1eda14cbcSMatt Macy /*
2eda14cbcSMatt Macy * CDDL HEADER START
3eda14cbcSMatt Macy *
4eda14cbcSMatt Macy * The contents of this file are subject to the terms of the
5eda14cbcSMatt Macy * Common Development and Distribution License (the "License").
6eda14cbcSMatt Macy * You may not use this file except in compliance with the License.
7eda14cbcSMatt Macy *
8eda14cbcSMatt Macy * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9eda14cbcSMatt Macy * or https://opensource.org/licenses/CDDL-1.0.
10eda14cbcSMatt Macy * See the License for the specific language governing permissions
11eda14cbcSMatt Macy * and limitations under the License.
12eda14cbcSMatt Macy *
13eda14cbcSMatt Macy * When distributing Covered Code, include this CDDL HEADER in each
14eda14cbcSMatt Macy * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15eda14cbcSMatt Macy * If applicable, add the following below this CDDL HEADER, with the
16eda14cbcSMatt Macy * fields enclosed by brackets "[]" replaced with your own identifying
17eda14cbcSMatt Macy * information: Portions Copyright [yyyy] [name of copyright owner]
18eda14cbcSMatt Macy *
19eda14cbcSMatt Macy * CDDL HEADER END
20eda14cbcSMatt Macy */
21eda14cbcSMatt Macy
22eda14cbcSMatt Macy /*
23eda14cbcSMatt Macy * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
242c48331dSMatt Macy * Copyright (c) 2012, 2020 by Delphix. All rights reserved.
25eda14cbcSMatt Macy * Copyright (c) 2016 Gvozden Nešković. All rights reserved.
26eda14cbcSMatt Macy */
27eda14cbcSMatt Macy
28eda14cbcSMatt Macy #include <sys/zfs_context.h>
29eda14cbcSMatt Macy #include <sys/spa.h>
30eda14cbcSMatt Macy #include <sys/spa_impl.h>
31eda14cbcSMatt Macy #include <sys/zap.h>
32eda14cbcSMatt Macy #include <sys/vdev_impl.h>
33eda14cbcSMatt Macy #include <sys/metaslab_impl.h>
34eda14cbcSMatt Macy #include <sys/zio.h>
35eda14cbcSMatt Macy #include <sys/zio_checksum.h>
36eda14cbcSMatt Macy #include <sys/dmu_tx.h>
37eda14cbcSMatt Macy #include <sys/abd.h>
387877fdebSMatt Macy #include <sys/zfs_rlock.h>
39eda14cbcSMatt Macy #include <sys/fs/zfs.h>
40eda14cbcSMatt Macy #include <sys/fm/fs/zfs.h>
41eda14cbcSMatt Macy #include <sys/vdev_raidz.h>
42eda14cbcSMatt Macy #include <sys/vdev_raidz_impl.h>
43eda14cbcSMatt Macy #include <sys/vdev_draid.h>
44eda14cbcSMatt Macy #include <sys/uberblock_impl.h>
45eda14cbcSMatt Macy #include <sys/dsl_scan.h>
46eda14cbcSMatt Macy
47eda14cbcSMatt Macy #ifdef ZFS_DEBUG
48eda14cbcSMatt Macy #include <sys/vdev.h> /* For vdev_xlate() in vdev_raidz_io_verify() */
49eda14cbcSMatt Macy #endif
50eda14cbcSMatt Macy
51eda14cbcSMatt Macy /*
52eda14cbcSMatt Macy * Virtual device vector for RAID-Z.
53eda14cbcSMatt Macy *
54eda14cbcSMatt Macy * This vdev supports single, double, and triple parity. For single parity,
55eda14cbcSMatt Macy * we use a simple XOR of all the data columns. For double or triple parity,
56eda14cbcSMatt Macy * we use a special case of Reed-Solomon coding. This extends the
57eda14cbcSMatt Macy * technique described in "The mathematics of RAID-6" by H. Peter Anvin by
58eda14cbcSMatt Macy * drawing on the system described in "A Tutorial on Reed-Solomon Coding for
59eda14cbcSMatt Macy * Fault-Tolerance in RAID-like Systems" by James S. Plank on which the
60eda14cbcSMatt Macy * former is also based. The latter is designed to provide higher performance
61eda14cbcSMatt Macy * for writes.
62eda14cbcSMatt Macy *
63eda14cbcSMatt Macy * Note that the Plank paper claimed to support arbitrary N+M, but was then
64eda14cbcSMatt Macy * amended six years later identifying a critical flaw that invalidates its
65eda14cbcSMatt Macy * claims. Nevertheless, the technique can be adapted to work for up to
66eda14cbcSMatt Macy * triple parity. For additional parity, the amendment "Note: Correction to
67eda14cbcSMatt Macy * the 1997 Tutorial on Reed-Solomon Coding" by James S. Plank and Ying Ding
68eda14cbcSMatt Macy * is viable, but the additional complexity means that write performance will
69eda14cbcSMatt Macy * suffer.
70eda14cbcSMatt Macy *
71eda14cbcSMatt Macy * All of the methods above operate on a Galois field, defined over the
72eda14cbcSMatt Macy * integers mod 2^N. In our case we choose N=8 for GF(8) so that all elements
73eda14cbcSMatt Macy * can be expressed with a single byte. Briefly, the operations on the
74eda14cbcSMatt Macy * field are defined as follows:
75eda14cbcSMatt Macy *
76eda14cbcSMatt Macy * o addition (+) is represented by a bitwise XOR
77eda14cbcSMatt Macy * o subtraction (-) is therefore identical to addition: A + B = A - B
78eda14cbcSMatt Macy * o multiplication of A by 2 is defined by the following bitwise expression:
79eda14cbcSMatt Macy *
80eda14cbcSMatt Macy * (A * 2)_7 = A_6
81eda14cbcSMatt Macy * (A * 2)_6 = A_5
82eda14cbcSMatt Macy * (A * 2)_5 = A_4
83eda14cbcSMatt Macy * (A * 2)_4 = A_3 + A_7
84eda14cbcSMatt Macy * (A * 2)_3 = A_2 + A_7
85eda14cbcSMatt Macy * (A * 2)_2 = A_1 + A_7
86eda14cbcSMatt Macy * (A * 2)_1 = A_0
87eda14cbcSMatt Macy * (A * 2)_0 = A_7
88eda14cbcSMatt Macy *
89eda14cbcSMatt Macy * In C, multiplying by 2 is therefore ((a << 1) ^ ((a & 0x80) ? 0x1d : 0)).
90eda14cbcSMatt Macy * As an aside, this multiplication is derived from the error correcting
91eda14cbcSMatt Macy * primitive polynomial x^8 + x^4 + x^3 + x^2 + 1.
92eda14cbcSMatt Macy *
93eda14cbcSMatt Macy * Observe that any number in the field (except for 0) can be expressed as a
94eda14cbcSMatt Macy * power of 2 -- a generator for the field. We store a table of the powers of
95eda14cbcSMatt Macy * 2 and logs base 2 for quick look ups, and exploit the fact that A * B can
96eda14cbcSMatt Macy * be rewritten as 2^(log_2(A) + log_2(B)) (where '+' is normal addition rather
97eda14cbcSMatt Macy * than field addition). The inverse of a field element A (A^-1) is therefore
98eda14cbcSMatt Macy * A ^ (255 - 1) = A^254.
99eda14cbcSMatt Macy *
100eda14cbcSMatt Macy * The up-to-three parity columns, P, Q, R over several data columns,
101eda14cbcSMatt Macy * D_0, ... D_n-1, can be expressed by field operations:
102eda14cbcSMatt Macy *
103eda14cbcSMatt Macy * P = D_0 + D_1 + ... + D_n-2 + D_n-1
104eda14cbcSMatt Macy * Q = 2^n-1 * D_0 + 2^n-2 * D_1 + ... + 2^1 * D_n-2 + 2^0 * D_n-1
105eda14cbcSMatt Macy * = ((...((D_0) * 2 + D_1) * 2 + ...) * 2 + D_n-2) * 2 + D_n-1
106eda14cbcSMatt Macy * R = 4^n-1 * D_0 + 4^n-2 * D_1 + ... + 4^1 * D_n-2 + 4^0 * D_n-1
107eda14cbcSMatt Macy * = ((...((D_0) * 4 + D_1) * 4 + ...) * 4 + D_n-2) * 4 + D_n-1
108eda14cbcSMatt Macy *
109eda14cbcSMatt Macy * We chose 1, 2, and 4 as our generators because 1 corresponds to the trivial
110eda14cbcSMatt Macy * XOR operation, and 2 and 4 can be computed quickly and generate linearly-
111eda14cbcSMatt Macy * independent coefficients. (There are no additional coefficients that have
112eda14cbcSMatt Macy * this property which is why the uncorrected Plank method breaks down.)
113eda14cbcSMatt Macy *
114eda14cbcSMatt Macy * See the reconstruction code below for how P, Q and R can used individually
115eda14cbcSMatt Macy * or in concert to recover missing data columns.
116eda14cbcSMatt Macy */
117eda14cbcSMatt Macy
118eda14cbcSMatt Macy #define VDEV_RAIDZ_P 0
119eda14cbcSMatt Macy #define VDEV_RAIDZ_Q 1
120eda14cbcSMatt Macy #define VDEV_RAIDZ_R 2
121eda14cbcSMatt Macy
122eda14cbcSMatt Macy #define VDEV_RAIDZ_MUL_2(x) (((x) << 1) ^ (((x) & 0x80) ? 0x1d : 0))
123eda14cbcSMatt Macy #define VDEV_RAIDZ_MUL_4(x) (VDEV_RAIDZ_MUL_2(VDEV_RAIDZ_MUL_2(x)))
124eda14cbcSMatt Macy
125eda14cbcSMatt Macy /*
126eda14cbcSMatt Macy * We provide a mechanism to perform the field multiplication operation on a
127eda14cbcSMatt Macy * 64-bit value all at once rather than a byte at a time. This works by
128eda14cbcSMatt Macy * creating a mask from the top bit in each byte and using that to
129eda14cbcSMatt Macy * conditionally apply the XOR of 0x1d.
130eda14cbcSMatt Macy */
131eda14cbcSMatt Macy #define VDEV_RAIDZ_64MUL_2(x, mask) \
132eda14cbcSMatt Macy { \
133eda14cbcSMatt Macy (mask) = (x) & 0x8080808080808080ULL; \
134eda14cbcSMatt Macy (mask) = ((mask) << 1) - ((mask) >> 7); \
135eda14cbcSMatt Macy (x) = (((x) << 1) & 0xfefefefefefefefeULL) ^ \
136eda14cbcSMatt Macy ((mask) & 0x1d1d1d1d1d1d1d1dULL); \
137eda14cbcSMatt Macy }
1387877fdebSMatt Macy
1397877fdebSMatt Macy #define VDEV_RAIDZ_64MUL_4(x, mask) \
140eda14cbcSMatt Macy { \
141eda14cbcSMatt Macy VDEV_RAIDZ_64MUL_2((x), mask); \
142eda14cbcSMatt Macy VDEV_RAIDZ_64MUL_2((x), mask); \
1437877fdebSMatt Macy }
1447877fdebSMatt Macy
145eda14cbcSMatt Macy
1467877fdebSMatt Macy /*
1477877fdebSMatt Macy * Big Theory Statement for how a RAIDZ VDEV is expanded
1487877fdebSMatt Macy *
1497877fdebSMatt Macy * An existing RAIDZ VDEV can be expanded by attaching a new disk. Expansion
1507877fdebSMatt Macy * works with all three RAIDZ parity choices, including RAIDZ1, 2, or 3. VDEVs
1517877fdebSMatt Macy * that have been previously expanded can be expanded again.
1527877fdebSMatt Macy *
1537877fdebSMatt Macy * The RAIDZ VDEV must be healthy (must be able to write to all the drives in
1547877fdebSMatt Macy * the VDEV) when an expansion starts. And the expansion will pause if any
1557877fdebSMatt Macy * disk in the VDEV fails, and resume once the VDEV is healthy again. All other
1567877fdebSMatt Macy * operations on the pool can continue while an expansion is in progress (e.g.
1577877fdebSMatt Macy * read/write, snapshot, zpool add, etc). Except zpool checkpoint, zpool trim,
1587877fdebSMatt Macy * and zpool initialize which can't be run during an expansion. Following a
1597877fdebSMatt Macy * reboot or export/import, the expansion resumes where it left off.
1607877fdebSMatt Macy *
1617877fdebSMatt Macy * == Reflowing the Data ==
1627877fdebSMatt Macy *
1637877fdebSMatt Macy * The expansion involves reflowing (copying) the data from the current set
1647877fdebSMatt Macy * of disks to spread it across the new set which now has one more disk. This
165eda14cbcSMatt Macy * reflow operation is similar to reflowing text when the column width of a
166eda14cbcSMatt Macy * text editor window is expanded. The text doesn’t change but the location of
1677877fdebSMatt Macy * the text changes to accommodate the new width. An example reflow result for
1687877fdebSMatt Macy * a 4-wide RAIDZ1 to a 5-wide is shown below.
169eda14cbcSMatt Macy *
1707877fdebSMatt Macy * Reflow End State
1717877fdebSMatt Macy * Each letter indicates a parity group (logical stripe)
172eda14cbcSMatt Macy *
1737877fdebSMatt Macy * Before expansion After Expansion
1747877fdebSMatt Macy * D1 D2 D3 D4 D1 D2 D3 D4 D5
1757877fdebSMatt Macy * +------+------+------+------+ +------+------+------+------+------+
1767877fdebSMatt Macy * | | | | | | | | | | |
1777877fdebSMatt Macy * | A | A | A | A | | A | A | A | A | B |
1787877fdebSMatt Macy * | 1| 2| 3| 4| | 1| 2| 3| 4| 5|
1797877fdebSMatt Macy * +------+------+------+------+ +------+------+------+------+------+
1807877fdebSMatt Macy * | | | | | | | | | | |
1817877fdebSMatt Macy * | B | B | C | C | | B | C | C | C | C |
1827877fdebSMatt Macy * | 5| 6| 7| 8| | 6| 7| 8| 9| 10|
183eda14cbcSMatt Macy * +------+------+------+------+ +------+------+------+------+------+
184eda14cbcSMatt Macy * | | | | | | | | | | |
185eda14cbcSMatt Macy * | C | C | D | D | | D | D | E | E | E |
186eda14cbcSMatt Macy * | 9| 10| 11| 12| | 11| 12| 13| 14| 15|
187eda14cbcSMatt Macy * +------+------+------+------+ +------+------+------+------+------+
188eda14cbcSMatt Macy * | | | | | | | | | | |
189eda14cbcSMatt Macy * | E | E | E | E | --> | E | F | F | G | G |
190eda14cbcSMatt Macy * | 13| 14| 15| 16| | 16| 17| 18|p 19| 20|
1917877fdebSMatt Macy * +------+------+------+------+ +------+------+------+------+------+
192eda14cbcSMatt Macy * | | | | | | | | | | |
1937877fdebSMatt Macy * | F | F | G | G | | G | G | H | H | H |
194eda14cbcSMatt Macy * | 17| 18| 19| 20| | 21| 22| 23| 24| 25|
195eda14cbcSMatt Macy * +------+------+------+------+ +------+------+------+------+------+
1967877fdebSMatt Macy * | | | | | | | | | | |
197eda14cbcSMatt Macy * | G | G | H | H | | H | I | I | J | J |
198eda14cbcSMatt Macy * | 21| 22| 23| 24| | 26| 27| 28| 29| 30|
199eda14cbcSMatt Macy * +------+------+------+------+ +------+------+------+------+------+
200eda14cbcSMatt Macy * | | | | | | | | | | |
201eda14cbcSMatt Macy * | H | H | I | I | | J | J | | | K |
202eda14cbcSMatt Macy * | 25| 26| 27| 28| | 31| 32| 33| 34| 35|
203eda14cbcSMatt Macy * +------+------+------+------+ +------+------+------+------+------+
204eda14cbcSMatt Macy *
205eda14cbcSMatt Macy * This reflow approach has several advantages. There is no need to read or
2067877fdebSMatt Macy * modify the block pointers or recompute any block checksums. The reflow
207eda14cbcSMatt Macy * doesn’t need to know where the parity sectors reside. We can read and write
208eda14cbcSMatt Macy * data sequentially and the copy can occur in a background thread in open
209eda14cbcSMatt Macy * context. The design also allows for fast discovery of what data to copy.
210eda14cbcSMatt Macy *
211eda14cbcSMatt Macy * The VDEV metaslabs are processed, one at a time, to copy the block data to
212eda14cbcSMatt Macy * have it flow across all the disks. The metaslab is disabled for allocations
213eda14cbcSMatt Macy * during the copy. As an optimization, we only copy the allocated data which
214eda14cbcSMatt Macy * can be determined by looking at the metaslab range tree. During the copy we
215eda14cbcSMatt Macy * must maintain the redundancy guarantees of the RAIDZ VDEV (i.e., we still
216eda14cbcSMatt Macy * need to be able to survive losing parity count disks). This means we
217eda14cbcSMatt Macy * cannot overwrite data during the reflow that would be needed if a disk is
218eda14cbcSMatt Macy * lost.
219eda14cbcSMatt Macy *
220eda14cbcSMatt Macy * After the reflow completes, all newly-written blocks will have the new
221eda14cbcSMatt Macy * layout, i.e., they will have the parity to data ratio implied by the new
2227877fdebSMatt Macy * number of disks in the RAIDZ group. Even though the reflow copies all of
2237877fdebSMatt Macy * the allocated space (data and parity), it is only rearranged, not changed.
2247877fdebSMatt Macy *
2257877fdebSMatt Macy * This act of reflowing the data has a few implications about blocks
2267877fdebSMatt Macy * that were written before the reflow completes:
2277877fdebSMatt Macy *
2287877fdebSMatt Macy * - Old blocks will still use the same amount of space (i.e., they will have
229eda14cbcSMatt Macy * the parity to data ratio implied by the old number of disks in the RAIDZ
230eda14cbcSMatt Macy * group).
231eda14cbcSMatt Macy * - Reading old blocks will be slightly slower than before the reflow, for
232eda14cbcSMatt Macy * two reasons. First, we will have to read from all disks in the RAIDZ
233eda14cbcSMatt Macy * VDEV, rather than being able to skip the children that contain only
2347877fdebSMatt Macy * parity of this block (because the data of a single block is now spread
235eda14cbcSMatt Macy * out across all the disks). Second, in most cases there will be an extra
236eda14cbcSMatt Macy * bcopy, needed to rearrange the data back to its original layout in memory.
237eda14cbcSMatt Macy *
2387877fdebSMatt Macy * == Scratch Area ==
239eda14cbcSMatt Macy *
240eda14cbcSMatt Macy * As we copy the block data, we can only progress to the point that writes
241eda14cbcSMatt Macy * will not overlap with blocks whose progress has not yet been recorded on
2427877fdebSMatt Macy * disk. Since partially-copied rows are always read from the old location,
2437877fdebSMatt Macy * we need to stop one row before the sector-wise overlap, to prevent any
2447877fdebSMatt Macy * row-wise overlap. For example, in the diagram above, when we reflow sector
2457877fdebSMatt Macy * B6 it will overwite the original location for B5.
2467877fdebSMatt Macy *
247eda14cbcSMatt Macy * To get around this, a scratch space is used so that we can start copying
248eda14cbcSMatt Macy * without risking data loss by overlapping the row. As an added benefit, it
249eda14cbcSMatt Macy * improves performance at the beginning of the reflow, but that small perf
250eda14cbcSMatt Macy * boost wouldn't be worth the complexity on its own.
2517877fdebSMatt Macy *
2527877fdebSMatt Macy * Ideally we want to copy at least 2 * (new_width)^2 so that we have a
253eda14cbcSMatt Macy * separation of 2*(new_width+1) and a chunk size of new_width+2. With the max
2547877fdebSMatt Macy * RAIDZ width of 255 and 4K sectors this would be 2MB per disk. In practice
255eda14cbcSMatt Macy * the widths will likely be single digits so we can get a substantial chuck
2567877fdebSMatt Macy * size using only a few MB of scratch per disk.
2577877fdebSMatt Macy *
258eda14cbcSMatt Macy * The scratch area is persisted to disk which holds a large amount of reflowed
259eda14cbcSMatt Macy * state. We can always read the partially written stripes when a disk fails or
260eda14cbcSMatt Macy * the copy is interrupted (crash) during the initial copying phase and also
261eda14cbcSMatt Macy * get past a small chunk size restriction. At a minimum, the scratch space
262eda14cbcSMatt Macy * must be large enough to get us to the point that one row does not overlap
2637877fdebSMatt Macy * itself when moved (i.e new_width^2). But going larger is even better. We
264eda14cbcSMatt Macy * use the 3.5 MiB reserved "boot" space that resides after the ZFS disk labels
265eda14cbcSMatt Macy * as our scratch space to handle overwriting the initial part of the VDEV.
2667877fdebSMatt Macy *
2677877fdebSMatt Macy * 0 256K 512K 4M
268eda14cbcSMatt Macy * +------+------+-----------------------+-----------------------------
269eda14cbcSMatt Macy * | VDEV | VDEV | Boot Block (3.5M) | Allocatable space ...
2707877fdebSMatt Macy * | L0 | L1 | Reserved | (Metaslabs)
2717877fdebSMatt Macy * +------+------+-----------------------+-------------------------------
2727877fdebSMatt Macy * Scratch Area
2737877fdebSMatt Macy *
2747877fdebSMatt Macy * == Reflow Progress Updates ==
2757877fdebSMatt Macy * After the initial scratch-based reflow, the expansion process works
276eda14cbcSMatt Macy * similarly to device removal. We create a new open context thread which
277eda14cbcSMatt Macy * reflows the data, and periodically kicks off sync tasks to update logical
278eda14cbcSMatt Macy * state. In this case, state is the committed progress (offset of next data
2797877fdebSMatt Macy * to copy). We need to persist the completed offset on disk, so that if we
2807877fdebSMatt Macy * crash we know which format each VDEV offset is in.
2817877fdebSMatt Macy *
282eda14cbcSMatt Macy * == Time Dependent Geometry ==
283eda14cbcSMatt Macy *
284eda14cbcSMatt Macy * In non-expanded RAIDZ, blocks are read from disk in a column by column
2857877fdebSMatt Macy * fashion. For a multi-row block, the second sector is in the first column
2867877fdebSMatt Macy * not in the second column. This allows us to issue full reads for each
287eda14cbcSMatt Macy * column directly into the request buffer. The block data is thus laid out
288eda14cbcSMatt Macy * sequentially in a column-by-column fashion.
2897877fdebSMatt Macy *
290eda14cbcSMatt Macy * For example, in the before expansion diagram above, one logical block might
291eda14cbcSMatt Macy * be sectors G19-H26. The parity is in G19,H23; and the data is in
292eda14cbcSMatt Macy * G20,H24,G21,H25,G22,H26.
293eda14cbcSMatt Macy *
294eda14cbcSMatt Macy * After a block is reflowed, the sectors that were all in the original column
295eda14cbcSMatt Macy * data can now reside in different columns. When reading from an expanded
296eda14cbcSMatt Macy * VDEV, we need to know the logical stripe width for each block so we can
297eda14cbcSMatt Macy * reconstitute the block’s data after the reads are completed. Likewise,
298eda14cbcSMatt Macy * when we perform the combinatorial reconstruction we need to know the
299eda14cbcSMatt Macy * original width so we can retry combinations from the past layouts.
300eda14cbcSMatt Macy *
301eda14cbcSMatt Macy * Time dependent geometry is what we call having blocks with different layouts
302eda14cbcSMatt Macy * (stripe widths) in the same VDEV. This time-dependent geometry uses the
303eda14cbcSMatt Macy * block’s birth time (+ the time expansion ended) to establish the correct
304eda14cbcSMatt Macy * width for a given block. After an expansion completes, we record the time
305eda14cbcSMatt Macy * for blocks written with a particular width (geometry).
306eda14cbcSMatt Macy *
307eda14cbcSMatt Macy * == On Disk Format Changes ==
308eda14cbcSMatt Macy *
309eda14cbcSMatt Macy * New pool feature flag, 'raidz_expansion' whose reference count is the number
310eda14cbcSMatt Macy * of RAIDZ VDEVs that have been expanded.
311eda14cbcSMatt Macy *
312eda14cbcSMatt Macy * The blocks on expanded RAIDZ VDEV can have different logical stripe widths.
313eda14cbcSMatt Macy *
314eda14cbcSMatt Macy * Since the uberblock can point to arbitrary blocks, which might be on the
315eda14cbcSMatt Macy * expanding RAIDZ, and might or might not have been expanded. We need to know
316eda14cbcSMatt Macy * which way a block is laid out before reading it. This info is the next
3177877fdebSMatt Macy * offset that needs to be reflowed and we persist that in the uberblock, in
318eda14cbcSMatt Macy * the new ub_raidz_reflow_info field, as opposed to the MOS or the vdev label.
3197877fdebSMatt Macy * After the expansion is complete, we then use the raidz_expand_txgs array
320eda14cbcSMatt Macy * (see below) to determine how to read a block and the ub_raidz_reflow_info
321eda14cbcSMatt Macy * field no longer required.
322eda14cbcSMatt Macy *
323eda14cbcSMatt Macy * The uberblock's ub_raidz_reflow_info field also holds the scratch space
324eda14cbcSMatt Macy * state (i.e., active or not) which is also required before reading a block
325eda14cbcSMatt Macy * during the initial phase of reflowing the data.
326eda14cbcSMatt Macy *
327eda14cbcSMatt Macy * The top-level RAIDZ VDEV has two new entries in the nvlist:
328eda14cbcSMatt Macy *
329eda14cbcSMatt Macy * 'raidz_expand_txgs' array: logical stripe widths by txg are recorded here
3307877fdebSMatt Macy * and used after the expansion is complete to
3317877fdebSMatt Macy * determine how to read a raidz block
3327877fdebSMatt Macy * 'raidz_expanding' boolean: present during reflow and removed after completion
3337877fdebSMatt Macy * used during a spa import to resume an unfinished
334eda14cbcSMatt Macy * expansion
3357877fdebSMatt Macy *
3367877fdebSMatt Macy * And finally the VDEVs top zap adds the following informational entries:
337eda14cbcSMatt Macy * VDEV_TOP_ZAP_RAIDZ_EXPAND_STATE
3387877fdebSMatt Macy * VDEV_TOP_ZAP_RAIDZ_EXPAND_START_TIME
339eda14cbcSMatt Macy * VDEV_TOP_ZAP_RAIDZ_EXPAND_END_TIME
3407877fdebSMatt Macy * VDEV_TOP_ZAP_RAIDZ_EXPAND_BYTES_COPIED
3417877fdebSMatt Macy */
3427877fdebSMatt Macy
3437877fdebSMatt Macy /*
344eda14cbcSMatt Macy * For testing only: pause the raidz expansion after reflowing this amount.
345eda14cbcSMatt Macy * (accessed by ZTS and ztest)
346eda14cbcSMatt Macy */
347eda14cbcSMatt Macy #ifdef _KERNEL
348eda14cbcSMatt Macy static
349eda14cbcSMatt Macy #endif /* _KERNEL */
350eda14cbcSMatt Macy unsigned long raidz_expand_max_reflow_bytes = 0;
351eda14cbcSMatt Macy
352eda14cbcSMatt Macy /*
353eda14cbcSMatt Macy * For testing only: pause the raidz expansion at a certain point.
3547877fdebSMatt Macy */
355eda14cbcSMatt Macy uint_t raidz_expand_pause_point = 0;
356eda14cbcSMatt Macy
357eda14cbcSMatt Macy /*
358eda14cbcSMatt Macy * Maximum amount of copy io's outstanding at once.
359eda14cbcSMatt Macy */
360eda14cbcSMatt Macy static unsigned long raidz_expand_max_copy_bytes = 10 * SPA_MAXBLOCKSIZE;
361eda14cbcSMatt Macy
362eda14cbcSMatt Macy /*
363eda14cbcSMatt Macy * Apply raidz map abds aggregation if the number of rows in the map is equal
364eda14cbcSMatt Macy * or greater than the value below.
365eda14cbcSMatt Macy */
366eda14cbcSMatt Macy static unsigned long raidz_io_aggregate_rows = 4;
367eda14cbcSMatt Macy
368eda14cbcSMatt Macy /*
369eda14cbcSMatt Macy * Automatically start a pool scrub when a RAIDZ expansion completes in
370eda14cbcSMatt Macy * order to verify the checksums of all blocks which have been copied
371eda14cbcSMatt Macy * during the expansion. Automatic scrubbing is enabled by default and
3727877fdebSMatt Macy * is strongly recommended.
373eda14cbcSMatt Macy */
374eda14cbcSMatt Macy static int zfs_scrub_after_expand = 1;
375eda14cbcSMatt Macy
376eda14cbcSMatt Macy static void
vdev_raidz_row_free(raidz_row_t * rr)377eda14cbcSMatt Macy vdev_raidz_row_free(raidz_row_t *rr)
378eda14cbcSMatt Macy {
379eda14cbcSMatt Macy for (int c = 0; c < rr->rr_cols; c++) {
380eda14cbcSMatt Macy raidz_col_t *rc = &rr->rr_col[c];
381eda14cbcSMatt Macy
382eda14cbcSMatt Macy if (rc->rc_size != 0)
383eda14cbcSMatt Macy abd_free(rc->rc_abd);
3847877fdebSMatt Macy if (rc->rc_orig_data != NULL)
3857877fdebSMatt Macy abd_free(rc->rc_orig_data);
3867877fdebSMatt Macy }
3877877fdebSMatt Macy
388eda14cbcSMatt Macy if (rr->rr_abd_empty != NULL)
389eda14cbcSMatt Macy abd_free(rr->rr_abd_empty);
390eda14cbcSMatt Macy
391eda14cbcSMatt Macy kmem_free(rr, offsetof(raidz_row_t, rr_col[rr->rr_scols]));
392eda14cbcSMatt Macy }
393eda14cbcSMatt Macy
394eda14cbcSMatt Macy void
vdev_raidz_map_free(raidz_map_t * rm)395eda14cbcSMatt Macy vdev_raidz_map_free(raidz_map_t *rm)
396eda14cbcSMatt Macy {
397eda14cbcSMatt Macy for (int i = 0; i < rm->rm_nrows; i++)
398eda14cbcSMatt Macy vdev_raidz_row_free(rm->rm_row[i]);
399eda14cbcSMatt Macy
400eda14cbcSMatt Macy if (rm->rm_nphys_cols) {
401eda14cbcSMatt Macy for (int i = 0; i < rm->rm_nphys_cols; i++) {
402eda14cbcSMatt Macy if (rm->rm_phys_col[i].rc_abd != NULL)
403eda14cbcSMatt Macy abd_free(rm->rm_phys_col[i].rc_abd);
404eda14cbcSMatt Macy }
405eda14cbcSMatt Macy
406eda14cbcSMatt Macy kmem_free(rm->rm_phys_col, sizeof (raidz_col_t) *
407eda14cbcSMatt Macy rm->rm_nphys_cols);
408eda14cbcSMatt Macy }
4097877fdebSMatt Macy
4107877fdebSMatt Macy ASSERT3P(rm->rm_lr, ==, NULL);
4117877fdebSMatt Macy kmem_free(rm, offsetof(raidz_map_t, rm_row[rm->rm_nrows]));
4127877fdebSMatt Macy }
413eda14cbcSMatt Macy
414eda14cbcSMatt Macy static void
vdev_raidz_map_free_vsd(zio_t * zio)415eda14cbcSMatt Macy vdev_raidz_map_free_vsd(zio_t *zio)
416eda14cbcSMatt Macy {
417eda14cbcSMatt Macy raidz_map_t *rm = zio->io_vsd;
418eda14cbcSMatt Macy
419eda14cbcSMatt Macy vdev_raidz_map_free(rm);
420eda14cbcSMatt Macy }
421eda14cbcSMatt Macy
422eda14cbcSMatt Macy static int
vdev_raidz_reflow_compare(const void * x1,const void * x2)423eda14cbcSMatt Macy vdev_raidz_reflow_compare(const void *x1, const void *x2)
4247877fdebSMatt Macy {
4257877fdebSMatt Macy const reflow_node_t *l = x1;
426eda14cbcSMatt Macy const reflow_node_t *r = x2;
4277877fdebSMatt Macy
4287877fdebSMatt Macy return (TREE_CMP(l->re_txg, r->re_txg));
4297877fdebSMatt Macy }
4307877fdebSMatt Macy
4317877fdebSMatt Macy const zio_vsd_ops_t vdev_raidz_vsd_ops = {
4327877fdebSMatt Macy .vsd_free = vdev_raidz_map_free_vsd,
4337877fdebSMatt Macy };
4347877fdebSMatt Macy
4357877fdebSMatt Macy raidz_row_t *
vdev_raidz_row_alloc(int cols)4367877fdebSMatt Macy vdev_raidz_row_alloc(int cols)
4377877fdebSMatt Macy {
4387877fdebSMatt Macy raidz_row_t *rr =
4397877fdebSMatt Macy kmem_zalloc(offsetof(raidz_row_t, rr_col[cols]), KM_SLEEP);
440eda14cbcSMatt Macy
441eda14cbcSMatt Macy rr->rr_cols = cols;
442eda14cbcSMatt Macy rr->rr_scols = cols;
443eda14cbcSMatt Macy
4447877fdebSMatt Macy for (int c = 0; c < cols; c++) {
445eda14cbcSMatt Macy raidz_col_t *rc = &rr->rr_col[c];
446eda14cbcSMatt Macy rc->rc_shadow_devidx = INT_MAX;
447eda14cbcSMatt Macy rc->rc_shadow_offset = UINT64_MAX;
448eda14cbcSMatt Macy rc->rc_allow_repair = 1;
449eda14cbcSMatt Macy }
450eda14cbcSMatt Macy return (rr);
4517877fdebSMatt Macy }
4527877fdebSMatt Macy
4537877fdebSMatt Macy static void
vdev_raidz_map_alloc_write(zio_t * zio,raidz_map_t * rm,uint64_t ashift)4547877fdebSMatt Macy vdev_raidz_map_alloc_write(zio_t *zio, raidz_map_t *rm, uint64_t ashift)
4557877fdebSMatt Macy {
4567877fdebSMatt Macy int c;
4577877fdebSMatt Macy int nwrapped = 0;
4587877fdebSMatt Macy uint64_t off = 0;
4597877fdebSMatt Macy raidz_row_t *rr = rm->rm_row[0];
4607877fdebSMatt Macy
461eda14cbcSMatt Macy ASSERT3U(zio->io_type, ==, ZIO_TYPE_WRITE);
462eda14cbcSMatt Macy ASSERT3U(rm->rm_nrows, ==, 1);
4637877fdebSMatt Macy
464eda14cbcSMatt Macy /*
4657877fdebSMatt Macy * Pad any parity columns with additional space to account for skip
466eda14cbcSMatt Macy * sectors.
4677877fdebSMatt Macy */
468eda14cbcSMatt Macy if (rm->rm_skipstart < rr->rr_firstdatacol) {
4697877fdebSMatt Macy ASSERT0(rm->rm_skipstart);
470eda14cbcSMatt Macy nwrapped = rm->rm_nskip;
471eda14cbcSMatt Macy } else if (rr->rr_scols < (rm->rm_skipstart + rm->rm_nskip)) {
472eda14cbcSMatt Macy nwrapped =
473eda14cbcSMatt Macy (rm->rm_skipstart + rm->rm_nskip) % rr->rr_scols;
4747877fdebSMatt Macy }
475eda14cbcSMatt Macy
4767877fdebSMatt Macy /*
4777877fdebSMatt Macy * Optional single skip sectors (rc_size == 0) will be handled in
4787877fdebSMatt Macy * vdev_raidz_io_start_write().
479eda14cbcSMatt Macy */
4807877fdebSMatt Macy int skipped = rr->rr_scols - rr->rr_cols;
4817877fdebSMatt Macy
4827877fdebSMatt Macy /* Allocate buffers for the parity columns */
483eda14cbcSMatt Macy for (c = 0; c < rr->rr_firstdatacol; c++) {
484eda14cbcSMatt Macy raidz_col_t *rc = &rr->rr_col[c];
4857877fdebSMatt Macy
4867877fdebSMatt Macy /*
4877877fdebSMatt Macy * Parity columns will pad out a linear ABD to account for
488eda14cbcSMatt Macy * the skip sector. A linear ABD is used here because
489eda14cbcSMatt Macy * parity calculations use the ABD buffer directly to calculate
490eda14cbcSMatt Macy * parity. This avoids doing a memcpy back to the ABD after the
491eda14cbcSMatt Macy * parity has been calculated. By issuing the parity column
492eda14cbcSMatt Macy * with the skip sector we can reduce contention on the child
493eda14cbcSMatt Macy * VDEV queue locks (vq_lock).
494eda14cbcSMatt Macy */
495eda14cbcSMatt Macy if (c < nwrapped) {
496eda14cbcSMatt Macy rc->rc_abd = abd_alloc_linear(
497eda14cbcSMatt Macy rc->rc_size + (1ULL << ashift), B_FALSE);
498eda14cbcSMatt Macy abd_zero_off(rc->rc_abd, rc->rc_size, 1ULL << ashift);
499eda14cbcSMatt Macy skipped++;
500eda14cbcSMatt Macy } else {
501eda14cbcSMatt Macy rc->rc_abd = abd_alloc_linear(rc->rc_size, B_FALSE);
502eda14cbcSMatt Macy }
503eda14cbcSMatt Macy }
504eda14cbcSMatt Macy
505eda14cbcSMatt Macy for (off = 0; c < rr->rr_cols; c++) {
506eda14cbcSMatt Macy raidz_col_t *rc = &rr->rr_col[c];
507eda14cbcSMatt Macy abd_t *abd = abd_get_offset_struct(&rc->rc_abdstruct,
508eda14cbcSMatt Macy zio->io_abd, off, rc->rc_size);
509eda14cbcSMatt Macy
5107877fdebSMatt Macy /*
5117877fdebSMatt Macy * Generate I/O for skip sectors to improve aggregation
512eda14cbcSMatt Macy * continuity. We will use gang ABD's to reduce contention
5137877fdebSMatt Macy * on the child VDEV queue locks (vq_lock) by issuing
5147877fdebSMatt Macy * a single I/O that contains the data and skip sector.
5157877fdebSMatt Macy *
5167877fdebSMatt Macy * It is important to make sure that rc_size is not updated
5177877fdebSMatt Macy * even though we are adding a skip sector to the ABD. When
5187877fdebSMatt Macy * calculating the parity in vdev_raidz_generate_parity_row()
5197877fdebSMatt Macy * the rc_size is used to iterate through the ABD's. We can
520eda14cbcSMatt Macy * not have zero'd out skip sectors used for calculating
521eda14cbcSMatt Macy * parity for raidz, because those same sectors are not used
522eda14cbcSMatt Macy * during reconstruction.
523eda14cbcSMatt Macy */
524eda14cbcSMatt Macy if (c >= rm->rm_skipstart && skipped < rm->rm_nskip) {
525eda14cbcSMatt Macy rc->rc_abd = abd_alloc_gang();
526eda14cbcSMatt Macy abd_gang_add(rc->rc_abd, abd, B_TRUE);
527eda14cbcSMatt Macy abd_gang_add(rc->rc_abd,
528eda14cbcSMatt Macy abd_get_zeros(1ULL << ashift), B_TRUE);
529eda14cbcSMatt Macy skipped++;
530eda14cbcSMatt Macy } else {
531eda14cbcSMatt Macy rc->rc_abd = abd;
532eda14cbcSMatt Macy }
533eda14cbcSMatt Macy off += rc->rc_size;
534eda14cbcSMatt Macy }
535eda14cbcSMatt Macy
536eda14cbcSMatt Macy ASSERT3U(off, ==, zio->io_size);
537eda14cbcSMatt Macy ASSERT3S(skipped, ==, rm->rm_nskip);
538eda14cbcSMatt Macy }
539eda14cbcSMatt Macy
540eda14cbcSMatt Macy static void
vdev_raidz_map_alloc_read(zio_t * zio,raidz_map_t * rm)541eda14cbcSMatt Macy vdev_raidz_map_alloc_read(zio_t *zio, raidz_map_t *rm)
542eda14cbcSMatt Macy {
543eda14cbcSMatt Macy int c;
544eda14cbcSMatt Macy raidz_row_t *rr = rm->rm_row[0];
545eda14cbcSMatt Macy
546eda14cbcSMatt Macy ASSERT3U(rm->rm_nrows, ==, 1);
547eda14cbcSMatt Macy
548eda14cbcSMatt Macy /* Allocate buffers for the parity columns */
549eda14cbcSMatt Macy for (c = 0; c < rr->rr_firstdatacol; c++)
550eda14cbcSMatt Macy rr->rr_col[c].rc_abd =
551eda14cbcSMatt Macy abd_alloc_linear(rr->rr_col[c].rc_size, B_FALSE);
552eda14cbcSMatt Macy
553eda14cbcSMatt Macy for (uint64_t off = 0; c < rr->rr_cols; c++) {
554eda14cbcSMatt Macy raidz_col_t *rc = &rr->rr_col[c];
555eda14cbcSMatt Macy rc->rc_abd = abd_get_offset_struct(&rc->rc_abdstruct,
556eda14cbcSMatt Macy zio->io_abd, off, rc->rc_size);
557eda14cbcSMatt Macy off += rc->rc_size;
558eda14cbcSMatt Macy }
559eda14cbcSMatt Macy }
560eda14cbcSMatt Macy
561eda14cbcSMatt Macy /*
562eda14cbcSMatt Macy * Divides the IO evenly across all child vdevs; usually, dcols is
563eda14cbcSMatt Macy * the number of children in the target vdev.
564eda14cbcSMatt Macy *
565eda14cbcSMatt Macy * Avoid inlining the function to keep vdev_raidz_io_start(), which
566eda14cbcSMatt Macy * is this functions only caller, as small as possible on the stack.
567eda14cbcSMatt Macy */
568eda14cbcSMatt Macy noinline raidz_map_t *
vdev_raidz_map_alloc(zio_t * zio,uint64_t ashift,uint64_t dcols,uint64_t nparity)569eda14cbcSMatt Macy vdev_raidz_map_alloc(zio_t *zio, uint64_t ashift, uint64_t dcols,
570eda14cbcSMatt Macy uint64_t nparity)
571eda14cbcSMatt Macy {
572eda14cbcSMatt Macy raidz_row_t *rr;
573eda14cbcSMatt Macy /* The starting RAIDZ (parent) vdev sector of the block. */
574eda14cbcSMatt Macy uint64_t b = zio->io_offset >> ashift;
575eda14cbcSMatt Macy /* The zio's size in units of the vdev's minimum sector size. */
576eda14cbcSMatt Macy uint64_t s = zio->io_size >> ashift;
577eda14cbcSMatt Macy /* The first column for this stripe. */
578eda14cbcSMatt Macy uint64_t f = b % dcols;
579eda14cbcSMatt Macy /* The starting byte offset on each child vdev. */
580eda14cbcSMatt Macy uint64_t o = (b / dcols) << ashift;
581eda14cbcSMatt Macy uint64_t acols, scols;
582eda14cbcSMatt Macy
583eda14cbcSMatt Macy raidz_map_t *rm =
584eda14cbcSMatt Macy kmem_zalloc(offsetof(raidz_map_t, rm_row[1]), KM_SLEEP);
585eda14cbcSMatt Macy rm->rm_nrows = 1;
586eda14cbcSMatt Macy
587eda14cbcSMatt Macy /*
588eda14cbcSMatt Macy * "Quotient": The number of data sectors for this stripe on all but
589eda14cbcSMatt Macy * the "big column" child vdevs that also contain "remainder" data.
590eda14cbcSMatt Macy */
591eda14cbcSMatt Macy uint64_t q = s / (dcols - nparity);
592eda14cbcSMatt Macy
5937877fdebSMatt Macy /*
594eda14cbcSMatt Macy * "Remainder": The number of partial stripe data sectors in this I/O.
5957877fdebSMatt Macy * This will add a sector to some, but not all, child vdevs.
596eda14cbcSMatt Macy */
5977877fdebSMatt Macy uint64_t r = s - q * (dcols - nparity);
5987877fdebSMatt Macy
599eda14cbcSMatt Macy /* The number of "big columns" - those which contain remainder data. */
6007877fdebSMatt Macy uint64_t bc = (r == 0 ? 0 : r + nparity);
6017877fdebSMatt Macy
602eda14cbcSMatt Macy /*
603eda14cbcSMatt Macy * The total number of data and parity sectors associated with
6047877fdebSMatt Macy * this I/O.
605eda14cbcSMatt Macy */
606eda14cbcSMatt Macy uint64_t tot = s + nparity * (q + (r == 0 ? 0 : 1));
607eda14cbcSMatt Macy
608eda14cbcSMatt Macy /*
609eda14cbcSMatt Macy * acols: The columns that will be accessed.
610eda14cbcSMatt Macy * scols: The columns that will be accessed or skipped.
6117877fdebSMatt Macy */
612eda14cbcSMatt Macy if (q == 0) {
6137877fdebSMatt Macy /* Our I/O request doesn't span all child vdevs. */
6147877fdebSMatt Macy acols = bc;
6157877fdebSMatt Macy scols = MIN(dcols, roundup(bc, nparity + 1));
6167877fdebSMatt Macy } else {
6177877fdebSMatt Macy acols = dcols;
618eda14cbcSMatt Macy scols = dcols;
6197877fdebSMatt Macy }
6207877fdebSMatt Macy
621eda14cbcSMatt Macy ASSERT3U(acols, <=, scols);
6227877fdebSMatt Macy rr = vdev_raidz_row_alloc(scols);
623eda14cbcSMatt Macy rm->rm_row[0] = rr;
6247877fdebSMatt Macy rr->rr_cols = acols;
625eda14cbcSMatt Macy rr->rr_bigcols = bc;
6267877fdebSMatt Macy rr->rr_firstdatacol = nparity;
6277877fdebSMatt Macy #ifdef ZFS_DEBUG
628eda14cbcSMatt Macy rr->rr_offset = zio->io_offset;
6297877fdebSMatt Macy rr->rr_size = zio->io_size;
630eda14cbcSMatt Macy #endif
631eda14cbcSMatt Macy
632eda14cbcSMatt Macy uint64_t asize = 0;
633eda14cbcSMatt Macy
634eda14cbcSMatt Macy for (uint64_t c = 0; c < scols; c++) {
635eda14cbcSMatt Macy raidz_col_t *rc = &rr->rr_col[c];
636eda14cbcSMatt Macy uint64_t col = f + c;
6377877fdebSMatt Macy uint64_t coff = o;
638eda14cbcSMatt Macy if (col >= dcols) {
639eda14cbcSMatt Macy col -= dcols;
640eda14cbcSMatt Macy coff += 1ULL << ashift;
641eda14cbcSMatt Macy }
642eda14cbcSMatt Macy rc->rc_devidx = col;
643eda14cbcSMatt Macy rc->rc_offset = coff;
6447877fdebSMatt Macy
6457877fdebSMatt Macy if (c >= acols)
646eda14cbcSMatt Macy rc->rc_size = 0;
647eda14cbcSMatt Macy else if (c < bc)
648eda14cbcSMatt Macy rc->rc_size = (q + 1) << ashift;
649eda14cbcSMatt Macy else
650eda14cbcSMatt Macy rc->rc_size = q << ashift;
651eda14cbcSMatt Macy
652eda14cbcSMatt Macy asize += rc->rc_size;
6537877fdebSMatt Macy }
654eda14cbcSMatt Macy
6557877fdebSMatt Macy ASSERT3U(asize, ==, tot << ashift);
6567877fdebSMatt Macy rm->rm_nskip = roundup(tot, nparity + 1) - tot;
6577877fdebSMatt Macy rm->rm_skipstart = bc;
6587877fdebSMatt Macy
6597877fdebSMatt Macy /*
6607877fdebSMatt Macy * If all data stored spans all columns, there's a danger that parity
6617877fdebSMatt Macy * will always be on the same device and, since parity isn't read
6627877fdebSMatt Macy * during normal operation, that device's I/O bandwidth won't be
663eda14cbcSMatt Macy * used effectively. We therefore switch the parity every 1MB.
6647877fdebSMatt Macy *
6657877fdebSMatt Macy * ... at least that was, ostensibly, the theory. As a practical
666eda14cbcSMatt Macy * matter unless we juggle the parity between all devices evenly, we
6677877fdebSMatt Macy * won't see any benefit. Further, occasional writes that aren't a
668eda14cbcSMatt Macy * multiple of the LCM of the number of children and the minimum
6697877fdebSMatt Macy * stripe width are sufficient to avoid pessimal behavior.
670eda14cbcSMatt Macy * Unfortunately, this decision created an implicit on-disk format
6717877fdebSMatt Macy * requirement that we need to support for all eternity, but only
6727877fdebSMatt Macy * for single-parity RAID-Z.
6737877fdebSMatt Macy *
674eda14cbcSMatt Macy * If we intend to skip a sector in the zeroth column for padding
6757877fdebSMatt Macy * we must make sure to note this swap. We will never intend to
676eda14cbcSMatt Macy * skip the first column since at least one data and one parity
677eda14cbcSMatt Macy * column must appear in each row.
678eda14cbcSMatt Macy */
679eda14cbcSMatt Macy ASSERT(rr->rr_cols >= 2);
680eda14cbcSMatt Macy ASSERT(rr->rr_col[0].rc_size == rr->rr_col[1].rc_size);
681eda14cbcSMatt Macy
682eda14cbcSMatt Macy if (rr->rr_firstdatacol == 1 && (zio->io_offset & (1ULL << 20))) {
683eda14cbcSMatt Macy uint64_t devidx = rr->rr_col[0].rc_devidx;
6847877fdebSMatt Macy o = rr->rr_col[0].rc_offset;
685eda14cbcSMatt Macy rr->rr_col[0].rc_devidx = rr->rr_col[1].rc_devidx;
686eda14cbcSMatt Macy rr->rr_col[0].rc_offset = rr->rr_col[1].rc_offset;
687eda14cbcSMatt Macy rr->rr_col[1].rc_devidx = devidx;
688eda14cbcSMatt Macy rr->rr_col[1].rc_offset = o;
689eda14cbcSMatt Macy if (rm->rm_skipstart == 0)
690eda14cbcSMatt Macy rm->rm_skipstart = 1;
6917877fdebSMatt Macy }
6927877fdebSMatt Macy
693eda14cbcSMatt Macy if (zio->io_type == ZIO_TYPE_WRITE) {
694eda14cbcSMatt Macy vdev_raidz_map_alloc_write(zio, rm, ashift);
695eda14cbcSMatt Macy } else {
696eda14cbcSMatt Macy vdev_raidz_map_alloc_read(zio, rm);
697eda14cbcSMatt Macy }
698eda14cbcSMatt Macy /* init RAIDZ parity ops */
699eda14cbcSMatt Macy rm->rm_ops = vdev_raidz_math_get_ops();
700eda14cbcSMatt Macy
701eda14cbcSMatt Macy return (rm);
702eda14cbcSMatt Macy }
703eda14cbcSMatt Macy
704eda14cbcSMatt Macy /*
7057877fdebSMatt Macy * Everything before reflow_offset_synced should have been moved to the new
706eda14cbcSMatt Macy * location (read and write completed). However, this may not yet be reflected
7077877fdebSMatt Macy * in the on-disk format (e.g. raidz_reflow_sync() has been called but the
7087877fdebSMatt Macy * uberblock has not yet been written). If reflow is not in progress,
709eda14cbcSMatt Macy * reflow_offset_synced should be UINT64_MAX. For each row, if the row is
7107877fdebSMatt Macy * entirely before reflow_offset_synced, it will come from the new location.
711eda14cbcSMatt Macy * Otherwise this row will come from the old location. Therefore, rows that
712eda14cbcSMatt Macy * straddle the reflow_offset_synced will come from the old location.
7137877fdebSMatt Macy *
714eda14cbcSMatt Macy * For writes, reflow_offset_next is the next offset to copy. If a sector has
7157877fdebSMatt Macy * been copied, but not yet reflected in the on-disk progress
716eda14cbcSMatt Macy * (reflow_offset_synced), it will also be written to the new (already copied)
717eda14cbcSMatt Macy * offset.
7187877fdebSMatt Macy */
719eda14cbcSMatt Macy noinline raidz_map_t *
vdev_raidz_map_alloc_expanded(zio_t * zio,uint64_t ashift,uint64_t physical_cols,uint64_t logical_cols,uint64_t nparity,uint64_t reflow_offset_synced,uint64_t reflow_offset_next,boolean_t use_scratch)720eda14cbcSMatt Macy vdev_raidz_map_alloc_expanded(zio_t *zio,
7217877fdebSMatt Macy uint64_t ashift, uint64_t physical_cols, uint64_t logical_cols,
722eda14cbcSMatt Macy uint64_t nparity, uint64_t reflow_offset_synced,
723eda14cbcSMatt Macy uint64_t reflow_offset_next, boolean_t use_scratch)
724eda14cbcSMatt Macy {
725eda14cbcSMatt Macy abd_t *abd = zio->io_abd;
726eda14cbcSMatt Macy uint64_t offset = zio->io_offset;
727eda14cbcSMatt Macy uint64_t size = zio->io_size;
7287877fdebSMatt Macy
7297877fdebSMatt Macy /* The zio's size in units of the vdev's minimum sector size. */
7307877fdebSMatt Macy uint64_t s = size >> ashift;
7317877fdebSMatt Macy
7327877fdebSMatt Macy /*
7337877fdebSMatt Macy * "Quotient": The number of data sectors for this stripe on all but
7347877fdebSMatt Macy * the "big column" child vdevs that also contain "remainder" data.
7357877fdebSMatt Macy * AKA "full rows"
7367877fdebSMatt Macy */
737eda14cbcSMatt Macy uint64_t q = s / (logical_cols - nparity);
738eda14cbcSMatt Macy
739eda14cbcSMatt Macy /*
740eda14cbcSMatt Macy * "Remainder": The number of partial stripe data sectors in this I/O.
741eda14cbcSMatt Macy * This will add a sector to some, but not all, child vdevs.
742eda14cbcSMatt Macy */
743eda14cbcSMatt Macy uint64_t r = s - q * (logical_cols - nparity);
744eda14cbcSMatt Macy
745eda14cbcSMatt Macy /* The number of "big columns" - those which contain remainder data. */
746eda14cbcSMatt Macy uint64_t bc = (r == 0 ? 0 : r + nparity);
747eda14cbcSMatt Macy
748eda14cbcSMatt Macy /*
749eda14cbcSMatt Macy * The total number of data and parity sectors associated with
750eda14cbcSMatt Macy * this I/O.
751eda14cbcSMatt Macy */
752eda14cbcSMatt Macy uint64_t tot = s + nparity * (q + (r == 0 ? 0 : 1));
753eda14cbcSMatt Macy
754eda14cbcSMatt Macy /* How many rows contain data (not skip) */
755eda14cbcSMatt Macy uint64_t rows = howmany(tot, logical_cols);
756eda14cbcSMatt Macy int cols = MIN(tot, logical_cols);
757eda14cbcSMatt Macy
758eda14cbcSMatt Macy raidz_map_t *rm =
759eda14cbcSMatt Macy kmem_zalloc(offsetof(raidz_map_t, rm_row[rows]),
760eda14cbcSMatt Macy KM_SLEEP);
761eda14cbcSMatt Macy rm->rm_nrows = rows;
762eda14cbcSMatt Macy rm->rm_nskip = roundup(tot, nparity + 1) - tot;
763eda14cbcSMatt Macy rm->rm_skipstart = bc;
764eda14cbcSMatt Macy uint64_t asize = 0;
765eda14cbcSMatt Macy
766eda14cbcSMatt Macy for (uint64_t row = 0; row < rows; row++) {
767eda14cbcSMatt Macy boolean_t row_use_scratch = B_FALSE;
768eda14cbcSMatt Macy raidz_row_t *rr = vdev_raidz_row_alloc(cols);
769eda14cbcSMatt Macy rm->rm_row[row] = rr;
770eda14cbcSMatt Macy
771eda14cbcSMatt Macy /* The starting RAIDZ (parent) vdev sector of the row. */
772eda14cbcSMatt Macy uint64_t b = (offset >> ashift) + row * logical_cols;
773eda14cbcSMatt Macy
774eda14cbcSMatt Macy /*
775eda14cbcSMatt Macy * If we are in the middle of a reflow, and the copying has
776eda14cbcSMatt Macy * not yet completed for any part of this row, then use the
777eda14cbcSMatt Macy * old location of this row. Note that reflow_offset_synced
778eda14cbcSMatt Macy * reflects the i/o that's been completed, because it's
779eda14cbcSMatt Macy * updated by a synctask, after zio_wait(spa_txg_zio[]).
780eda14cbcSMatt Macy * This is sufficient for our check, even if that progress
781eda14cbcSMatt Macy * has not yet been recorded to disk (reflected in
782eda14cbcSMatt Macy * spa_ubsync). Also note that we consider the last row to
783eda14cbcSMatt Macy * be "full width" (`cols`-wide rather than `bc`-wide) for
784eda14cbcSMatt Macy * this calculation. This causes a tiny bit of unnecessary
785eda14cbcSMatt Macy * double-writes but is safe and simpler to calculate.
786eda14cbcSMatt Macy */
787eda14cbcSMatt Macy int row_phys_cols = physical_cols;
788eda14cbcSMatt Macy if (b + cols > reflow_offset_synced >> ashift)
789eda14cbcSMatt Macy row_phys_cols--;
790eda14cbcSMatt Macy else if (use_scratch)
791eda14cbcSMatt Macy row_use_scratch = B_TRUE;
792eda14cbcSMatt Macy
793eda14cbcSMatt Macy /* starting child of this row */
794eda14cbcSMatt Macy uint64_t child_id = b % row_phys_cols;
795eda14cbcSMatt Macy /* The starting byte offset on each child vdev. */
796eda14cbcSMatt Macy uint64_t child_offset = (b / row_phys_cols) << ashift;
797eda14cbcSMatt Macy
798eda14cbcSMatt Macy /*
799eda14cbcSMatt Macy * Note, rr_cols is the entire width of the block, even
800eda14cbcSMatt Macy * if this row is shorter. This is needed because parity
801eda14cbcSMatt Macy * generation (for Q and R) needs to know the entire width,
802eda14cbcSMatt Macy * because it treats the short row as though it was
803eda14cbcSMatt Macy * full-width (and the "phantom" sectors were zero-filled).
804eda14cbcSMatt Macy *
805eda14cbcSMatt Macy * Another approach to this would be to set cols shorter
806eda14cbcSMatt Macy * (to just the number of columns that we might do i/o to)
807eda14cbcSMatt Macy * and have another mechanism to tell the parity generation
808eda14cbcSMatt Macy * about the "entire width". Reconstruction (at least
809eda14cbcSMatt Macy * vdev_raidz_reconstruct_general()) would also need to
810eda14cbcSMatt Macy * know about the "entire width".
811eda14cbcSMatt Macy */
812eda14cbcSMatt Macy rr->rr_firstdatacol = nparity;
813eda14cbcSMatt Macy #ifdef ZFS_DEBUG
814eda14cbcSMatt Macy /*
815eda14cbcSMatt Macy * note: rr_size is PSIZE, not ASIZE
816eda14cbcSMatt Macy */
817eda14cbcSMatt Macy rr->rr_offset = b << ashift;
818eda14cbcSMatt Macy rr->rr_size = (rr->rr_cols - rr->rr_firstdatacol) << ashift;
819eda14cbcSMatt Macy #endif
820eda14cbcSMatt Macy
821eda14cbcSMatt Macy for (int c = 0; c < rr->rr_cols; c++, child_id++) {
822eda14cbcSMatt Macy if (child_id >= row_phys_cols) {
823eda14cbcSMatt Macy child_id -= row_phys_cols;
824eda14cbcSMatt Macy child_offset += 1ULL << ashift;
825eda14cbcSMatt Macy }
826eda14cbcSMatt Macy raidz_col_t *rc = &rr->rr_col[c];
827eda14cbcSMatt Macy rc->rc_devidx = child_id;
828eda14cbcSMatt Macy rc->rc_offset = child_offset;
829eda14cbcSMatt Macy
830eda14cbcSMatt Macy /*
831eda14cbcSMatt Macy * Get this from the scratch space if appropriate.
832eda14cbcSMatt Macy * This only happens if we crashed in the middle of
833eda14cbcSMatt Macy * raidz_reflow_scratch_sync() (while it's running,
834eda14cbcSMatt Macy * the rangelock prevents us from doing concurrent
835eda14cbcSMatt Macy * io), and even then only during zpool import or
836eda14cbcSMatt Macy * when the pool is imported readonly.
837eda14cbcSMatt Macy */
838eda14cbcSMatt Macy if (row_use_scratch)
839eda14cbcSMatt Macy rc->rc_offset -= VDEV_BOOT_SIZE;
840eda14cbcSMatt Macy
841eda14cbcSMatt Macy uint64_t dc = c - rr->rr_firstdatacol;
842eda14cbcSMatt Macy if (c < rr->rr_firstdatacol) {
843eda14cbcSMatt Macy rc->rc_size = 1ULL << ashift;
844eda14cbcSMatt Macy
845eda14cbcSMatt Macy /*
846eda14cbcSMatt Macy * Parity sectors' rc_abd's are set below
847eda14cbcSMatt Macy * after determining if this is an aggregation.
848eda14cbcSMatt Macy */
849eda14cbcSMatt Macy } else if (row == rows - 1 && bc != 0 && c >= bc) {
850eda14cbcSMatt Macy /*
851eda14cbcSMatt Macy * Past the end of the block (even including
852eda14cbcSMatt Macy * skip sectors). This sector is part of the
853eda14cbcSMatt Macy * map so that we have full rows for p/q parity
8547877fdebSMatt Macy * generation.
855eda14cbcSMatt Macy */
856eda14cbcSMatt Macy rc->rc_size = 0;
857eda14cbcSMatt Macy rc->rc_abd = NULL;
858eda14cbcSMatt Macy } else {
8597877fdebSMatt Macy /* "data column" (col excluding parity) */
8607877fdebSMatt Macy uint64_t off;
8617877fdebSMatt Macy
862eda14cbcSMatt Macy if (c < bc || r == 0) {
8637877fdebSMatt Macy off = dc * rows + row;
864eda14cbcSMatt Macy } else {
8657877fdebSMatt Macy off = r * rows +
8667877fdebSMatt Macy (dc - r) * (rows - 1) + row;
867eda14cbcSMatt Macy }
8687877fdebSMatt Macy rc->rc_size = 1ULL << ashift;
869eda14cbcSMatt Macy rc->rc_abd = abd_get_offset_struct(
8707877fdebSMatt Macy &rc->rc_abdstruct, abd, off << ashift,
8717877fdebSMatt Macy rc->rc_size);
8727877fdebSMatt Macy }
873eda14cbcSMatt Macy
8747877fdebSMatt Macy if (rc->rc_size == 0)
875eda14cbcSMatt Macy continue;
876eda14cbcSMatt Macy
877eda14cbcSMatt Macy /*
878eda14cbcSMatt Macy * If any part of this row is in both old and new
879eda14cbcSMatt Macy * locations, the primary location is the old
880eda14cbcSMatt Macy * location. If this sector was already copied to the
881eda14cbcSMatt Macy * new location, we need to also write to the new,
882eda14cbcSMatt Macy * "shadow" location.
883eda14cbcSMatt Macy *
884eda14cbcSMatt Macy * Note, `row_phys_cols != physical_cols` indicates
885eda14cbcSMatt Macy * that the primary location is the old location.
886eda14cbcSMatt Macy * `b+c < reflow_offset_next` indicates that the copy
8877877fdebSMatt Macy * to the new location has been initiated. We know
888eda14cbcSMatt Macy * that the copy has completed because we have the
889eda14cbcSMatt Macy * rangelock, which is held exclusively while the
890eda14cbcSMatt Macy * copy is in progress.
891eda14cbcSMatt Macy */
892eda14cbcSMatt Macy if (row_use_scratch ||
893eda14cbcSMatt Macy (row_phys_cols != physical_cols &&
894eda14cbcSMatt Macy b + c < reflow_offset_next >> ashift)) {
8957877fdebSMatt Macy rc->rc_shadow_devidx = (b + c) % physical_cols;
896eda14cbcSMatt Macy rc->rc_shadow_offset =
8977877fdebSMatt Macy ((b + c) / physical_cols) << ashift;
8987877fdebSMatt Macy if (row_use_scratch)
8997877fdebSMatt Macy rc->rc_shadow_offset -= VDEV_BOOT_SIZE;
900eda14cbcSMatt Macy }
9017877fdebSMatt Macy
9027877fdebSMatt Macy asize += rc->rc_size;
903eda14cbcSMatt Macy }
9047877fdebSMatt Macy
905eda14cbcSMatt Macy /*
9067877fdebSMatt Macy * See comment in vdev_raidz_map_alloc()
907eda14cbcSMatt Macy */
9087877fdebSMatt Macy if (rr->rr_firstdatacol == 1 && rr->rr_cols > 1 &&
9097877fdebSMatt Macy (offset & (1ULL << 20))) {
910eda14cbcSMatt Macy ASSERT(rr->rr_cols >= 2);
9117877fdebSMatt Macy ASSERT(rr->rr_col[0].rc_size == rr->rr_col[1].rc_size);
912eda14cbcSMatt Macy
913eda14cbcSMatt Macy int devidx0 = rr->rr_col[0].rc_devidx;
914eda14cbcSMatt Macy uint64_t offset0 = rr->rr_col[0].rc_offset;
9157877fdebSMatt Macy int shadow_devidx0 = rr->rr_col[0].rc_shadow_devidx;
916eda14cbcSMatt Macy uint64_t shadow_offset0 =
917eda14cbcSMatt Macy rr->rr_col[0].rc_shadow_offset;
918eda14cbcSMatt Macy
919eda14cbcSMatt Macy rr->rr_col[0].rc_devidx = rr->rr_col[1].rc_devidx;
9207877fdebSMatt Macy rr->rr_col[0].rc_offset = rr->rr_col[1].rc_offset;
9217877fdebSMatt Macy rr->rr_col[0].rc_shadow_devidx =
9227877fdebSMatt Macy rr->rr_col[1].rc_shadow_devidx;
923eda14cbcSMatt Macy rr->rr_col[0].rc_shadow_offset =
924eda14cbcSMatt Macy rr->rr_col[1].rc_shadow_offset;
9257877fdebSMatt Macy
926eda14cbcSMatt Macy rr->rr_col[1].rc_devidx = devidx0;
927eda14cbcSMatt Macy rr->rr_col[1].rc_offset = offset0;
928eda14cbcSMatt Macy rr->rr_col[1].rc_shadow_devidx = shadow_devidx0;
929eda14cbcSMatt Macy rr->rr_col[1].rc_shadow_offset = shadow_offset0;
930eda14cbcSMatt Macy }
931eda14cbcSMatt Macy }
9327877fdebSMatt Macy ASSERT3U(asize, ==, tot << ashift);
933eda14cbcSMatt Macy
934eda14cbcSMatt Macy /*
935eda14cbcSMatt Macy * Determine if the block is contiguous, in which case we can use
936eda14cbcSMatt Macy * an aggregation.
937eda14cbcSMatt Macy */
938eda14cbcSMatt Macy if (rows >= raidz_io_aggregate_rows) {
939eda14cbcSMatt Macy rm->rm_nphys_cols = physical_cols;
940eda14cbcSMatt Macy rm->rm_phys_col =
941eda14cbcSMatt Macy kmem_zalloc(sizeof (raidz_col_t) * rm->rm_nphys_cols,
942eda14cbcSMatt Macy KM_SLEEP);
9437877fdebSMatt Macy
9447877fdebSMatt Macy /*
945eda14cbcSMatt Macy * Determine the aggregate io's offset and size, and check
9467877fdebSMatt Macy * that the io is contiguous.
947eda14cbcSMatt Macy */
948eda14cbcSMatt Macy for (int i = 0;
949eda14cbcSMatt Macy i < rm->rm_nrows && rm->rm_phys_col != NULL; i++) {
950eda14cbcSMatt Macy raidz_row_t *rr = rm->rm_row[i];
951eda14cbcSMatt Macy for (int c = 0; c < rr->rr_cols; c++) {
952eda14cbcSMatt Macy raidz_col_t *rc = &rr->rr_col[c];
953eda14cbcSMatt Macy raidz_col_t *prc =
954eda14cbcSMatt Macy &rm->rm_phys_col[rc->rc_devidx];
9557877fdebSMatt Macy
9567877fdebSMatt Macy if (rc->rc_size == 0)
9577877fdebSMatt Macy continue;
9587877fdebSMatt Macy
959eda14cbcSMatt Macy if (prc->rc_size == 0) {
9607877fdebSMatt Macy ASSERT0(prc->rc_offset);
9617877fdebSMatt Macy prc->rc_offset = rc->rc_offset;
9627877fdebSMatt Macy } else if (prc->rc_offset + prc->rc_size !=
9637877fdebSMatt Macy rc->rc_offset) {
9647877fdebSMatt Macy /*
9657877fdebSMatt Macy * This block is not contiguous and
966eda14cbcSMatt Macy * therefore can't be aggregated.
9677877fdebSMatt Macy * This is expected to be rare, so
968eda14cbcSMatt Macy * the cost of allocating and then
9697877fdebSMatt Macy * freeing rm_phys_col is not
9707877fdebSMatt Macy * significant.
971eda14cbcSMatt Macy */
972eda14cbcSMatt Macy kmem_free(rm->rm_phys_col,
973eda14cbcSMatt Macy sizeof (raidz_col_t) *
9747877fdebSMatt Macy rm->rm_nphys_cols);
9757877fdebSMatt Macy rm->rm_phys_col = NULL;
9767877fdebSMatt Macy rm->rm_nphys_cols = 0;
9777877fdebSMatt Macy break;
978eda14cbcSMatt Macy }
979eda14cbcSMatt Macy prc->rc_size += rc->rc_size;
980eda14cbcSMatt Macy }
981eda14cbcSMatt Macy }
982eda14cbcSMatt Macy }
983eda14cbcSMatt Macy if (rm->rm_phys_col != NULL) {
984eda14cbcSMatt Macy /*
985eda14cbcSMatt Macy * Allocate aggregate ABD's.
986eda14cbcSMatt Macy */
987eda14cbcSMatt Macy for (int i = 0; i < rm->rm_nphys_cols; i++) {
988eda14cbcSMatt Macy raidz_col_t *prc = &rm->rm_phys_col[i];
989eda14cbcSMatt Macy
990eda14cbcSMatt Macy prc->rc_devidx = i;
991eda14cbcSMatt Macy
992eda14cbcSMatt Macy if (prc->rc_size == 0)
993eda14cbcSMatt Macy continue;
994eda14cbcSMatt Macy
9957877fdebSMatt Macy prc->rc_abd =
996eda14cbcSMatt Macy abd_alloc_linear(rm->rm_phys_col[i].rc_size,
997eda14cbcSMatt Macy B_FALSE);
998eda14cbcSMatt Macy }
999eda14cbcSMatt Macy
1000eda14cbcSMatt Macy /*
1001eda14cbcSMatt Macy * Point the parity abd's into the aggregate abd's.
1002eda14cbcSMatt Macy */
1003eda14cbcSMatt Macy for (int i = 0; i < rm->rm_nrows; i++) {
1004eda14cbcSMatt Macy raidz_row_t *rr = rm->rm_row[i];
1005eda14cbcSMatt Macy for (int c = 0; c < rr->rr_firstdatacol; c++) {
1006eda14cbcSMatt Macy raidz_col_t *rc = &rr->rr_col[c];
1007eda14cbcSMatt Macy raidz_col_t *prc =
1008eda14cbcSMatt Macy &rm->rm_phys_col[rc->rc_devidx];
10097877fdebSMatt Macy rc->rc_abd =
10107877fdebSMatt Macy abd_get_offset_struct(&rc->rc_abdstruct,
1011eda14cbcSMatt Macy prc->rc_abd,
1012eda14cbcSMatt Macy rc->rc_offset - prc->rc_offset,
1013eda14cbcSMatt Macy rc->rc_size);
1014eda14cbcSMatt Macy }
10157877fdebSMatt Macy }
10167877fdebSMatt Macy } else {
1017eda14cbcSMatt Macy /*
1018eda14cbcSMatt Macy * Allocate new abd's for the parity sectors.
1019eda14cbcSMatt Macy */
1020eda14cbcSMatt Macy for (int i = 0; i < rm->rm_nrows; i++) {
1021eda14cbcSMatt Macy raidz_row_t *rr = rm->rm_row[i];
1022eda14cbcSMatt Macy for (int c = 0; c < rr->rr_firstdatacol; c++) {
1023eda14cbcSMatt Macy raidz_col_t *rc = &rr->rr_col[c];
1024eda14cbcSMatt Macy rc->rc_abd =
1025eda14cbcSMatt Macy abd_alloc_linear(rc->rc_size,
1026eda14cbcSMatt Macy B_TRUE);
1027eda14cbcSMatt Macy }
1028eda14cbcSMatt Macy }
1029eda14cbcSMatt Macy }
1030eda14cbcSMatt Macy /* init RAIDZ parity ops */
1031eda14cbcSMatt Macy rm->rm_ops = vdev_raidz_math_get_ops();
1032eda14cbcSMatt Macy
1033eda14cbcSMatt Macy return (rm);
1034eda14cbcSMatt Macy }
1035eda14cbcSMatt Macy
1036eda14cbcSMatt Macy struct pqr_struct {
1037eda14cbcSMatt Macy uint64_t *p;
1038eda14cbcSMatt Macy uint64_t *q;
1039eda14cbcSMatt Macy uint64_t *r;
1040eda14cbcSMatt Macy };
1041eda14cbcSMatt Macy
1042eda14cbcSMatt Macy static int
vdev_raidz_p_func(void * buf,size_t size,void * private)1043eda14cbcSMatt Macy vdev_raidz_p_func(void *buf, size_t size, void *private)
1044eda14cbcSMatt Macy {
1045eda14cbcSMatt Macy struct pqr_struct *pqr = private;
1046eda14cbcSMatt Macy const uint64_t *src = buf;
1047eda14cbcSMatt Macy int cnt = size / sizeof (src[0]);
1048eda14cbcSMatt Macy
1049eda14cbcSMatt Macy ASSERT(pqr->p && !pqr->q && !pqr->r);
1050eda14cbcSMatt Macy
1051eda14cbcSMatt Macy for (int i = 0; i < cnt; i++, src++, pqr->p++)
1052eda14cbcSMatt Macy *pqr->p ^= *src;
1053eda14cbcSMatt Macy
1054eda14cbcSMatt Macy return (0);
1055eda14cbcSMatt Macy }
1056eda14cbcSMatt Macy
1057eda14cbcSMatt Macy static int
vdev_raidz_pq_func(void * buf,size_t size,void * private)1058eda14cbcSMatt Macy vdev_raidz_pq_func(void *buf, size_t size, void *private)
1059eda14cbcSMatt Macy {
1060eda14cbcSMatt Macy struct pqr_struct *pqr = private;
1061eda14cbcSMatt Macy const uint64_t *src = buf;
1062eda14cbcSMatt Macy uint64_t mask;
1063eda14cbcSMatt Macy int cnt = size / sizeof (src[0]);
1064eda14cbcSMatt Macy
1065eda14cbcSMatt Macy ASSERT(pqr->p && pqr->q && !pqr->r);
1066eda14cbcSMatt Macy
1067eda14cbcSMatt Macy for (int i = 0; i < cnt; i++, src++, pqr->p++, pqr->q++) {
1068eda14cbcSMatt Macy *pqr->p ^= *src;
1069eda14cbcSMatt Macy VDEV_RAIDZ_64MUL_2(*pqr->q, mask);
1070eda14cbcSMatt Macy *pqr->q ^= *src;
1071eda14cbcSMatt Macy }
1072eda14cbcSMatt Macy
1073eda14cbcSMatt Macy return (0);
1074eda14cbcSMatt Macy }
1075eda14cbcSMatt Macy
1076eda14cbcSMatt Macy static int
vdev_raidz_pqr_func(void * buf,size_t size,void * private)1077eda14cbcSMatt Macy vdev_raidz_pqr_func(void *buf, size_t size, void *private)
1078eda14cbcSMatt Macy {
1079eda14cbcSMatt Macy struct pqr_struct *pqr = private;
1080eda14cbcSMatt Macy const uint64_t *src = buf;
1081eda14cbcSMatt Macy uint64_t mask;
1082eda14cbcSMatt Macy int cnt = size / sizeof (src[0]);
1083eda14cbcSMatt Macy
1084eda14cbcSMatt Macy ASSERT(pqr->p && pqr->q && pqr->r);
1085eda14cbcSMatt Macy
1086eda14cbcSMatt Macy for (int i = 0; i < cnt; i++, src++, pqr->p++, pqr->q++, pqr->r++) {
1087eda14cbcSMatt Macy *pqr->p ^= *src;
1088eda14cbcSMatt Macy VDEV_RAIDZ_64MUL_2(*pqr->q, mask);
1089eda14cbcSMatt Macy *pqr->q ^= *src;
1090eda14cbcSMatt Macy VDEV_RAIDZ_64MUL_4(*pqr->r, mask);
1091eda14cbcSMatt Macy *pqr->r ^= *src;
1092eda14cbcSMatt Macy }
1093eda14cbcSMatt Macy
1094eda14cbcSMatt Macy return (0);
1095eda14cbcSMatt Macy }
1096eda14cbcSMatt Macy
1097eda14cbcSMatt Macy static void
vdev_raidz_generate_parity_p(raidz_row_t * rr)1098eda14cbcSMatt Macy vdev_raidz_generate_parity_p(raidz_row_t *rr)
1099eda14cbcSMatt Macy {
1100eda14cbcSMatt Macy uint64_t *p = abd_to_buf(rr->rr_col[VDEV_RAIDZ_P].rc_abd);
1101eda14cbcSMatt Macy
1102eda14cbcSMatt Macy for (int c = rr->rr_firstdatacol; c < rr->rr_cols; c++) {
1103eda14cbcSMatt Macy abd_t *src = rr->rr_col[c].rc_abd;
1104eda14cbcSMatt Macy
1105eda14cbcSMatt Macy if (c == rr->rr_firstdatacol) {
1106eda14cbcSMatt Macy abd_copy_to_buf(p, src, rr->rr_col[c].rc_size);
1107eda14cbcSMatt Macy } else {
1108eda14cbcSMatt Macy struct pqr_struct pqr = { p, NULL, NULL };
1109eda14cbcSMatt Macy (void) abd_iterate_func(src, 0, rr->rr_col[c].rc_size,
1110eda14cbcSMatt Macy vdev_raidz_p_func, &pqr);
1111eda14cbcSMatt Macy }
1112eda14cbcSMatt Macy }
1113eda14cbcSMatt Macy }
1114eda14cbcSMatt Macy
1115eda14cbcSMatt Macy static void
vdev_raidz_generate_parity_pq(raidz_row_t * rr)1116eda14cbcSMatt Macy vdev_raidz_generate_parity_pq(raidz_row_t *rr)
1117eda14cbcSMatt Macy {
1118eda14cbcSMatt Macy uint64_t *p = abd_to_buf(rr->rr_col[VDEV_RAIDZ_P].rc_abd);
1119eda14cbcSMatt Macy uint64_t *q = abd_to_buf(rr->rr_col[VDEV_RAIDZ_Q].rc_abd);
1120eda14cbcSMatt Macy uint64_t pcnt = rr->rr_col[VDEV_RAIDZ_P].rc_size / sizeof (p[0]);
1121eda14cbcSMatt Macy ASSERT(rr->rr_col[VDEV_RAIDZ_P].rc_size ==
1122eda14cbcSMatt Macy rr->rr_col[VDEV_RAIDZ_Q].rc_size);
1123eda14cbcSMatt Macy
1124eda14cbcSMatt Macy for (int c = rr->rr_firstdatacol; c < rr->rr_cols; c++) {
1125eda14cbcSMatt Macy abd_t *src = rr->rr_col[c].rc_abd;
1126eda14cbcSMatt Macy
1127eda14cbcSMatt Macy uint64_t ccnt = rr->rr_col[c].rc_size / sizeof (p[0]);
1128eda14cbcSMatt Macy
1129eda14cbcSMatt Macy if (c == rr->rr_firstdatacol) {
1130eda14cbcSMatt Macy ASSERT(ccnt == pcnt || ccnt == 0);
1131eda14cbcSMatt Macy abd_copy_to_buf(p, src, rr->rr_col[c].rc_size);
1132eda14cbcSMatt Macy (void) memcpy(q, p, rr->rr_col[c].rc_size);
1133eda14cbcSMatt Macy
1134eda14cbcSMatt Macy for (uint64_t i = ccnt; i < pcnt; i++) {
1135eda14cbcSMatt Macy p[i] = 0;
1136eda14cbcSMatt Macy q[i] = 0;
1137eda14cbcSMatt Macy }
1138eda14cbcSMatt Macy } else {
1139eda14cbcSMatt Macy struct pqr_struct pqr = { p, q, NULL };
1140eda14cbcSMatt Macy
1141eda14cbcSMatt Macy ASSERT(ccnt <= pcnt);
1142eda14cbcSMatt Macy (void) abd_iterate_func(src, 0, rr->rr_col[c].rc_size,
1143eda14cbcSMatt Macy vdev_raidz_pq_func, &pqr);
1144eda14cbcSMatt Macy
1145eda14cbcSMatt Macy /*
1146eda14cbcSMatt Macy * Treat short columns as though they are full of 0s.
1147eda14cbcSMatt Macy * Note that there's therefore nothing needed for P.
1148eda14cbcSMatt Macy */
1149eda14cbcSMatt Macy uint64_t mask;
1150eda14cbcSMatt Macy for (uint64_t i = ccnt; i < pcnt; i++) {
1151eda14cbcSMatt Macy VDEV_RAIDZ_64MUL_2(q[i], mask);
1152eda14cbcSMatt Macy }
1153eda14cbcSMatt Macy }
1154eda14cbcSMatt Macy }
1155eda14cbcSMatt Macy }
1156eda14cbcSMatt Macy
1157eda14cbcSMatt Macy static void
vdev_raidz_generate_parity_pqr(raidz_row_t * rr)1158eda14cbcSMatt Macy vdev_raidz_generate_parity_pqr(raidz_row_t *rr)
1159eda14cbcSMatt Macy {
1160eda14cbcSMatt Macy uint64_t *p = abd_to_buf(rr->rr_col[VDEV_RAIDZ_P].rc_abd);
1161eda14cbcSMatt Macy uint64_t *q = abd_to_buf(rr->rr_col[VDEV_RAIDZ_Q].rc_abd);
1162eda14cbcSMatt Macy uint64_t *r = abd_to_buf(rr->rr_col[VDEV_RAIDZ_R].rc_abd);
1163eda14cbcSMatt Macy uint64_t pcnt = rr->rr_col[VDEV_RAIDZ_P].rc_size / sizeof (p[0]);
1164eda14cbcSMatt Macy ASSERT(rr->rr_col[VDEV_RAIDZ_P].rc_size ==
1165eda14cbcSMatt Macy rr->rr_col[VDEV_RAIDZ_Q].rc_size);
1166eda14cbcSMatt Macy ASSERT(rr->rr_col[VDEV_RAIDZ_P].rc_size ==
1167eda14cbcSMatt Macy rr->rr_col[VDEV_RAIDZ_R].rc_size);
1168eda14cbcSMatt Macy
1169eda14cbcSMatt Macy for (int c = rr->rr_firstdatacol; c < rr->rr_cols; c++) {
1170eda14cbcSMatt Macy abd_t *src = rr->rr_col[c].rc_abd;
1171eda14cbcSMatt Macy
1172eda14cbcSMatt Macy uint64_t ccnt = rr->rr_col[c].rc_size / sizeof (p[0]);
1173eda14cbcSMatt Macy
1174eda14cbcSMatt Macy if (c == rr->rr_firstdatacol) {
1175eda14cbcSMatt Macy ASSERT(ccnt == pcnt || ccnt == 0);
11767877fdebSMatt Macy abd_copy_to_buf(p, src, rr->rr_col[c].rc_size);
1177eda14cbcSMatt Macy (void) memcpy(q, p, rr->rr_col[c].rc_size);
1178eda14cbcSMatt Macy (void) memcpy(r, p, rr->rr_col[c].rc_size);
1179eda14cbcSMatt Macy
1180eda14cbcSMatt Macy for (uint64_t i = ccnt; i < pcnt; i++) {
1181eda14cbcSMatt Macy p[i] = 0;
11827877fdebSMatt Macy q[i] = 0;
1183eda14cbcSMatt Macy r[i] = 0;
1184eda14cbcSMatt Macy }
1185eda14cbcSMatt Macy } else {
1186eda14cbcSMatt Macy struct pqr_struct pqr = { p, q, r };
1187eda14cbcSMatt Macy
1188eda14cbcSMatt Macy ASSERT(ccnt <= pcnt);
1189eda14cbcSMatt Macy (void) abd_iterate_func(src, 0, rr->rr_col[c].rc_size,
1190eda14cbcSMatt Macy vdev_raidz_pqr_func, &pqr);
1191eda14cbcSMatt Macy
1192eda14cbcSMatt Macy /*
1193eda14cbcSMatt Macy * Treat short columns as though they are full of 0s.
1194eda14cbcSMatt Macy * Note that there's therefore nothing needed for P.
1195eda14cbcSMatt Macy */
1196eda14cbcSMatt Macy uint64_t mask;
1197eda14cbcSMatt Macy for (uint64_t i = ccnt; i < pcnt; i++) {
1198eda14cbcSMatt Macy VDEV_RAIDZ_64MUL_2(q[i], mask);
1199eda14cbcSMatt Macy VDEV_RAIDZ_64MUL_4(r[i], mask);
1200eda14cbcSMatt Macy }
1201eda14cbcSMatt Macy }
1202eda14cbcSMatt Macy }
1203eda14cbcSMatt Macy }
1204eda14cbcSMatt Macy
1205eda14cbcSMatt Macy /*
12067877fdebSMatt Macy * Generate RAID parity in the first virtual columns according to the number of
1207eda14cbcSMatt Macy * parity columns available.
1208eda14cbcSMatt Macy */
1209eda14cbcSMatt Macy void
vdev_raidz_generate_parity_row(raidz_map_t * rm,raidz_row_t * rr)1210eda14cbcSMatt Macy vdev_raidz_generate_parity_row(raidz_map_t *rm, raidz_row_t *rr)
1211eda14cbcSMatt Macy {
1212eda14cbcSMatt Macy if (rr->rr_cols == 0) {
1213eda14cbcSMatt Macy /*
1214eda14cbcSMatt Macy * We are handling this block one row at a time (because
1215eda14cbcSMatt Macy * this block has a different logical vs physical width,
1216eda14cbcSMatt Macy * due to RAIDZ expansion), and this is a pad-only row,
1217eda14cbcSMatt Macy * which has no parity.
12187877fdebSMatt Macy */
1219eda14cbcSMatt Macy return;
1220eda14cbcSMatt Macy }
12217877fdebSMatt Macy
1222eda14cbcSMatt Macy /* Generate using the new math implementation */
1223eda14cbcSMatt Macy if (vdev_raidz_math_generate(rm, rr) != RAIDZ_ORIGINAL_IMPL)
1224eda14cbcSMatt Macy return;
1225eda14cbcSMatt Macy
1226eda14cbcSMatt Macy switch (rr->rr_firstdatacol) {
1227eda14cbcSMatt Macy case 1:
1228eda14cbcSMatt Macy vdev_raidz_generate_parity_p(rr);
1229eda14cbcSMatt Macy break;
1230eda14cbcSMatt Macy case 2:
1231eda14cbcSMatt Macy vdev_raidz_generate_parity_pq(rr);
1232eda14cbcSMatt Macy break;
1233eda14cbcSMatt Macy case 3:
1234eda14cbcSMatt Macy vdev_raidz_generate_parity_pqr(rr);
1235eda14cbcSMatt Macy break;
1236eda14cbcSMatt Macy default:
1237eda14cbcSMatt Macy cmn_err(CE_PANIC, "invalid RAID-Z configuration");
12387877fdebSMatt Macy }
12397877fdebSMatt Macy }
1240eda14cbcSMatt Macy
1241eda14cbcSMatt Macy void
vdev_raidz_generate_parity(raidz_map_t * rm)1242eda14cbcSMatt Macy vdev_raidz_generate_parity(raidz_map_t *rm)
1243eda14cbcSMatt Macy {
1244eda14cbcSMatt Macy for (int i = 0; i < rm->rm_nrows; i++) {
1245eda14cbcSMatt Macy raidz_row_t *rr = rm->rm_row[i];
1246eda14cbcSMatt Macy vdev_raidz_generate_parity_row(rm, rr);
1247eda14cbcSMatt Macy }
1248eda14cbcSMatt Macy }
1249eda14cbcSMatt Macy
1250eda14cbcSMatt Macy static int
vdev_raidz_reconst_p_func(void * dbuf,void * sbuf,size_t size,void * private)1251eda14cbcSMatt Macy vdev_raidz_reconst_p_func(void *dbuf, void *sbuf, size_t size, void *private)
1252eda14cbcSMatt Macy {
1253eda14cbcSMatt Macy (void) private;
1254eda14cbcSMatt Macy uint64_t *dst = dbuf;
1255eda14cbcSMatt Macy uint64_t *src = sbuf;
1256eda14cbcSMatt Macy int cnt = size / sizeof (src[0]);
1257eda14cbcSMatt Macy
1258eda14cbcSMatt Macy for (int i = 0; i < cnt; i++) {
1259eda14cbcSMatt Macy dst[i] ^= src[i];
1260eda14cbcSMatt Macy }
1261eda14cbcSMatt Macy
1262eda14cbcSMatt Macy return (0);
1263eda14cbcSMatt Macy }
1264eda14cbcSMatt Macy
1265eda14cbcSMatt Macy static int
vdev_raidz_reconst_q_pre_func(void * dbuf,void * sbuf,size_t size,void * private)1266eda14cbcSMatt Macy vdev_raidz_reconst_q_pre_func(void *dbuf, void *sbuf, size_t size,
1267eda14cbcSMatt Macy void *private)
1268eda14cbcSMatt Macy {
1269eda14cbcSMatt Macy (void) private;
1270eda14cbcSMatt Macy uint64_t *dst = dbuf;
1271eda14cbcSMatt Macy uint64_t *src = sbuf;
1272eda14cbcSMatt Macy uint64_t mask;
1273eda14cbcSMatt Macy int cnt = size / sizeof (dst[0]);
1274eda14cbcSMatt Macy
1275eda14cbcSMatt Macy for (int i = 0; i < cnt; i++, dst++, src++) {
1276eda14cbcSMatt Macy VDEV_RAIDZ_64MUL_2(*dst, mask);
1277eda14cbcSMatt Macy *dst ^= *src;
1278eda14cbcSMatt Macy }
1279eda14cbcSMatt Macy
1280eda14cbcSMatt Macy return (0);
1281eda14cbcSMatt Macy }
1282eda14cbcSMatt Macy
1283eda14cbcSMatt Macy static int
vdev_raidz_reconst_q_pre_tail_func(void * buf,size_t size,void * private)1284eda14cbcSMatt Macy vdev_raidz_reconst_q_pre_tail_func(void *buf, size_t size, void *private)
1285eda14cbcSMatt Macy {
1286eda14cbcSMatt Macy (void) private;
1287eda14cbcSMatt Macy uint64_t *dst = buf;
1288eda14cbcSMatt Macy uint64_t mask;
1289eda14cbcSMatt Macy int cnt = size / sizeof (dst[0]);
1290eda14cbcSMatt Macy
1291eda14cbcSMatt Macy for (int i = 0; i < cnt; i++, dst++) {
1292eda14cbcSMatt Macy /* same operation as vdev_raidz_reconst_q_pre_func() on dst */
1293eda14cbcSMatt Macy VDEV_RAIDZ_64MUL_2(*dst, mask);
1294eda14cbcSMatt Macy }
1295eda14cbcSMatt Macy
1296eda14cbcSMatt Macy return (0);
1297eda14cbcSMatt Macy }
1298eda14cbcSMatt Macy
1299eda14cbcSMatt Macy struct reconst_q_struct {
13007877fdebSMatt Macy uint64_t *q;
1301eda14cbcSMatt Macy int exp;
1302eda14cbcSMatt Macy };
1303eda14cbcSMatt Macy
1304eda14cbcSMatt Macy static int
vdev_raidz_reconst_q_post_func(void * buf,size_t size,void * private)1305eda14cbcSMatt Macy vdev_raidz_reconst_q_post_func(void *buf, size_t size, void *private)
1306eda14cbcSMatt Macy {
1307eda14cbcSMatt Macy struct reconst_q_struct *rq = private;
1308eda14cbcSMatt Macy uint64_t *dst = buf;
1309eda14cbcSMatt Macy int cnt = size / sizeof (dst[0]);
1310eda14cbcSMatt Macy
1311eda14cbcSMatt Macy for (int i = 0; i < cnt; i++, dst++, rq->q++) {
1312eda14cbcSMatt Macy int j;
1313eda14cbcSMatt Macy uint8_t *b;
1314eda14cbcSMatt Macy
1315eda14cbcSMatt Macy *dst ^= *rq->q;
1316eda14cbcSMatt Macy for (j = 0, b = (uint8_t *)dst; j < 8; j++, b++) {
1317eda14cbcSMatt Macy *b = vdev_raidz_exp2(*b, rq->exp);
1318eda14cbcSMatt Macy }
1319eda14cbcSMatt Macy }
1320eda14cbcSMatt Macy
1321eda14cbcSMatt Macy return (0);
1322eda14cbcSMatt Macy }
1323eda14cbcSMatt Macy
1324eda14cbcSMatt Macy struct reconst_pq_struct {
1325eda14cbcSMatt Macy uint8_t *p;
1326eda14cbcSMatt Macy uint8_t *q;
1327eda14cbcSMatt Macy uint8_t *pxy;
1328eda14cbcSMatt Macy uint8_t *qxy;
1329eda14cbcSMatt Macy int aexp;
1330eda14cbcSMatt Macy int bexp;
1331eda14cbcSMatt Macy };
13327877fdebSMatt Macy
1333eda14cbcSMatt Macy static int
vdev_raidz_reconst_pq_func(void * xbuf,void * ybuf,size_t size,void * private)13347877fdebSMatt Macy vdev_raidz_reconst_pq_func(void *xbuf, void *ybuf, size_t size, void *private)
13357877fdebSMatt Macy {
13367877fdebSMatt Macy struct reconst_pq_struct *rpq = private;
13377877fdebSMatt Macy uint8_t *xd = xbuf;
13387877fdebSMatt Macy uint8_t *yd = ybuf;
1339eda14cbcSMatt Macy
13407877fdebSMatt Macy for (int i = 0; i < size;
13417877fdebSMatt Macy i++, rpq->p++, rpq->q++, rpq->pxy++, rpq->qxy++, xd++, yd++) {
13427877fdebSMatt Macy *xd = vdev_raidz_exp2(*rpq->p ^ *rpq->pxy, rpq->aexp) ^
1343eda14cbcSMatt Macy vdev_raidz_exp2(*rpq->q ^ *rpq->qxy, rpq->bexp);
1344eda14cbcSMatt Macy *yd = *rpq->p ^ *rpq->pxy ^ *xd;
13457877fdebSMatt Macy }
13467877fdebSMatt Macy
13477877fdebSMatt Macy return (0);
1348eda14cbcSMatt Macy }
1349eda14cbcSMatt Macy
1350eda14cbcSMatt Macy static int
vdev_raidz_reconst_pq_tail_func(void * xbuf,size_t size,void * private)1351eda14cbcSMatt Macy vdev_raidz_reconst_pq_tail_func(void *xbuf, size_t size, void *private)
1352eda14cbcSMatt Macy {
1353eda14cbcSMatt Macy struct reconst_pq_struct *rpq = private;
1354eda14cbcSMatt Macy uint8_t *xd = xbuf;
1355eda14cbcSMatt Macy
1356eda14cbcSMatt Macy for (int i = 0; i < size;
1357eda14cbcSMatt Macy i++, rpq->p++, rpq->q++, rpq->pxy++, rpq->qxy++, xd++) {
1358eda14cbcSMatt Macy /* same operation as vdev_raidz_reconst_pq_func() on xd */
1359eda14cbcSMatt Macy *xd = vdev_raidz_exp2(*rpq->p ^ *rpq->pxy, rpq->aexp) ^
1360eda14cbcSMatt Macy vdev_raidz_exp2(*rpq->q ^ *rpq->qxy, rpq->bexp);
1361eda14cbcSMatt Macy }
1362eda14cbcSMatt Macy
1363eda14cbcSMatt Macy return (0);
1364eda14cbcSMatt Macy }
1365eda14cbcSMatt Macy
1366eda14cbcSMatt Macy static void
vdev_raidz_reconstruct_p(raidz_row_t * rr,int * tgts,int ntgts)1367eda14cbcSMatt Macy vdev_raidz_reconstruct_p(raidz_row_t *rr, int *tgts, int ntgts)
1368eda14cbcSMatt Macy {
1369eda14cbcSMatt Macy int x = tgts[0];
1370eda14cbcSMatt Macy abd_t *dst, *src;
1371eda14cbcSMatt Macy
1372eda14cbcSMatt Macy if (zfs_flags & ZFS_DEBUG_RAIDZ_RECONSTRUCT)
1373eda14cbcSMatt Macy zfs_dbgmsg("reconstruct_p(rm=%px x=%u)", rr, x);
1374eda14cbcSMatt Macy
1375eda14cbcSMatt Macy ASSERT3U(ntgts, ==, 1);
1376eda14cbcSMatt Macy ASSERT3U(x, >=, rr->rr_firstdatacol);
1377eda14cbcSMatt Macy ASSERT3U(x, <, rr->rr_cols);
13787877fdebSMatt Macy
1379eda14cbcSMatt Macy ASSERT3U(rr->rr_col[x].rc_size, <=, rr->rr_col[VDEV_RAIDZ_P].rc_size);
1380eda14cbcSMatt Macy
1381eda14cbcSMatt Macy src = rr->rr_col[VDEV_RAIDZ_P].rc_abd;
1382eda14cbcSMatt Macy dst = rr->rr_col[x].rc_abd;
1383eda14cbcSMatt Macy
1384eda14cbcSMatt Macy abd_copy_from_buf(dst, abd_to_buf(src), rr->rr_col[x].rc_size);
1385eda14cbcSMatt Macy
1386eda14cbcSMatt Macy for (int c = rr->rr_firstdatacol; c < rr->rr_cols; c++) {
1387eda14cbcSMatt Macy uint64_t size = MIN(rr->rr_col[x].rc_size,
1388eda14cbcSMatt Macy rr->rr_col[c].rc_size);
1389eda14cbcSMatt Macy
1390eda14cbcSMatt Macy src = rr->rr_col[c].rc_abd;
1391eda14cbcSMatt Macy
1392eda14cbcSMatt Macy if (c == x)
1393eda14cbcSMatt Macy continue;
1394eda14cbcSMatt Macy
1395eda14cbcSMatt Macy (void) abd_iterate_func2(dst, src, 0, 0, size,
13967877fdebSMatt Macy vdev_raidz_reconst_p_func, NULL);
1397eda14cbcSMatt Macy }
13987877fdebSMatt Macy }
13997877fdebSMatt Macy
14007877fdebSMatt Macy static void
vdev_raidz_reconstruct_q(raidz_row_t * rr,int * tgts,int ntgts)14017877fdebSMatt Macy vdev_raidz_reconstruct_q(raidz_row_t *rr, int *tgts, int ntgts)
1402eda14cbcSMatt Macy {
14037877fdebSMatt Macy int x = tgts[0];
14047877fdebSMatt Macy int c, exp;
1405eda14cbcSMatt Macy abd_t *dst, *src;
1406eda14cbcSMatt Macy
14077877fdebSMatt Macy if (zfs_flags & ZFS_DEBUG_RAIDZ_RECONSTRUCT)
14087877fdebSMatt Macy zfs_dbgmsg("reconstruct_q(rm=%px x=%u)", rr, x);
14097877fdebSMatt Macy
14107877fdebSMatt Macy ASSERT(ntgts == 1);
14117877fdebSMatt Macy
1412eda14cbcSMatt Macy ASSERT(rr->rr_col[x].rc_size <= rr->rr_col[VDEV_RAIDZ_Q].rc_size);
1413eda14cbcSMatt Macy
1414eda14cbcSMatt Macy for (c = rr->rr_firstdatacol; c < rr->rr_cols; c++) {
14157877fdebSMatt Macy uint64_t size = (c == x) ? 0 : MIN(rr->rr_col[x].rc_size,
14167877fdebSMatt Macy rr->rr_col[c].rc_size);
14177877fdebSMatt Macy
14187877fdebSMatt Macy src = rr->rr_col[c].rc_abd;
14197877fdebSMatt Macy dst = rr->rr_col[x].rc_abd;
1420eda14cbcSMatt Macy
1421eda14cbcSMatt Macy if (c == rr->rr_firstdatacol) {
1422eda14cbcSMatt Macy abd_copy(dst, src, size);
1423eda14cbcSMatt Macy if (rr->rr_col[x].rc_size > size) {
1424eda14cbcSMatt Macy abd_zero_off(dst, size,
1425eda14cbcSMatt Macy rr->rr_col[x].rc_size - size);
14267877fdebSMatt Macy }
1427eda14cbcSMatt Macy } else {
14287877fdebSMatt Macy ASSERT3U(size, <=, rr->rr_col[x].rc_size);
1429eda14cbcSMatt Macy (void) abd_iterate_func2(dst, src, 0, 0, size,
1430eda14cbcSMatt Macy vdev_raidz_reconst_q_pre_func, NULL);
1431eda14cbcSMatt Macy (void) abd_iterate_func(dst,
1432eda14cbcSMatt Macy size, rr->rr_col[x].rc_size - size,
1433eda14cbcSMatt Macy vdev_raidz_reconst_q_pre_tail_func, NULL);
1434eda14cbcSMatt Macy }
1435eda14cbcSMatt Macy }
1436eda14cbcSMatt Macy
1437eda14cbcSMatt Macy src = rr->rr_col[VDEV_RAIDZ_Q].rc_abd;
14387877fdebSMatt Macy dst = rr->rr_col[x].rc_abd;
1439eda14cbcSMatt Macy exp = 255 - (rr->rr_cols - 1 - x);
1440eda14cbcSMatt Macy
1441eda14cbcSMatt Macy struct reconst_q_struct rq = { abd_to_buf(src), exp };
1442eda14cbcSMatt Macy (void) abd_iterate_func(dst, 0, rr->rr_col[x].rc_size,
1443eda14cbcSMatt Macy vdev_raidz_reconst_q_post_func, &rq);
1444eda14cbcSMatt Macy }
1445eda14cbcSMatt Macy
1446eda14cbcSMatt Macy static void
vdev_raidz_reconstruct_pq(raidz_row_t * rr,int * tgts,int ntgts)1447eda14cbcSMatt Macy vdev_raidz_reconstruct_pq(raidz_row_t *rr, int *tgts, int ntgts)
1448eda14cbcSMatt Macy {
1449eda14cbcSMatt Macy uint8_t *p, *q, *pxy, *qxy, tmp, a, b, aexp, bexp;
1450eda14cbcSMatt Macy abd_t *pdata, *qdata;
1451eda14cbcSMatt Macy uint64_t xsize, ysize;
1452eda14cbcSMatt Macy int x = tgts[0];
1453eda14cbcSMatt Macy int y = tgts[1];
1454eda14cbcSMatt Macy abd_t *xd, *yd;
1455eda14cbcSMatt Macy
1456eda14cbcSMatt Macy if (zfs_flags & ZFS_DEBUG_RAIDZ_RECONSTRUCT)
1457eda14cbcSMatt Macy zfs_dbgmsg("reconstruct_pq(rm=%px x=%u y=%u)", rr, x, y);
1458eda14cbcSMatt Macy
1459eda14cbcSMatt Macy ASSERT(ntgts == 2);
1460eda14cbcSMatt Macy ASSERT(x < y);
1461eda14cbcSMatt Macy ASSERT(x >= rr->rr_firstdatacol);
1462eda14cbcSMatt Macy ASSERT(y < rr->rr_cols);
1463eda14cbcSMatt Macy
1464eda14cbcSMatt Macy ASSERT(rr->rr_col[x].rc_size >= rr->rr_col[y].rc_size);
1465eda14cbcSMatt Macy
1466eda14cbcSMatt Macy /*
1467eda14cbcSMatt Macy * Move the parity data aside -- we're going to compute parity as
1468eda14cbcSMatt Macy * though columns x and y were full of zeros -- Pxy and Qxy. We want to
1469eda14cbcSMatt Macy * reuse the parity generation mechanism without trashing the actual
1470eda14cbcSMatt Macy * parity so we make those columns appear to be full of zeros by
1471eda14cbcSMatt Macy * setting their lengths to zero.
1472eda14cbcSMatt Macy */
14737877fdebSMatt Macy pdata = rr->rr_col[VDEV_RAIDZ_P].rc_abd;
1474eda14cbcSMatt Macy qdata = rr->rr_col[VDEV_RAIDZ_Q].rc_abd;
14757877fdebSMatt Macy xsize = rr->rr_col[x].rc_size;
1476eda14cbcSMatt Macy ysize = rr->rr_col[y].rc_size;
1477eda14cbcSMatt Macy
1478eda14cbcSMatt Macy rr->rr_col[VDEV_RAIDZ_P].rc_abd =
1479eda14cbcSMatt Macy abd_alloc_linear(rr->rr_col[VDEV_RAIDZ_P].rc_size, B_TRUE);
1480eda14cbcSMatt Macy rr->rr_col[VDEV_RAIDZ_Q].rc_abd =
1481eda14cbcSMatt Macy abd_alloc_linear(rr->rr_col[VDEV_RAIDZ_Q].rc_size, B_TRUE);
1482eda14cbcSMatt Macy rr->rr_col[x].rc_size = 0;
1483eda14cbcSMatt Macy rr->rr_col[y].rc_size = 0;
1484eda14cbcSMatt Macy
1485eda14cbcSMatt Macy vdev_raidz_generate_parity_pq(rr);
1486eda14cbcSMatt Macy
1487eda14cbcSMatt Macy rr->rr_col[x].rc_size = xsize;
14887877fdebSMatt Macy rr->rr_col[y].rc_size = ysize;
1489eda14cbcSMatt Macy
1490eda14cbcSMatt Macy p = abd_to_buf(pdata);
1491eda14cbcSMatt Macy q = abd_to_buf(qdata);
1492eda14cbcSMatt Macy pxy = abd_to_buf(rr->rr_col[VDEV_RAIDZ_P].rc_abd);
14937877fdebSMatt Macy qxy = abd_to_buf(rr->rr_col[VDEV_RAIDZ_Q].rc_abd);
1494eda14cbcSMatt Macy xd = rr->rr_col[x].rc_abd;
1495eda14cbcSMatt Macy yd = rr->rr_col[y].rc_abd;
1496eda14cbcSMatt Macy
1497eda14cbcSMatt Macy /*
1498eda14cbcSMatt Macy * We now have:
14997877fdebSMatt Macy * Pxy = P + D_x + D_y
1500eda14cbcSMatt Macy * Qxy = Q + 2^(ndevs - 1 - x) * D_x + 2^(ndevs - 1 - y) * D_y
1501eda14cbcSMatt Macy *
1502eda14cbcSMatt Macy * We can then solve for D_x:
1503eda14cbcSMatt Macy * D_x = A * (P + Pxy) + B * (Q + Qxy)
1504eda14cbcSMatt Macy * where
1505eda14cbcSMatt Macy * A = 2^(x - y) * (2^(x - y) + 1)^-1
1506eda14cbcSMatt Macy * B = 2^(ndevs - 1 - x) * (2^(x - y) + 1)^-1
1507eda14cbcSMatt Macy *
15087877fdebSMatt Macy * With D_x in hand, we can easily solve for D_y:
15097877fdebSMatt Macy * D_y = P + Pxy + D_x
1510eda14cbcSMatt Macy */
15117877fdebSMatt Macy
1512eda14cbcSMatt Macy a = vdev_raidz_pow2[255 + x - y];
1513eda14cbcSMatt Macy b = vdev_raidz_pow2[255 - (rr->rr_cols - 1 - x)];
15147877fdebSMatt Macy tmp = 255 - vdev_raidz_log2[a ^ 1];
1515eda14cbcSMatt Macy
1516eda14cbcSMatt Macy aexp = vdev_raidz_log2[vdev_raidz_exp2(a, tmp)];
15177877fdebSMatt Macy bexp = vdev_raidz_log2[vdev_raidz_exp2(b, tmp)];
1518eda14cbcSMatt Macy
1519eda14cbcSMatt Macy ASSERT3U(xsize, >=, ysize);
1520eda14cbcSMatt Macy struct reconst_pq_struct rpq = { p, q, pxy, qxy, aexp, bexp };
1521eda14cbcSMatt Macy
1522eda14cbcSMatt Macy (void) abd_iterate_func2(xd, yd, 0, 0, ysize,
15237877fdebSMatt Macy vdev_raidz_reconst_pq_func, &rpq);
15247877fdebSMatt Macy (void) abd_iterate_func(xd, ysize, xsize - ysize,
15257877fdebSMatt Macy vdev_raidz_reconst_pq_tail_func, &rpq);
1526eda14cbcSMatt Macy
1527eda14cbcSMatt Macy abd_free(rr->rr_col[VDEV_RAIDZ_P].rc_abd);
1528eda14cbcSMatt Macy abd_free(rr->rr_col[VDEV_RAIDZ_Q].rc_abd);
1529eda14cbcSMatt Macy
1530eda14cbcSMatt Macy /*
1531eda14cbcSMatt Macy * Restore the saved parity data.
1532eda14cbcSMatt Macy */
1533eda14cbcSMatt Macy rr->rr_col[VDEV_RAIDZ_P].rc_abd = pdata;
15347877fdebSMatt Macy rr->rr_col[VDEV_RAIDZ_Q].rc_abd = qdata;
15357877fdebSMatt Macy }
1536eda14cbcSMatt Macy
15377877fdebSMatt Macy /*
15387877fdebSMatt Macy * In the general case of reconstruction, we must solve the system of linear
1539eda14cbcSMatt Macy * equations defined by the coefficients used to generate parity as well as
1540eda14cbcSMatt Macy * the contents of the data and parity disks. This can be expressed with
1541eda14cbcSMatt Macy * vectors for the original data (D) and the actual data (d) and parity (p)
1542eda14cbcSMatt Macy * and a matrix composed of the identity matrix (I) and a dispersal matrix (V):
1543eda14cbcSMatt Macy *
15447877fdebSMatt Macy * __ __ __ __
1545eda14cbcSMatt Macy * | | __ __ | p_0 |
15467877fdebSMatt Macy * | V | | D_0 | | p_m-1 |
1547eda14cbcSMatt Macy * | | x | : | = | d_0 |
1548eda14cbcSMatt Macy * | I | | D_n-1 | | : |
1549eda14cbcSMatt Macy * | | ~~ ~~ | d_n-1 |
1550eda14cbcSMatt Macy * ~~ ~~ ~~ ~~
1551eda14cbcSMatt Macy *
1552eda14cbcSMatt Macy * I is simply a square identity matrix of size n, and V is a vandermonde
1553eda14cbcSMatt Macy * matrix defined by the coefficients we chose for the various parity columns
1554eda14cbcSMatt Macy * (1, 2, 4). Note that these values were chosen both for simplicity, speedy
1555eda14cbcSMatt Macy * computation as well as linear separability.
1556eda14cbcSMatt Macy *
1557eda14cbcSMatt Macy * __ __ __ __
1558eda14cbcSMatt Macy * | 1 .. 1 1 1 | | p_0 |
1559eda14cbcSMatt Macy * | 2^n-1 .. 4 2 1 | __ __ | : |
1560eda14cbcSMatt Macy * | 4^n-1 .. 16 4 1 | | D_0 | | p_m-1 |
15617877fdebSMatt Macy * | 1 .. 0 0 0 | | D_1 | | d_0 |
1562eda14cbcSMatt Macy * | 0 .. 0 0 0 | x | D_2 | = | d_1 |
1563eda14cbcSMatt Macy * | : : : : | | : | | d_2 |
1564eda14cbcSMatt Macy * | 0 .. 1 0 0 | | D_n-1 | | : |
1565eda14cbcSMatt Macy * | 0 .. 0 1 0 | ~~ ~~ | : |
1566eda14cbcSMatt Macy * | 0 .. 0 0 1 | | d_n-1 |
1567eda14cbcSMatt Macy * ~~ ~~ ~~ ~~
1568eda14cbcSMatt Macy *
1569eda14cbcSMatt Macy * Note that I, V, d, and p are known. To compute D, we must invert the
1570eda14cbcSMatt Macy * matrix and use the known data and parity values to reconstruct the unknown
15717877fdebSMatt Macy * data values. We begin by removing the rows in V|I and d|p that correspond
1572eda14cbcSMatt Macy * to failed or missing columns; we then make V|I square (n x n) and d|p
15737877fdebSMatt Macy * sized n by removing rows corresponding to unused parity from the bottom up
1574eda14cbcSMatt Macy * to generate (V|I)' and (d|p)'. We can then generate the inverse of (V|I)'
1575eda14cbcSMatt Macy * using Gauss-Jordan elimination. In the example below we use m=3 parity
15767877fdebSMatt Macy * columns, n=8 data columns, with errors in d_1, d_2, and p_1:
1577eda14cbcSMatt Macy * __ __
15787877fdebSMatt Macy * | 1 1 1 1 1 1 1 1 |
1579eda14cbcSMatt Macy * | 128 64 32 16 8 4 2 1 | <-----+-+-- missing disks
1580eda14cbcSMatt Macy * | 19 205 116 29 64 16 4 1 | / /
1581eda14cbcSMatt Macy * | 1 0 0 0 0 0 0 0 | / /
15827877fdebSMatt Macy * | 0 1 0 0 0 0 0 0 | <--' /
1583eda14cbcSMatt Macy * (V|I) = | 0 0 1 0 0 0 0 0 | <---'
1584eda14cbcSMatt Macy * | 0 0 0 1 0 0 0 0 |
1585eda14cbcSMatt Macy * | 0 0 0 0 1 0 0 0 |
15867877fdebSMatt Macy * | 0 0 0 0 0 1 0 0 |
1587eda14cbcSMatt Macy * | 0 0 0 0 0 0 1 0 |
15887877fdebSMatt Macy * | 0 0 0 0 0 0 0 1 |
1589eda14cbcSMatt Macy * ~~ ~~
1590eda14cbcSMatt Macy * __ __
1591eda14cbcSMatt Macy * | 1 1 1 1 1 1 1 1 |
1592eda14cbcSMatt Macy * | 128 64 32 16 8 4 2 1 |
15937877fdebSMatt Macy * | 19 205 116 29 64 16 4 1 |
1594eda14cbcSMatt Macy * | 1 0 0 0 0 0 0 0 |
1595eda14cbcSMatt Macy * | 0 1 0 0 0 0 0 0 |
1596eda14cbcSMatt Macy * (V|I)' = | 0 0 1 0 0 0 0 0 |
1597eda14cbcSMatt Macy * | 0 0 0 1 0 0 0 0 |
1598eda14cbcSMatt Macy * | 0 0 0 0 1 0 0 0 |
1599eda14cbcSMatt Macy * | 0 0 0 0 0 1 0 0 |
1600eda14cbcSMatt Macy * | 0 0 0 0 0 0 1 0 |
1601eda14cbcSMatt Macy * | 0 0 0 0 0 0 0 1 |
1602eda14cbcSMatt Macy * ~~ ~~
16037877fdebSMatt Macy *
16047877fdebSMatt Macy * Here we employ Gauss-Jordan elimination to find the inverse of (V|I)'. We
1605eda14cbcSMatt Macy * have carefully chosen the seed values 1, 2, and 4 to ensure that this
1606eda14cbcSMatt Macy * matrix is not singular.
1607eda14cbcSMatt Macy * __ __
1608eda14cbcSMatt Macy * | 1 1 1 1 1 1 1 1 1 0 0 0 0 0 0 0 |
1609eda14cbcSMatt Macy * | 19 205 116 29 64 16 4 1 0 1 0 0 0 0 0 0 |
1610eda14cbcSMatt Macy * | 1 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 |
1611eda14cbcSMatt Macy * | 0 0 0 1 0 0 0 0 0 0 0 1 0 0 0 0 |
1612eda14cbcSMatt Macy * | 0 0 0 0 1 0 0 0 0 0 0 0 1 0 0 0 |
1613eda14cbcSMatt Macy * | 0 0 0 0 0 1 0 0 0 0 0 0 0 1 0 0 |
1614eda14cbcSMatt Macy * | 0 0 0 0 0 0 1 0 0 0 0 0 0 0 1 0 |
1615eda14cbcSMatt Macy * | 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 1 |
1616eda14cbcSMatt Macy * ~~ ~~
1617eda14cbcSMatt Macy * __ __
1618eda14cbcSMatt Macy * | 1 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 |
1619eda14cbcSMatt Macy * | 1 1 1 1 1 1 1 1 1 0 0 0 0 0 0 0 |
16207877fdebSMatt Macy * | 19 205 116 29 64 16 4 1 0 1 0 0 0 0 0 0 |
1621eda14cbcSMatt Macy * | 0 0 0 1 0 0 0 0 0 0 0 1 0 0 0 0 |
1622eda14cbcSMatt Macy * | 0 0 0 0 1 0 0 0 0 0 0 0 1 0 0 0 |
1623eda14cbcSMatt Macy * | 0 0 0 0 0 1 0 0 0 0 0 0 0 1 0 0 |
1624eda14cbcSMatt Macy * | 0 0 0 0 0 0 1 0 0 0 0 0 0 0 1 0 |
1625eda14cbcSMatt Macy * | 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 1 |
1626eda14cbcSMatt Macy * ~~ ~~
1627eda14cbcSMatt Macy * __ __
1628eda14cbcSMatt Macy * | 1 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 |
1629eda14cbcSMatt Macy * | 0 1 1 0 0 0 0 0 1 0 1 1 1 1 1 1 |
1630eda14cbcSMatt Macy * | 0 205 116 0 0 0 0 0 0 1 19 29 64 16 4 1 |
1631eda14cbcSMatt Macy * | 0 0 0 1 0 0 0 0 0 0 0 1 0 0 0 0 |
1632eda14cbcSMatt Macy * | 0 0 0 0 1 0 0 0 0 0 0 0 1 0 0 0 |
1633eda14cbcSMatt Macy * | 0 0 0 0 0 1 0 0 0 0 0 0 0 1 0 0 |
1634eda14cbcSMatt Macy * | 0 0 0 0 0 0 1 0 0 0 0 0 0 0 1 0 |
1635eda14cbcSMatt Macy * | 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 1 |
1636eda14cbcSMatt Macy * ~~ ~~
1637eda14cbcSMatt Macy * __ __
1638eda14cbcSMatt Macy * | 1 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 |
1639eda14cbcSMatt Macy * | 0 1 1 0 0 0 0 0 1 0 1 1 1 1 1 1 |
1640eda14cbcSMatt Macy * | 0 0 185 0 0 0 0 0 205 1 222 208 141 221 201 204 |
1641eda14cbcSMatt Macy * | 0 0 0 1 0 0 0 0 0 0 0 1 0 0 0 0 |
1642eda14cbcSMatt Macy * | 0 0 0 0 1 0 0 0 0 0 0 0 1 0 0 0 |
1643eda14cbcSMatt Macy * | 0 0 0 0 0 1 0 0 0 0 0 0 0 1 0 0 |
1644eda14cbcSMatt Macy * | 0 0 0 0 0 0 1 0 0 0 0 0 0 0 1 0 |
1645eda14cbcSMatt Macy * | 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 1 |
1646eda14cbcSMatt Macy * ~~ ~~
1647eda14cbcSMatt Macy * __ __
1648eda14cbcSMatt Macy * | 1 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 |
16497877fdebSMatt Macy * | 0 1 1 0 0 0 0 0 1 0 1 1 1 1 1 1 |
16507877fdebSMatt Macy * | 0 0 1 0 0 0 0 0 166 100 4 40 158 168 216 209 |
1651eda14cbcSMatt Macy * | 0 0 0 1 0 0 0 0 0 0 0 1 0 0 0 0 |
1652eda14cbcSMatt Macy * | 0 0 0 0 1 0 0 0 0 0 0 0 1 0 0 0 |
16537877fdebSMatt Macy * | 0 0 0 0 0 1 0 0 0 0 0 0 0 1 0 0 |
1654eda14cbcSMatt Macy * | 0 0 0 0 0 0 1 0 0 0 0 0 0 0 1 0 |
1655eda14cbcSMatt Macy * | 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 1 |
1656eda14cbcSMatt Macy * ~~ ~~
1657eda14cbcSMatt Macy * __ __
16587877fdebSMatt Macy * | 1 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 |
1659eda14cbcSMatt Macy * | 0 1 0 0 0 0 0 0 167 100 5 41 159 169 217 208 |
1660eda14cbcSMatt Macy * | 0 0 1 0 0 0 0 0 166 100 4 40 158 168 216 209 |
16617877fdebSMatt Macy * | 0 0 0 1 0 0 0 0 0 0 0 1 0 0 0 0 |
16627877fdebSMatt Macy * | 0 0 0 0 1 0 0 0 0 0 0 0 1 0 0 0 |
1663eda14cbcSMatt Macy * | 0 0 0 0 0 1 0 0 0 0 0 0 0 1 0 0 |
1664eda14cbcSMatt Macy * | 0 0 0 0 0 0 1 0 0 0 0 0 0 0 1 0 |
1665eda14cbcSMatt Macy * | 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 1 |
1666eda14cbcSMatt Macy * ~~ ~~
1667eda14cbcSMatt Macy * __ __
1668eda14cbcSMatt Macy * | 0 0 1 0 0 0 0 0 |
1669eda14cbcSMatt Macy * | 167 100 5 41 159 169 217 208 |
1670eda14cbcSMatt Macy * | 166 100 4 40 158 168 216 209 |
16717877fdebSMatt Macy * (V|I)'^-1 = | 0 0 0 1 0 0 0 0 |
16727877fdebSMatt Macy * | 0 0 0 0 1 0 0 0 |
16737877fdebSMatt Macy * | 0 0 0 0 0 1 0 0 |
16747877fdebSMatt Macy * | 0 0 0 0 0 0 1 0 |
16757877fdebSMatt Macy * | 0 0 0 0 0 0 0 1 |
16767877fdebSMatt Macy * ~~ ~~
16777877fdebSMatt Macy *
16787877fdebSMatt Macy * We can then simply compute D = (V|I)'^-1 x (d|p)' to discover the values
16797877fdebSMatt Macy * of the missing data.
16807877fdebSMatt Macy *
16817877fdebSMatt Macy * As is apparent from the example above, the only non-trivial rows in the
16827877fdebSMatt Macy * inverse matrix correspond to the data disks that we're trying to
1683eda14cbcSMatt Macy * reconstruct. Indeed, those are the only rows we need as the others would
1684eda14cbcSMatt Macy * only be useful for reconstructing data known or assumed to be valid. For
1685eda14cbcSMatt Macy * that reason, we only build the coefficients in the rows that correspond to
1686eda14cbcSMatt Macy * targeted columns.
1687eda14cbcSMatt Macy */
1688eda14cbcSMatt Macy
1689eda14cbcSMatt Macy static void
vdev_raidz_matrix_init(raidz_row_t * rr,int n,int nmap,int * map,uint8_t ** rows)1690eda14cbcSMatt Macy vdev_raidz_matrix_init(raidz_row_t *rr, int n, int nmap, int *map,
1691eda14cbcSMatt Macy uint8_t **rows)
1692eda14cbcSMatt Macy {
16937877fdebSMatt Macy int i, j;
1694eda14cbcSMatt Macy int pow;
1695eda14cbcSMatt Macy
1696eda14cbcSMatt Macy ASSERT(n == rr->rr_cols - rr->rr_firstdatacol);
1697eda14cbcSMatt Macy
16987877fdebSMatt Macy /*
16997877fdebSMatt Macy * Fill in the missing rows of interest.
1700eda14cbcSMatt Macy */
17017877fdebSMatt Macy for (i = 0; i < nmap; i++) {
1702eda14cbcSMatt Macy ASSERT3S(0, <=, map[i]);
17037877fdebSMatt Macy ASSERT3S(map[i], <=, 2);
1704eda14cbcSMatt Macy
1705eda14cbcSMatt Macy pow = map[i] * n;
17067877fdebSMatt Macy if (pow > 255)
17077877fdebSMatt Macy pow -= 255;
1708eda14cbcSMatt Macy ASSERT(pow <= 255);
1709eda14cbcSMatt Macy
1710eda14cbcSMatt Macy for (j = 0; j < n; j++) {
1711eda14cbcSMatt Macy pow -= map[i];
1712eda14cbcSMatt Macy if (pow < 0)
1713eda14cbcSMatt Macy pow += 255;
1714eda14cbcSMatt Macy rows[i][j] = vdev_raidz_pow2[pow];
1715eda14cbcSMatt Macy }
1716eda14cbcSMatt Macy }
1717eda14cbcSMatt Macy }
1718eda14cbcSMatt Macy
1719eda14cbcSMatt Macy static void
vdev_raidz_matrix_invert(raidz_row_t * rr,int n,int nmissing,int * missing,uint8_t ** rows,uint8_t ** invrows,const uint8_t * used)1720eda14cbcSMatt Macy vdev_raidz_matrix_invert(raidz_row_t *rr, int n, int nmissing, int *missing,
1721eda14cbcSMatt Macy uint8_t **rows, uint8_t **invrows, const uint8_t *used)
1722eda14cbcSMatt Macy {
1723eda14cbcSMatt Macy int i, j, ii, jj;
1724eda14cbcSMatt Macy uint8_t log;
17257877fdebSMatt Macy
17267877fdebSMatt Macy /*
17277877fdebSMatt Macy * Assert that the first nmissing entries from the array of used
17287877fdebSMatt Macy * columns correspond to parity columns and that subsequent entries
17297877fdebSMatt Macy * correspond to data columns.
17307877fdebSMatt Macy */
17317877fdebSMatt Macy for (i = 0; i < nmissing; i++) {
17327877fdebSMatt Macy ASSERT3S(used[i], <, rr->rr_firstdatacol);
17337877fdebSMatt Macy }
17347877fdebSMatt Macy for (; i < n; i++) {
17357877fdebSMatt Macy ASSERT3S(used[i], >=, rr->rr_firstdatacol);
17367877fdebSMatt Macy }
17377877fdebSMatt Macy
17387877fdebSMatt Macy /*
17397877fdebSMatt Macy * First initialize the storage where we'll compute the inverse rows.
17407877fdebSMatt Macy */
17417877fdebSMatt Macy for (i = 0; i < nmissing; i++) {
17427877fdebSMatt Macy for (j = 0; j < n; j++) {
17437877fdebSMatt Macy invrows[i][j] = (i == j) ? 1 : 0;
17447877fdebSMatt Macy }
17457877fdebSMatt Macy }
17467877fdebSMatt Macy
17477877fdebSMatt Macy /*
17487877fdebSMatt Macy * Subtract all trivial rows from the rows of consequence.
17497877fdebSMatt Macy */
17507877fdebSMatt Macy for (i = 0; i < nmissing; i++) {
17517877fdebSMatt Macy for (j = nmissing; j < n; j++) {
17527877fdebSMatt Macy ASSERT3U(used[j], >=, rr->rr_firstdatacol);
17537877fdebSMatt Macy jj = used[j] - rr->rr_firstdatacol;
17547877fdebSMatt Macy ASSERT3S(jj, <, n);
17557877fdebSMatt Macy invrows[i][j] = rows[i][jj];
17567877fdebSMatt Macy rows[i][jj] = 0;
17577877fdebSMatt Macy }
17587877fdebSMatt Macy }
17597877fdebSMatt Macy
17607877fdebSMatt Macy /*
17617877fdebSMatt Macy * For each of the rows of interest, we must normalize it and subtract
17627877fdebSMatt Macy * a multiple of it from the other rows.
17637877fdebSMatt Macy */
17647877fdebSMatt Macy for (i = 0; i < nmissing; i++) {
17657877fdebSMatt Macy for (j = 0; j < missing[i]; j++) {
17667877fdebSMatt Macy ASSERT0(rows[i][j]);
17677877fdebSMatt Macy }
17687877fdebSMatt Macy ASSERT3U(rows[i][missing[i]], !=, 0);
17697877fdebSMatt Macy
17707877fdebSMatt Macy /*
17717877fdebSMatt Macy * Compute the inverse of the first element and multiply each
17727877fdebSMatt Macy * element in the row by that value.
17737877fdebSMatt Macy */
17747877fdebSMatt Macy log = 255 - vdev_raidz_log2[rows[i][missing[i]]];
17757877fdebSMatt Macy
17767877fdebSMatt Macy for (j = 0; j < n; j++) {
17777877fdebSMatt Macy rows[i][j] = vdev_raidz_exp2(rows[i][j], log);
17787877fdebSMatt Macy invrows[i][j] = vdev_raidz_exp2(invrows[i][j], log);
17797877fdebSMatt Macy }
17807877fdebSMatt Macy
17817877fdebSMatt Macy for (ii = 0; ii < nmissing; ii++) {
17827877fdebSMatt Macy if (i == ii)
17837877fdebSMatt Macy continue;
17847877fdebSMatt Macy
17857877fdebSMatt Macy ASSERT3U(rows[ii][missing[i]], !=, 0);
17867877fdebSMatt Macy
17877877fdebSMatt Macy log = vdev_raidz_log2[rows[ii][missing[i]]];
17887877fdebSMatt Macy
17897877fdebSMatt Macy for (j = 0; j < n; j++) {
17907877fdebSMatt Macy rows[ii][j] ^=
17917877fdebSMatt Macy vdev_raidz_exp2(rows[i][j], log);
17927877fdebSMatt Macy invrows[ii][j] ^=
17937877fdebSMatt Macy vdev_raidz_exp2(invrows[i][j], log);
17947877fdebSMatt Macy }
17957877fdebSMatt Macy }
17967877fdebSMatt Macy }
17977877fdebSMatt Macy
17987877fdebSMatt Macy /*
17997877fdebSMatt Macy * Verify that the data that is left in the rows are properly part of
18007877fdebSMatt Macy * an identity matrix.
18017877fdebSMatt Macy */
18027877fdebSMatt Macy for (i = 0; i < nmissing; i++) {
18037877fdebSMatt Macy for (j = 0; j < n; j++) {
18047877fdebSMatt Macy if (j == missing[i]) {
18057877fdebSMatt Macy ASSERT3U(rows[i][j], ==, 1);
18067877fdebSMatt Macy } else {
18077877fdebSMatt Macy ASSERT0(rows[i][j]);
18087877fdebSMatt Macy }
18097877fdebSMatt Macy }
1810eda14cbcSMatt Macy }
1811eda14cbcSMatt Macy }
1812eda14cbcSMatt Macy
1813eda14cbcSMatt Macy static void
vdev_raidz_matrix_reconstruct(raidz_row_t * rr,int n,int nmissing,int * missing,uint8_t ** invrows,const uint8_t * used)1814eda14cbcSMatt Macy vdev_raidz_matrix_reconstruct(raidz_row_t *rr, int n, int nmissing,
1815eda14cbcSMatt Macy int *missing, uint8_t **invrows, const uint8_t *used)
1816eda14cbcSMatt Macy {
1817eda14cbcSMatt Macy int i, j, x, cc, c;
1818eda14cbcSMatt Macy uint8_t *src;
1819eda14cbcSMatt Macy uint64_t ccount;
1820eda14cbcSMatt Macy uint8_t *dst[VDEV_RAIDZ_MAXPARITY] = { NULL };
1821eda14cbcSMatt Macy uint64_t dcount[VDEV_RAIDZ_MAXPARITY] = { 0 };
1822eda14cbcSMatt Macy uint8_t log = 0;
1823eda14cbcSMatt Macy uint8_t val;
1824eda14cbcSMatt Macy int ll;
1825eda14cbcSMatt Macy uint8_t *invlog[VDEV_RAIDZ_MAXPARITY];
1826eda14cbcSMatt Macy uint8_t *p, *pp;
1827eda14cbcSMatt Macy size_t psize;
1828eda14cbcSMatt Macy
1829eda14cbcSMatt Macy psize = sizeof (invlog[0][0]) * n * nmissing;
1830eda14cbcSMatt Macy p = kmem_alloc(psize, KM_SLEEP);
1831eda14cbcSMatt Macy
18327877fdebSMatt Macy for (pp = p, i = 0; i < nmissing; i++) {
1833eda14cbcSMatt Macy invlog[i] = pp;
1834eda14cbcSMatt Macy pp += n;
18357877fdebSMatt Macy }
18367877fdebSMatt Macy
1837eda14cbcSMatt Macy for (i = 0; i < nmissing; i++) {
18387877fdebSMatt Macy for (j = 0; j < n; j++) {
18397877fdebSMatt Macy ASSERT3U(invrows[i][j], !=, 0);
18407877fdebSMatt Macy invlog[i][j] = vdev_raidz_log2[invrows[i][j]];
18417877fdebSMatt Macy }
18427877fdebSMatt Macy }
18437877fdebSMatt Macy
18447877fdebSMatt Macy for (i = 0; i < n; i++) {
18457877fdebSMatt Macy c = used[i];
18467877fdebSMatt Macy ASSERT3U(c, <, rr->rr_cols);
1847eda14cbcSMatt Macy
1848eda14cbcSMatt Macy ccount = rr->rr_col[c].rc_size;
18497877fdebSMatt Macy ASSERT(ccount >= rr->rr_col[missing[0]].rc_size || i > 0);
18507877fdebSMatt Macy if (ccount == 0)
1851eda14cbcSMatt Macy continue;
18527877fdebSMatt Macy src = abd_to_buf(rr->rr_col[c].rc_abd);
1853eda14cbcSMatt Macy for (j = 0; j < nmissing; j++) {
1854eda14cbcSMatt Macy cc = missing[j] + rr->rr_firstdatacol;
1855eda14cbcSMatt Macy ASSERT3U(cc, >=, rr->rr_firstdatacol);
1856eda14cbcSMatt Macy ASSERT3U(cc, <, rr->rr_cols);
1857eda14cbcSMatt Macy ASSERT3U(cc, !=, c);
1858eda14cbcSMatt Macy
1859eda14cbcSMatt Macy dcount[j] = rr->rr_col[cc].rc_size;
1860eda14cbcSMatt Macy if (dcount[j] != 0)
1861eda14cbcSMatt Macy dst[j] = abd_to_buf(rr->rr_col[cc].rc_abd);
1862eda14cbcSMatt Macy }
1863eda14cbcSMatt Macy
1864eda14cbcSMatt Macy for (x = 0; x < ccount; x++, src++) {
1865eda14cbcSMatt Macy if (*src != 0)
18667877fdebSMatt Macy log = vdev_raidz_log2[*src];
18677877fdebSMatt Macy
1868eda14cbcSMatt Macy for (cc = 0; cc < nmissing; cc++) {
1869eda14cbcSMatt Macy if (x >= dcount[cc])
1870eda14cbcSMatt Macy continue;
1871eda14cbcSMatt Macy
1872eda14cbcSMatt Macy if (*src == 0) {
1873eda14cbcSMatt Macy val = 0;
18742c48331dSMatt Macy } else {
1875eda14cbcSMatt Macy if ((ll = log + invlog[cc][i]) >= 255)
1876eda14cbcSMatt Macy ll -= 255;
18772c48331dSMatt Macy val = vdev_raidz_pow2[ll];
18782c48331dSMatt Macy }
18792c48331dSMatt Macy
18802c48331dSMatt Macy if (i == 0)
18812c48331dSMatt Macy dst[cc][x] = val;
1882eda14cbcSMatt Macy else
1883eda14cbcSMatt Macy dst[cc][x] ^= val;
1884eda14cbcSMatt Macy }
1885eda14cbcSMatt Macy }
1886eda14cbcSMatt Macy }
1887eda14cbcSMatt Macy
1888eda14cbcSMatt Macy kmem_free(p, psize);
1889eda14cbcSMatt Macy }
1890eda14cbcSMatt Macy
1891eda14cbcSMatt Macy static void
vdev_raidz_reconstruct_general(raidz_row_t * rr,int * tgts,int ntgts)1892eda14cbcSMatt Macy vdev_raidz_reconstruct_general(raidz_row_t *rr, int *tgts, int ntgts)
1893eda14cbcSMatt Macy {
1894eda14cbcSMatt Macy int i, c, t, tt;
1895eda14cbcSMatt Macy unsigned int n;
1896eda14cbcSMatt Macy unsigned int nmissing_rows;
1897eda14cbcSMatt Macy int missing_rows[VDEV_RAIDZ_MAXPARITY];
1898eda14cbcSMatt Macy int parity_map[VDEV_RAIDZ_MAXPARITY];
1899eda14cbcSMatt Macy uint8_t *p, *pp;
1900eda14cbcSMatt Macy size_t psize;
1901eda14cbcSMatt Macy uint8_t *rows[VDEV_RAIDZ_MAXPARITY];
1902eda14cbcSMatt Macy uint8_t *invrows[VDEV_RAIDZ_MAXPARITY];
1903eda14cbcSMatt Macy uint8_t *used;
1904eda14cbcSMatt Macy
1905eda14cbcSMatt Macy abd_t **bufs = NULL;
1906eda14cbcSMatt Macy
1907eda14cbcSMatt Macy if (zfs_flags & ZFS_DEBUG_RAIDZ_RECONSTRUCT)
19087877fdebSMatt Macy zfs_dbgmsg("reconstruct_general(rm=%px ntgts=%u)", rr, ntgts);
1909eda14cbcSMatt Macy /*
1910eda14cbcSMatt Macy * Matrix reconstruction can't use scatter ABDs yet, so we allocate
19117877fdebSMatt Macy * temporary linear ABDs if any non-linear ABDs are found.
1912eda14cbcSMatt Macy */
1913eda14cbcSMatt Macy for (i = rr->rr_firstdatacol; i < rr->rr_cols; i++) {
1914eda14cbcSMatt Macy ASSERT(rr->rr_col[i].rc_abd != NULL);
19157877fdebSMatt Macy if (!abd_is_linear(rr->rr_col[i].rc_abd)) {
1916eda14cbcSMatt Macy bufs = kmem_alloc(rr->rr_cols * sizeof (abd_t *),
1917eda14cbcSMatt Macy KM_PUSHPAGE);
1918eda14cbcSMatt Macy
1919eda14cbcSMatt Macy for (c = rr->rr_firstdatacol; c < rr->rr_cols; c++) {
1920eda14cbcSMatt Macy raidz_col_t *col = &rr->rr_col[c];
1921eda14cbcSMatt Macy
1922eda14cbcSMatt Macy bufs[c] = col->rc_abd;
1923eda14cbcSMatt Macy if (bufs[c] != NULL) {
1924eda14cbcSMatt Macy col->rc_abd = abd_alloc_linear(
19257877fdebSMatt Macy col->rc_size, B_TRUE);
19267877fdebSMatt Macy abd_copy(col->rc_abd, bufs[c],
1927eda14cbcSMatt Macy col->rc_size);
1928eda14cbcSMatt Macy }
1929eda14cbcSMatt Macy }
1930eda14cbcSMatt Macy
1931eda14cbcSMatt Macy break;
1932eda14cbcSMatt Macy }
1933eda14cbcSMatt Macy }
19347877fdebSMatt Macy
19357877fdebSMatt Macy n = rr->rr_cols - rr->rr_firstdatacol;
19367877fdebSMatt Macy
19377877fdebSMatt Macy /*
19387877fdebSMatt Macy * Figure out which data columns are missing.
19397877fdebSMatt Macy */
1940eda14cbcSMatt Macy nmissing_rows = 0;
19417877fdebSMatt Macy for (t = 0; t < ntgts; t++) {
19427877fdebSMatt Macy if (tgts[t] >= rr->rr_firstdatacol) {
19437877fdebSMatt Macy missing_rows[nmissing_rows++] =
1944eda14cbcSMatt Macy tgts[t] - rr->rr_firstdatacol;
1945eda14cbcSMatt Macy }
19467877fdebSMatt Macy }
1947eda14cbcSMatt Macy
1948eda14cbcSMatt Macy /*
1949eda14cbcSMatt Macy * Figure out which parity columns to use to help generate the missing
1950eda14cbcSMatt Macy * data columns.
1951eda14cbcSMatt Macy */
1952eda14cbcSMatt Macy for (tt = 0, c = 0, i = 0; i < nmissing_rows; c++) {
1953eda14cbcSMatt Macy ASSERT(tt < ntgts);
1954eda14cbcSMatt Macy ASSERT(c < rr->rr_firstdatacol);
1955eda14cbcSMatt Macy
1956eda14cbcSMatt Macy /*
1957eda14cbcSMatt Macy * Skip any targeted parity columns.
1958eda14cbcSMatt Macy */
19597877fdebSMatt Macy if (c == tgts[tt]) {
1960eda14cbcSMatt Macy tt++;
1961eda14cbcSMatt Macy continue;
1962eda14cbcSMatt Macy }
19637877fdebSMatt Macy
19647877fdebSMatt Macy parity_map[i] = c;
1965eda14cbcSMatt Macy i++;
1966eda14cbcSMatt Macy }
1967eda14cbcSMatt Macy
1968eda14cbcSMatt Macy psize = (sizeof (rows[0][0]) + sizeof (invrows[0][0])) *
1969eda14cbcSMatt Macy nmissing_rows * n + sizeof (used[0]) * n;
19707877fdebSMatt Macy p = kmem_alloc(psize, KM_SLEEP);
1971eda14cbcSMatt Macy
1972eda14cbcSMatt Macy for (pp = p, i = 0; i < nmissing_rows; i++) {
1973eda14cbcSMatt Macy rows[i] = pp;
1974eda14cbcSMatt Macy pp += n;
1975eda14cbcSMatt Macy invrows[i] = pp;
1976eda14cbcSMatt Macy pp += n;
19777877fdebSMatt Macy }
1978eda14cbcSMatt Macy used = pp;
19797877fdebSMatt Macy
19807877fdebSMatt Macy for (i = 0; i < nmissing_rows; i++) {
1981eda14cbcSMatt Macy used[i] = parity_map[i];
1982eda14cbcSMatt Macy }
19837877fdebSMatt Macy
1984eda14cbcSMatt Macy for (tt = 0, c = rr->rr_firstdatacol; c < rr->rr_cols; c++) {
1985eda14cbcSMatt Macy if (tt < nmissing_rows &&
1986eda14cbcSMatt Macy c == missing_rows[tt] + rr->rr_firstdatacol) {
1987eda14cbcSMatt Macy tt++;
1988eda14cbcSMatt Macy continue;
1989eda14cbcSMatt Macy }
19907877fdebSMatt Macy
1991eda14cbcSMatt Macy ASSERT3S(i, <, n);
1992eda14cbcSMatt Macy used[i] = c;
1993eda14cbcSMatt Macy i++;
1994eda14cbcSMatt Macy }
1995eda14cbcSMatt Macy
19967877fdebSMatt Macy /*
19977877fdebSMatt Macy * Initialize the interesting rows of the matrix.
19987877fdebSMatt Macy */
19997877fdebSMatt Macy vdev_raidz_matrix_init(rr, n, nmissing_rows, parity_map, rows);
20007877fdebSMatt Macy
20017877fdebSMatt Macy /*
20027877fdebSMatt Macy * Invert the matrix.
20037877fdebSMatt Macy */
20047877fdebSMatt Macy vdev_raidz_matrix_invert(rr, n, nmissing_rows, missing_rows, rows,
20057877fdebSMatt Macy invrows, used);
20067877fdebSMatt Macy
20077877fdebSMatt Macy /*
20087877fdebSMatt Macy * Reconstruct the missing data using the generated matrix.
20097877fdebSMatt Macy */
20107877fdebSMatt Macy vdev_raidz_matrix_reconstruct(rr, n, nmissing_rows, missing_rows,
20117877fdebSMatt Macy invrows, used);
20127877fdebSMatt Macy
20137877fdebSMatt Macy kmem_free(p, psize);
20147877fdebSMatt Macy
20157877fdebSMatt Macy /*
20167877fdebSMatt Macy * copy back from temporary linear abds and free them
20177877fdebSMatt Macy */
20187877fdebSMatt Macy if (bufs) {
20197877fdebSMatt Macy for (c = rr->rr_firstdatacol; c < rr->rr_cols; c++) {
20207877fdebSMatt Macy raidz_col_t *col = &rr->rr_col[c];
20217877fdebSMatt Macy
20227877fdebSMatt Macy if (bufs[c] != NULL) {
20237877fdebSMatt Macy abd_copy(bufs[c], col->rc_abd, col->rc_size);
20247877fdebSMatt Macy abd_free(col->rc_abd);
20257877fdebSMatt Macy }
20267877fdebSMatt Macy col->rc_abd = bufs[c];
20277877fdebSMatt Macy }
20287877fdebSMatt Macy kmem_free(bufs, rr->rr_cols * sizeof (abd_t *));
20297877fdebSMatt Macy }
20307877fdebSMatt Macy }
20317877fdebSMatt Macy
20327877fdebSMatt Macy static void
vdev_raidz_reconstruct_row(raidz_map_t * rm,raidz_row_t * rr,const int * t,int nt)20337877fdebSMatt Macy vdev_raidz_reconstruct_row(raidz_map_t *rm, raidz_row_t *rr,
20347877fdebSMatt Macy const int *t, int nt)
20357877fdebSMatt Macy {
20367877fdebSMatt Macy int tgts[VDEV_RAIDZ_MAXPARITY], *dt;
20377877fdebSMatt Macy int ntgts;
20387877fdebSMatt Macy int i, c, ret;
20397877fdebSMatt Macy int nbadparity, nbaddata;
20407877fdebSMatt Macy int parity_valid[VDEV_RAIDZ_MAXPARITY];
20417877fdebSMatt Macy
20427877fdebSMatt Macy if (zfs_flags & ZFS_DEBUG_RAIDZ_RECONSTRUCT) {
20437877fdebSMatt Macy zfs_dbgmsg("reconstruct(rm=%px nt=%u cols=%u md=%u mp=%u)",
20447877fdebSMatt Macy rr, nt, (int)rr->rr_cols, (int)rr->rr_missingdata,
20457877fdebSMatt Macy (int)rr->rr_missingparity);
20467877fdebSMatt Macy }
20477877fdebSMatt Macy
20487877fdebSMatt Macy nbadparity = rr->rr_firstdatacol;
20497877fdebSMatt Macy nbaddata = rr->rr_cols - nbadparity;
20507877fdebSMatt Macy ntgts = 0;
20517877fdebSMatt Macy for (i = 0, c = 0; c < rr->rr_cols; c++) {
20527877fdebSMatt Macy if (zfs_flags & ZFS_DEBUG_RAIDZ_RECONSTRUCT) {
20537877fdebSMatt Macy zfs_dbgmsg("reconstruct(rm=%px col=%u devid=%u "
20547877fdebSMatt Macy "offset=%llx error=%u)",
20557877fdebSMatt Macy rr, c, (int)rr->rr_col[c].rc_devidx,
20567877fdebSMatt Macy (long long)rr->rr_col[c].rc_offset,
20577877fdebSMatt Macy (int)rr->rr_col[c].rc_error);
20587877fdebSMatt Macy }
20597877fdebSMatt Macy if (c < rr->rr_firstdatacol)
20607877fdebSMatt Macy parity_valid[c] = B_FALSE;
20617877fdebSMatt Macy
20627877fdebSMatt Macy if (i < nt && c == t[i]) {
20637877fdebSMatt Macy tgts[ntgts++] = c;
20647877fdebSMatt Macy i++;
20657877fdebSMatt Macy } else if (rr->rr_col[c].rc_error != 0) {
20667877fdebSMatt Macy tgts[ntgts++] = c;
20677877fdebSMatt Macy } else if (c >= rr->rr_firstdatacol) {
20687877fdebSMatt Macy nbaddata--;
20697877fdebSMatt Macy } else {
20707877fdebSMatt Macy parity_valid[c] = B_TRUE;
20717877fdebSMatt Macy nbadparity--;
20727877fdebSMatt Macy }
20737877fdebSMatt Macy }
20747877fdebSMatt Macy
20757877fdebSMatt Macy ASSERT(ntgts >= nt);
20767877fdebSMatt Macy ASSERT(nbaddata >= 0);
20777877fdebSMatt Macy ASSERT(nbaddata + nbadparity == ntgts);
20787877fdebSMatt Macy
20797877fdebSMatt Macy dt = &tgts[nbadparity];
20807877fdebSMatt Macy
20817877fdebSMatt Macy /* Reconstruct using the new math implementation */
20827877fdebSMatt Macy ret = vdev_raidz_math_reconstruct(rm, rr, parity_valid, dt, nbaddata);
20837877fdebSMatt Macy if (ret != RAIDZ_ORIGINAL_IMPL)
20847877fdebSMatt Macy return;
20857877fdebSMatt Macy
20867877fdebSMatt Macy /*
20877877fdebSMatt Macy * See if we can use any of our optimized reconstruction routines.
20887877fdebSMatt Macy */
20897877fdebSMatt Macy switch (nbaddata) {
20907877fdebSMatt Macy case 1:
20917877fdebSMatt Macy if (parity_valid[VDEV_RAIDZ_P]) {
20927877fdebSMatt Macy vdev_raidz_reconstruct_p(rr, dt, 1);
20937877fdebSMatt Macy return;
20947877fdebSMatt Macy }
20957877fdebSMatt Macy
20967877fdebSMatt Macy ASSERT(rr->rr_firstdatacol > 1);
20977877fdebSMatt Macy
20987877fdebSMatt Macy if (parity_valid[VDEV_RAIDZ_Q]) {
20997877fdebSMatt Macy vdev_raidz_reconstruct_q(rr, dt, 1);
21007877fdebSMatt Macy return;
21017877fdebSMatt Macy }
21027877fdebSMatt Macy
21037877fdebSMatt Macy ASSERT(rr->rr_firstdatacol > 2);
21047877fdebSMatt Macy break;
21057877fdebSMatt Macy
21067877fdebSMatt Macy case 2:
21077877fdebSMatt Macy ASSERT(rr->rr_firstdatacol > 1);
21087877fdebSMatt Macy
21097877fdebSMatt Macy if (parity_valid[VDEV_RAIDZ_P] &&
21107877fdebSMatt Macy parity_valid[VDEV_RAIDZ_Q]) {
21117877fdebSMatt Macy vdev_raidz_reconstruct_pq(rr, dt, 2);
21127877fdebSMatt Macy return;
21137877fdebSMatt Macy }
21147877fdebSMatt Macy
21157877fdebSMatt Macy ASSERT(rr->rr_firstdatacol > 2);
21167877fdebSMatt Macy
21177877fdebSMatt Macy break;
21187877fdebSMatt Macy }
21197877fdebSMatt Macy
21207877fdebSMatt Macy vdev_raidz_reconstruct_general(rr, tgts, ntgts);
21217877fdebSMatt Macy }
21227877fdebSMatt Macy
21237877fdebSMatt Macy static int
vdev_raidz_open(vdev_t * vd,uint64_t * asize,uint64_t * max_asize,uint64_t * logical_ashift,uint64_t * physical_ashift)21247877fdebSMatt Macy vdev_raidz_open(vdev_t *vd, uint64_t *asize, uint64_t *max_asize,
21257877fdebSMatt Macy uint64_t *logical_ashift, uint64_t *physical_ashift)
21267877fdebSMatt Macy {
21277877fdebSMatt Macy vdev_raidz_t *vdrz = vd->vdev_tsd;
21287877fdebSMatt Macy uint64_t nparity = vdrz->vd_nparity;
21297877fdebSMatt Macy int c;
21307877fdebSMatt Macy int lasterror = 0;
21317877fdebSMatt Macy int numerrors = 0;
21327877fdebSMatt Macy
21337877fdebSMatt Macy ASSERT(nparity > 0);
21347877fdebSMatt Macy
21357877fdebSMatt Macy if (nparity > VDEV_RAIDZ_MAXPARITY ||
21367877fdebSMatt Macy vd->vdev_children < nparity + 1) {
21377877fdebSMatt Macy vd->vdev_stat.vs_aux = VDEV_AUX_BAD_LABEL;
21387877fdebSMatt Macy return (SET_ERROR(EINVAL));
21397877fdebSMatt Macy }
21407877fdebSMatt Macy
21417877fdebSMatt Macy vdev_open_children(vd);
21427877fdebSMatt Macy
21437877fdebSMatt Macy for (c = 0; c < vd->vdev_children; c++) {
21447877fdebSMatt Macy vdev_t *cvd = vd->vdev_child[c];
21457877fdebSMatt Macy
21467877fdebSMatt Macy if (cvd->vdev_open_error != 0) {
21477877fdebSMatt Macy lasterror = cvd->vdev_open_error;
21487877fdebSMatt Macy numerrors++;
21497877fdebSMatt Macy continue;
21507877fdebSMatt Macy }
21517877fdebSMatt Macy
21527877fdebSMatt Macy *asize = MIN(*asize - 1, cvd->vdev_asize - 1) + 1;
21537877fdebSMatt Macy *max_asize = MIN(*max_asize - 1, cvd->vdev_max_asize - 1) + 1;
21547877fdebSMatt Macy *logical_ashift = MAX(*logical_ashift, cvd->vdev_ashift);
21557877fdebSMatt Macy }
21567877fdebSMatt Macy for (c = 0; c < vd->vdev_children; c++) {
21577877fdebSMatt Macy vdev_t *cvd = vd->vdev_child[c];
21587877fdebSMatt Macy
21597877fdebSMatt Macy if (cvd->vdev_open_error != 0)
21607877fdebSMatt Macy continue;
21617877fdebSMatt Macy *physical_ashift = vdev_best_ashift(*logical_ashift,
21627877fdebSMatt Macy *physical_ashift, cvd->vdev_physical_ashift);
21637877fdebSMatt Macy }
21647877fdebSMatt Macy
21657877fdebSMatt Macy if (vd->vdev_rz_expanding) {
21667877fdebSMatt Macy *asize *= vd->vdev_children - 1;
21677877fdebSMatt Macy *max_asize *= vd->vdev_children - 1;
21687877fdebSMatt Macy
21697877fdebSMatt Macy vd->vdev_min_asize = *asize;
21707877fdebSMatt Macy } else {
21717877fdebSMatt Macy *asize *= vd->vdev_children;
21727877fdebSMatt Macy *max_asize *= vd->vdev_children;
21737877fdebSMatt Macy }
21747877fdebSMatt Macy
21757877fdebSMatt Macy if (numerrors > nparity) {
21767877fdebSMatt Macy vd->vdev_stat.vs_aux = VDEV_AUX_NO_REPLICAS;
21777877fdebSMatt Macy return (lasterror);
21787877fdebSMatt Macy }
21797877fdebSMatt Macy
21807877fdebSMatt Macy return (0);
21817877fdebSMatt Macy }
21827877fdebSMatt Macy
21837877fdebSMatt Macy static void
vdev_raidz_close(vdev_t * vd)21847877fdebSMatt Macy vdev_raidz_close(vdev_t *vd)
21857877fdebSMatt Macy {
21867877fdebSMatt Macy for (int c = 0; c < vd->vdev_children; c++) {
21877877fdebSMatt Macy if (vd->vdev_child[c] != NULL)
21887877fdebSMatt Macy vdev_close(vd->vdev_child[c]);
21897877fdebSMatt Macy }
21907877fdebSMatt Macy }
21917877fdebSMatt Macy
21927877fdebSMatt Macy /*
21937877fdebSMatt Macy * Return the logical width to use, given the txg in which the allocation
21947877fdebSMatt Macy * happened. Note that BP_GET_BIRTH() is usually the txg in which the
21957877fdebSMatt Macy * BP was allocated. Remapped BP's (that were relocated due to device
21967877fdebSMatt Macy * removal, see remap_blkptr_cb()), will have a more recent physical birth
21977877fdebSMatt Macy * which reflects when the BP was relocated, but we can ignore these because
21987877fdebSMatt Macy * they can't be on RAIDZ (device removal doesn't support RAIDZ).
21997877fdebSMatt Macy */
22007877fdebSMatt Macy static uint64_t
vdev_raidz_get_logical_width(vdev_raidz_t * vdrz,uint64_t txg)22017877fdebSMatt Macy vdev_raidz_get_logical_width(vdev_raidz_t *vdrz, uint64_t txg)
22027877fdebSMatt Macy {
22037877fdebSMatt Macy reflow_node_t lookup = {
22047877fdebSMatt Macy .re_txg = txg,
22057877fdebSMatt Macy };
22067877fdebSMatt Macy avl_index_t where;
22077877fdebSMatt Macy
22087877fdebSMatt Macy uint64_t width;
22097877fdebSMatt Macy mutex_enter(&vdrz->vd_expand_lock);
22107877fdebSMatt Macy reflow_node_t *re = avl_find(&vdrz->vd_expand_txgs, &lookup, &where);
22117877fdebSMatt Macy if (re != NULL) {
22127877fdebSMatt Macy width = re->re_logical_width;
22137877fdebSMatt Macy } else {
22147877fdebSMatt Macy re = avl_nearest(&vdrz->vd_expand_txgs, where, AVL_BEFORE);
22157877fdebSMatt Macy if (re != NULL)
22167877fdebSMatt Macy width = re->re_logical_width;
22177877fdebSMatt Macy else
22187877fdebSMatt Macy width = vdrz->vd_original_width;
22197877fdebSMatt Macy }
22207877fdebSMatt Macy mutex_exit(&vdrz->vd_expand_lock);
22217877fdebSMatt Macy return (width);
22227877fdebSMatt Macy }
22237877fdebSMatt Macy
22247877fdebSMatt Macy /*
22257877fdebSMatt Macy * Note: If the RAIDZ vdev has been expanded, older BP's may have allocated
22267877fdebSMatt Macy * more space due to the lower data-to-parity ratio. In this case it's
22277877fdebSMatt Macy * important to pass in the correct txg. Note that vdev_gang_header_asize()
22287877fdebSMatt Macy * relies on a constant asize for psize=SPA_GANGBLOCKSIZE=SPA_MINBLOCKSIZE,
22297877fdebSMatt Macy * regardless of txg. This is assured because for a single data sector, we
22307877fdebSMatt Macy * allocate P+1 sectors regardless of width ("cols", which is at least P+1).
22317877fdebSMatt Macy */
22327877fdebSMatt Macy static uint64_t
vdev_raidz_asize(vdev_t * vd,uint64_t psize,uint64_t txg)22337877fdebSMatt Macy vdev_raidz_asize(vdev_t *vd, uint64_t psize, uint64_t txg)
22347877fdebSMatt Macy {
22357877fdebSMatt Macy vdev_raidz_t *vdrz = vd->vdev_tsd;
22367877fdebSMatt Macy uint64_t asize;
22377877fdebSMatt Macy uint64_t ashift = vd->vdev_top->vdev_ashift;
22387877fdebSMatt Macy uint64_t cols = vdrz->vd_original_width;
22397877fdebSMatt Macy uint64_t nparity = vdrz->vd_nparity;
22407877fdebSMatt Macy
22417877fdebSMatt Macy cols = vdev_raidz_get_logical_width(vdrz, txg);
22427877fdebSMatt Macy
22437877fdebSMatt Macy asize = ((psize - 1) >> ashift) + 1;
22447877fdebSMatt Macy asize += nparity * ((asize + cols - nparity - 1) / (cols - nparity));
22457877fdebSMatt Macy asize = roundup(asize, nparity + 1) << ashift;
22467877fdebSMatt Macy
22477877fdebSMatt Macy #ifdef ZFS_DEBUG
22487877fdebSMatt Macy uint64_t asize_new = ((psize - 1) >> ashift) + 1;
22497877fdebSMatt Macy uint64_t ncols_new = vdrz->vd_physical_width;
22507877fdebSMatt Macy asize_new += nparity * ((asize_new + ncols_new - nparity - 1) /
22517877fdebSMatt Macy (ncols_new - nparity));
22527877fdebSMatt Macy asize_new = roundup(asize_new, nparity + 1) << ashift;
22537877fdebSMatt Macy VERIFY3U(asize_new, <=, asize);
22547877fdebSMatt Macy #endif
22557877fdebSMatt Macy
22567877fdebSMatt Macy return (asize);
22577877fdebSMatt Macy }
22587877fdebSMatt Macy
22597877fdebSMatt Macy /*
22607877fdebSMatt Macy * The allocatable space for a raidz vdev is N * sizeof(smallest child)
22617877fdebSMatt Macy * so each child must provide at least 1/Nth of its asize.
22627877fdebSMatt Macy */
22637877fdebSMatt Macy static uint64_t
vdev_raidz_min_asize(vdev_t * vd)22647877fdebSMatt Macy vdev_raidz_min_asize(vdev_t *vd)
22657877fdebSMatt Macy {
22667877fdebSMatt Macy return ((vd->vdev_min_asize + vd->vdev_children - 1) /
22677877fdebSMatt Macy vd->vdev_children);
22687877fdebSMatt Macy }
22697877fdebSMatt Macy
22707877fdebSMatt Macy void
vdev_raidz_child_done(zio_t * zio)22717877fdebSMatt Macy vdev_raidz_child_done(zio_t *zio)
22727877fdebSMatt Macy {
22737877fdebSMatt Macy raidz_col_t *rc = zio->io_private;
22747877fdebSMatt Macy
22757877fdebSMatt Macy ASSERT3P(rc->rc_abd, !=, NULL);
22767877fdebSMatt Macy rc->rc_error = zio->io_error;
22777877fdebSMatt Macy rc->rc_tried = 1;
22787877fdebSMatt Macy rc->rc_skipped = 0;
22797877fdebSMatt Macy }
22807877fdebSMatt Macy
22817877fdebSMatt Macy static void
vdev_raidz_shadow_child_done(zio_t * zio)22827877fdebSMatt Macy vdev_raidz_shadow_child_done(zio_t *zio)
22837877fdebSMatt Macy {
22847877fdebSMatt Macy raidz_col_t *rc = zio->io_private;
22857877fdebSMatt Macy
22867877fdebSMatt Macy rc->rc_shadow_error = zio->io_error;
22877877fdebSMatt Macy }
22887877fdebSMatt Macy
22897877fdebSMatt Macy static void
vdev_raidz_io_verify(zio_t * zio,raidz_map_t * rm,raidz_row_t * rr,int col)22907877fdebSMatt Macy vdev_raidz_io_verify(zio_t *zio, raidz_map_t *rm, raidz_row_t *rr, int col)
22917877fdebSMatt Macy {
22927877fdebSMatt Macy (void) rm;
22937877fdebSMatt Macy #ifdef ZFS_DEBUG
22947877fdebSMatt Macy range_seg64_t logical_rs, physical_rs, remain_rs;
22957877fdebSMatt Macy logical_rs.rs_start = rr->rr_offset;
22967877fdebSMatt Macy logical_rs.rs_end = logical_rs.rs_start +
22977877fdebSMatt Macy vdev_raidz_asize(zio->io_vd, rr->rr_size,
22987877fdebSMatt Macy BP_GET_BIRTH(zio->io_bp));
22997877fdebSMatt Macy
23007877fdebSMatt Macy raidz_col_t *rc = &rr->rr_col[col];
23017877fdebSMatt Macy vdev_t *cvd = zio->io_vd->vdev_child[rc->rc_devidx];
23027877fdebSMatt Macy
23037877fdebSMatt Macy vdev_xlate(cvd, &logical_rs, &physical_rs, &remain_rs);
23047877fdebSMatt Macy ASSERT(vdev_xlate_is_empty(&remain_rs));
23057877fdebSMatt Macy if (vdev_xlate_is_empty(&physical_rs)) {
23067877fdebSMatt Macy /*
23077877fdebSMatt Macy * If we are in the middle of expansion, the
23087877fdebSMatt Macy * physical->logical mapping is changing so vdev_xlate()
23097877fdebSMatt Macy * can't give us a reliable answer.
23107877fdebSMatt Macy */
23117877fdebSMatt Macy return;
23127877fdebSMatt Macy }
23137877fdebSMatt Macy ASSERT3U(rc->rc_offset, ==, physical_rs.rs_start);
23147877fdebSMatt Macy ASSERT3U(rc->rc_offset, <, physical_rs.rs_end);
23157877fdebSMatt Macy /*
23167877fdebSMatt Macy * It would be nice to assert that rs_end is equal
23177877fdebSMatt Macy * to rc_offset + rc_size but there might be an
23187877fdebSMatt Macy * optional I/O at the end that is not accounted in
23197877fdebSMatt Macy * rc_size.
23207877fdebSMatt Macy */
23217877fdebSMatt Macy if (physical_rs.rs_end > rc->rc_offset + rc->rc_size) {
23227877fdebSMatt Macy ASSERT3U(physical_rs.rs_end, ==, rc->rc_offset +
23237877fdebSMatt Macy rc->rc_size + (1 << zio->io_vd->vdev_top->vdev_ashift));
23247877fdebSMatt Macy } else {
23257877fdebSMatt Macy ASSERT3U(physical_rs.rs_end, ==, rc->rc_offset + rc->rc_size);
23267877fdebSMatt Macy }
23277877fdebSMatt Macy #endif
23287877fdebSMatt Macy }
23297877fdebSMatt Macy
2330eda14cbcSMatt Macy static void
vdev_raidz_io_start_write(zio_t * zio,raidz_row_t * rr)2331eda14cbcSMatt Macy vdev_raidz_io_start_write(zio_t *zio, raidz_row_t *rr)
2332eda14cbcSMatt Macy {
2333eda14cbcSMatt Macy vdev_t *vd = zio->io_vd;
2334eda14cbcSMatt Macy raidz_map_t *rm = zio->io_vsd;
2335eda14cbcSMatt Macy
23367877fdebSMatt Macy vdev_raidz_generate_parity_row(rm, rr);
23377877fdebSMatt Macy
23387877fdebSMatt Macy for (int c = 0; c < rr->rr_scols; c++) {
23397877fdebSMatt Macy raidz_col_t *rc = &rr->rr_col[c];
2340eda14cbcSMatt Macy vdev_t *cvd = vd->vdev_child[rc->rc_devidx];
2341eda14cbcSMatt Macy
2342eda14cbcSMatt Macy /* Verify physical to logical translation */
23437877fdebSMatt Macy vdev_raidz_io_verify(zio, rm, rr, c);
23447877fdebSMatt Macy
2345eda14cbcSMatt Macy if (rc->rc_size == 0)
23467877fdebSMatt Macy continue;
23477877fdebSMatt Macy
23487877fdebSMatt Macy ASSERT3U(rc->rc_offset + rc->rc_size, <,
23497877fdebSMatt Macy cvd->vdev_psize - VDEV_LABEL_END_SIZE);
23507877fdebSMatt Macy
23517877fdebSMatt Macy ASSERT3P(rc->rc_abd, !=, NULL);
23527877fdebSMatt Macy zio_nowait(zio_vdev_child_io(zio, NULL, cvd,
23537877fdebSMatt Macy rc->rc_offset, rc->rc_abd,
23547877fdebSMatt Macy abd_get_size(rc->rc_abd), zio->io_type,
23557877fdebSMatt Macy zio->io_priority, 0, vdev_raidz_child_done, rc));
23567877fdebSMatt Macy
23577877fdebSMatt Macy if (rc->rc_shadow_devidx != INT_MAX) {
23587877fdebSMatt Macy vdev_t *cvd2 = vd->vdev_child[rc->rc_shadow_devidx];
23597877fdebSMatt Macy
23607877fdebSMatt Macy ASSERT3U(
23617877fdebSMatt Macy rc->rc_shadow_offset + abd_get_size(rc->rc_abd), <,
23627877fdebSMatt Macy cvd2->vdev_psize - VDEV_LABEL_END_SIZE);
23637877fdebSMatt Macy
23647877fdebSMatt Macy zio_nowait(zio_vdev_child_io(zio, NULL, cvd2,
23657877fdebSMatt Macy rc->rc_shadow_offset, rc->rc_abd,
23667877fdebSMatt Macy abd_get_size(rc->rc_abd),
23677877fdebSMatt Macy zio->io_type, zio->io_priority, 0,
23687877fdebSMatt Macy vdev_raidz_shadow_child_done, rc));
23697877fdebSMatt Macy }
23707877fdebSMatt Macy }
23717877fdebSMatt Macy }
23727877fdebSMatt Macy
23737877fdebSMatt Macy /*
23747877fdebSMatt Macy * Generate optional I/Os for skip sectors to improve aggregation contiguity.
23757877fdebSMatt Macy * This only works for vdev_raidz_map_alloc() (not _expanded()).
2376eda14cbcSMatt Macy */
2377eda14cbcSMatt Macy static void
raidz_start_skip_writes(zio_t * zio)23787877fdebSMatt Macy raidz_start_skip_writes(zio_t *zio)
23797877fdebSMatt Macy {
23807877fdebSMatt Macy vdev_t *vd = zio->io_vd;
2381eda14cbcSMatt Macy uint64_t ashift = vd->vdev_top->vdev_ashift;
23827877fdebSMatt Macy raidz_map_t *rm = zio->io_vsd;
23837877fdebSMatt Macy ASSERT3U(rm->rm_nrows, ==, 1);
2384eda14cbcSMatt Macy raidz_row_t *rr = rm->rm_row[0];
2385eda14cbcSMatt Macy for (int c = 0; c < rr->rr_scols; c++) {
2386eda14cbcSMatt Macy raidz_col_t *rc = &rr->rr_col[c];
2387eda14cbcSMatt Macy vdev_t *cvd = vd->vdev_child[rc->rc_devidx];
2388eda14cbcSMatt Macy if (rc->rc_size != 0)
2389eda14cbcSMatt Macy continue;
2390eda14cbcSMatt Macy ASSERT3P(rc->rc_abd, ==, NULL);
2391eda14cbcSMatt Macy
23927877fdebSMatt Macy ASSERT3U(rc->rc_offset, <,
2393eda14cbcSMatt Macy cvd->vdev_psize - VDEV_LABEL_END_SIZE);
2394eda14cbcSMatt Macy
2395eda14cbcSMatt Macy zio_nowait(zio_vdev_child_io(zio, NULL, cvd, rc->rc_offset,
2396eda14cbcSMatt Macy NULL, 1ULL << ashift, zio->io_type, zio->io_priority,
23977877fdebSMatt Macy ZIO_FLAG_NODATA | ZIO_FLAG_OPTIONAL, NULL, NULL));
23987877fdebSMatt Macy }
23997877fdebSMatt Macy }
24007877fdebSMatt Macy
2401eda14cbcSMatt Macy static void
vdev_raidz_io_start_read_row(zio_t * zio,raidz_row_t * rr,boolean_t forceparity)2402eda14cbcSMatt Macy vdev_raidz_io_start_read_row(zio_t *zio, raidz_row_t *rr, boolean_t forceparity)
2403eda14cbcSMatt Macy {
2404eda14cbcSMatt Macy vdev_t *vd = zio->io_vd;
2405eda14cbcSMatt Macy
2406eda14cbcSMatt Macy /*
24077877fdebSMatt Macy * Iterate over the columns in reverse order so that we hit the parity
2408eda14cbcSMatt Macy * last -- any errors along the way will force us to read the parity.
24097877fdebSMatt Macy */
2410eda14cbcSMatt Macy for (int c = rr->rr_cols - 1; c >= 0; c--) {
2411eda14cbcSMatt Macy raidz_col_t *rc = &rr->rr_col[c];
24127877fdebSMatt Macy if (rc->rc_size == 0)
2413eda14cbcSMatt Macy continue;
2414eda14cbcSMatt Macy vdev_t *cvd = vd->vdev_child[rc->rc_devidx];
2415eda14cbcSMatt Macy if (!vdev_readable(cvd)) {
24167877fdebSMatt Macy if (c >= rr->rr_firstdatacol)
2417eda14cbcSMatt Macy rr->rr_missingdata++;
24187877fdebSMatt Macy else
24197877fdebSMatt Macy rr->rr_missingparity++;
24207877fdebSMatt Macy rc->rc_error = SET_ERROR(ENXIO);
24217877fdebSMatt Macy rc->rc_tried = 1; /* don't even try */
24227877fdebSMatt Macy rc->rc_skipped = 1;
2423eda14cbcSMatt Macy continue;
24247877fdebSMatt Macy }
24257877fdebSMatt Macy if (vdev_dtl_contains(cvd, DTL_MISSING, zio->io_txg, 1)) {
24267877fdebSMatt Macy if (c >= rr->rr_firstdatacol)
24277877fdebSMatt Macy rr->rr_missingdata++;
24287877fdebSMatt Macy else
24297877fdebSMatt Macy rr->rr_missingparity++;
24307877fdebSMatt Macy rc->rc_error = SET_ERROR(ESTALE);
24317877fdebSMatt Macy rc->rc_skipped = 1;
24327877fdebSMatt Macy continue;
24337877fdebSMatt Macy }
24347877fdebSMatt Macy if (forceparity ||
24357877fdebSMatt Macy c >= rr->rr_firstdatacol || rr->rr_missingdata > 0 ||
24367877fdebSMatt Macy (zio->io_flags & (ZIO_FLAG_SCRUB | ZIO_FLAG_RESILVER))) {
24377877fdebSMatt Macy zio_nowait(zio_vdev_child_io(zio, NULL, cvd,
2438eda14cbcSMatt Macy rc->rc_offset, rc->rc_abd, rc->rc_size,
2439eda14cbcSMatt Macy zio->io_type, zio->io_priority, 0,
2440eda14cbcSMatt Macy vdev_raidz_child_done, rc));
2441eda14cbcSMatt Macy }
2442eda14cbcSMatt Macy }
2443eda14cbcSMatt Macy }
2444eda14cbcSMatt Macy
24457877fdebSMatt Macy static void
vdev_raidz_io_start_read_phys_cols(zio_t * zio,raidz_map_t * rm)24467877fdebSMatt Macy vdev_raidz_io_start_read_phys_cols(zio_t *zio, raidz_map_t *rm)
24477877fdebSMatt Macy {
2448eda14cbcSMatt Macy vdev_t *vd = zio->io_vd;
2449eda14cbcSMatt Macy
2450eda14cbcSMatt Macy for (int i = 0; i < rm->rm_nphys_cols; i++) {
24517877fdebSMatt Macy raidz_col_t *prc = &rm->rm_phys_col[i];
24527877fdebSMatt Macy if (prc->rc_size == 0)
24537877fdebSMatt Macy continue;
24547877fdebSMatt Macy
2455eda14cbcSMatt Macy ASSERT3U(prc->rc_devidx, ==, i);
24567877fdebSMatt Macy vdev_t *cvd = vd->vdev_child[i];
24577877fdebSMatt Macy if (!vdev_readable(cvd)) {
24587877fdebSMatt Macy prc->rc_error = SET_ERROR(ENXIO);
24597877fdebSMatt Macy prc->rc_tried = 1; /* don't even try */
2460eda14cbcSMatt Macy prc->rc_skipped = 1;
24617877fdebSMatt Macy continue;
24627877fdebSMatt Macy }
2463eda14cbcSMatt Macy if (vdev_dtl_contains(cvd, DTL_MISSING, zio->io_txg, 1)) {
24647877fdebSMatt Macy prc->rc_error = SET_ERROR(ESTALE);
24657877fdebSMatt Macy prc->rc_skipped = 1;
24667877fdebSMatt Macy continue;
24677877fdebSMatt Macy }
24682c48331dSMatt Macy zio_nowait(zio_vdev_child_io(zio, NULL, cvd,
24692c48331dSMatt Macy prc->rc_offset, prc->rc_abd, prc->rc_size,
24702c48331dSMatt Macy zio->io_type, zio->io_priority, 0,
2471eda14cbcSMatt Macy vdev_raidz_child_done, prc));
2472eda14cbcSMatt Macy }
24732c48331dSMatt Macy }
2474eda14cbcSMatt Macy
24757877fdebSMatt Macy static void
vdev_raidz_io_start_read(zio_t * zio,raidz_map_t * rm)24767877fdebSMatt Macy vdev_raidz_io_start_read(zio_t *zio, raidz_map_t *rm)
24777877fdebSMatt Macy {
24782c48331dSMatt Macy /*
2479eda14cbcSMatt Macy * If there are multiple rows, we will be hitting
2480eda14cbcSMatt Macy * all disks, so go ahead and read the parity so
2481eda14cbcSMatt Macy * that we are reading in decent size chunks.
2482eda14cbcSMatt Macy */
2483eda14cbcSMatt Macy boolean_t forceparity = rm->rm_nrows > 1;
2484eda14cbcSMatt Macy
2485eda14cbcSMatt Macy if (rm->rm_phys_col) {
2486eda14cbcSMatt Macy vdev_raidz_io_start_read_phys_cols(zio, rm);
24877877fdebSMatt Macy } else {
24887877fdebSMatt Macy for (int i = 0; i < rm->rm_nrows; i++) {
24897877fdebSMatt Macy raidz_row_t *rr = rm->rm_row[i];
24907877fdebSMatt Macy vdev_raidz_io_start_read_row(zio, rr, forceparity);
24917877fdebSMatt Macy }
24927877fdebSMatt Macy }
24937877fdebSMatt Macy }
24947877fdebSMatt Macy
24957877fdebSMatt Macy /*
24967877fdebSMatt Macy * Start an IO operation on a RAIDZ VDev
24977877fdebSMatt Macy *
24987877fdebSMatt Macy * Outline:
24997877fdebSMatt Macy * - For write operations:
25007877fdebSMatt Macy * 1. Generate the parity data
25017877fdebSMatt Macy * 2. Create child zio write operations to each column's vdev, for both
25027877fdebSMatt Macy * data and parity.
25037877fdebSMatt Macy * 3. If the column skips any sectors for padding, create optional dummy
25047877fdebSMatt Macy * write zio children for those areas to improve aggregation continuity.
25057877fdebSMatt Macy * - For read operations:
25067877fdebSMatt Macy * 1. Create child zio read operations to each data column's vdev to read
25077877fdebSMatt Macy * the range of data required for zio.
25087877fdebSMatt Macy * 2. If this is a scrub or resilver operation, or if any of the data
2509eda14cbcSMatt Macy * vdevs have had errors, then create zio read operations to the parity
25107877fdebSMatt Macy * columns' VDevs as well.
2511eda14cbcSMatt Macy */
25127877fdebSMatt Macy static void
vdev_raidz_io_start(zio_t * zio)25137877fdebSMatt Macy vdev_raidz_io_start(zio_t *zio)
25147877fdebSMatt Macy {
25157877fdebSMatt Macy vdev_t *vd = zio->io_vd;
2516eda14cbcSMatt Macy vdev_t *tvd = vd->vdev_top;
25177877fdebSMatt Macy vdev_raidz_t *vdrz = vd->vdev_tsd;
2518eda14cbcSMatt Macy raidz_map_t *rm;
25197877fdebSMatt Macy
25207877fdebSMatt Macy uint64_t logical_width = vdev_raidz_get_logical_width(vdrz,
25217877fdebSMatt Macy BP_GET_BIRTH(zio->io_bp));
25227877fdebSMatt Macy if (logical_width != vdrz->vd_physical_width) {
25237877fdebSMatt Macy zfs_locked_range_t *lr = NULL;
25247877fdebSMatt Macy uint64_t synced_offset = UINT64_MAX;
25257877fdebSMatt Macy uint64_t next_offset = UINT64_MAX;
25267877fdebSMatt Macy boolean_t use_scratch = B_FALSE;
25277877fdebSMatt Macy /*
25287877fdebSMatt Macy * Note: when the expansion is completing, we set
25297877fdebSMatt Macy * vre_state=DSS_FINISHED (in raidz_reflow_complete_sync())
25307877fdebSMatt Macy * in a later txg than when we last update spa_ubsync's state
25317877fdebSMatt Macy * (see the end of spa_raidz_expand_thread()). Therefore we
25327877fdebSMatt Macy * may see vre_state!=SCANNING before
25337877fdebSMatt Macy * VDEV_TOP_ZAP_RAIDZ_EXPAND_STATE=DSS_FINISHED is reflected
25347877fdebSMatt Macy * on disk, but the copying progress has been synced to disk
25357877fdebSMatt Macy * (and reflected in spa_ubsync). In this case it's fine to
25367877fdebSMatt Macy * treat the expansion as completed, since if we crash there's
25377877fdebSMatt Macy * no additional copying to do.
25387877fdebSMatt Macy */
25397877fdebSMatt Macy if (vdrz->vn_vre.vre_state == DSS_SCANNING) {
25407877fdebSMatt Macy ASSERT3P(vd->vdev_spa->spa_raidz_expand, ==,
25417877fdebSMatt Macy &vdrz->vn_vre);
25427877fdebSMatt Macy lr = zfs_rangelock_enter(&vdrz->vn_vre.vre_rangelock,
25437877fdebSMatt Macy zio->io_offset, zio->io_size, RL_READER);
2544eda14cbcSMatt Macy use_scratch =
25457877fdebSMatt Macy (RRSS_GET_STATE(&vd->vdev_spa->spa_ubsync) ==
25467877fdebSMatt Macy RRSS_SCRATCH_VALID);
25477877fdebSMatt Macy synced_offset =
25487877fdebSMatt Macy RRSS_GET_OFFSET(&vd->vdev_spa->spa_ubsync);
25497877fdebSMatt Macy next_offset = vdrz->vn_vre.vre_offset;
2550eda14cbcSMatt Macy /*
2551eda14cbcSMatt Macy * If we haven't resumed expanding since importing the
2552eda14cbcSMatt Macy * pool, vre_offset won't have been set yet. In
2553eda14cbcSMatt Macy * this case the next offset to be copied is the same
2554eda14cbcSMatt Macy * as what was synced.
2555eda14cbcSMatt Macy */
2556eda14cbcSMatt Macy if (next_offset == UINT64_MAX) {
25577877fdebSMatt Macy next_offset = synced_offset;
25587877fdebSMatt Macy }
2559eda14cbcSMatt Macy }
2560eda14cbcSMatt Macy if (use_scratch) {
2561eda14cbcSMatt Macy zfs_dbgmsg("zio=%px %s io_offset=%llu offset_synced="
2562eda14cbcSMatt Macy "%lld next_offset=%lld use_scratch=%u",
2563eda14cbcSMatt Macy zio,
2564eda14cbcSMatt Macy zio->io_type == ZIO_TYPE_WRITE ? "WRITE" : "READ",
2565eda14cbcSMatt Macy (long long)zio->io_offset,
2566eda14cbcSMatt Macy (long long)synced_offset,
2567eda14cbcSMatt Macy (long long)next_offset,
2568eda14cbcSMatt Macy use_scratch);
2569eda14cbcSMatt Macy }
2570eda14cbcSMatt Macy
2571eda14cbcSMatt Macy rm = vdev_raidz_map_alloc_expanded(zio,
2572eda14cbcSMatt Macy tvd->vdev_ashift, vdrz->vd_physical_width,
2573eda14cbcSMatt Macy logical_width, vdrz->vd_nparity,
25747877fdebSMatt Macy synced_offset, next_offset, use_scratch);
25757877fdebSMatt Macy rm->rm_lr = lr;
2576eda14cbcSMatt Macy } else {
25777877fdebSMatt Macy rm = vdev_raidz_map_alloc(zio,
2578eda14cbcSMatt Macy tvd->vdev_ashift, logical_width, vdrz->vd_nparity);
25797877fdebSMatt Macy }
2580eda14cbcSMatt Macy rm->rm_original_width = vdrz->vd_original_width;
2581eda14cbcSMatt Macy
25827877fdebSMatt Macy zio->io_vsd = rm;
2583eda14cbcSMatt Macy zio->io_vsd_ops = &vdev_raidz_vsd_ops;
2584eda14cbcSMatt Macy if (zio->io_type == ZIO_TYPE_WRITE) {
2585eda14cbcSMatt Macy for (int i = 0; i < rm->rm_nrows; i++) {
2586eda14cbcSMatt Macy vdev_raidz_io_start_write(zio, rm->rm_row[i]);
2587eda14cbcSMatt Macy }
25887877fdebSMatt Macy
25897877fdebSMatt Macy if (logical_width == vdrz->vd_physical_width) {
25907877fdebSMatt Macy raidz_start_skip_writes(zio);
25917877fdebSMatt Macy }
25927877fdebSMatt Macy } else {
25937877fdebSMatt Macy ASSERT(zio->io_type == ZIO_TYPE_READ);
2594eda14cbcSMatt Macy vdev_raidz_io_start_read(zio, rm);
2595eda14cbcSMatt Macy }
2596eda14cbcSMatt Macy
2597eda14cbcSMatt Macy zio_execute(zio);
2598eda14cbcSMatt Macy }
2599eda14cbcSMatt Macy
2600eda14cbcSMatt Macy /*
2601eda14cbcSMatt Macy * Report a checksum error for a child of a RAID-Z device.
2602eda14cbcSMatt Macy */
2603eda14cbcSMatt Macy void
vdev_raidz_checksum_error(zio_t * zio,raidz_col_t * rc,abd_t * bad_data)2604eda14cbcSMatt Macy vdev_raidz_checksum_error(zio_t *zio, raidz_col_t *rc, abd_t *bad_data)
2605eda14cbcSMatt Macy {
2606eda14cbcSMatt Macy vdev_t *vd = zio->io_vd->vdev_child[rc->rc_devidx];
2607eda14cbcSMatt Macy
2608eda14cbcSMatt Macy if (!(zio->io_flags & ZIO_FLAG_SPECULATIVE) &&
2609eda14cbcSMatt Macy zio->io_priority != ZIO_PRIORITY_REBUILD) {
2610eda14cbcSMatt Macy zio_bad_cksum_t zbc;
2611eda14cbcSMatt Macy raidz_map_t *rm = zio->io_vsd;
2612eda14cbcSMatt Macy
2613eda14cbcSMatt Macy zbc.zbc_has_cksum = 0;
26147877fdebSMatt Macy zbc.zbc_injected = rm->rm_ecksuminjected;
26157877fdebSMatt Macy
2616eda14cbcSMatt Macy mutex_enter(&vd->vdev_stat_lock);
2617eda14cbcSMatt Macy vd->vdev_stat.vs_checksum_errors++;
2618eda14cbcSMatt Macy mutex_exit(&vd->vdev_stat_lock);
2619eda14cbcSMatt Macy (void) zfs_ereport_post_checksum(zio->io_spa, vd,
2620eda14cbcSMatt Macy &zio->io_bookmark, zio, rc->rc_offset, rc->rc_size,
2621eda14cbcSMatt Macy rc->rc_abd, bad_data, &zbc);
2622eda14cbcSMatt Macy }
2623eda14cbcSMatt Macy }
2624eda14cbcSMatt Macy
26257877fdebSMatt Macy /*
26267877fdebSMatt Macy * We keep track of whether or not there were any injected errors, so that
26277877fdebSMatt Macy * any ereports we generate can note it.
26287877fdebSMatt Macy */
2629eda14cbcSMatt Macy static int
raidz_checksum_verify(zio_t * zio)2630eda14cbcSMatt Macy raidz_checksum_verify(zio_t *zio)
2631eda14cbcSMatt Macy {
2632eda14cbcSMatt Macy zio_bad_cksum_t zbc = {0};
2633eda14cbcSMatt Macy raidz_map_t *rm = zio->io_vsd;
2634eda14cbcSMatt Macy
2635eda14cbcSMatt Macy int ret = zio_checksum_error(zio, &zbc);
2636eda14cbcSMatt Macy if (ret != 0 && zbc.zbc_injected != 0)
2637eda14cbcSMatt Macy rm->rm_ecksuminjected = 1;
26387877fdebSMatt Macy
26397877fdebSMatt Macy return (ret);
2640eda14cbcSMatt Macy }
26417877fdebSMatt Macy
26427877fdebSMatt Macy /*
26437877fdebSMatt Macy * Generate the parity from the data columns. If we tried and were able to
26447877fdebSMatt Macy * read the parity without error, verify that the generated parity matches the
26457877fdebSMatt Macy * data we read. If it doesn't, we fire off a checksum error. Return the
26467877fdebSMatt Macy * number of such failures.
26477877fdebSMatt Macy */
26487877fdebSMatt Macy static int
raidz_parity_verify(zio_t * zio,raidz_row_t * rr)26497877fdebSMatt Macy raidz_parity_verify(zio_t *zio, raidz_row_t *rr)
26507877fdebSMatt Macy {
26517877fdebSMatt Macy abd_t *orig[VDEV_RAIDZ_MAXPARITY];
26527877fdebSMatt Macy int c, ret = 0;
26537877fdebSMatt Macy raidz_map_t *rm = zio->io_vsd;
26547877fdebSMatt Macy raidz_col_t *rc;
26557877fdebSMatt Macy
26567877fdebSMatt Macy blkptr_t *bp = zio->io_bp;
26577877fdebSMatt Macy enum zio_checksum checksum = (bp == NULL ? zio->io_prop.zp_checksum :
26587877fdebSMatt Macy (BP_IS_GANG(bp) ? ZIO_CHECKSUM_GANG_HEADER : BP_GET_CHECKSUM(bp)));
26597877fdebSMatt Macy
26607877fdebSMatt Macy if (checksum == ZIO_CHECKSUM_NOPARITY)
26617877fdebSMatt Macy return (ret);
26627877fdebSMatt Macy
26637877fdebSMatt Macy for (c = 0; c < rr->rr_firstdatacol; c++) {
26647877fdebSMatt Macy rc = &rr->rr_col[c];
26657877fdebSMatt Macy if (!rc->rc_tried || rc->rc_error != 0)
26667877fdebSMatt Macy continue;
26677877fdebSMatt Macy
26687877fdebSMatt Macy orig[c] = rc->rc_abd;
26697877fdebSMatt Macy ASSERT3U(abd_get_size(rc->rc_abd), ==, rc->rc_size);
26707877fdebSMatt Macy rc->rc_abd = abd_alloc_linear(rc->rc_size, B_FALSE);
26717877fdebSMatt Macy }
26727877fdebSMatt Macy
26737877fdebSMatt Macy /*
26747877fdebSMatt Macy * Verify any empty sectors are zero filled to ensure the parity
26757877fdebSMatt Macy * is calculated correctly even if these non-data sectors are damaged.
26767877fdebSMatt Macy */
26777877fdebSMatt Macy if (rr->rr_nempty && rr->rr_abd_empty != NULL)
26787877fdebSMatt Macy ret += vdev_draid_map_verify_empty(zio, rr);
26797877fdebSMatt Macy
26807877fdebSMatt Macy /*
26817877fdebSMatt Macy * Regenerates parity even for !tried||rc_error!=0 columns. This
26827877fdebSMatt Macy * isn't harmful but it does have the side effect of fixing stuff
26837877fdebSMatt Macy * we didn't realize was necessary (i.e. even if we return 0).
26847877fdebSMatt Macy */
26857877fdebSMatt Macy vdev_raidz_generate_parity_row(rm, rr);
26867877fdebSMatt Macy
26877877fdebSMatt Macy for (c = 0; c < rr->rr_firstdatacol; c++) {
26887877fdebSMatt Macy rc = &rr->rr_col[c];
26897877fdebSMatt Macy
26907877fdebSMatt Macy if (!rc->rc_tried || rc->rc_error != 0)
26917877fdebSMatt Macy continue;
26927877fdebSMatt Macy
26937877fdebSMatt Macy if (abd_cmp(orig[c], rc->rc_abd) != 0) {
26947877fdebSMatt Macy zfs_dbgmsg("found error on col=%u devidx=%u off %llx",
26957877fdebSMatt Macy c, (int)rc->rc_devidx, (u_longlong_t)rc->rc_offset);
26967877fdebSMatt Macy vdev_raidz_checksum_error(zio, rc, orig[c]);
26977877fdebSMatt Macy rc->rc_error = SET_ERROR(ECKSUM);
26987877fdebSMatt Macy ret++;
26997877fdebSMatt Macy }
27007877fdebSMatt Macy abd_free(orig[c]);
27017877fdebSMatt Macy }
27027877fdebSMatt Macy
27037877fdebSMatt Macy return (ret);
27047877fdebSMatt Macy }
27057877fdebSMatt Macy
27067877fdebSMatt Macy static int
vdev_raidz_worst_error(raidz_row_t * rr)27077877fdebSMatt Macy vdev_raidz_worst_error(raidz_row_t *rr)
27087877fdebSMatt Macy {
27097877fdebSMatt Macy int error = 0;
27107877fdebSMatt Macy
27117877fdebSMatt Macy for (int c = 0; c < rr->rr_cols; c++) {
27127877fdebSMatt Macy error = zio_worst_error(error, rr->rr_col[c].rc_error);
27137877fdebSMatt Macy error = zio_worst_error(error, rr->rr_col[c].rc_shadow_error);
27147877fdebSMatt Macy }
27157877fdebSMatt Macy
27167877fdebSMatt Macy return (error);
27177877fdebSMatt Macy }
27187877fdebSMatt Macy
27197877fdebSMatt Macy static void
vdev_raidz_io_done_verified(zio_t * zio,raidz_row_t * rr)27207877fdebSMatt Macy vdev_raidz_io_done_verified(zio_t *zio, raidz_row_t *rr)
27217877fdebSMatt Macy {
27227877fdebSMatt Macy int unexpected_errors = 0;
27237877fdebSMatt Macy int parity_errors = 0;
27247877fdebSMatt Macy int parity_untried = 0;
27257877fdebSMatt Macy int data_errors = 0;
27267877fdebSMatt Macy
27277877fdebSMatt Macy ASSERT3U(zio->io_type, ==, ZIO_TYPE_READ);
27287877fdebSMatt Macy
27297877fdebSMatt Macy for (int c = 0; c < rr->rr_cols; c++) {
27307877fdebSMatt Macy raidz_col_t *rc = &rr->rr_col[c];
27317877fdebSMatt Macy
27327877fdebSMatt Macy if (rc->rc_error) {
27337877fdebSMatt Macy if (c < rr->rr_firstdatacol)
27347877fdebSMatt Macy parity_errors++;
27357877fdebSMatt Macy else
27367877fdebSMatt Macy data_errors++;
27377877fdebSMatt Macy
27387877fdebSMatt Macy if (!rc->rc_skipped)
27397877fdebSMatt Macy unexpected_errors++;
27407877fdebSMatt Macy } else if (c < rr->rr_firstdatacol && !rc->rc_tried) {
2741eda14cbcSMatt Macy parity_untried++;
2742eda14cbcSMatt Macy }
2743eda14cbcSMatt Macy
27447877fdebSMatt Macy if (rc->rc_force_repair)
27457877fdebSMatt Macy unexpected_errors++;
2746eda14cbcSMatt Macy }
2747eda14cbcSMatt Macy
2748eda14cbcSMatt Macy /*
27497877fdebSMatt Macy * If we read more parity disks than were used for
27507877fdebSMatt Macy * reconstruction, confirm that the other parity disks produced
2751eda14cbcSMatt Macy * correct data.
2752eda14cbcSMatt Macy *
2753eda14cbcSMatt Macy * Note that we also regenerate parity when resilvering so we
2754eda14cbcSMatt Macy * can write it out to failed devices later.
2755eda14cbcSMatt Macy */
2756eda14cbcSMatt Macy if (parity_errors + parity_untried <
2757eda14cbcSMatt Macy rr->rr_firstdatacol - data_errors ||
2758eda14cbcSMatt Macy (zio->io_flags & ZIO_FLAG_RESILVER)) {
27597877fdebSMatt Macy int n = raidz_parity_verify(zio, rr);
27607877fdebSMatt Macy unexpected_errors += n;
27617877fdebSMatt Macy }
27627877fdebSMatt Macy
27637877fdebSMatt Macy if (zio->io_error == 0 && spa_writeable(zio->io_spa) &&
2764eda14cbcSMatt Macy (unexpected_errors > 0 || (zio->io_flags & ZIO_FLAG_RESILVER))) {
2765eda14cbcSMatt Macy /*
2766eda14cbcSMatt Macy * Use the good data we have in hand to repair damaged children.
2767 */
2768 for (int c = 0; c < rr->rr_cols; c++) {
2769 raidz_col_t *rc = &rr->rr_col[c];
2770 vdev_t *vd = zio->io_vd;
2771 vdev_t *cvd = vd->vdev_child[rc->rc_devidx];
2772
2773 if (!rc->rc_allow_repair) {
2774 continue;
2775 } else if (!rc->rc_force_repair &&
2776 (rc->rc_error == 0 || rc->rc_size == 0)) {
2777 continue;
2778 }
2779
2780 zfs_dbgmsg("zio=%px repairing c=%u devidx=%u "
2781 "offset=%llx",
2782 zio, c, rc->rc_devidx, (long long)rc->rc_offset);
2783
2784 zio_nowait(zio_vdev_child_io(zio, NULL, cvd,
2785 rc->rc_offset, rc->rc_abd, rc->rc_size,
2786 ZIO_TYPE_WRITE,
2787 zio->io_priority == ZIO_PRIORITY_REBUILD ?
2788 ZIO_PRIORITY_REBUILD : ZIO_PRIORITY_ASYNC_WRITE,
2789 ZIO_FLAG_IO_REPAIR | (unexpected_errors ?
2790 ZIO_FLAG_SELF_HEAL : 0), NULL, NULL));
2791 }
2792 }
2793
2794 /*
2795 * Scrub or resilver i/o's: overwrite any shadow locations with the
2796 * good data. This ensures that if we've already copied this sector,
2797 * it will be corrected if it was damaged. This writes more than is
2798 * necessary, but since expansion is paused during scrub/resilver, at
2799 * most a single row will have a shadow location.
2800 */
2801 if (zio->io_error == 0 && spa_writeable(zio->io_spa) &&
2802 (zio->io_flags & (ZIO_FLAG_RESILVER | ZIO_FLAG_SCRUB))) {
2803 for (int c = 0; c < rr->rr_cols; c++) {
2804 raidz_col_t *rc = &rr->rr_col[c];
2805 vdev_t *vd = zio->io_vd;
2806
2807 if (rc->rc_shadow_devidx == INT_MAX || rc->rc_size == 0)
2808 continue;
2809 vdev_t *cvd = vd->vdev_child[rc->rc_shadow_devidx];
2810
2811 /*
2812 * Note: We don't want to update the repair stats
2813 * because that would incorrectly indicate that there
2814 * was bad data to repair, which we aren't sure about.
2815 * By clearing the SCAN_THREAD flag, we prevent this
2816 * from happening, despite having the REPAIR flag set.
2817 * We need to set SELF_HEAL so that this i/o can't be
2818 * bypassed by zio_vdev_io_start().
2819 */
2820 zio_t *cio = zio_vdev_child_io(zio, NULL, cvd,
2821 rc->rc_shadow_offset, rc->rc_abd, rc->rc_size,
2822 ZIO_TYPE_WRITE, ZIO_PRIORITY_ASYNC_WRITE,
2823 ZIO_FLAG_IO_REPAIR | ZIO_FLAG_SELF_HEAL,
2824 NULL, NULL);
2825 cio->io_flags &= ~ZIO_FLAG_SCAN_THREAD;
2826 zio_nowait(cio);
2827 }
2828 }
2829 }
2830
2831 static void
raidz_restore_orig_data(raidz_map_t * rm)2832 raidz_restore_orig_data(raidz_map_t *rm)
2833 {
2834 for (int i = 0; i < rm->rm_nrows; i++) {
2835 raidz_row_t *rr = rm->rm_row[i];
2836 for (int c = 0; c < rr->rr_cols; c++) {
2837 raidz_col_t *rc = &rr->rr_col[c];
2838 if (rc->rc_need_orig_restore) {
2839 abd_copy(rc->rc_abd,
2840 rc->rc_orig_data, rc->rc_size);
2841 rc->rc_need_orig_restore = B_FALSE;
2842 }
2843 }
2844 }
2845 }
2846
2847 /*
2848 * During raidz_reconstruct() for expanded VDEV, we need special consideration
2849 * failure simulations. See note in raidz_reconstruct() on simulating failure
2850 * of a pre-expansion device.
2851 *
2852 * Treating logical child i as failed, return TRUE if the given column should
2853 * be treated as failed. The idea of logical children allows us to imagine
2854 * that a disk silently failed before a RAIDZ expansion (reads from this disk
2855 * succeed but return the wrong data). Since the expansion doesn't verify
2856 * checksums, the incorrect data will be moved to new locations spread among
2857 * the children (going diagonally across them).
2858 *
2859 * Higher "logical child failures" (values of `i`) indicate these
2860 * "pre-expansion failures". The first physical_width values imagine that a
2861 * current child failed; the next physical_width-1 values imagine that a
2862 * child failed before the most recent expansion; the next physical_width-2
2863 * values imagine a child failed in the expansion before that, etc.
2864 */
2865 static boolean_t
raidz_simulate_failure(int physical_width,int original_width,int ashift,int i,raidz_col_t * rc)2866 raidz_simulate_failure(int physical_width, int original_width, int ashift,
2867 int i, raidz_col_t *rc)
2868 {
2869 uint64_t sector_id =
2870 physical_width * (rc->rc_offset >> ashift) +
2871 rc->rc_devidx;
2872
2873 for (int w = physical_width; w >= original_width; w--) {
2874 if (i < w) {
2875 return (sector_id % w == i);
2876 } else {
2877 i -= w;
2878 }
2879 }
2880 ASSERT(!"invalid logical child id");
2881 return (B_FALSE);
2882 }
2883
2884 /*
2885 * returns EINVAL if reconstruction of the block will not be possible
2886 * returns ECKSUM if this specific reconstruction failed
2887 * returns 0 on successful reconstruction
2888 */
2889 static int
raidz_reconstruct(zio_t * zio,int * ltgts,int ntgts,int nparity)2890 raidz_reconstruct(zio_t *zio, int *ltgts, int ntgts, int nparity)
2891 {
2892 raidz_map_t *rm = zio->io_vsd;
2893 int physical_width = zio->io_vd->vdev_children;
2894 int original_width = (rm->rm_original_width != 0) ?
2895 rm->rm_original_width : physical_width;
2896 int dbgmsg = zfs_flags & ZFS_DEBUG_RAIDZ_RECONSTRUCT;
2897
2898 if (dbgmsg) {
2899 zfs_dbgmsg("raidz_reconstruct_expanded(zio=%px ltgts=%u,%u,%u "
2900 "ntgts=%u", zio, ltgts[0], ltgts[1], ltgts[2], ntgts);
2901 }
2902
2903 /* Reconstruct each row */
2904 for (int r = 0; r < rm->rm_nrows; r++) {
2905 raidz_row_t *rr = rm->rm_row[r];
2906 int my_tgts[VDEV_RAIDZ_MAXPARITY]; /* value is child id */
2907 int t = 0;
2908 int dead = 0;
2909 int dead_data = 0;
2910
2911 if (dbgmsg)
2912 zfs_dbgmsg("raidz_reconstruct_expanded(row=%u)", r);
2913
2914 for (int c = 0; c < rr->rr_cols; c++) {
2915 raidz_col_t *rc = &rr->rr_col[c];
2916 ASSERT0(rc->rc_need_orig_restore);
2917 if (rc->rc_error != 0) {
2918 dead++;
2919 if (c >= nparity)
2920 dead_data++;
2921 continue;
2922 }
2923 if (rc->rc_size == 0)
2924 continue;
2925 for (int lt = 0; lt < ntgts; lt++) {
2926 if (raidz_simulate_failure(physical_width,
2927 original_width,
2928 zio->io_vd->vdev_top->vdev_ashift,
2929 ltgts[lt], rc)) {
2930 if (rc->rc_orig_data == NULL) {
2931 rc->rc_orig_data =
2932 abd_alloc_linear(
2933 rc->rc_size, B_TRUE);
2934 abd_copy(rc->rc_orig_data,
2935 rc->rc_abd, rc->rc_size);
2936 }
2937 rc->rc_need_orig_restore = B_TRUE;
2938
2939 dead++;
2940 if (c >= nparity)
2941 dead_data++;
2942 /*
2943 * Note: simulating failure of a
2944 * pre-expansion device can hit more
2945 * than one column, in which case we
2946 * might try to simulate more failures
2947 * than can be reconstructed, which is
2948 * also more than the size of my_tgts.
2949 * This check prevents accessing past
2950 * the end of my_tgts. The "dead >
2951 * nparity" check below will fail this
2952 * reconstruction attempt.
2953 */
2954 if (t < VDEV_RAIDZ_MAXPARITY) {
2955 my_tgts[t++] = c;
2956 if (dbgmsg) {
2957 zfs_dbgmsg("simulating "
2958 "failure of col %u "
2959 "devidx %u", c,
2960 (int)rc->rc_devidx);
2961 }
2962 }
2963 break;
2964 }
2965 }
2966 }
2967 if (dead > nparity) {
2968 /* reconstruction not possible */
2969 if (dbgmsg) {
2970 zfs_dbgmsg("reconstruction not possible; "
2971 "too many failures");
2972 }
2973 raidz_restore_orig_data(rm);
2974 return (EINVAL);
2975 }
2976 if (dead_data > 0)
2977 vdev_raidz_reconstruct_row(rm, rr, my_tgts, t);
2978 }
2979
2980 /* Check for success */
2981 if (raidz_checksum_verify(zio) == 0) {
2982
2983 /* Reconstruction succeeded - report errors */
2984 for (int i = 0; i < rm->rm_nrows; i++) {
2985 raidz_row_t *rr = rm->rm_row[i];
2986
2987 for (int c = 0; c < rr->rr_cols; c++) {
2988 raidz_col_t *rc = &rr->rr_col[c];
2989 if (rc->rc_need_orig_restore) {
2990 /*
2991 * Note: if this is a parity column,
2992 * we don't really know if it's wrong.
2993 * We need to let
2994 * vdev_raidz_io_done_verified() check
2995 * it, and if we set rc_error, it will
2996 * think that it is a "known" error
2997 * that doesn't need to be checked
2998 * or corrected.
2999 */
3000 if (rc->rc_error == 0 &&
3001 c >= rr->rr_firstdatacol) {
3002 vdev_raidz_checksum_error(zio,
3003 rc, rc->rc_orig_data);
3004 rc->rc_error =
3005 SET_ERROR(ECKSUM);
3006 }
3007 rc->rc_need_orig_restore = B_FALSE;
3008 }
3009 }
3010
3011 vdev_raidz_io_done_verified(zio, rr);
3012 }
3013
3014 zio_checksum_verified(zio);
3015
3016 if (dbgmsg) {
3017 zfs_dbgmsg("reconstruction successful "
3018 "(checksum verified)");
3019 }
3020 return (0);
3021 }
3022
3023 /* Reconstruction failed - restore original data */
3024 raidz_restore_orig_data(rm);
3025 if (dbgmsg) {
3026 zfs_dbgmsg("raidz_reconstruct_expanded(zio=%px) checksum "
3027 "failed", zio);
3028 }
3029 return (ECKSUM);
3030 }
3031
3032 /*
3033 * Iterate over all combinations of N bad vdevs and attempt a reconstruction.
3034 * Note that the algorithm below is non-optimal because it doesn't take into
3035 * account how reconstruction is actually performed. For example, with
3036 * triple-parity RAID-Z the reconstruction procedure is the same if column 4
3037 * is targeted as invalid as if columns 1 and 4 are targeted since in both
3038 * cases we'd only use parity information in column 0.
3039 *
3040 * The order that we find the various possible combinations of failed
3041 * disks is dictated by these rules:
3042 * - Examine each "slot" (the "i" in tgts[i])
3043 * - Try to increment this slot (tgts[i] += 1)
3044 * - if we can't increment because it runs into the next slot,
3045 * reset our slot to the minimum, and examine the next slot
3046 *
3047 * For example, with a 6-wide RAIDZ3, and no known errors (so we have to choose
3048 * 3 columns to reconstruct), we will generate the following sequence:
3049 *
3050 * STATE ACTION
3051 * 0 1 2 special case: skip since these are all parity
3052 * 0 1 3 first slot: reset to 0; middle slot: increment to 2
3053 * 0 2 3 first slot: increment to 1
3054 * 1 2 3 first: reset to 0; middle: reset to 1; last: increment to 4
3055 * 0 1 4 first: reset to 0; middle: increment to 2
3056 * 0 2 4 first: increment to 1
3057 * 1 2 4 first: reset to 0; middle: increment to 3
3058 * 0 3 4 first: increment to 1
3059 * 1 3 4 first: increment to 2
3060 * 2 3 4 first: reset to 0; middle: reset to 1; last: increment to 5
3061 * 0 1 5 first: reset to 0; middle: increment to 2
3062 * 0 2 5 first: increment to 1
3063 * 1 2 5 first: reset to 0; middle: increment to 3
3064 * 0 3 5 first: increment to 1
3065 * 1 3 5 first: increment to 2
3066 * 2 3 5 first: reset to 0; middle: increment to 4
3067 * 0 4 5 first: increment to 1
3068 * 1 4 5 first: increment to 2
3069 * 2 4 5 first: increment to 3
3070 * 3 4 5 done
3071 *
3072 * This strategy works for dRAID but is less efficient when there are a large
3073 * number of child vdevs and therefore permutations to check. Furthermore,
3074 * since the raidz_map_t rows likely do not overlap, reconstruction would be
3075 * possible as long as there are no more than nparity data errors per row.
3076 * These additional permutations are not currently checked but could be as
3077 * a future improvement.
3078 *
3079 * Returns 0 on success, ECKSUM on failure.
3080 */
3081 static int
vdev_raidz_combrec(zio_t * zio)3082 vdev_raidz_combrec(zio_t *zio)
3083 {
3084 int nparity = vdev_get_nparity(zio->io_vd);
3085 raidz_map_t *rm = zio->io_vsd;
3086 int physical_width = zio->io_vd->vdev_children;
3087 int original_width = (rm->rm_original_width != 0) ?
3088 rm->rm_original_width : physical_width;
3089
3090 for (int i = 0; i < rm->rm_nrows; i++) {
3091 raidz_row_t *rr = rm->rm_row[i];
3092 int total_errors = 0;
3093
3094 for (int c = 0; c < rr->rr_cols; c++) {
3095 if (rr->rr_col[c].rc_error)
3096 total_errors++;
3097 }
3098
3099 if (total_errors > nparity)
3100 return (vdev_raidz_worst_error(rr));
3101 }
3102
3103 for (int num_failures = 1; num_failures <= nparity; num_failures++) {
3104 int tstore[VDEV_RAIDZ_MAXPARITY + 2];
3105 int *ltgts = &tstore[1]; /* value is logical child ID */
3106
3107
3108 /*
3109 * Determine number of logical children, n. See comment
3110 * above raidz_simulate_failure().
3111 */
3112 int n = 0;
3113 for (int w = physical_width;
3114 w >= original_width; w--) {
3115 n += w;
3116 }
3117
3118 ASSERT3U(num_failures, <=, nparity);
3119 ASSERT3U(num_failures, <=, VDEV_RAIDZ_MAXPARITY);
3120
3121 /* Handle corner cases in combrec logic */
3122 ltgts[-1] = -1;
3123 for (int i = 0; i < num_failures; i++) {
3124 ltgts[i] = i;
3125 }
3126 ltgts[num_failures] = n;
3127
3128 for (;;) {
3129 int err = raidz_reconstruct(zio, ltgts, num_failures,
3130 nparity);
3131 if (err == EINVAL) {
3132 /*
3133 * Reconstruction not possible with this #
3134 * failures; try more failures.
3135 */
3136 break;
3137 } else if (err == 0)
3138 return (0);
3139
3140 /* Compute next targets to try */
3141 for (int t = 0; ; t++) {
3142 ASSERT3U(t, <, num_failures);
3143 ltgts[t]++;
3144 if (ltgts[t] == n) {
3145 /* try more failures */
3146 ASSERT3U(t, ==, num_failures - 1);
3147 if (zfs_flags &
3148 ZFS_DEBUG_RAIDZ_RECONSTRUCT) {
3149 zfs_dbgmsg("reconstruction "
3150 "failed for num_failures="
3151 "%u; tried all "
3152 "combinations",
3153 num_failures);
3154 }
3155 break;
3156 }
3157
3158 ASSERT3U(ltgts[t], <, n);
3159 ASSERT3U(ltgts[t], <=, ltgts[t + 1]);
3160
3161 /*
3162 * If that spot is available, we're done here.
3163 * Try the next combination.
3164 */
3165 if (ltgts[t] != ltgts[t + 1])
3166 break; // found next combination
3167
3168 /*
3169 * Otherwise, reset this tgt to the minimum,
3170 * and move on to the next tgt.
3171 */
3172 ltgts[t] = ltgts[t - 1] + 1;
3173 ASSERT3U(ltgts[t], ==, t);
3174 }
3175
3176 /* Increase the number of failures and keep trying. */
3177 if (ltgts[num_failures - 1] == n)
3178 break;
3179 }
3180 }
3181 if (zfs_flags & ZFS_DEBUG_RAIDZ_RECONSTRUCT)
3182 zfs_dbgmsg("reconstruction failed for all num_failures");
3183 return (ECKSUM);
3184 }
3185
3186 void
vdev_raidz_reconstruct(raidz_map_t * rm,const int * t,int nt)3187 vdev_raidz_reconstruct(raidz_map_t *rm, const int *t, int nt)
3188 {
3189 for (uint64_t row = 0; row < rm->rm_nrows; row++) {
3190 raidz_row_t *rr = rm->rm_row[row];
3191 vdev_raidz_reconstruct_row(rm, rr, t, nt);
3192 }
3193 }
3194
3195 /*
3196 * Complete a write IO operation on a RAIDZ VDev
3197 *
3198 * Outline:
3199 * 1. Check for errors on the child IOs.
3200 * 2. Return, setting an error code if too few child VDevs were written
3201 * to reconstruct the data later. Note that partial writes are
3202 * considered successful if they can be reconstructed at all.
3203 */
3204 static void
vdev_raidz_io_done_write_impl(zio_t * zio,raidz_row_t * rr)3205 vdev_raidz_io_done_write_impl(zio_t *zio, raidz_row_t *rr)
3206 {
3207 int normal_errors = 0;
3208 int shadow_errors = 0;
3209
3210 ASSERT3U(rr->rr_missingparity, <=, rr->rr_firstdatacol);
3211 ASSERT3U(rr->rr_missingdata, <=, rr->rr_cols - rr->rr_firstdatacol);
3212 ASSERT3U(zio->io_type, ==, ZIO_TYPE_WRITE);
3213
3214 for (int c = 0; c < rr->rr_cols; c++) {
3215 raidz_col_t *rc = &rr->rr_col[c];
3216
3217 if (rc->rc_error != 0) {
3218 ASSERT(rc->rc_error != ECKSUM); /* child has no bp */
3219 normal_errors++;
3220 }
3221 if (rc->rc_shadow_error != 0) {
3222 ASSERT(rc->rc_shadow_error != ECKSUM);
3223 shadow_errors++;
3224 }
3225 }
3226
3227 /*
3228 * Treat partial writes as a success. If we couldn't write enough
3229 * columns to reconstruct the data, the I/O failed. Otherwise, good
3230 * enough. Note that in the case of a shadow write (during raidz
3231 * expansion), depending on if we crash, either the normal (old) or
3232 * shadow (new) location may become the "real" version of the block,
3233 * so both locations must have sufficient redundancy.
3234 *
3235 * Now that we support write reallocation, it would be better
3236 * to treat partial failure as real failure unless there are
3237 * no non-degraded top-level vdevs left, and not update DTLs
3238 * if we intend to reallocate.
3239 */
3240 if (normal_errors > rr->rr_firstdatacol ||
3241 shadow_errors > rr->rr_firstdatacol) {
3242 zio->io_error = zio_worst_error(zio->io_error,
3243 vdev_raidz_worst_error(rr));
3244 }
3245 }
3246
3247 static void
vdev_raidz_io_done_reconstruct_known_missing(zio_t * zio,raidz_map_t * rm,raidz_row_t * rr)3248 vdev_raidz_io_done_reconstruct_known_missing(zio_t *zio, raidz_map_t *rm,
3249 raidz_row_t *rr)
3250 {
3251 int parity_errors = 0;
3252 int parity_untried = 0;
3253 int data_errors = 0;
3254 int total_errors = 0;
3255
3256 ASSERT3U(rr->rr_missingparity, <=, rr->rr_firstdatacol);
3257 ASSERT3U(rr->rr_missingdata, <=, rr->rr_cols - rr->rr_firstdatacol);
3258
3259 for (int c = 0; c < rr->rr_cols; c++) {
3260 raidz_col_t *rc = &rr->rr_col[c];
3261
3262 /*
3263 * If scrubbing and a replacing/sparing child vdev determined
3264 * that not all of its children have an identical copy of the
3265 * data, then clear the error so the column is treated like
3266 * any other read and force a repair to correct the damage.
3267 */
3268 if (rc->rc_error == ECKSUM) {
3269 ASSERT(zio->io_flags & ZIO_FLAG_SCRUB);
3270 vdev_raidz_checksum_error(zio, rc, rc->rc_abd);
3271 rc->rc_force_repair = 1;
3272 rc->rc_error = 0;
3273 }
3274
3275 if (rc->rc_error) {
3276 if (c < rr->rr_firstdatacol)
3277 parity_errors++;
3278 else
3279 data_errors++;
3280
3281 total_errors++;
3282 } else if (c < rr->rr_firstdatacol && !rc->rc_tried) {
3283 parity_untried++;
3284 }
3285 }
3286
3287 /*
3288 * If there were data errors and the number of errors we saw was
3289 * correctable -- less than or equal to the number of parity disks read
3290 * -- reconstruct based on the missing data.
3291 */
3292 if (data_errors != 0 &&
3293 total_errors <= rr->rr_firstdatacol - parity_untried) {
3294 /*
3295 * We either attempt to read all the parity columns or
3296 * none of them. If we didn't try to read parity, we
3297 * wouldn't be here in the correctable case. There must
3298 * also have been fewer parity errors than parity
3299 * columns or, again, we wouldn't be in this code path.
3300 */
3301 ASSERT(parity_untried == 0);
3302 ASSERT(parity_errors < rr->rr_firstdatacol);
3303
3304 /*
3305 * Identify the data columns that reported an error.
3306 */
3307 int n = 0;
3308 int tgts[VDEV_RAIDZ_MAXPARITY];
3309 for (int c = rr->rr_firstdatacol; c < rr->rr_cols; c++) {
3310 raidz_col_t *rc = &rr->rr_col[c];
3311 if (rc->rc_error != 0) {
3312 ASSERT(n < VDEV_RAIDZ_MAXPARITY);
3313 tgts[n++] = c;
3314 }
3315 }
3316
3317 ASSERT(rr->rr_firstdatacol >= n);
3318
3319 vdev_raidz_reconstruct_row(rm, rr, tgts, n);
3320 }
3321 }
3322
3323 /*
3324 * Return the number of reads issued.
3325 */
3326 static int
vdev_raidz_read_all(zio_t * zio,raidz_row_t * rr)3327 vdev_raidz_read_all(zio_t *zio, raidz_row_t *rr)
3328 {
3329 vdev_t *vd = zio->io_vd;
3330 int nread = 0;
3331
3332 rr->rr_missingdata = 0;
3333 rr->rr_missingparity = 0;
3334
3335 /*
3336 * If this rows contains empty sectors which are not required
3337 * for a normal read then allocate an ABD for them now so they
3338 * may be read, verified, and any needed repairs performed.
3339 */
3340 if (rr->rr_nempty != 0 && rr->rr_abd_empty == NULL)
3341 vdev_draid_map_alloc_empty(zio, rr);
3342
3343 for (int c = 0; c < rr->rr_cols; c++) {
3344 raidz_col_t *rc = &rr->rr_col[c];
3345 if (rc->rc_tried || rc->rc_size == 0)
3346 continue;
3347
3348 zio_nowait(zio_vdev_child_io(zio, NULL,
3349 vd->vdev_child[rc->rc_devidx],
3350 rc->rc_offset, rc->rc_abd, rc->rc_size,
3351 zio->io_type, zio->io_priority, 0,
3352 vdev_raidz_child_done, rc));
3353 nread++;
3354 }
3355 return (nread);
3356 }
3357
3358 /*
3359 * We're here because either there were too many errors to even attempt
3360 * reconstruction (total_errors == rm_first_datacol), or vdev_*_combrec()
3361 * failed. In either case, there is enough bad data to prevent reconstruction.
3362 * Start checksum ereports for all children which haven't failed.
3363 */
3364 static void
vdev_raidz_io_done_unrecoverable(zio_t * zio)3365 vdev_raidz_io_done_unrecoverable(zio_t *zio)
3366 {
3367 raidz_map_t *rm = zio->io_vsd;
3368
3369 for (int i = 0; i < rm->rm_nrows; i++) {
3370 raidz_row_t *rr = rm->rm_row[i];
3371
3372 for (int c = 0; c < rr->rr_cols; c++) {
3373 raidz_col_t *rc = &rr->rr_col[c];
3374 vdev_t *cvd = zio->io_vd->vdev_child[rc->rc_devidx];
3375
3376 if (rc->rc_error != 0)
3377 continue;
3378
3379 zio_bad_cksum_t zbc;
3380 zbc.zbc_has_cksum = 0;
3381 zbc.zbc_injected = rm->rm_ecksuminjected;
3382
3383 mutex_enter(&cvd->vdev_stat_lock);
3384 cvd->vdev_stat.vs_checksum_errors++;
3385 mutex_exit(&cvd->vdev_stat_lock);
3386 (void) zfs_ereport_start_checksum(zio->io_spa,
3387 cvd, &zio->io_bookmark, zio, rc->rc_offset,
3388 rc->rc_size, &zbc);
3389 }
3390 }
3391 }
3392
3393 void
vdev_raidz_io_done(zio_t * zio)3394 vdev_raidz_io_done(zio_t *zio)
3395 {
3396 raidz_map_t *rm = zio->io_vsd;
3397
3398 ASSERT(zio->io_bp != NULL);
3399 if (zio->io_type == ZIO_TYPE_WRITE) {
3400 for (int i = 0; i < rm->rm_nrows; i++) {
3401 vdev_raidz_io_done_write_impl(zio, rm->rm_row[i]);
3402 }
3403 } else {
3404 if (rm->rm_phys_col) {
3405 /*
3406 * This is an aggregated read. Copy the data and status
3407 * from the aggregate abd's to the individual rows.
3408 */
3409 for (int i = 0; i < rm->rm_nrows; i++) {
3410 raidz_row_t *rr = rm->rm_row[i];
3411
3412 for (int c = 0; c < rr->rr_cols; c++) {
3413 raidz_col_t *rc = &rr->rr_col[c];
3414 if (rc->rc_tried || rc->rc_size == 0)
3415 continue;
3416
3417 raidz_col_t *prc =
3418 &rm->rm_phys_col[rc->rc_devidx];
3419 rc->rc_error = prc->rc_error;
3420 rc->rc_tried = prc->rc_tried;
3421 rc->rc_skipped = prc->rc_skipped;
3422 if (c >= rr->rr_firstdatacol) {
3423 /*
3424 * Note: this is slightly faster
3425 * than using abd_copy_off().
3426 */
3427 char *physbuf = abd_to_buf(
3428 prc->rc_abd);
3429 void *physloc = physbuf +
3430 rc->rc_offset -
3431 prc->rc_offset;
3432
3433 abd_copy_from_buf(rc->rc_abd,
3434 physloc, rc->rc_size);
3435 }
3436 }
3437 }
3438 }
3439
3440 for (int i = 0; i < rm->rm_nrows; i++) {
3441 raidz_row_t *rr = rm->rm_row[i];
3442 vdev_raidz_io_done_reconstruct_known_missing(zio,
3443 rm, rr);
3444 }
3445
3446 if (raidz_checksum_verify(zio) == 0) {
3447 for (int i = 0; i < rm->rm_nrows; i++) {
3448 raidz_row_t *rr = rm->rm_row[i];
3449 vdev_raidz_io_done_verified(zio, rr);
3450 }
3451 zio_checksum_verified(zio);
3452 } else {
3453 /*
3454 * A sequential resilver has no checksum which makes
3455 * combinatoral reconstruction impossible. This code
3456 * path is unreachable since raidz_checksum_verify()
3457 * has no checksum to verify and must succeed.
3458 */
3459 ASSERT3U(zio->io_priority, !=, ZIO_PRIORITY_REBUILD);
3460
3461 /*
3462 * This isn't a typical situation -- either we got a
3463 * read error or a child silently returned bad data.
3464 * Read every block so we can try again with as much
3465 * data and parity as we can track down. If we've
3466 * already been through once before, all children will
3467 * be marked as tried so we'll proceed to combinatorial
3468 * reconstruction.
3469 */
3470 int nread = 0;
3471 for (int i = 0; i < rm->rm_nrows; i++) {
3472 nread += vdev_raidz_read_all(zio,
3473 rm->rm_row[i]);
3474 }
3475 if (nread != 0) {
3476 /*
3477 * Normally our stage is VDEV_IO_DONE, but if
3478 * we've already called redone(), it will have
3479 * changed to VDEV_IO_START, in which case we
3480 * don't want to call redone() again.
3481 */
3482 if (zio->io_stage != ZIO_STAGE_VDEV_IO_START)
3483 zio_vdev_io_redone(zio);
3484 return;
3485 }
3486 /*
3487 * It would be too expensive to try every possible
3488 * combination of failed sectors in every row, so
3489 * instead we try every combination of failed current or
3490 * past physical disk. This means that if the incorrect
3491 * sectors were all on Nparity disks at any point in the
3492 * past, we will find the correct data. The only known
3493 * case where this is less durable than a non-expanded
3494 * RAIDZ, is if we have a silent failure during
3495 * expansion. In that case, one block could be
3496 * partially in the old format and partially in the
3497 * new format, so we'd lost some sectors from the old
3498 * format and some from the new format.
3499 *
3500 * e.g. logical_width=4 physical_width=6
3501 * the 15 (6+5+4) possible failed disks are:
3502 * width=6 child=0
3503 * width=6 child=1
3504 * width=6 child=2
3505 * width=6 child=3
3506 * width=6 child=4
3507 * width=6 child=5
3508 * width=5 child=0
3509 * width=5 child=1
3510 * width=5 child=2
3511 * width=5 child=3
3512 * width=5 child=4
3513 * width=4 child=0
3514 * width=4 child=1
3515 * width=4 child=2
3516 * width=4 child=3
3517 * And we will try every combination of Nparity of these
3518 * failing.
3519 *
3520 * As a first pass, we can generate every combo,
3521 * and try reconstructing, ignoring any known
3522 * failures. If any row has too many known + simulated
3523 * failures, then we bail on reconstructing with this
3524 * number of simulated failures. As an improvement,
3525 * we could detect the number of whole known failures
3526 * (i.e. we have known failures on these disks for
3527 * every row; the disks never succeeded), and
3528 * subtract that from the max # failures to simulate.
3529 * We could go even further like the current
3530 * combrec code, but that doesn't seem like it
3531 * gains us very much. If we simulate a failure
3532 * that is also a known failure, that's fine.
3533 */
3534 zio->io_error = vdev_raidz_combrec(zio);
3535 if (zio->io_error == ECKSUM &&
3536 !(zio->io_flags & ZIO_FLAG_SPECULATIVE)) {
3537 vdev_raidz_io_done_unrecoverable(zio);
3538 }
3539 }
3540 }
3541 if (rm->rm_lr != NULL) {
3542 zfs_rangelock_exit(rm->rm_lr);
3543 rm->rm_lr = NULL;
3544 }
3545 }
3546
3547 static void
vdev_raidz_state_change(vdev_t * vd,int faulted,int degraded)3548 vdev_raidz_state_change(vdev_t *vd, int faulted, int degraded)
3549 {
3550 vdev_raidz_t *vdrz = vd->vdev_tsd;
3551 if (faulted > vdrz->vd_nparity)
3552 vdev_set_state(vd, B_FALSE, VDEV_STATE_CANT_OPEN,
3553 VDEV_AUX_NO_REPLICAS);
3554 else if (degraded + faulted != 0)
3555 vdev_set_state(vd, B_FALSE, VDEV_STATE_DEGRADED, VDEV_AUX_NONE);
3556 else
3557 vdev_set_state(vd, B_FALSE, VDEV_STATE_HEALTHY, VDEV_AUX_NONE);
3558 }
3559
3560 /*
3561 * Determine if any portion of the provided block resides on a child vdev
3562 * with a dirty DTL and therefore needs to be resilvered. The function
3563 * assumes that at least one DTL is dirty which implies that full stripe
3564 * width blocks must be resilvered.
3565 */
3566 static boolean_t
vdev_raidz_need_resilver(vdev_t * vd,const dva_t * dva,size_t psize,uint64_t phys_birth)3567 vdev_raidz_need_resilver(vdev_t *vd, const dva_t *dva, size_t psize,
3568 uint64_t phys_birth)
3569 {
3570 vdev_raidz_t *vdrz = vd->vdev_tsd;
3571
3572 /*
3573 * If we're in the middle of a RAIDZ expansion, this block may be in
3574 * the old and/or new location. For simplicity, always resilver it.
3575 */
3576 if (vdrz->vn_vre.vre_state == DSS_SCANNING)
3577 return (B_TRUE);
3578
3579 uint64_t dcols = vd->vdev_children;
3580 uint64_t nparity = vdrz->vd_nparity;
3581 uint64_t ashift = vd->vdev_top->vdev_ashift;
3582 /* The starting RAIDZ (parent) vdev sector of the block. */
3583 uint64_t b = DVA_GET_OFFSET(dva) >> ashift;
3584 /* The zio's size in units of the vdev's minimum sector size. */
3585 uint64_t s = ((psize - 1) >> ashift) + 1;
3586 /* The first column for this stripe. */
3587 uint64_t f = b % dcols;
3588
3589 /* Unreachable by sequential resilver. */
3590 ASSERT3U(phys_birth, !=, TXG_UNKNOWN);
3591
3592 if (!vdev_dtl_contains(vd, DTL_PARTIAL, phys_birth, 1))
3593 return (B_FALSE);
3594
3595 if (s + nparity >= dcols)
3596 return (B_TRUE);
3597
3598 for (uint64_t c = 0; c < s + nparity; c++) {
3599 uint64_t devidx = (f + c) % dcols;
3600 vdev_t *cvd = vd->vdev_child[devidx];
3601
3602 /*
3603 * dsl_scan_need_resilver() already checked vd with
3604 * vdev_dtl_contains(). So here just check cvd with
3605 * vdev_dtl_empty(), cheaper and a good approximation.
3606 */
3607 if (!vdev_dtl_empty(cvd, DTL_PARTIAL))
3608 return (B_TRUE);
3609 }
3610
3611 return (B_FALSE);
3612 }
3613
3614 static void
vdev_raidz_xlate(vdev_t * cvd,const range_seg64_t * logical_rs,range_seg64_t * physical_rs,range_seg64_t * remain_rs)3615 vdev_raidz_xlate(vdev_t *cvd, const range_seg64_t *logical_rs,
3616 range_seg64_t *physical_rs, range_seg64_t *remain_rs)
3617 {
3618 (void) remain_rs;
3619
3620 vdev_t *raidvd = cvd->vdev_parent;
3621 ASSERT(raidvd->vdev_ops == &vdev_raidz_ops);
3622
3623 vdev_raidz_t *vdrz = raidvd->vdev_tsd;
3624
3625 if (vdrz->vn_vre.vre_state == DSS_SCANNING) {
3626 /*
3627 * We're in the middle of expansion, in which case the
3628 * translation is in flux. Any answer we give may be wrong
3629 * by the time we return, so it isn't safe for the caller to
3630 * act on it. Therefore we say that this range isn't present
3631 * on any children. The only consumers of this are "zpool
3632 * initialize" and trimming, both of which are "best effort"
3633 * anyway.
3634 */
3635 physical_rs->rs_start = physical_rs->rs_end = 0;
3636 remain_rs->rs_start = remain_rs->rs_end = 0;
3637 return;
3638 }
3639
3640 uint64_t width = vdrz->vd_physical_width;
3641 uint64_t tgt_col = cvd->vdev_id;
3642 uint64_t ashift = raidvd->vdev_top->vdev_ashift;
3643
3644 /* make sure the offsets are block-aligned */
3645 ASSERT0(logical_rs->rs_start % (1 << ashift));
3646 ASSERT0(logical_rs->rs_end % (1 << ashift));
3647 uint64_t b_start = logical_rs->rs_start >> ashift;
3648 uint64_t b_end = logical_rs->rs_end >> ashift;
3649
3650 uint64_t start_row = 0;
3651 if (b_start > tgt_col) /* avoid underflow */
3652 start_row = ((b_start - tgt_col - 1) / width) + 1;
3653
3654 uint64_t end_row = 0;
3655 if (b_end > tgt_col)
3656 end_row = ((b_end - tgt_col - 1) / width) + 1;
3657
3658 physical_rs->rs_start = start_row << ashift;
3659 physical_rs->rs_end = end_row << ashift;
3660
3661 ASSERT3U(physical_rs->rs_start, <=, logical_rs->rs_start);
3662 ASSERT3U(physical_rs->rs_end - physical_rs->rs_start, <=,
3663 logical_rs->rs_end - logical_rs->rs_start);
3664 }
3665
3666 static void
raidz_reflow_sync(void * arg,dmu_tx_t * tx)3667 raidz_reflow_sync(void *arg, dmu_tx_t *tx)
3668 {
3669 spa_t *spa = arg;
3670 int txgoff = dmu_tx_get_txg(tx) & TXG_MASK;
3671 vdev_raidz_expand_t *vre = spa->spa_raidz_expand;
3672
3673 /*
3674 * Ensure there are no i/os to the range that is being committed.
3675 */
3676 uint64_t old_offset = RRSS_GET_OFFSET(&spa->spa_uberblock);
3677 ASSERT3U(vre->vre_offset_pertxg[txgoff], >=, old_offset);
3678
3679 mutex_enter(&vre->vre_lock);
3680 uint64_t new_offset =
3681 MIN(vre->vre_offset_pertxg[txgoff], vre->vre_failed_offset);
3682 /*
3683 * We should not have committed anything that failed.
3684 */
3685 VERIFY3U(vre->vre_failed_offset, >=, old_offset);
3686 mutex_exit(&vre->vre_lock);
3687
3688 zfs_locked_range_t *lr = zfs_rangelock_enter(&vre->vre_rangelock,
3689 old_offset, new_offset - old_offset,
3690 RL_WRITER);
3691
3692 /*
3693 * Update the uberblock that will be written when this txg completes.
3694 */
3695 RAIDZ_REFLOW_SET(&spa->spa_uberblock,
3696 RRSS_SCRATCH_INVALID_SYNCED_REFLOW, new_offset);
3697 vre->vre_offset_pertxg[txgoff] = 0;
3698 zfs_rangelock_exit(lr);
3699
3700 mutex_enter(&vre->vre_lock);
3701 vre->vre_bytes_copied += vre->vre_bytes_copied_pertxg[txgoff];
3702 vre->vre_bytes_copied_pertxg[txgoff] = 0;
3703 mutex_exit(&vre->vre_lock);
3704
3705 vdev_t *vd = vdev_lookup_top(spa, vre->vre_vdev_id);
3706 VERIFY0(zap_update(spa->spa_meta_objset,
3707 vd->vdev_top_zap, VDEV_TOP_ZAP_RAIDZ_EXPAND_BYTES_COPIED,
3708 sizeof (vre->vre_bytes_copied), 1, &vre->vre_bytes_copied, tx));
3709 }
3710
3711 static void
raidz_reflow_complete_sync(void * arg,dmu_tx_t * tx)3712 raidz_reflow_complete_sync(void *arg, dmu_tx_t *tx)
3713 {
3714 spa_t *spa = arg;
3715 vdev_raidz_expand_t *vre = spa->spa_raidz_expand;
3716 vdev_t *raidvd = vdev_lookup_top(spa, vre->vre_vdev_id);
3717 vdev_raidz_t *vdrz = raidvd->vdev_tsd;
3718
3719 for (int i = 0; i < TXG_SIZE; i++)
3720 VERIFY0(vre->vre_offset_pertxg[i]);
3721
3722 reflow_node_t *re = kmem_zalloc(sizeof (*re), KM_SLEEP);
3723 re->re_txg = tx->tx_txg + TXG_CONCURRENT_STATES;
3724 re->re_logical_width = vdrz->vd_physical_width;
3725 mutex_enter(&vdrz->vd_expand_lock);
3726 avl_add(&vdrz->vd_expand_txgs, re);
3727 mutex_exit(&vdrz->vd_expand_lock);
3728
3729 vdev_t *vd = vdev_lookup_top(spa, vre->vre_vdev_id);
3730
3731 /*
3732 * Dirty the config so that the updated ZPOOL_CONFIG_RAIDZ_EXPAND_TXGS
3733 * will get written (based on vd_expand_txgs).
3734 */
3735 vdev_config_dirty(vd);
3736
3737 /*
3738 * Before we change vre_state, the on-disk state must reflect that we
3739 * have completed all copying, so that vdev_raidz_io_start() can use
3740 * vre_state to determine if the reflow is in progress. See also the
3741 * end of spa_raidz_expand_thread().
3742 */
3743 VERIFY3U(RRSS_GET_OFFSET(&spa->spa_ubsync), ==,
3744 raidvd->vdev_ms_count << raidvd->vdev_ms_shift);
3745
3746 vre->vre_end_time = gethrestime_sec();
3747 vre->vre_state = DSS_FINISHED;
3748
3749 uint64_t state = vre->vre_state;
3750 VERIFY0(zap_update(spa->spa_meta_objset,
3751 vd->vdev_top_zap, VDEV_TOP_ZAP_RAIDZ_EXPAND_STATE,
3752 sizeof (state), 1, &state, tx));
3753
3754 uint64_t end_time = vre->vre_end_time;
3755 VERIFY0(zap_update(spa->spa_meta_objset,
3756 vd->vdev_top_zap, VDEV_TOP_ZAP_RAIDZ_EXPAND_END_TIME,
3757 sizeof (end_time), 1, &end_time, tx));
3758
3759 spa->spa_uberblock.ub_raidz_reflow_info = 0;
3760
3761 spa_history_log_internal(spa, "raidz vdev expansion completed", tx,
3762 "%s vdev %llu new width %llu", spa_name(spa),
3763 (unsigned long long)vd->vdev_id,
3764 (unsigned long long)vd->vdev_children);
3765
3766 spa->spa_raidz_expand = NULL;
3767 raidvd->vdev_rz_expanding = B_FALSE;
3768
3769 spa_async_request(spa, SPA_ASYNC_INITIALIZE_RESTART);
3770 spa_async_request(spa, SPA_ASYNC_TRIM_RESTART);
3771 spa_async_request(spa, SPA_ASYNC_AUTOTRIM_RESTART);
3772
3773 spa_notify_waiters(spa);
3774
3775 /*
3776 * While we're in syncing context take the opportunity to
3777 * setup a scrub. All the data has been sucessfully copied
3778 * but we have not validated any checksums.
3779 */
3780 pool_scan_func_t func = POOL_SCAN_SCRUB;
3781 if (zfs_scrub_after_expand && dsl_scan_setup_check(&func, tx) == 0)
3782 dsl_scan_setup_sync(&func, tx);
3783 }
3784
3785 /*
3786 * Struct for one copy zio.
3787 */
3788 typedef struct raidz_reflow_arg {
3789 vdev_raidz_expand_t *rra_vre;
3790 zfs_locked_range_t *rra_lr;
3791 uint64_t rra_txg;
3792 } raidz_reflow_arg_t;
3793
3794 /*
3795 * The write of the new location is done.
3796 */
3797 static void
raidz_reflow_write_done(zio_t * zio)3798 raidz_reflow_write_done(zio_t *zio)
3799 {
3800 raidz_reflow_arg_t *rra = zio->io_private;
3801 vdev_raidz_expand_t *vre = rra->rra_vre;
3802
3803 abd_free(zio->io_abd);
3804
3805 mutex_enter(&vre->vre_lock);
3806 if (zio->io_error != 0) {
3807 /* Force a reflow pause on errors */
3808 vre->vre_failed_offset =
3809 MIN(vre->vre_failed_offset, rra->rra_lr->lr_offset);
3810 }
3811 ASSERT3U(vre->vre_outstanding_bytes, >=, zio->io_size);
3812 vre->vre_outstanding_bytes -= zio->io_size;
3813 if (rra->rra_lr->lr_offset + rra->rra_lr->lr_length <
3814 vre->vre_failed_offset) {
3815 vre->vre_bytes_copied_pertxg[rra->rra_txg & TXG_MASK] +=
3816 zio->io_size;
3817 }
3818 cv_signal(&vre->vre_cv);
3819 mutex_exit(&vre->vre_lock);
3820
3821 zfs_rangelock_exit(rra->rra_lr);
3822
3823 kmem_free(rra, sizeof (*rra));
3824 spa_config_exit(zio->io_spa, SCL_STATE, zio->io_spa);
3825 }
3826
3827 /*
3828 * The read of the old location is done. The parent zio is the write to
3829 * the new location. Allow it to start.
3830 */
3831 static void
raidz_reflow_read_done(zio_t * zio)3832 raidz_reflow_read_done(zio_t *zio)
3833 {
3834 raidz_reflow_arg_t *rra = zio->io_private;
3835 vdev_raidz_expand_t *vre = rra->rra_vre;
3836
3837 /*
3838 * If the read failed, or if it was done on a vdev that is not fully
3839 * healthy (e.g. a child that has a resilver in progress), we may not
3840 * have the correct data. Note that it's OK if the write proceeds.
3841 * It may write garbage but the location is otherwise unused and we
3842 * will retry later due to vre_failed_offset.
3843 */
3844 if (zio->io_error != 0 || !vdev_dtl_empty(zio->io_vd, DTL_MISSING)) {
3845 zfs_dbgmsg("reflow read failed off=%llu size=%llu txg=%llu "
3846 "err=%u partial_dtl_empty=%u missing_dtl_empty=%u",
3847 (long long)rra->rra_lr->lr_offset,
3848 (long long)rra->rra_lr->lr_length,
3849 (long long)rra->rra_txg,
3850 zio->io_error,
3851 vdev_dtl_empty(zio->io_vd, DTL_PARTIAL),
3852 vdev_dtl_empty(zio->io_vd, DTL_MISSING));
3853 mutex_enter(&vre->vre_lock);
3854 /* Force a reflow pause on errors */
3855 vre->vre_failed_offset =
3856 MIN(vre->vre_failed_offset, rra->rra_lr->lr_offset);
3857 mutex_exit(&vre->vre_lock);
3858 }
3859
3860 zio_nowait(zio_unique_parent(zio));
3861 }
3862
3863 static void
raidz_reflow_record_progress(vdev_raidz_expand_t * vre,uint64_t offset,dmu_tx_t * tx)3864 raidz_reflow_record_progress(vdev_raidz_expand_t *vre, uint64_t offset,
3865 dmu_tx_t *tx)
3866 {
3867 int txgoff = dmu_tx_get_txg(tx) & TXG_MASK;
3868 spa_t *spa = dmu_tx_pool(tx)->dp_spa;
3869
3870 if (offset == 0)
3871 return;
3872
3873 mutex_enter(&vre->vre_lock);
3874 ASSERT3U(vre->vre_offset, <=, offset);
3875 vre->vre_offset = offset;
3876 mutex_exit(&vre->vre_lock);
3877
3878 if (vre->vre_offset_pertxg[txgoff] == 0) {
3879 dsl_sync_task_nowait(dmu_tx_pool(tx), raidz_reflow_sync,
3880 spa, tx);
3881 }
3882 vre->vre_offset_pertxg[txgoff] = offset;
3883 }
3884
3885 static boolean_t
vdev_raidz_expand_child_replacing(vdev_t * raidz_vd)3886 vdev_raidz_expand_child_replacing(vdev_t *raidz_vd)
3887 {
3888 for (int i = 0; i < raidz_vd->vdev_children; i++) {
3889 /* Quick check if a child is being replaced */
3890 if (!raidz_vd->vdev_child[i]->vdev_ops->vdev_op_leaf)
3891 return (B_TRUE);
3892 }
3893 return (B_FALSE);
3894 }
3895
3896 static boolean_t
raidz_reflow_impl(vdev_t * vd,vdev_raidz_expand_t * vre,range_tree_t * rt,dmu_tx_t * tx)3897 raidz_reflow_impl(vdev_t *vd, vdev_raidz_expand_t *vre, range_tree_t *rt,
3898 dmu_tx_t *tx)
3899 {
3900 spa_t *spa = vd->vdev_spa;
3901 int ashift = vd->vdev_top->vdev_ashift;
3902 uint64_t offset, size;
3903
3904 if (!range_tree_find_in(rt, 0, vd->vdev_top->vdev_asize,
3905 &offset, &size)) {
3906 return (B_FALSE);
3907 }
3908 ASSERT(IS_P2ALIGNED(offset, 1 << ashift));
3909 ASSERT3U(size, >=, 1 << ashift);
3910 uint64_t length = 1 << ashift;
3911 int txgoff = dmu_tx_get_txg(tx) & TXG_MASK;
3912
3913 uint64_t blkid = offset >> ashift;
3914
3915 int old_children = vd->vdev_children - 1;
3916
3917 /*
3918 * We can only progress to the point that writes will not overlap
3919 * with blocks whose progress has not yet been recorded on disk.
3920 * Since partially-copied rows are still read from the old location,
3921 * we need to stop one row before the sector-wise overlap, to prevent
3922 * row-wise overlap.
3923 *
3924 * Note that even if we are skipping over a large unallocated region,
3925 * we can't move the on-disk progress to `offset`, because concurrent
3926 * writes/allocations could still use the currently-unallocated
3927 * region.
3928 */
3929 uint64_t ubsync_blkid =
3930 RRSS_GET_OFFSET(&spa->spa_ubsync) >> ashift;
3931 uint64_t next_overwrite_blkid = ubsync_blkid +
3932 ubsync_blkid / old_children - old_children;
3933 VERIFY3U(next_overwrite_blkid, >, ubsync_blkid);
3934
3935 if (blkid >= next_overwrite_blkid) {
3936 raidz_reflow_record_progress(vre,
3937 next_overwrite_blkid << ashift, tx);
3938 return (B_TRUE);
3939 }
3940
3941 range_tree_remove(rt, offset, length);
3942
3943 raidz_reflow_arg_t *rra = kmem_zalloc(sizeof (*rra), KM_SLEEP);
3944 rra->rra_vre = vre;
3945 rra->rra_lr = zfs_rangelock_enter(&vre->vre_rangelock,
3946 offset, length, RL_WRITER);
3947 rra->rra_txg = dmu_tx_get_txg(tx);
3948
3949 raidz_reflow_record_progress(vre, offset + length, tx);
3950
3951 mutex_enter(&vre->vre_lock);
3952 vre->vre_outstanding_bytes += length;
3953 mutex_exit(&vre->vre_lock);
3954
3955 /*
3956 * SCL_STATE will be released when the read and write are done,
3957 * by raidz_reflow_write_done().
3958 */
3959 spa_config_enter(spa, SCL_STATE, spa, RW_READER);
3960
3961 /* check if a replacing vdev was added, if so treat it as an error */
3962 if (vdev_raidz_expand_child_replacing(vd)) {
3963 zfs_dbgmsg("replacing vdev encountered, reflow paused at "
3964 "offset=%llu txg=%llu",
3965 (long long)rra->rra_lr->lr_offset,
3966 (long long)rra->rra_txg);
3967
3968 mutex_enter(&vre->vre_lock);
3969 vre->vre_failed_offset =
3970 MIN(vre->vre_failed_offset, rra->rra_lr->lr_offset);
3971 cv_signal(&vre->vre_cv);
3972 mutex_exit(&vre->vre_lock);
3973
3974 /* drop everything we acquired */
3975 zfs_rangelock_exit(rra->rra_lr);
3976 kmem_free(rra, sizeof (*rra));
3977 spa_config_exit(spa, SCL_STATE, spa);
3978 return (B_TRUE);
3979 }
3980
3981 zio_t *pio = spa->spa_txg_zio[txgoff];
3982 abd_t *abd = abd_alloc_for_io(length, B_FALSE);
3983 zio_t *write_zio = zio_vdev_child_io(pio, NULL,
3984 vd->vdev_child[blkid % vd->vdev_children],
3985 (blkid / vd->vdev_children) << ashift,
3986 abd, length,
3987 ZIO_TYPE_WRITE, ZIO_PRIORITY_REMOVAL,
3988 ZIO_FLAG_CANFAIL,
3989 raidz_reflow_write_done, rra);
3990
3991 zio_nowait(zio_vdev_child_io(write_zio, NULL,
3992 vd->vdev_child[blkid % old_children],
3993 (blkid / old_children) << ashift,
3994 abd, length,
3995 ZIO_TYPE_READ, ZIO_PRIORITY_REMOVAL,
3996 ZIO_FLAG_CANFAIL,
3997 raidz_reflow_read_done, rra));
3998
3999 return (B_FALSE);
4000 }
4001
4002 /*
4003 * For testing (ztest specific)
4004 */
4005 static void
raidz_expand_pause(uint_t pause_point)4006 raidz_expand_pause(uint_t pause_point)
4007 {
4008 while (raidz_expand_pause_point != 0 &&
4009 raidz_expand_pause_point <= pause_point)
4010 delay(hz);
4011 }
4012
4013 static void
raidz_scratch_child_done(zio_t * zio)4014 raidz_scratch_child_done(zio_t *zio)
4015 {
4016 zio_t *pio = zio->io_private;
4017
4018 mutex_enter(&pio->io_lock);
4019 pio->io_error = zio_worst_error(pio->io_error, zio->io_error);
4020 mutex_exit(&pio->io_lock);
4021 }
4022
4023 /*
4024 * Reflow the beginning portion of the vdev into an intermediate scratch area
4025 * in memory and on disk. This operation must be persisted on disk before we
4026 * proceed to overwrite the beginning portion with the reflowed data.
4027 *
4028 * This multi-step task can fail to complete if disk errors are encountered
4029 * and we can return here after a pause (waiting for disk to become healthy).
4030 */
4031 static void
raidz_reflow_scratch_sync(void * arg,dmu_tx_t * tx)4032 raidz_reflow_scratch_sync(void *arg, dmu_tx_t *tx)
4033 {
4034 vdev_raidz_expand_t *vre = arg;
4035 spa_t *spa = dmu_tx_pool(tx)->dp_spa;
4036 zio_t *pio;
4037 int error;
4038
4039 spa_config_enter(spa, SCL_STATE, FTAG, RW_READER);
4040 vdev_t *raidvd = vdev_lookup_top(spa, vre->vre_vdev_id);
4041 int ashift = raidvd->vdev_ashift;
4042 uint64_t write_size = P2ALIGN(VDEV_BOOT_SIZE, 1 << ashift);
4043 uint64_t logical_size = write_size * raidvd->vdev_children;
4044 uint64_t read_size =
4045 P2ROUNDUP(DIV_ROUND_UP(logical_size, (raidvd->vdev_children - 1)),
4046 1 << ashift);
4047
4048 /*
4049 * The scratch space must be large enough to get us to the point
4050 * that one row does not overlap itself when moved. This is checked
4051 * by vdev_raidz_attach_check().
4052 */
4053 VERIFY3U(write_size, >=, raidvd->vdev_children << ashift);
4054 VERIFY3U(write_size, <=, VDEV_BOOT_SIZE);
4055 VERIFY3U(write_size, <=, read_size);
4056
4057 zfs_locked_range_t *lr = zfs_rangelock_enter(&vre->vre_rangelock,
4058 0, logical_size, RL_WRITER);
4059
4060 abd_t **abds = kmem_alloc(raidvd->vdev_children * sizeof (abd_t *),
4061 KM_SLEEP);
4062 for (int i = 0; i < raidvd->vdev_children; i++) {
4063 abds[i] = abd_alloc_linear(read_size, B_FALSE);
4064 }
4065
4066 raidz_expand_pause(RAIDZ_EXPAND_PAUSE_PRE_SCRATCH_1);
4067
4068 /*
4069 * If we have already written the scratch area then we must read from
4070 * there, since new writes were redirected there while we were paused
4071 * or the original location may have been partially overwritten with
4072 * reflowed data.
4073 */
4074 if (RRSS_GET_STATE(&spa->spa_ubsync) == RRSS_SCRATCH_VALID) {
4075 VERIFY3U(RRSS_GET_OFFSET(&spa->spa_ubsync), ==, logical_size);
4076 /*
4077 * Read from scratch space.
4078 */
4079 pio = zio_root(spa, NULL, NULL, ZIO_FLAG_CANFAIL);
4080 for (int i = 0; i < raidvd->vdev_children; i++) {
4081 /*
4082 * Note: zio_vdev_child_io() adds VDEV_LABEL_START_SIZE
4083 * to the offset to calculate the physical offset to
4084 * write to. Passing in a negative offset makes us
4085 * access the scratch area.
4086 */
4087 zio_nowait(zio_vdev_child_io(pio, NULL,
4088 raidvd->vdev_child[i],
4089 VDEV_BOOT_OFFSET - VDEV_LABEL_START_SIZE, abds[i],
4090 write_size, ZIO_TYPE_READ, ZIO_PRIORITY_ASYNC_READ,
4091 ZIO_FLAG_CANFAIL, raidz_scratch_child_done, pio));
4092 }
4093 error = zio_wait(pio);
4094 if (error != 0) {
4095 zfs_dbgmsg("reflow: error %d reading scratch location",
4096 error);
4097 goto io_error_exit;
4098 }
4099 goto overwrite;
4100 }
4101
4102 /*
4103 * Read from original location.
4104 */
4105 pio = zio_root(spa, NULL, NULL, ZIO_FLAG_CANFAIL);
4106 for (int i = 0; i < raidvd->vdev_children - 1; i++) {
4107 ASSERT0(vdev_is_dead(raidvd->vdev_child[i]));
4108 zio_nowait(zio_vdev_child_io(pio, NULL, raidvd->vdev_child[i],
4109 0, abds[i], read_size, ZIO_TYPE_READ,
4110 ZIO_PRIORITY_ASYNC_READ, ZIO_FLAG_CANFAIL,
4111 raidz_scratch_child_done, pio));
4112 }
4113 error = zio_wait(pio);
4114 if (error != 0) {
4115 zfs_dbgmsg("reflow: error %d reading original location", error);
4116 io_error_exit:
4117 for (int i = 0; i < raidvd->vdev_children; i++)
4118 abd_free(abds[i]);
4119 kmem_free(abds, raidvd->vdev_children * sizeof (abd_t *));
4120 zfs_rangelock_exit(lr);
4121 spa_config_exit(spa, SCL_STATE, FTAG);
4122 return;
4123 }
4124
4125 raidz_expand_pause(RAIDZ_EXPAND_PAUSE_PRE_SCRATCH_2);
4126
4127 /*
4128 * Reflow in memory.
4129 */
4130 uint64_t logical_sectors = logical_size >> ashift;
4131 for (int i = raidvd->vdev_children - 1; i < logical_sectors; i++) {
4132 int oldchild = i % (raidvd->vdev_children - 1);
4133 uint64_t oldoff = (i / (raidvd->vdev_children - 1)) << ashift;
4134
4135 int newchild = i % raidvd->vdev_children;
4136 uint64_t newoff = (i / raidvd->vdev_children) << ashift;
4137
4138 /* a single sector should not be copying over itself */
4139 ASSERT(!(newchild == oldchild && newoff == oldoff));
4140
4141 abd_copy_off(abds[newchild], abds[oldchild],
4142 newoff, oldoff, 1 << ashift);
4143 }
4144
4145 /*
4146 * Verify that we filled in everything we intended to (write_size on
4147 * each child).
4148 */
4149 VERIFY0(logical_sectors % raidvd->vdev_children);
4150 VERIFY3U((logical_sectors / raidvd->vdev_children) << ashift, ==,
4151 write_size);
4152
4153 /*
4154 * Write to scratch location (boot area).
4155 */
4156 pio = zio_root(spa, NULL, NULL, ZIO_FLAG_CANFAIL);
4157 for (int i = 0; i < raidvd->vdev_children; i++) {
4158 /*
4159 * Note: zio_vdev_child_io() adds VDEV_LABEL_START_SIZE to
4160 * the offset to calculate the physical offset to write to.
4161 * Passing in a negative offset lets us access the boot area.
4162 */
4163 zio_nowait(zio_vdev_child_io(pio, NULL, raidvd->vdev_child[i],
4164 VDEV_BOOT_OFFSET - VDEV_LABEL_START_SIZE, abds[i],
4165 write_size, ZIO_TYPE_WRITE, ZIO_PRIORITY_ASYNC_WRITE,
4166 ZIO_FLAG_CANFAIL, raidz_scratch_child_done, pio));
4167 }
4168 error = zio_wait(pio);
4169 if (error != 0) {
4170 zfs_dbgmsg("reflow: error %d writing scratch location", error);
4171 goto io_error_exit;
4172 }
4173 pio = zio_root(spa, NULL, NULL, 0);
4174 zio_flush(pio, raidvd);
4175 zio_wait(pio);
4176
4177 zfs_dbgmsg("reflow: wrote %llu bytes (logical) to scratch area",
4178 (long long)logical_size);
4179
4180 raidz_expand_pause(RAIDZ_EXPAND_PAUSE_PRE_SCRATCH_3);
4181
4182 /*
4183 * Update uberblock to indicate that scratch space is valid. This is
4184 * needed because after this point, the real location may be
4185 * overwritten. If we crash, we need to get the data from the
4186 * scratch space, rather than the real location.
4187 *
4188 * Note: ub_timestamp is bumped so that vdev_uberblock_compare()
4189 * will prefer this uberblock.
4190 */
4191 RAIDZ_REFLOW_SET(&spa->spa_ubsync, RRSS_SCRATCH_VALID, logical_size);
4192 spa->spa_ubsync.ub_timestamp++;
4193 ASSERT0(vdev_uberblock_sync_list(&spa->spa_root_vdev, 1,
4194 &spa->spa_ubsync, ZIO_FLAG_CONFIG_WRITER));
4195 if (spa_multihost(spa))
4196 mmp_update_uberblock(spa, &spa->spa_ubsync);
4197
4198 zfs_dbgmsg("reflow: uberblock updated "
4199 "(txg %llu, SCRATCH_VALID, size %llu, ts %llu)",
4200 (long long)spa->spa_ubsync.ub_txg,
4201 (long long)logical_size,
4202 (long long)spa->spa_ubsync.ub_timestamp);
4203
4204 raidz_expand_pause(RAIDZ_EXPAND_PAUSE_SCRATCH_VALID);
4205
4206 /*
4207 * Overwrite with reflow'ed data.
4208 */
4209 overwrite:
4210 pio = zio_root(spa, NULL, NULL, ZIO_FLAG_CANFAIL);
4211 for (int i = 0; i < raidvd->vdev_children; i++) {
4212 zio_nowait(zio_vdev_child_io(pio, NULL, raidvd->vdev_child[i],
4213 0, abds[i], write_size, ZIO_TYPE_WRITE,
4214 ZIO_PRIORITY_ASYNC_WRITE, ZIO_FLAG_CANFAIL,
4215 raidz_scratch_child_done, pio));
4216 }
4217 error = zio_wait(pio);
4218 if (error != 0) {
4219 /*
4220 * When we exit early here and drop the range lock, new
4221 * writes will go into the scratch area so we'll need to
4222 * read from there when we return after pausing.
4223 */
4224 zfs_dbgmsg("reflow: error %d writing real location", error);
4225 /*
4226 * Update the uberblock that is written when this txg completes.
4227 */
4228 RAIDZ_REFLOW_SET(&spa->spa_uberblock, RRSS_SCRATCH_VALID,
4229 logical_size);
4230 goto io_error_exit;
4231 }
4232 pio = zio_root(spa, NULL, NULL, 0);
4233 zio_flush(pio, raidvd);
4234 zio_wait(pio);
4235
4236 zfs_dbgmsg("reflow: overwrote %llu bytes (logical) to real location",
4237 (long long)logical_size);
4238 for (int i = 0; i < raidvd->vdev_children; i++)
4239 abd_free(abds[i]);
4240 kmem_free(abds, raidvd->vdev_children * sizeof (abd_t *));
4241
4242 raidz_expand_pause(RAIDZ_EXPAND_PAUSE_SCRATCH_REFLOWED);
4243
4244 /*
4245 * Update uberblock to indicate that the initial part has been
4246 * reflow'ed. This is needed because after this point (when we exit
4247 * the rangelock), we allow regular writes to this region, which will
4248 * be written to the new location only (because reflow_offset_next ==
4249 * reflow_offset_synced). If we crashed and re-copied from the
4250 * scratch space, we would lose the regular writes.
4251 */
4252 RAIDZ_REFLOW_SET(&spa->spa_ubsync, RRSS_SCRATCH_INVALID_SYNCED,
4253 logical_size);
4254 spa->spa_ubsync.ub_timestamp++;
4255 ASSERT0(vdev_uberblock_sync_list(&spa->spa_root_vdev, 1,
4256 &spa->spa_ubsync, ZIO_FLAG_CONFIG_WRITER));
4257 if (spa_multihost(spa))
4258 mmp_update_uberblock(spa, &spa->spa_ubsync);
4259
4260 zfs_dbgmsg("reflow: uberblock updated "
4261 "(txg %llu, SCRATCH_NOT_IN_USE, size %llu, ts %llu)",
4262 (long long)spa->spa_ubsync.ub_txg,
4263 (long long)logical_size,
4264 (long long)spa->spa_ubsync.ub_timestamp);
4265
4266 raidz_expand_pause(RAIDZ_EXPAND_PAUSE_SCRATCH_POST_REFLOW_1);
4267
4268 /*
4269 * Update progress.
4270 */
4271 vre->vre_offset = logical_size;
4272 zfs_rangelock_exit(lr);
4273 spa_config_exit(spa, SCL_STATE, FTAG);
4274
4275 int txgoff = dmu_tx_get_txg(tx) & TXG_MASK;
4276 vre->vre_offset_pertxg[txgoff] = vre->vre_offset;
4277 vre->vre_bytes_copied_pertxg[txgoff] = vre->vre_bytes_copied;
4278 /*
4279 * Note - raidz_reflow_sync() will update the uberblock state to
4280 * RRSS_SCRATCH_INVALID_SYNCED_REFLOW
4281 */
4282 raidz_reflow_sync(spa, tx);
4283
4284 raidz_expand_pause(RAIDZ_EXPAND_PAUSE_SCRATCH_POST_REFLOW_2);
4285 }
4286
4287 /*
4288 * We crashed in the middle of raidz_reflow_scratch_sync(); complete its work
4289 * here. No other i/o can be in progress, so we don't need the vre_rangelock.
4290 */
4291 void
vdev_raidz_reflow_copy_scratch(spa_t * spa)4292 vdev_raidz_reflow_copy_scratch(spa_t *spa)
4293 {
4294 vdev_raidz_expand_t *vre = spa->spa_raidz_expand;
4295 uint64_t logical_size = RRSS_GET_OFFSET(&spa->spa_uberblock);
4296 ASSERT3U(RRSS_GET_STATE(&spa->spa_uberblock), ==, RRSS_SCRATCH_VALID);
4297
4298 spa_config_enter(spa, SCL_STATE, FTAG, RW_READER);
4299 vdev_t *raidvd = vdev_lookup_top(spa, vre->vre_vdev_id);
4300 ASSERT0(logical_size % raidvd->vdev_children);
4301 uint64_t write_size = logical_size / raidvd->vdev_children;
4302
4303 zio_t *pio;
4304
4305 /*
4306 * Read from scratch space.
4307 */
4308 abd_t **abds = kmem_alloc(raidvd->vdev_children * sizeof (abd_t *),
4309 KM_SLEEP);
4310 for (int i = 0; i < raidvd->vdev_children; i++) {
4311 abds[i] = abd_alloc_linear(write_size, B_FALSE);
4312 }
4313
4314 pio = zio_root(spa, NULL, NULL, 0);
4315 for (int i = 0; i < raidvd->vdev_children; i++) {
4316 /*
4317 * Note: zio_vdev_child_io() adds VDEV_LABEL_START_SIZE to
4318 * the offset to calculate the physical offset to write to.
4319 * Passing in a negative offset lets us access the boot area.
4320 */
4321 zio_nowait(zio_vdev_child_io(pio, NULL, raidvd->vdev_child[i],
4322 VDEV_BOOT_OFFSET - VDEV_LABEL_START_SIZE, abds[i],
4323 write_size, ZIO_TYPE_READ,
4324 ZIO_PRIORITY_ASYNC_READ, 0,
4325 raidz_scratch_child_done, pio));
4326 }
4327 zio_wait(pio);
4328
4329 /*
4330 * Overwrite real location with reflow'ed data.
4331 */
4332 pio = zio_root(spa, NULL, NULL, 0);
4333 for (int i = 0; i < raidvd->vdev_children; i++) {
4334 zio_nowait(zio_vdev_child_io(pio, NULL, raidvd->vdev_child[i],
4335 0, abds[i], write_size, ZIO_TYPE_WRITE,
4336 ZIO_PRIORITY_ASYNC_WRITE, 0,
4337 raidz_scratch_child_done, pio));
4338 }
4339 zio_wait(pio);
4340 pio = zio_root(spa, NULL, NULL, 0);
4341 zio_flush(pio, raidvd);
4342 zio_wait(pio);
4343
4344 zfs_dbgmsg("reflow recovery: overwrote %llu bytes (logical) "
4345 "to real location", (long long)logical_size);
4346
4347 for (int i = 0; i < raidvd->vdev_children; i++)
4348 abd_free(abds[i]);
4349 kmem_free(abds, raidvd->vdev_children * sizeof (abd_t *));
4350
4351 /*
4352 * Update uberblock.
4353 */
4354 RAIDZ_REFLOW_SET(&spa->spa_ubsync,
4355 RRSS_SCRATCH_INVALID_SYNCED_ON_IMPORT, logical_size);
4356 spa->spa_ubsync.ub_timestamp++;
4357 VERIFY0(vdev_uberblock_sync_list(&spa->spa_root_vdev, 1,
4358 &spa->spa_ubsync, ZIO_FLAG_CONFIG_WRITER));
4359 if (spa_multihost(spa))
4360 mmp_update_uberblock(spa, &spa->spa_ubsync);
4361
4362 zfs_dbgmsg("reflow recovery: uberblock updated "
4363 "(txg %llu, SCRATCH_NOT_IN_USE, size %llu, ts %llu)",
4364 (long long)spa->spa_ubsync.ub_txg,
4365 (long long)logical_size,
4366 (long long)spa->spa_ubsync.ub_timestamp);
4367
4368 dmu_tx_t *tx = dmu_tx_create_assigned(spa->spa_dsl_pool,
4369 spa_first_txg(spa));
4370 int txgoff = dmu_tx_get_txg(tx) & TXG_MASK;
4371 vre->vre_offset = logical_size;
4372 vre->vre_offset_pertxg[txgoff] = vre->vre_offset;
4373 vre->vre_bytes_copied_pertxg[txgoff] = vre->vre_bytes_copied;
4374 /*
4375 * Note that raidz_reflow_sync() will update the uberblock once more
4376 */
4377 raidz_reflow_sync(spa, tx);
4378
4379 dmu_tx_commit(tx);
4380
4381 spa_config_exit(spa, SCL_STATE, FTAG);
4382 }
4383
4384 static boolean_t
spa_raidz_expand_thread_check(void * arg,zthr_t * zthr)4385 spa_raidz_expand_thread_check(void *arg, zthr_t *zthr)
4386 {
4387 (void) zthr;
4388 spa_t *spa = arg;
4389
4390 return (spa->spa_raidz_expand != NULL &&
4391 !spa->spa_raidz_expand->vre_waiting_for_resilver);
4392 }
4393
4394 /*
4395 * RAIDZ expansion background thread
4396 *
4397 * Can be called multiple times if the reflow is paused
4398 */
4399 static void
spa_raidz_expand_thread(void * arg,zthr_t * zthr)4400 spa_raidz_expand_thread(void *arg, zthr_t *zthr)
4401 {
4402 spa_t *spa = arg;
4403 vdev_raidz_expand_t *vre = spa->spa_raidz_expand;
4404
4405 if (RRSS_GET_STATE(&spa->spa_ubsync) == RRSS_SCRATCH_VALID)
4406 vre->vre_offset = 0;
4407 else
4408 vre->vre_offset = RRSS_GET_OFFSET(&spa->spa_ubsync);
4409
4410 /* Reflow the begining portion using the scratch area */
4411 if (vre->vre_offset == 0) {
4412 VERIFY0(dsl_sync_task(spa_name(spa),
4413 NULL, raidz_reflow_scratch_sync,
4414 vre, 0, ZFS_SPACE_CHECK_NONE));
4415
4416 /* if we encountered errors then pause */
4417 if (vre->vre_offset == 0) {
4418 mutex_enter(&vre->vre_lock);
4419 vre->vre_waiting_for_resilver = B_TRUE;
4420 mutex_exit(&vre->vre_lock);
4421 return;
4422 }
4423 }
4424
4425 spa_config_enter(spa, SCL_CONFIG, FTAG, RW_READER);
4426 vdev_t *raidvd = vdev_lookup_top(spa, vre->vre_vdev_id);
4427
4428 uint64_t guid = raidvd->vdev_guid;
4429
4430 /* Iterate over all the remaining metaslabs */
4431 for (uint64_t i = vre->vre_offset >> raidvd->vdev_ms_shift;
4432 i < raidvd->vdev_ms_count &&
4433 !zthr_iscancelled(zthr) &&
4434 vre->vre_failed_offset == UINT64_MAX; i++) {
4435 metaslab_t *msp = raidvd->vdev_ms[i];
4436
4437 metaslab_disable(msp);
4438 mutex_enter(&msp->ms_lock);
4439
4440 /*
4441 * The metaslab may be newly created (for the expanded
4442 * space), in which case its trees won't exist yet,
4443 * so we need to bail out early.
4444 */
4445 if (msp->ms_new) {
4446 mutex_exit(&msp->ms_lock);
4447 metaslab_enable(msp, B_FALSE, B_FALSE);
4448 continue;
4449 }
4450
4451 VERIFY0(metaslab_load(msp));
4452
4453 /*
4454 * We want to copy everything except the free (allocatable)
4455 * space. Note that there may be a little bit more free
4456 * space (e.g. in ms_defer), and it's fine to copy that too.
4457 */
4458 range_tree_t *rt = range_tree_create(NULL, RANGE_SEG64,
4459 NULL, 0, 0);
4460 range_tree_add(rt, msp->ms_start, msp->ms_size);
4461 range_tree_walk(msp->ms_allocatable, range_tree_remove, rt);
4462 mutex_exit(&msp->ms_lock);
4463
4464 /*
4465 * Force the last sector of each metaslab to be copied. This
4466 * ensures that we advance the on-disk progress to the end of
4467 * this metaslab while the metaslab is disabled. Otherwise, we
4468 * could move past this metaslab without advancing the on-disk
4469 * progress, and then an allocation to this metaslab would not
4470 * be copied.
4471 */
4472 int sectorsz = 1 << raidvd->vdev_ashift;
4473 uint64_t ms_last_offset = msp->ms_start +
4474 msp->ms_size - sectorsz;
4475 if (!range_tree_contains(rt, ms_last_offset, sectorsz)) {
4476 range_tree_add(rt, ms_last_offset, sectorsz);
4477 }
4478
4479 /*
4480 * When we are resuming from a paused expansion (i.e.
4481 * when importing a pool with a expansion in progress),
4482 * discard any state that we have already processed.
4483 */
4484 range_tree_clear(rt, 0, vre->vre_offset);
4485
4486 while (!zthr_iscancelled(zthr) &&
4487 !range_tree_is_empty(rt) &&
4488 vre->vre_failed_offset == UINT64_MAX) {
4489
4490 /*
4491 * We need to periodically drop the config lock so that
4492 * writers can get in. Additionally, we can't wait
4493 * for a txg to sync while holding a config lock
4494 * (since a waiting writer could cause a 3-way deadlock
4495 * with the sync thread, which also gets a config
4496 * lock for reader). So we can't hold the config lock
4497 * while calling dmu_tx_assign().
4498 */
4499 spa_config_exit(spa, SCL_CONFIG, FTAG);
4500
4501 /*
4502 * If requested, pause the reflow when the amount
4503 * specified by raidz_expand_max_reflow_bytes is reached
4504 *
4505 * This pause is only used during testing or debugging.
4506 */
4507 while (raidz_expand_max_reflow_bytes != 0 &&
4508 raidz_expand_max_reflow_bytes <=
4509 vre->vre_bytes_copied && !zthr_iscancelled(zthr)) {
4510 delay(hz);
4511 }
4512
4513 mutex_enter(&vre->vre_lock);
4514 while (vre->vre_outstanding_bytes >
4515 raidz_expand_max_copy_bytes) {
4516 cv_wait(&vre->vre_cv, &vre->vre_lock);
4517 }
4518 mutex_exit(&vre->vre_lock);
4519
4520 dmu_tx_t *tx =
4521 dmu_tx_create_dd(spa_get_dsl(spa)->dp_mos_dir);
4522
4523 VERIFY0(dmu_tx_assign(tx, TXG_WAIT));
4524 uint64_t txg = dmu_tx_get_txg(tx);
4525
4526 /*
4527 * Reacquire the vdev_config lock. Theoretically, the
4528 * vdev_t that we're expanding may have changed.
4529 */
4530 spa_config_enter(spa, SCL_CONFIG, FTAG, RW_READER);
4531 raidvd = vdev_lookup_top(spa, vre->vre_vdev_id);
4532
4533 boolean_t needsync =
4534 raidz_reflow_impl(raidvd, vre, rt, tx);
4535
4536 dmu_tx_commit(tx);
4537
4538 if (needsync) {
4539 spa_config_exit(spa, SCL_CONFIG, FTAG);
4540 txg_wait_synced(spa->spa_dsl_pool, txg);
4541 spa_config_enter(spa, SCL_CONFIG, FTAG,
4542 RW_READER);
4543 }
4544 }
4545
4546 spa_config_exit(spa, SCL_CONFIG, FTAG);
4547
4548 metaslab_enable(msp, B_FALSE, B_FALSE);
4549 range_tree_vacate(rt, NULL, NULL);
4550 range_tree_destroy(rt);
4551
4552 spa_config_enter(spa, SCL_CONFIG, FTAG, RW_READER);
4553 raidvd = vdev_lookup_top(spa, vre->vre_vdev_id);
4554 }
4555
4556 spa_config_exit(spa, SCL_CONFIG, FTAG);
4557
4558 /*
4559 * The txg_wait_synced() here ensures that all reflow zio's have
4560 * completed, and vre_failed_offset has been set if necessary. It
4561 * also ensures that the progress of the last raidz_reflow_sync() is
4562 * written to disk before raidz_reflow_complete_sync() changes the
4563 * in-memory vre_state. vdev_raidz_io_start() uses vre_state to
4564 * determine if a reflow is in progress, in which case we may need to
4565 * write to both old and new locations. Therefore we can only change
4566 * vre_state once this is not necessary, which is once the on-disk
4567 * progress (in spa_ubsync) has been set past any possible writes (to
4568 * the end of the last metaslab).
4569 */
4570 txg_wait_synced(spa->spa_dsl_pool, 0);
4571
4572 if (!zthr_iscancelled(zthr) &&
4573 vre->vre_offset == raidvd->vdev_ms_count << raidvd->vdev_ms_shift) {
4574 /*
4575 * We are not being canceled or paused, so the reflow must be
4576 * complete. In that case also mark it as completed on disk.
4577 */
4578 ASSERT3U(vre->vre_failed_offset, ==, UINT64_MAX);
4579 VERIFY0(dsl_sync_task(spa_name(spa), NULL,
4580 raidz_reflow_complete_sync, spa,
4581 0, ZFS_SPACE_CHECK_NONE));
4582 (void) vdev_online(spa, guid, ZFS_ONLINE_EXPAND, NULL);
4583 } else {
4584 /*
4585 * Wait for all copy zio's to complete and for all the
4586 * raidz_reflow_sync() synctasks to be run.
4587 */
4588 spa_history_log_internal(spa, "reflow pause",
4589 NULL, "offset=%llu failed_offset=%lld",
4590 (long long)vre->vre_offset,
4591 (long long)vre->vre_failed_offset);
4592 mutex_enter(&vre->vre_lock);
4593 if (vre->vre_failed_offset != UINT64_MAX) {
4594 /*
4595 * Reset progress so that we will retry everything
4596 * after the point that something failed.
4597 */
4598 vre->vre_offset = vre->vre_failed_offset;
4599 vre->vre_failed_offset = UINT64_MAX;
4600 vre->vre_waiting_for_resilver = B_TRUE;
4601 }
4602 mutex_exit(&vre->vre_lock);
4603 }
4604 }
4605
4606 void
spa_start_raidz_expansion_thread(spa_t * spa)4607 spa_start_raidz_expansion_thread(spa_t *spa)
4608 {
4609 ASSERT3P(spa->spa_raidz_expand_zthr, ==, NULL);
4610 spa->spa_raidz_expand_zthr = zthr_create("raidz_expand",
4611 spa_raidz_expand_thread_check, spa_raidz_expand_thread,
4612 spa, defclsyspri);
4613 }
4614
4615 void
raidz_dtl_reassessed(vdev_t * vd)4616 raidz_dtl_reassessed(vdev_t *vd)
4617 {
4618 spa_t *spa = vd->vdev_spa;
4619 if (spa->spa_raidz_expand != NULL) {
4620 vdev_raidz_expand_t *vre = spa->spa_raidz_expand;
4621 /*
4622 * we get called often from vdev_dtl_reassess() so make
4623 * sure it's our vdev and any replacing is complete
4624 */
4625 if (vd->vdev_top->vdev_id == vre->vre_vdev_id &&
4626 !vdev_raidz_expand_child_replacing(vd->vdev_top)) {
4627 mutex_enter(&vre->vre_lock);
4628 if (vre->vre_waiting_for_resilver) {
4629 vdev_dbgmsg(vd, "DTL reassessed, "
4630 "continuing raidz expansion");
4631 vre->vre_waiting_for_resilver = B_FALSE;
4632 zthr_wakeup(spa->spa_raidz_expand_zthr);
4633 }
4634 mutex_exit(&vre->vre_lock);
4635 }
4636 }
4637 }
4638
4639 int
vdev_raidz_attach_check(vdev_t * new_child)4640 vdev_raidz_attach_check(vdev_t *new_child)
4641 {
4642 vdev_t *raidvd = new_child->vdev_parent;
4643 uint64_t new_children = raidvd->vdev_children;
4644
4645 /*
4646 * We use the "boot" space as scratch space to handle overwriting the
4647 * initial part of the vdev. If it is too small, then this expansion
4648 * is not allowed. This would be very unusual (e.g. ashift > 13 and
4649 * >200 children).
4650 */
4651 if (new_children << raidvd->vdev_ashift > VDEV_BOOT_SIZE) {
4652 return (EINVAL);
4653 }
4654 return (0);
4655 }
4656
4657 void
vdev_raidz_attach_sync(void * arg,dmu_tx_t * tx)4658 vdev_raidz_attach_sync(void *arg, dmu_tx_t *tx)
4659 {
4660 vdev_t *new_child = arg;
4661 spa_t *spa = new_child->vdev_spa;
4662 vdev_t *raidvd = new_child->vdev_parent;
4663 vdev_raidz_t *vdrz = raidvd->vdev_tsd;
4664 ASSERT3P(raidvd->vdev_ops, ==, &vdev_raidz_ops);
4665 ASSERT3P(raidvd->vdev_top, ==, raidvd);
4666 ASSERT3U(raidvd->vdev_children, >, vdrz->vd_original_width);
4667 ASSERT3U(raidvd->vdev_children, ==, vdrz->vd_physical_width + 1);
4668 ASSERT3P(raidvd->vdev_child[raidvd->vdev_children - 1], ==,
4669 new_child);
4670
4671 spa_feature_incr(spa, SPA_FEATURE_RAIDZ_EXPANSION, tx);
4672
4673 vdrz->vd_physical_width++;
4674
4675 VERIFY0(spa->spa_uberblock.ub_raidz_reflow_info);
4676 vdrz->vn_vre.vre_vdev_id = raidvd->vdev_id;
4677 vdrz->vn_vre.vre_offset = 0;
4678 vdrz->vn_vre.vre_failed_offset = UINT64_MAX;
4679 spa->spa_raidz_expand = &vdrz->vn_vre;
4680 zthr_wakeup(spa->spa_raidz_expand_zthr);
4681
4682 /*
4683 * Dirty the config so that ZPOOL_CONFIG_RAIDZ_EXPANDING will get
4684 * written to the config.
4685 */
4686 vdev_config_dirty(raidvd);
4687
4688 vdrz->vn_vre.vre_start_time = gethrestime_sec();
4689 vdrz->vn_vre.vre_end_time = 0;
4690 vdrz->vn_vre.vre_state = DSS_SCANNING;
4691 vdrz->vn_vre.vre_bytes_copied = 0;
4692
4693 uint64_t state = vdrz->vn_vre.vre_state;
4694 VERIFY0(zap_update(spa->spa_meta_objset,
4695 raidvd->vdev_top_zap, VDEV_TOP_ZAP_RAIDZ_EXPAND_STATE,
4696 sizeof (state), 1, &state, tx));
4697
4698 uint64_t start_time = vdrz->vn_vre.vre_start_time;
4699 VERIFY0(zap_update(spa->spa_meta_objset,
4700 raidvd->vdev_top_zap, VDEV_TOP_ZAP_RAIDZ_EXPAND_START_TIME,
4701 sizeof (start_time), 1, &start_time, tx));
4702
4703 (void) zap_remove(spa->spa_meta_objset,
4704 raidvd->vdev_top_zap, VDEV_TOP_ZAP_RAIDZ_EXPAND_END_TIME, tx);
4705 (void) zap_remove(spa->spa_meta_objset,
4706 raidvd->vdev_top_zap, VDEV_TOP_ZAP_RAIDZ_EXPAND_BYTES_COPIED, tx);
4707
4708 spa_history_log_internal(spa, "raidz vdev expansion started", tx,
4709 "%s vdev %llu new width %llu", spa_name(spa),
4710 (unsigned long long)raidvd->vdev_id,
4711 (unsigned long long)raidvd->vdev_children);
4712 }
4713
4714 int
vdev_raidz_load(vdev_t * vd)4715 vdev_raidz_load(vdev_t *vd)
4716 {
4717 vdev_raidz_t *vdrz = vd->vdev_tsd;
4718 int err;
4719
4720 uint64_t state = DSS_NONE;
4721 uint64_t start_time = 0;
4722 uint64_t end_time = 0;
4723 uint64_t bytes_copied = 0;
4724
4725 if (vd->vdev_top_zap != 0) {
4726 err = zap_lookup(vd->vdev_spa->spa_meta_objset,
4727 vd->vdev_top_zap, VDEV_TOP_ZAP_RAIDZ_EXPAND_STATE,
4728 sizeof (state), 1, &state);
4729 if (err != 0 && err != ENOENT)
4730 return (err);
4731
4732 err = zap_lookup(vd->vdev_spa->spa_meta_objset,
4733 vd->vdev_top_zap, VDEV_TOP_ZAP_RAIDZ_EXPAND_START_TIME,
4734 sizeof (start_time), 1, &start_time);
4735 if (err != 0 && err != ENOENT)
4736 return (err);
4737
4738 err = zap_lookup(vd->vdev_spa->spa_meta_objset,
4739 vd->vdev_top_zap, VDEV_TOP_ZAP_RAIDZ_EXPAND_END_TIME,
4740 sizeof (end_time), 1, &end_time);
4741 if (err != 0 && err != ENOENT)
4742 return (err);
4743
4744 err = zap_lookup(vd->vdev_spa->spa_meta_objset,
4745 vd->vdev_top_zap, VDEV_TOP_ZAP_RAIDZ_EXPAND_BYTES_COPIED,
4746 sizeof (bytes_copied), 1, &bytes_copied);
4747 if (err != 0 && err != ENOENT)
4748 return (err);
4749 }
4750
4751 /*
4752 * If we are in the middle of expansion, vre_state should have
4753 * already been set by vdev_raidz_init().
4754 */
4755 EQUIV(vdrz->vn_vre.vre_state == DSS_SCANNING, state == DSS_SCANNING);
4756 vdrz->vn_vre.vre_state = (dsl_scan_state_t)state;
4757 vdrz->vn_vre.vre_start_time = start_time;
4758 vdrz->vn_vre.vre_end_time = end_time;
4759 vdrz->vn_vre.vre_bytes_copied = bytes_copied;
4760
4761 return (0);
4762 }
4763
4764 int
spa_raidz_expand_get_stats(spa_t * spa,pool_raidz_expand_stat_t * pres)4765 spa_raidz_expand_get_stats(spa_t *spa, pool_raidz_expand_stat_t *pres)
4766 {
4767 vdev_raidz_expand_t *vre = spa->spa_raidz_expand;
4768
4769 if (vre == NULL) {
4770 /* no removal in progress; find most recent completed */
4771 for (int c = 0; c < spa->spa_root_vdev->vdev_children; c++) {
4772 vdev_t *vd = spa->spa_root_vdev->vdev_child[c];
4773 if (vd->vdev_ops == &vdev_raidz_ops) {
4774 vdev_raidz_t *vdrz = vd->vdev_tsd;
4775
4776 if (vdrz->vn_vre.vre_end_time != 0 &&
4777 (vre == NULL ||
4778 vdrz->vn_vre.vre_end_time >
4779 vre->vre_end_time)) {
4780 vre = &vdrz->vn_vre;
4781 }
4782 }
4783 }
4784 }
4785
4786 if (vre == NULL) {
4787 return (SET_ERROR(ENOENT));
4788 }
4789
4790 pres->pres_state = vre->vre_state;
4791 pres->pres_expanding_vdev = vre->vre_vdev_id;
4792
4793 vdev_t *vd = vdev_lookup_top(spa, vre->vre_vdev_id);
4794 pres->pres_to_reflow = vd->vdev_stat.vs_alloc;
4795
4796 mutex_enter(&vre->vre_lock);
4797 pres->pres_reflowed = vre->vre_bytes_copied;
4798 for (int i = 0; i < TXG_SIZE; i++)
4799 pres->pres_reflowed += vre->vre_bytes_copied_pertxg[i];
4800 mutex_exit(&vre->vre_lock);
4801
4802 pres->pres_start_time = vre->vre_start_time;
4803 pres->pres_end_time = vre->vre_end_time;
4804 pres->pres_waiting_for_resilver = vre->vre_waiting_for_resilver;
4805
4806 return (0);
4807 }
4808
4809 /*
4810 * Initialize private RAIDZ specific fields from the nvlist.
4811 */
4812 static int
vdev_raidz_init(spa_t * spa,nvlist_t * nv,void ** tsd)4813 vdev_raidz_init(spa_t *spa, nvlist_t *nv, void **tsd)
4814 {
4815 uint_t children;
4816 nvlist_t **child;
4817 int error = nvlist_lookup_nvlist_array(nv,
4818 ZPOOL_CONFIG_CHILDREN, &child, &children);
4819 if (error != 0)
4820 return (SET_ERROR(EINVAL));
4821
4822 uint64_t nparity;
4823 if (nvlist_lookup_uint64(nv, ZPOOL_CONFIG_NPARITY, &nparity) == 0) {
4824 if (nparity == 0 || nparity > VDEV_RAIDZ_MAXPARITY)
4825 return (SET_ERROR(EINVAL));
4826
4827 /*
4828 * Previous versions could only support 1 or 2 parity
4829 * device.
4830 */
4831 if (nparity > 1 && spa_version(spa) < SPA_VERSION_RAIDZ2)
4832 return (SET_ERROR(EINVAL));
4833 else if (nparity > 2 && spa_version(spa) < SPA_VERSION_RAIDZ3)
4834 return (SET_ERROR(EINVAL));
4835 } else {
4836 /*
4837 * We require the parity to be specified for SPAs that
4838 * support multiple parity levels.
4839 */
4840 if (spa_version(spa) >= SPA_VERSION_RAIDZ2)
4841 return (SET_ERROR(EINVAL));
4842
4843 /*
4844 * Otherwise, we default to 1 parity device for RAID-Z.
4845 */
4846 nparity = 1;
4847 }
4848
4849 vdev_raidz_t *vdrz = kmem_zalloc(sizeof (*vdrz), KM_SLEEP);
4850 vdrz->vn_vre.vre_vdev_id = -1;
4851 vdrz->vn_vre.vre_offset = UINT64_MAX;
4852 vdrz->vn_vre.vre_failed_offset = UINT64_MAX;
4853 mutex_init(&vdrz->vn_vre.vre_lock, NULL, MUTEX_DEFAULT, NULL);
4854 cv_init(&vdrz->vn_vre.vre_cv, NULL, CV_DEFAULT, NULL);
4855 zfs_rangelock_init(&vdrz->vn_vre.vre_rangelock, NULL, NULL);
4856 mutex_init(&vdrz->vd_expand_lock, NULL, MUTEX_DEFAULT, NULL);
4857 avl_create(&vdrz->vd_expand_txgs, vdev_raidz_reflow_compare,
4858 sizeof (reflow_node_t), offsetof(reflow_node_t, re_link));
4859
4860 vdrz->vd_physical_width = children;
4861 vdrz->vd_nparity = nparity;
4862
4863 /* note, the ID does not exist when creating a pool */
4864 (void) nvlist_lookup_uint64(nv, ZPOOL_CONFIG_ID,
4865 &vdrz->vn_vre.vre_vdev_id);
4866
4867 boolean_t reflow_in_progress =
4868 nvlist_exists(nv, ZPOOL_CONFIG_RAIDZ_EXPANDING);
4869 if (reflow_in_progress) {
4870 spa->spa_raidz_expand = &vdrz->vn_vre;
4871 vdrz->vn_vre.vre_state = DSS_SCANNING;
4872 }
4873
4874 vdrz->vd_original_width = children;
4875 uint64_t *txgs;
4876 unsigned int txgs_size = 0;
4877 error = nvlist_lookup_uint64_array(nv, ZPOOL_CONFIG_RAIDZ_EXPAND_TXGS,
4878 &txgs, &txgs_size);
4879 if (error == 0) {
4880 for (int i = 0; i < txgs_size; i++) {
4881 reflow_node_t *re = kmem_zalloc(sizeof (*re), KM_SLEEP);
4882 re->re_txg = txgs[txgs_size - i - 1];
4883 re->re_logical_width = vdrz->vd_physical_width - i;
4884
4885 if (reflow_in_progress)
4886 re->re_logical_width--;
4887
4888 avl_add(&vdrz->vd_expand_txgs, re);
4889 }
4890
4891 vdrz->vd_original_width = vdrz->vd_physical_width - txgs_size;
4892 }
4893 if (reflow_in_progress) {
4894 vdrz->vd_original_width--;
4895 zfs_dbgmsg("reflow_in_progress, %u wide, %d prior expansions",
4896 children, txgs_size);
4897 }
4898
4899 *tsd = vdrz;
4900
4901 return (0);
4902 }
4903
4904 static void
vdev_raidz_fini(vdev_t * vd)4905 vdev_raidz_fini(vdev_t *vd)
4906 {
4907 vdev_raidz_t *vdrz = vd->vdev_tsd;
4908 if (vd->vdev_spa->spa_raidz_expand == &vdrz->vn_vre)
4909 vd->vdev_spa->spa_raidz_expand = NULL;
4910 reflow_node_t *re;
4911 void *cookie = NULL;
4912 avl_tree_t *tree = &vdrz->vd_expand_txgs;
4913 while ((re = avl_destroy_nodes(tree, &cookie)) != NULL)
4914 kmem_free(re, sizeof (*re));
4915 avl_destroy(&vdrz->vd_expand_txgs);
4916 mutex_destroy(&vdrz->vd_expand_lock);
4917 mutex_destroy(&vdrz->vn_vre.vre_lock);
4918 cv_destroy(&vdrz->vn_vre.vre_cv);
4919 zfs_rangelock_fini(&vdrz->vn_vre.vre_rangelock);
4920 kmem_free(vdrz, sizeof (*vdrz));
4921 }
4922
4923 /*
4924 * Add RAIDZ specific fields to the config nvlist.
4925 */
4926 static void
vdev_raidz_config_generate(vdev_t * vd,nvlist_t * nv)4927 vdev_raidz_config_generate(vdev_t *vd, nvlist_t *nv)
4928 {
4929 ASSERT3P(vd->vdev_ops, ==, &vdev_raidz_ops);
4930 vdev_raidz_t *vdrz = vd->vdev_tsd;
4931
4932 /*
4933 * Make sure someone hasn't managed to sneak a fancy new vdev
4934 * into a crufty old storage pool.
4935 */
4936 ASSERT(vdrz->vd_nparity == 1 ||
4937 (vdrz->vd_nparity <= 2 &&
4938 spa_version(vd->vdev_spa) >= SPA_VERSION_RAIDZ2) ||
4939 (vdrz->vd_nparity <= 3 &&
4940 spa_version(vd->vdev_spa) >= SPA_VERSION_RAIDZ3));
4941
4942 /*
4943 * Note that we'll add these even on storage pools where they
4944 * aren't strictly required -- older software will just ignore
4945 * it.
4946 */
4947 fnvlist_add_uint64(nv, ZPOOL_CONFIG_NPARITY, vdrz->vd_nparity);
4948
4949 if (vdrz->vn_vre.vre_state == DSS_SCANNING) {
4950 fnvlist_add_boolean(nv, ZPOOL_CONFIG_RAIDZ_EXPANDING);
4951 }
4952
4953 mutex_enter(&vdrz->vd_expand_lock);
4954 if (!avl_is_empty(&vdrz->vd_expand_txgs)) {
4955 uint64_t count = avl_numnodes(&vdrz->vd_expand_txgs);
4956 uint64_t *txgs = kmem_alloc(sizeof (uint64_t) * count,
4957 KM_SLEEP);
4958 uint64_t i = 0;
4959
4960 for (reflow_node_t *re = avl_first(&vdrz->vd_expand_txgs);
4961 re != NULL; re = AVL_NEXT(&vdrz->vd_expand_txgs, re)) {
4962 txgs[i++] = re->re_txg;
4963 }
4964
4965 fnvlist_add_uint64_array(nv, ZPOOL_CONFIG_RAIDZ_EXPAND_TXGS,
4966 txgs, count);
4967
4968 kmem_free(txgs, sizeof (uint64_t) * count);
4969 }
4970 mutex_exit(&vdrz->vd_expand_lock);
4971 }
4972
4973 static uint64_t
vdev_raidz_nparity(vdev_t * vd)4974 vdev_raidz_nparity(vdev_t *vd)
4975 {
4976 vdev_raidz_t *vdrz = vd->vdev_tsd;
4977 return (vdrz->vd_nparity);
4978 }
4979
4980 static uint64_t
vdev_raidz_ndisks(vdev_t * vd)4981 vdev_raidz_ndisks(vdev_t *vd)
4982 {
4983 return (vd->vdev_children);
4984 }
4985
4986 vdev_ops_t vdev_raidz_ops = {
4987 .vdev_op_init = vdev_raidz_init,
4988 .vdev_op_fini = vdev_raidz_fini,
4989 .vdev_op_open = vdev_raidz_open,
4990 .vdev_op_close = vdev_raidz_close,
4991 .vdev_op_asize = vdev_raidz_asize,
4992 .vdev_op_min_asize = vdev_raidz_min_asize,
4993 .vdev_op_min_alloc = NULL,
4994 .vdev_op_io_start = vdev_raidz_io_start,
4995 .vdev_op_io_done = vdev_raidz_io_done,
4996 .vdev_op_state_change = vdev_raidz_state_change,
4997 .vdev_op_need_resilver = vdev_raidz_need_resilver,
4998 .vdev_op_hold = NULL,
4999 .vdev_op_rele = NULL,
5000 .vdev_op_remap = NULL,
5001 .vdev_op_xlate = vdev_raidz_xlate,
5002 .vdev_op_rebuild_asize = NULL,
5003 .vdev_op_metaslab_init = NULL,
5004 .vdev_op_config_generate = vdev_raidz_config_generate,
5005 .vdev_op_nparity = vdev_raidz_nparity,
5006 .vdev_op_ndisks = vdev_raidz_ndisks,
5007 .vdev_op_type = VDEV_TYPE_RAIDZ, /* name of this vdev type */
5008 .vdev_op_leaf = B_FALSE /* not a leaf vdev */
5009 };
5010
5011 /* BEGIN CSTYLED */
5012 ZFS_MODULE_PARAM(zfs_vdev, raidz_, expand_max_reflow_bytes, ULONG, ZMOD_RW,
5013 "For testing, pause RAIDZ expansion after reflowing this many bytes");
5014 ZFS_MODULE_PARAM(zfs_vdev, raidz_, expand_max_copy_bytes, ULONG, ZMOD_RW,
5015 "Max amount of concurrent i/o for RAIDZ expansion");
5016 ZFS_MODULE_PARAM(zfs_vdev, raidz_, io_aggregate_rows, ULONG, ZMOD_RW,
5017 "For expanded RAIDZ, aggregate reads that have more rows than this");
5018 ZFS_MODULE_PARAM(zfs, zfs_, scrub_after_expand, INT, ZMOD_RW,
5019 "For expanded RAIDZ, automatically start a pool scrub when expansion "
5020 "completes");
5021 /* END CSTYLED */
5022