1 /*
2  * Copyright (C) 2013 Andrea Mazzoleni
3  *
4  * This program is free software: you can redistribute it and/or modify
5  * it under the terms of the GNU General Public License as published by
6  * the Free Software Foundation, either version 2 of the License, or
7  * (at your option) any later version.
8  *
9  * This program is distributed in the hope that it will be useful,
10  * but WITHOUT ANY WARRANTY; without even the implied warranty of
11  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
12  * GNU General Public License for more details.
13  */
14 
15 #include "internal.h"
16 #include "gf.h"
17 
18 /*
19  * This is a RAID implementation working in the Galois Field GF(2^8) with
20  * the primitive polynomial x^8 + x^4 + x^3 + x^2 + 1 (285 decimal), and
21  * supporting up to six parity levels.
22  *
23  * For RAID5 and RAID6 it works as as described in the H. Peter Anvin's
24  * paper "The mathematics of RAID-6" [1]. Please refer to this paper for a
25  * complete explanation.
26  *
27  * To support triple parity, it was first evaluated and then dropped, an
28  * extension of the same approach, with additional parity coefficients set
29  * as powers of 2^-1, with equations:
30  *
31  * P = sum(Di)
32  * Q = sum(2^i * Di)
33  * R = sum(2^-i * Di) with 0<=i<N
34  *
35  * This approach works well for triple parity and it's very efficient,
36  * because we can implement very fast parallel multiplications and
37  * divisions by 2 in GF(2^8).
38  *
39  * It's also similar at the approach used by ZFS RAIDZ3, with the
40  * difference that ZFS uses powers of 4 instead of 2^-1.
41  *
42  * Unfortunately it doesn't work beyond triple parity, because whatever
43  * value we choose to generate the power coefficients to compute other
44  * parities, the resulting equations are not solvable for some
45  * combinations of missing disks.
46  *
47  * This is expected, because the Vandermonde matrix used to compute the
48  * parity has no guarantee to have all submatrices not singular
49  * [2, Chap 11, Problem 7] and this is a requirement to have
50  * a MDS (Maximum Distance Separable) code [2, Chap 11, Theorem 8].
51  *
52  * To overcome this limitation, we use a Cauchy matrix [3][4] to compute
53  * the parity. A Cauchy matrix has the property to have all the square
54  * submatrices not singular, resulting in always solvable equations,
55  * for any combination of missing disks.
56  *
57  * The problem of this approach is that it requires the use of
58  * generic multiplications, and not only by 2 or 2^-1, potentially
59  * affecting badly the performance.
60  *
61  * Hopefully there is a method to implement parallel multiplications
62  * using SSSE3 or AVX2 instructions [1][5]. Method competitive with the
63  * computation of triple parity using power coefficients.
64  *
65  * Another important property of the Cauchy matrix is that we can setup
66  * the first two rows with coefficients equal at the RAID5 and RAID6 approach
67  * described, resulting in a compatible extension, and requiring SSSE3
68  * or AVX2 instructions only if triple parity or beyond is used.
69  *
70  * The matrix is also adjusted, multiplying each row by a constant factor
71  * to make the first column of all 1, to optimize the computation for
72  * the first disk.
73  *
74  * This results in the matrix A[row,col] defined as:
75  *
76  * 01 01 01 01 01 01 01 01 01 01 01 01 01 01 01 01 01 01 01 01 01 01...
77  * 01 02 04 08 10 20 40 80 1d 3a 74 e8 cd 87 13 26 4c 98 2d 5a b4 75...
78  * 01 f5 d2 c4 9a 71 f1 7f fc 87 c1 c6 19 2f 40 55 3d ba 53 04 9c 61...
79  * 01 bb a6 d7 c7 07 ce 82 4a 2f a5 9b b6 60 f1 ad e7 f4 06 d2 df 2e...
80  * 01 97 7f 9c 7c 18 bd a2 58 1a da 74 70 a3 e5 47 29 07 f5 80 23 e9...
81  * 01 2b 3f cf 73 2c d6 ed cb 74 15 78 8a c1 17 c9 89 68 21 ab 76 3b...
82  *
83  * This matrix supports 6 level of parity, one for each row, for up to 251
84  * data disks, one for each column, with all the 377,342,351,231 square
85  * submatrices not singular, verified also with brute-force.
86  *
87  * This matrix can be extended to support any number of parities, just
88  * adding additional rows, and removing one column for each new row.
89  * (see mktables.c for more details in how the matrix is generated)
90  *
91  * In details, parity is computed as:
92  *
93  * P = sum(Di)
94  * Q = sum(2^i *  Di)
95  * R = sum(A[2,i] * Di)
96  * S = sum(A[3,i] * Di)
97  * T = sum(A[4,i] * Di)
98  * U = sum(A[5,i] * Di) with 0<=i<N
99  *
100  * To recover from a failure of six disks at indexes x,y,z,h,v,w,
101  * with 0<=x<y<z<h<v<w<N, we compute the parity of the available N-6
102  * disks as:
103  *
104  * Pa = sum(Di)
105  * Qa = sum(2^i * Di)
106  * Ra = sum(A[2,i] * Di)
107  * Sa = sum(A[3,i] * Di)
108  * Ta = sum(A[4,i] * Di)
109  * Ua = sum(A[5,i] * Di) with 0<=i<N,i!=x,i!=y,i!=z,i!=h,i!=v,i!=w.
110  *
111  * And if we define:
112  *
113  * Pd = Pa + P
114  * Qd = Qa + Q
115  * Rd = Ra + R
116  * Sd = Sa + S
117  * Td = Ta + T
118  * Ud = Ua + U
119  *
120  * we can sum these two sets of equations, obtaining:
121  *
122  * Pd =          Dx +          Dy +          Dz +          Dh +          Dv +          Dw
123  * Qd =    2^x * Dx +    2^y * Dy +    2^z * Dz +    2^h * Dh +    2^v * Dv +    2^w * Dw
124  * Rd = A[2,x] * Dx + A[2,y] * Dy + A[2,z] * Dz + A[2,h] * Dh + A[2,v] * Dv + A[2,w] * Dw
125  * Sd = A[3,x] * Dx + A[3,y] * Dy + A[3,z] * Dz + A[3,h] * Dh + A[3,v] * Dv + A[3,w] * Dw
126  * Td = A[4,x] * Dx + A[4,y] * Dy + A[4,z] * Dz + A[4,h] * Dh + A[4,v] * Dv + A[4,w] * Dw
127  * Ud = A[5,x] * Dx + A[5,y] * Dy + A[5,z] * Dz + A[5,h] * Dh + A[5,v] * Dv + A[5,w] * Dw
128  *
129  * A linear system always solvable because the coefficients matrix is
130  * always not singular due the properties of the matrix A[].
131  *
132  * Resulting speed in x64, with 8 data disks, using a stripe of 256 KiB,
133  * for a Core i5-4670K Haswell Quad-Core 3.4GHz is:
134  *
135  *             int8   int32   int64    sse2   ssse3    avx2
136  *   gen1             13339   25438   45438           50588
137  *   gen2              4115    6514   21840           32201
138  *   gen3       814                           10154   18613
139  *   gen4       620                            7569   14229
140  *   gen5       496                            5149   10051
141  *   gen6       413                            4239    8190
142  *
143  * Values are in MiB/s of data processed by a single thread, not counting
144  * generated parity.
145  *
146  * You can replicate these results in your machine using the
147  * "raid/test/speedtest.c" program.
148  *
149  * For comparison, the triple parity computation using the power
150  * coefficients "1,2,2^-1" is only a little faster than the one based on
151  * the Cauchy matrix if SSSE3 or AVX2 is present.
152  *
153  *             int8   int32   int64    sse2   ssse3    avx2
154  *   genz              2337    2874   10920           18944
155  *
156  * In conclusion, the use of power coefficients, and specifically powers
157  * of 1,2,2^-1, is the best option to implement triple parity in CPUs
158  * without SSSE3 and AVX2.
159  * But if a modern CPU with SSSE3 or AVX2 is available, the Cauchy
160  * matrix is the best option because it provides a fast and general
161  * approach working for any number of parities.
162  *
163  * References:
164  * [1] Anvin, "The mathematics of RAID-6", 2004
165  * [2] MacWilliams, Sloane, "The Theory of Error-Correcting Codes", 1977
166  * [3] Blomer, "An XOR-Based Erasure-Resilient Coding Scheme", 1995
167  * [4] Roth, "Introduction to Coding Theory", 2006
168  * [5] Plank, "Screaming Fast Galois Field Arithmetic Using Intel SIMD Instructions", 2013
169  */
170 
171 /**
172  * Generator matrix currently used.
173  */
174 const uint8_t (*raid_gfgen)[256];
175 
raid_mode(int mode)176 void raid_mode(int mode)
177 {
178 	if (mode == RAID_MODE_VANDERMONDE) {
179 		raid_gen_ptr[2] = raid_genz_ptr;
180 		raid_gfgen = gfvandermonde;
181 	} else {
182 		raid_gen_ptr[2] = raid_gen3_ptr;
183 		raid_gfgen = gfcauchy;
184 	}
185 }
186 
187 /**
188  * Buffer filled with 0 used in recovering.
189  */
190 static void *raid_zero_block;
191 
raid_zero(void * zero)192 void raid_zero(void *zero)
193 {
194 	raid_zero_block = zero;
195 }
196 
197 /*
198  * Forwarders for parity computation.
199  *
200  * These functions compute the parity blocks from the provided data.
201  *
202  * The number of parities to compute is implicit in the position in the
203  * forwarder vector. Position at index #i, computes (#i+1) parities.
204  *
205  * All these functions give the guarantee that parities are written
206  * in order. First parity P, then parity Q, and so on.
207  * This allows to specify the same memory buffer for multiple parities
208  * knowing that you'll get the latest written one.
209  * This characteristic is used by the raid_delta_gen() function to
210  * avoid to damage unused parities in recovering.
211  *
212  * @nd Number of data blocks
213  * @size Size of the blocks pointed by @v. It must be a multiplier of 64.
214  * @v Vector of pointers to the blocks of data and parity.
215  *   It has (@nd + #parities) elements. The starting elements are the blocks
216  *   for data, following with the parity blocks.
217  *   Each block has @size bytes.
218  */
219 void (*raid_gen_ptr[RAID_PARITY_MAX])(int nd, size_t size, void **vv);
220 void (*raid_gen3_ptr)(int nd, size_t size, void **vv);
221 void (*raid_genz_ptr)(int nd, size_t size, void **vv);
222 
raid_gen(int nd,int np,size_t size,void ** v)223 void raid_gen(int nd, int np, size_t size, void **v)
224 {
225 	/* enforce limit on size */
226 	BUG_ON(size % 64 != 0);
227 
228 	/* enforce limit on number of failures */
229 	BUG_ON(np < 1);
230 	BUG_ON(np > RAID_PARITY_MAX);
231 
232 	raid_gen_ptr[np - 1](nd, size, v);
233 }
234 
235 /**
236  * Inverts the square matrix M of size nxn into V.
237  *
238  * This is not a general matrix inversion because we assume the matrix M
239  * to have all the square submatrix not singular.
240  * We use Gauss elimination to invert.
241  *
242  * @M Matrix to invert with @n rows and @n columns.
243  * @V Destination matrix where the result is put.
244  * @n Number of rows and columns of the matrix.
245  */
raid_invert(uint8_t * M,uint8_t * V,int n)246 void raid_invert(uint8_t *M, uint8_t *V, int n)
247 {
248 	int i, j, k;
249 
250 	/* set the identity matrix in V */
251 	for (i = 0; i < n; ++i)
252 		for (j = 0; j < n; ++j)
253 			V[i * n + j] = i == j;
254 
255 	/* for each element in the diagonal */
256 	for (k = 0; k < n; ++k) {
257 		uint8_t f;
258 
259 		/* the diagonal element cannot be 0 because */
260 		/* we are inverting matrices with all the square */
261 		/* submatrices not singular */
262 		BUG_ON(M[k * n + k] == 0);
263 
264 		/* make the diagonal element to be 1 */
265 		f = inv(M[k * n + k]);
266 		for (j = 0; j < n; ++j) {
267 			M[k * n + j] = mul(f, M[k * n + j]);
268 			V[k * n + j] = mul(f, V[k * n + j]);
269 		}
270 
271 		/* make all the elements over and under the diagonal */
272 		/* to be zero */
273 		for (i = 0; i < n; ++i) {
274 			if (i == k)
275 				continue;
276 			f = M[i * n + k];
277 			for (j = 0; j < n; ++j) {
278 				M[i * n + j] ^= mul(f, M[k * n + j]);
279 				V[i * n + j] ^= mul(f, V[k * n + j]);
280 			}
281 		}
282 	}
283 }
284 
285 /**
286  * Computes the parity without the missing data blocks
287  * and store it in the buffers of such data blocks.
288  *
289  * This is the parity expressed as Pa,Qa,Ra,Sa,Ta,Ua in the equations.
290  */
raid_delta_gen(int nr,int * id,int * ip,int nd,size_t size,void ** v)291 void raid_delta_gen(int nr, int *id, int *ip, int nd, size_t size, void **v)
292 {
293 	void *p[RAID_PARITY_MAX];
294 	void *pa[RAID_PARITY_MAX];
295 	int i, j;
296 	int np;
297 	void *latest;
298 
299 	/* total number of parities we are going to process */
300 	/* they are both the used and the unused ones */
301 	np = ip[nr - 1] + 1;
302 
303 	/* latest missing data block */
304 	latest = v[id[nr - 1]];
305 
306 	/* setup pointers for delta computation */
307 	for (i = 0, j = 0; i < np; ++i) {
308 		/* keep a copy of the original parity vector */
309 		p[i] = v[nd + i];
310 
311 		if (ip[j] == i) {
312 			/*
313 			 * Set used parities to point to the missing
314 			 * data blocks.
315 			 *
316 			 * The related data blocks are instead set
317 			 * to point to the "zero" buffer.
318 			 */
319 
320 			/* the latest parity to use ends the for loop and */
321 			/* then it cannot happen to process more of them */
322 			BUG_ON(j >= nr);
323 
324 			/* buffer for missing data blocks */
325 			pa[j] = v[id[j]];
326 
327 			/* set at zero the missing data blocks */
328 			v[id[j]] = raid_zero_block;
329 
330 			/* compute the parity over the missing data blocks */
331 			v[nd + i] = pa[j];
332 
333 			/* check for the next used entry */
334 			++j;
335 		} else {
336 			/*
337 			 * Unused parities are going to be rewritten with
338 			 * not significative data, because we don't have
339 			 * functions able to compute only a subset of
340 			 * parities.
341 			 *
342 			 * To avoid this, we reuse parity buffers,
343 			 * assuming that all the parity functions write
344 			 * parities in order.
345 			 *
346 			 * We assign the unused parity block to the same
347 			 * block of the latest used parity that we know it
348 			 * will be written.
349 			 *
350 			 * This means that this block will be written
351 			 * multiple times and only the latest write will
352 			 * contain the correct data.
353 			 */
354 			v[nd + i] = latest;
355 		}
356 	}
357 
358 	/* all the parities have to be processed */
359 	BUG_ON(j != nr);
360 
361 	/* recompute the parity, note that np may be smaller than the */
362 	/* total number of parities available */
363 	raid_gen(nd, np, size, v);
364 
365 	/* restore data buffers as before */
366 	for (j = 0; j < nr; ++j)
367 		v[id[j]] = pa[j];
368 
369 	/* restore parity buffers as before */
370 	for (i = 0; i < np; ++i)
371 		v[nd + i] = p[i];
372 }
373 
374 /**
375  * Recover failure of one data block for PAR1.
376  *
377  * Starting from the equation:
378  *
379  * Pd = Dx
380  *
381  * and solving we get:
382  *
383  * Dx = Pd
384  */
raid_rec1of1(int * id,int nd,size_t size,void ** v)385 void raid_rec1of1(int *id, int nd, size_t size, void **v)
386 {
387 	void *p;
388 	void *pa;
389 
390 	/* for PAR1 we can directly compute the missing block */
391 	/* and we don't need to use the zero buffer */
392 	p = v[nd];
393 	pa = v[id[0]];
394 
395 	/* use the parity as missing data block */
396 	v[id[0]] = p;
397 
398 	/* compute the parity over the missing data block */
399 	v[nd] = pa;
400 
401 	/* compute */
402 	raid_gen(nd, 1, size, v);
403 
404 	/* restore as before */
405 	v[id[0]] = pa;
406 	v[nd] = p;
407 }
408 
409 /**
410  * Recover failure of two data blocks for PAR2.
411  *
412  * Starting from the equations:
413  *
414  * Pd = Dx + Dy
415  * Qd = 2^id[0] * Dx + 2^id[1] * Dy
416  *
417  * and solving we get:
418  *
419  *               1                     2^(-id[0])
420  * Dy = ------------------- * Pd + ------------------- * Qd
421  *      2^(id[1]-id[0]) + 1        2^(id[1]-id[0]) + 1
422  *
423  * Dx = Dy + Pd
424  *
425  * with conditions:
426  *
427  * 2^id[0] != 0
428  * 2^(id[1]-id[0]) + 1 != 0
429  *
430  * That are always satisfied for any 0<=id[0]<id[1]<255.
431  */
raid_rec2of2_int8(int * id,int * ip,int nd,size_t size,void ** vv)432 void raid_rec2of2_int8(int *id, int *ip, int nd, size_t size, void **vv)
433 {
434 	uint8_t **v = (uint8_t **)vv;
435 	size_t i;
436 	uint8_t *p;
437 	uint8_t *pa;
438 	uint8_t *q;
439 	uint8_t *qa;
440 	const uint8_t *T[2];
441 
442 	/* get multiplication tables */
443 	T[0] = table(inv(pow2(id[1] - id[0]) ^ 1));
444 	T[1] = table(inv(pow2(id[0]) ^ pow2(id[1])));
445 
446 	/* compute delta parity */
447 	raid_delta_gen(2, id, ip, nd, size, vv);
448 
449 	p = v[nd];
450 	q = v[nd + 1];
451 	pa = v[id[0]];
452 	qa = v[id[1]];
453 
454 	for (i = 0; i < size; ++i) {
455 		/* delta */
456 		uint8_t Pd = p[i] ^ pa[i];
457 		uint8_t Qd = q[i] ^ qa[i];
458 
459 		/* reconstruct */
460 		uint8_t Dy = T[0][Pd] ^ T[1][Qd];
461 		uint8_t Dx = Pd ^ Dy;
462 
463 		/* set */
464 		pa[i] = Dx;
465 		qa[i] = Dy;
466 	}
467 }
468 
469 /*
470  * Forwarders for data recovery.
471  *
472  * These functions recover data blocks using the specified parity
473  * to recompute the missing data.
474  *
475  * Note that the format of vectors @id/@ip is different than raid_rec().
476  * For example, in the vector @ip the first parity is represented with the
477  * value 0 and not @nd.
478  *
479  * @nr Number of failed data blocks to recover.
480  * @id[] Vector of @nr indexes of the data blocks to recover.
481  *   The indexes start from 0. They must be in order.
482  * @ip[] Vector of @nr indexes of the parity blocks to use in the recovering.
483  *   The indexes start from 0. They must be in order.
484  * @nd Number of data blocks.
485  * @np Number of parity blocks.
486  * @size Size of the blocks pointed by @v. It must be a multiplier of 64.
487  * @v Vector of pointers to the blocks of data and parity.
488  *   It has (@nd + @np) elements. The starting elements are the blocks
489  *   for data, following with the parity blocks.
490  *   Each block has @size bytes.
491  */
492 void (*raid_rec_ptr[RAID_PARITY_MAX])(
493 	int nr, int *id, int *ip, int nd, size_t size, void **vv);
494 
raid_rec(int nr,int * ir,int nd,int np,size_t size,void ** v)495 void raid_rec(int nr, int *ir, int nd, int np, size_t size, void **v)
496 {
497 	int nrd; /* number of data blocks to recover */
498 	int nrp; /* number of parity blocks to recover */
499 
500 	/* enforce limit on size */
501 	BUG_ON(size % 64 != 0);
502 
503 	/* enforce limit on number of failures */
504 	BUG_ON(nr > np);
505 	BUG_ON(np > RAID_PARITY_MAX);
506 
507 	/* enforce order in index vector */
508 	BUG_ON(nr >= 2 && ir[0] >= ir[1]);
509 	BUG_ON(nr >= 3 && ir[1] >= ir[2]);
510 	BUG_ON(nr >= 4 && ir[2] >= ir[3]);
511 	BUG_ON(nr >= 5 && ir[3] >= ir[4]);
512 	BUG_ON(nr >= 6 && ir[4] >= ir[5]);
513 
514 	/* enforce limit on index vector */
515 	BUG_ON(nr > 0 && ir[nr-1] >= nd + np);
516 
517 	/* count the number of data blocks to recover */
518 	nrd = 0;
519 	while (nrd < nr && ir[nrd] < nd)
520 		++nrd;
521 
522 	/* all the remaining are parity */
523 	nrp = nr - nrd;
524 
525 	/* enforce limit on number of failures */
526 	BUG_ON(nrd > nd);
527 	BUG_ON(nrp > np);
528 
529 	/* if failed data is present */
530 	if (nrd != 0) {
531 		int ip[RAID_PARITY_MAX];
532 		int i, j, k;
533 
534 		/* setup the vector of parities to use */
535 		for (i = 0, j = 0, k = 0; i < np; ++i) {
536 			if (j < nrp && ir[nrd + j] == nd + i) {
537 				/* this parity has to be recovered */
538 				++j;
539 			} else {
540 				/* this parity is used for recovering */
541 				ip[k] = i;
542 				++k;
543 			}
544 		}
545 
546 		/* recover the nrd data blocks specified in ir[], */
547 		/* using the first nrd parity in ip[] for recovering */
548 		raid_rec_ptr[nrd - 1](nrd, ir, ip, nd, size, v);
549 	}
550 
551 	/* recompute all the parities up to the last bad one */
552 	if (nrp != 0)
553 		raid_gen(nd, ir[nr - 1] - nd + 1, size, v);
554 }
555 
raid_data(int nr,int * id,int * ip,int nd,size_t size,void ** v)556 void raid_data(int nr, int *id, int *ip, int nd, size_t size, void **v)
557 {
558 	/* enforce limit on size */
559 	BUG_ON(size % 64 != 0);
560 
561 	/* enforce limit on number of failures */
562 	BUG_ON(nr > nd);
563 	BUG_ON(nr > RAID_PARITY_MAX);
564 
565 	/* enforce order in index vector for data */
566 	BUG_ON(nr >= 2 && id[0] >= id[1]);
567 	BUG_ON(nr >= 3 && id[1] >= id[2]);
568 	BUG_ON(nr >= 4 && id[2] >= id[3]);
569 	BUG_ON(nr >= 5 && id[3] >= id[4]);
570 	BUG_ON(nr >= 6 && id[4] >= id[5]);
571 
572 	/* enforce limit on index vector for data */
573 	BUG_ON(nr > 0 && id[nr-1] >= nd);
574 
575 	/* enforce order in index vector for parity */
576 	BUG_ON(nr >= 2 && ip[0] >= ip[1]);
577 	BUG_ON(nr >= 3 && ip[1] >= ip[2]);
578 	BUG_ON(nr >= 4 && ip[2] >= ip[3]);
579 	BUG_ON(nr >= 5 && ip[3] >= ip[4]);
580 	BUG_ON(nr >= 6 && ip[4] >= ip[5]);
581 
582 	/* if failed data is present */
583 	if (nr != 0)
584 		raid_rec_ptr[nr - 1](nr, id, ip, nd, size, v);
585 }
586 
587