xref: /netbsd/sys/dev/raidframe/rf_pq.c (revision 9272c734)
1 /*	$NetBSD: rf_pq.c,v 1.17 2019/10/10 03:43:59 christos Exp $	*/
2 /*
3  * Copyright (c) 1995 Carnegie-Mellon University.
4  * All rights reserved.
5  *
6  * Author: Daniel Stodolsky
7  *
8  * Permission to use, copy, modify and distribute this software and
9  * its documentation is hereby granted, provided that both the copyright
10  * notice and this permission notice appear in all copies of the
11  * software, derivative works or modified versions, and any portions
12  * thereof, and that both notices appear in supporting documentation.
13  *
14  * CARNEGIE MELLON ALLOWS FREE USE OF THIS SOFTWARE IN ITS "AS IS"
15  * CONDITION.  CARNEGIE MELLON DISCLAIMS ANY LIABILITY OF ANY KIND
16  * FOR ANY DAMAGES WHATSOEVER RESULTING FROM THE USE OF THIS SOFTWARE.
17  *
18  * Carnegie Mellon requests users of this software to return to
19  *
20  *  Software Distribution Coordinator  or  Software.Distribution@CS.CMU.EDU
21  *  School of Computer Science
22  *  Carnegie Mellon University
23  *  Pittsburgh PA 15213-3890
24  *
25  * any improvements or extensions that they make and grant Carnegie the
26  * rights to redistribute these changes.
27  */
28 
29 /*
30  * Code for RAID level 6 (P + Q) disk array architecture.
31  */
32 
33 #include <sys/cdefs.h>
34 __KERNEL_RCSID(0, "$NetBSD: rf_pq.c,v 1.17 2019/10/10 03:43:59 christos Exp $");
35 
36 #include "rf_archs.h"
37 
38 #if (RF_INCLUDE_DECL_PQ > 0) || (RF_INCLUDE_RAID6 > 0) || (RF_INCLUDE_EVENODD > 0)
39 
40 #include <dev/raidframe/raidframevar.h>
41 
42 #include "rf_raid.h"
43 #include "rf_dag.h"
44 #include "rf_dagffrd.h"
45 #include "rf_dagffwr.h"
46 #include "rf_dagdegrd.h"
47 #include "rf_dagdegwr.h"
48 #include "rf_dagutils.h"
49 #include "rf_dagfuncs.h"
50 #include "rf_etimer.h"
51 #include "rf_pqdeg.h"
52 #include "rf_general.h"
53 #include "rf_map.h"
54 #include "rf_pq.h"
55 
56 RF_RedFuncs_t rf_pFuncs = {rf_RegularONPFunc, "Regular Old-New P", rf_SimpleONPFunc, "Simple Old-New P"};
57 RF_RedFuncs_t rf_pRecoveryFuncs = {rf_RecoveryPFunc, "Recovery P Func", rf_RecoveryPFunc, "Recovery P Func"};
58 
59 void
rf_RegularONPFunc(RF_DagNode_t * node)60 rf_RegularONPFunc(RF_DagNode_t *node)
61 {
62 	rf_RegularXorFunc(node);
63 }
64 /*
65    same as simpleONQ func, but the coefficient is always 1
66 */
67 
68 void
rf_SimpleONPFunc(RF_DagNode_t * node)69 rf_SimpleONPFunc(RF_DagNode_t *node)
70 {
71 	rf_SimpleXorFunc(node);
72 }
73 
74 void
rf_RecoveryPFunc(RF_DagNode_t * node)75 rf_RecoveryPFunc(RF_DagNode_t *node)
76 {
77 	rf_RecoveryXorFunc(node);
78 }
79 
80 void
rf_RegularPFunc(RF_DagNode_t * node)81 rf_RegularPFunc(RF_DagNode_t *node)
82 {
83 	rf_RegularXorFunc(node);
84 }
85 #endif /* (RF_INCLUDE_DECL_PQ > 0) || (RF_INCLUDE_RAID6 > 0) || (RF_INCLUDE_EVENODD > 0) */
86 #if (RF_INCLUDE_DECL_PQ > 0) || (RF_INCLUDE_RAID6 > 0)
87 
88 static void
89 QDelta(char *dest, char *obuf, char *nbuf, unsigned length,
90     unsigned char coeff);
91 static void
92 rf_InvertQ(unsigned long *qbuf, unsigned long *abuf,
93     unsigned length, unsigned coeff);
94 
95 RF_RedFuncs_t rf_qFuncs = {rf_RegularONQFunc, "Regular Old-New Q", rf_SimpleONQFunc, "Simple Old-New Q"};
96 RF_RedFuncs_t rf_qRecoveryFuncs = {rf_RecoveryQFunc, "Recovery Q Func", rf_RecoveryQFunc, "Recovery Q Func"};
97 RF_RedFuncs_t rf_pqRecoveryFuncs = {rf_RecoveryPQFunc, "Recovery PQ Func", rf_RecoveryPQFunc, "Recovery PQ Func"};
98 
99 void
rf_PQDagSelect(RF_Raid_t * raidPtr,RF_IoType_t type,RF_AccessStripeMap_t * asmap,RF_VoidFuncPtr * createFunc)100 rf_PQDagSelect(
101     RF_Raid_t * raidPtr,
102     RF_IoType_t type,
103     RF_AccessStripeMap_t * asmap,
104     RF_VoidFuncPtr * createFunc)
105 {
106 	RF_RaidLayout_t *layoutPtr = &(raidPtr->Layout);
107 	unsigned ndfail = asmap->numDataFailed;
108 	unsigned npfail = asmap->numParityFailed;
109 	unsigned ntfail = npfail + ndfail;
110 
111 	RF_ASSERT(RF_IO_IS_R_OR_W(type));
112 	if (ntfail > 2) {
113 		RF_ERRORMSG("more than two disks failed in a single group!  Aborting I/O operation.\n");
114 		*createFunc = NULL;
115 		return;
116 	}
117 	/* ok, we can do this I/O */
118 	if (type == RF_IO_TYPE_READ) {
119 		switch (ndfail) {
120 		case 0:
121 			/* fault free read */
122 			*createFunc = (RF_VoidFuncPtr) rf_CreateFaultFreeReadDAG;	/* same as raid 5 */
123 			break;
124 		case 1:
125 			/* lost a single data unit */
126 			/* two cases: (1) parity is not lost. do a normal raid
127 			 * 5 reconstruct read. (2) parity is lost. do a
128 			 * reconstruct read using "q". */
129 			if (ntfail == 2) {	/* also lost redundancy */
130 				if (asmap->failedPDAs[1]->type == RF_PDA_TYPE_PARITY)
131 					*createFunc = (RF_VoidFuncPtr) rf_PQ_110_CreateReadDAG;
132 				else
133 					*createFunc = (RF_VoidFuncPtr) rf_PQ_101_CreateReadDAG;
134 			} else {
135 				/* P and Q are ok. But is there a failure in
136 				 * some unaccessed data unit? */
137 				if (rf_NumFailedDataUnitsInStripe(raidPtr, asmap) == 2)
138 					*createFunc = (RF_VoidFuncPtr) rf_PQ_200_CreateReadDAG;
139 				else
140 					*createFunc = (RF_VoidFuncPtr) rf_PQ_100_CreateReadDAG;
141 			}
142 			break;
143 		case 2:
144 			/* lost two data units */
145 			*createFunc = (RF_VoidFuncPtr) rf_PQ_200_CreateReadDAG;
146 			break;
147 		}
148 		return;
149 	}
150 	/* a write */
151 	switch (ntfail) {
152 	case 0:		/* fault free */
153 		if (rf_suppressLocksAndLargeWrites ||
154 		    (((asmap->numStripeUnitsAccessed <= (layoutPtr->numDataCol / 2)) && (layoutPtr->numDataCol != 1)) ||
155 			(asmap->parityInfo->next != NULL) || (asmap->qInfo->next != NULL) || rf_CheckStripeForFailures(raidPtr, asmap))) {
156 
157 			*createFunc = (RF_VoidFuncPtr) rf_PQCreateSmallWriteDAG;
158 		} else {
159 			*createFunc = (RF_VoidFuncPtr) rf_PQCreateLargeWriteDAG;
160 		}
161 		break;
162 
163 	case 1:		/* single disk fault */
164 		if (npfail == 1) {
165 			RF_ASSERT((asmap->failedPDAs[0]->type == RF_PDA_TYPE_PARITY) || (asmap->failedPDAs[0]->type == RF_PDA_TYPE_Q));
166 			if (asmap->failedPDAs[0]->type == RF_PDA_TYPE_Q) {	/* q died, treat like
167 										 * normal mode raid5
168 										 * write. */
169 				if (((asmap->numStripeUnitsAccessed <= (layoutPtr->numDataCol / 2)) || (asmap->numStripeUnitsAccessed == 1))
170 				    || rf_NumFailedDataUnitsInStripe(raidPtr, asmap))
171 					*createFunc = (RF_VoidFuncPtr) rf_PQ_001_CreateSmallWriteDAG;
172 				else
173 					*createFunc = (RF_VoidFuncPtr) rf_PQ_001_CreateLargeWriteDAG;
174 			} else {/* parity died, small write only updating Q */
175 				if (((asmap->numStripeUnitsAccessed <= (layoutPtr->numDataCol / 2)) || (asmap->numStripeUnitsAccessed == 1))
176 				    || rf_NumFailedDataUnitsInStripe(raidPtr, asmap))
177 					*createFunc = (RF_VoidFuncPtr) rf_PQ_010_CreateSmallWriteDAG;
178 				else
179 					*createFunc = (RF_VoidFuncPtr) rf_PQ_010_CreateLargeWriteDAG;
180 			}
181 		} else {	/* data missing. Do a P reconstruct write if
182 				 * only a single data unit is lost in the
183 				 * stripe, otherwise a PQ reconstruct write. */
184 			if (rf_NumFailedDataUnitsInStripe(raidPtr, asmap) == 2)
185 				*createFunc = (RF_VoidFuncPtr) rf_PQ_200_CreateWriteDAG;
186 			else
187 				*createFunc = (RF_VoidFuncPtr) rf_PQ_100_CreateWriteDAG;
188 		}
189 		break;
190 
191 	case 2:		/* two disk faults */
192 		switch (npfail) {
193 		case 2:	/* both p and q dead */
194 			*createFunc = (RF_VoidFuncPtr) rf_PQ_011_CreateWriteDAG;
195 			break;
196 		case 1:	/* either p or q and dead data */
197 			RF_ASSERT(asmap->failedPDAs[0]->type == RF_PDA_TYPE_DATA);
198 			RF_ASSERT((asmap->failedPDAs[1]->type == RF_PDA_TYPE_PARITY) || (asmap->failedPDAs[1]->type == RF_PDA_TYPE_Q));
199 			if (asmap->failedPDAs[1]->type == RF_PDA_TYPE_Q)
200 				*createFunc = (RF_VoidFuncPtr) rf_PQ_101_CreateWriteDAG;
201 			else
202 				*createFunc = (RF_VoidFuncPtr) rf_PQ_110_CreateWriteDAG;
203 			break;
204 		case 0:	/* double data loss */
205 			*createFunc = (RF_VoidFuncPtr) rf_PQ_200_CreateWriteDAG;
206 			break;
207 		}
208 		break;
209 
210 	default:		/* more than 2 disk faults */
211 		*createFunc = NULL;
212 		RF_PANIC();
213 	}
214 	return;
215 }
216 /*
217    Used as a stop gap info function
218 */
219 #if 0
220 static void
221 PQOne(RF_Raid_t *raidPtr, int *nSucc, int *nAnte, RF_AccessStripeMap_t *asmap)
222 {
223 	*nSucc = *nAnte = 1;
224 }
225 
226 static void
227 PQOneTwo(RF_Raid_t *raidPtr, int *nSucc, int *nAnte, RF_AccessStripeMap_t *asmap)
228 {
229 	*nSucc = 1;
230 	*nAnte = 2;
231 }
232 #endif
233 
RF_CREATE_DAG_FUNC_DECL(rf_PQCreateLargeWriteDAG)234 RF_CREATE_DAG_FUNC_DECL(rf_PQCreateLargeWriteDAG)
235 {
236 	rf_CommonCreateLargeWriteDAG(raidPtr, asmap, dag_h, bp, flags, allocList, 2,
237 	    rf_RegularPQFunc, RF_FALSE);
238 }
239 
240 int
rf_RegularONQFunc(RF_DagNode_t * node)241 rf_RegularONQFunc(RF_DagNode_t *node)
242 {
243 	int     np = node->numParams;
244 	int     d;
245 	RF_Raid_t *raidPtr = (RF_Raid_t *) node->params[np - 1].p;
246 	int     i;
247 	RF_AccTraceEntry_t *tracerec = node->dagHdr->tracerec;
248 	RF_Etimer_t timer;
249 	char   *qbuf, *qpbuf;
250 	char   *obuf, *nbuf;
251 	RF_PhysDiskAddr_t *old, *new;
252 	unsigned long coeff;
253 	unsigned secPerSU = raidPtr->Layout.sectorsPerStripeUnit;
254 
255 	RF_ETIMER_START(timer);
256 
257 	d = (np - 3) / 4;
258 	RF_ASSERT(4 * d + 3 == np);
259 	qbuf = (char *) node->params[2 * d + 1].p;	/* q buffer */
260 	for (i = 0; i < d; i++) {
261 		old = (RF_PhysDiskAddr_t *) node->params[2 * i].p;
262 		obuf = (char *) node->params[2 * i + 1].p;
263 		new = (RF_PhysDiskAddr_t *) node->params[2 * (d + 1 + i)].p;
264 		nbuf = (char *) node->params[2 * (d + 1 + i) + 1].p;
265 		RF_ASSERT(new->numSector == old->numSector);
266 		RF_ASSERT(new->raidAddress == old->raidAddress);
267 		/* the stripe unit within the stripe tells us the coefficient
268 		 * to use for the multiply. */
269 		coeff = rf_RaidAddressToStripeUnitID(&(raidPtr->Layout), new->raidAddress);
270 		/* compute the data unit offset within the column, then add
271 		 * one */
272 		coeff = (coeff % raidPtr->Layout.numDataCol);
273 		qpbuf = qbuf + rf_RaidAddressToByte(raidPtr, old->startSector % secPerSU);
274 		QDelta(qpbuf, obuf, nbuf, rf_RaidAddressToByte(raidPtr, old->numSector), coeff);
275 	}
276 
277 	RF_ETIMER_STOP(timer);
278 	RF_ETIMER_EVAL(timer);
279 	tracerec->q_us += RF_ETIMER_VAL_US(timer);
280 	rf_GenericWakeupFunc(node, 0);	/* call wake func explicitly since no
281 					 * I/O in this node */
282 	return (0);
283 }
284 /*
285    See the SimpleXORFunc for the difference between a simple and regular func.
286    These Q functions should be used for
287 
288          new q = Q(data,old data,old q)
289 
290    style updates and not for
291 
292          q = ( new data, new data, .... )
293 
294    computations.
295 
296    The simple q takes 2(2d+1)+1 params, where d is the number
297    of stripes written. The order of params is
298    old data pda_0, old data buffer_0, old data pda_1, old data buffer_1, ... old data pda_d, old data buffer_d
299    [2d] old q pda_0, old q buffer
300    [2d_2] new data pda_0, new data buffer_0, ...                                    new data pda_d, new data buffer_d
301    raidPtr
302 */
303 
304 int
rf_SimpleONQFunc(RF_DagNode_t * node)305 rf_SimpleONQFunc(RF_DagNode_t *node)
306 {
307 	int     np = node->numParams;
308 	int     d;
309 	RF_Raid_t *raidPtr = (RF_Raid_t *) node->params[np - 1].p;
310 	int     i;
311 	RF_AccTraceEntry_t *tracerec = node->dagHdr->tracerec;
312 	RF_Etimer_t timer;
313 	char   *qbuf;
314 	char   *obuf, *nbuf;
315 	RF_PhysDiskAddr_t *old, *new;
316 	unsigned long coeff;
317 
318 	RF_ETIMER_START(timer);
319 
320 	d = (np - 3) / 4;
321 	RF_ASSERT(4 * d + 3 == np);
322 	qbuf = (char *) node->params[2 * d + 1].p;	/* q buffer */
323 	for (i = 0; i < d; i++) {
324 		old = (RF_PhysDiskAddr_t *) node->params[2 * i].p;
325 		obuf = (char *) node->params[2 * i + 1].p;
326 		new = (RF_PhysDiskAddr_t *) node->params[2 * (d + 1 + i)].p;
327 		nbuf = (char *) node->params[2 * (d + 1 + i) + 1].p;
328 		RF_ASSERT(new->numSector == old->numSector);
329 		RF_ASSERT(new->raidAddress == old->raidAddress);
330 		/* the stripe unit within the stripe tells us the coefficient
331 		 * to use for the multiply. */
332 		coeff = rf_RaidAddressToStripeUnitID(&(raidPtr->Layout), new->raidAddress);
333 		/* compute the data unit offset within the column, then add
334 		 * one */
335 		coeff = (coeff % raidPtr->Layout.numDataCol);
336 		QDelta(qbuf, obuf, nbuf, rf_RaidAddressToByte(raidPtr, old->numSector), coeff);
337 	}
338 
339 	RF_ETIMER_STOP(timer);
340 	RF_ETIMER_EVAL(timer);
341 	tracerec->q_us += RF_ETIMER_VAL_US(timer);
342 	rf_GenericWakeupFunc(node, 0);	/* call wake func explicitly since no
343 					 * I/O in this node */
344 	return (0);
345 }
RF_CREATE_DAG_FUNC_DECL(rf_PQCreateSmallWriteDAG)346 RF_CREATE_DAG_FUNC_DECL(rf_PQCreateSmallWriteDAG)
347 {
348 	rf_CommonCreateSmallWriteDAG(raidPtr, asmap, dag_h, bp, flags, allocList, &rf_pFuncs, &rf_qFuncs);
349 }
350 
351 static void RegularQSubr(RF_DagNode_t *node, char   *qbuf);
352 
353 static void
RegularQSubr(RF_DagNode_t * node,char * qbuf)354 RegularQSubr(RF_DagNode_t *node, char *qbuf)
355 {
356 	int     np = node->numParams;
357 	int     d;
358 	RF_Raid_t *raidPtr = (RF_Raid_t *) node->params[np - 1].p;
359 	unsigned secPerSU = raidPtr->Layout.sectorsPerStripeUnit;
360 	int     i;
361 	RF_AccTraceEntry_t *tracerec = node->dagHdr->tracerec;
362 	RF_Etimer_t timer;
363 	char   *obuf, *qpbuf;
364 	RF_PhysDiskAddr_t *old;
365 	unsigned long coeff;
366 
367 	RF_ETIMER_START(timer);
368 
369 	d = (np - 1) / 2;
370 	RF_ASSERT(2 * d + 1 == np);
371 	for (i = 0; i < d; i++) {
372 		old = (RF_PhysDiskAddr_t *) node->params[2 * i].p;
373 		obuf = (char *) node->params[2 * i + 1].p;
374 		coeff = rf_RaidAddressToStripeUnitID(&(raidPtr->Layout), old->raidAddress);
375 		/* compute the data unit offset within the column, then add
376 		 * one */
377 		coeff = (coeff % raidPtr->Layout.numDataCol);
378 		/* the input buffers may not all be aligned with the start of
379 		 * the stripe. so shift by their sector offset within the
380 		 * stripe unit */
381 		qpbuf = qbuf + rf_RaidAddressToByte(raidPtr, old->startSector % secPerSU);
382 		rf_IncQ((unsigned long *) qpbuf, (unsigned long *) obuf, rf_RaidAddressToByte(raidPtr, old->numSector), coeff);
383 	}
384 
385 	RF_ETIMER_STOP(timer);
386 	RF_ETIMER_EVAL(timer);
387 	tracerec->q_us += RF_ETIMER_VAL_US(timer);
388 }
389 /*
390    used in degraded writes.
391 */
392 
393 static void DegrQSubr(RF_DagNode_t *node);
394 
395 static void
DegrQSubr(RF_DagNode_t * node)396 DegrQSubr(RF_DagNode_t *node)
397 {
398 	int     np = node->numParams;
399 	int     d;
400 	RF_Raid_t *raidPtr = (RF_Raid_t *) node->params[np - 1].p;
401 	unsigned secPerSU = raidPtr->Layout.sectorsPerStripeUnit;
402 	int     i;
403 	RF_AccTraceEntry_t *tracerec = node->dagHdr->tracerec;
404 	RF_Etimer_t timer;
405 	char   *qbuf = node->results[1];
406 	char   *obuf, *qpbuf;
407 	RF_PhysDiskAddr_t *old;
408 	unsigned long coeff;
409 	unsigned fail_start;
410 	int     j;
411 
412 	old = (RF_PhysDiskAddr_t *) node->params[np - 2].p;
413 	fail_start = old->startSector % secPerSU;
414 
415 	RF_ETIMER_START(timer);
416 
417 	d = (np - 2) / 2;
418 	RF_ASSERT(2 * d + 2 == np);
419 	for (i = 0; i < d; i++) {
420 		old = (RF_PhysDiskAddr_t *) node->params[2 * i].p;
421 		obuf = (char *) node->params[2 * i + 1].p;
422 		coeff = rf_RaidAddressToStripeUnitID(&(raidPtr->Layout), old->raidAddress);
423 		/* compute the data unit offset within the column, then add
424 		 * one */
425 		coeff = (coeff % raidPtr->Layout.numDataCol);
426 		/* the input buffers may not all be aligned with the start of
427 		 * the stripe. so shift by their sector offset within the
428 		 * stripe unit */
429 		j = old->startSector % secPerSU;
430 		RF_ASSERT(j >= fail_start);
431 		qpbuf = qbuf + rf_RaidAddressToByte(raidPtr, j - fail_start);
432 		rf_IncQ((unsigned long *) qpbuf, (unsigned long *) obuf, rf_RaidAddressToByte(raidPtr, old->numSector), coeff);
433 	}
434 
435 	RF_ETIMER_STOP(timer);
436 	RF_ETIMER_EVAL(timer);
437 	tracerec->q_us += RF_ETIMER_VAL_US(timer);
438 }
439 /*
440    Called by large write code to compute the new parity and the new q.
441 
442    structure of the params:
443 
444    pda_0, buffer_0, pda_1 , buffer_1, ... , pda_d, buffer_d ( d = numDataCol
445    raidPtr
446 
447    for a total of 2d+1 arguments.
448    The result buffers results[0], results[1] are the buffers for the p and q,
449    respectively.
450 
451    We compute Q first, then compute P. The P calculation may try to reuse
452    one of the input buffers for its output, so if we computed P first, we would
453    corrupt the input for the q calculation.
454 */
455 
456 int
rf_RegularPQFunc(RF_DagNode_t * node)457 rf_RegularPQFunc(RF_DagNode_t *node)
458 {
459 	RegularQSubr(node, node->results[1]);
460 	return (rf_RegularXorFunc(node));	/* does the wakeup */
461 }
462 
463 int
rf_RegularQFunc(RF_DagNode_t * node)464 rf_RegularQFunc(RF_DagNode_t *node)
465 {
466 	/* Almost ... adjust Qsubr args */
467 	RegularQSubr(node, node->results[0]);
468 	rf_GenericWakeupFunc(node, 0);	/* call wake func explicitly since no
469 					 * I/O in this node */
470 	return (0);
471 }
472 /*
473    Called by singly degraded write code to compute the new parity and the new q.
474 
475    structure of the params:
476 
477    pda_0, buffer_0, pda_1 , buffer_1, ... , pda_d, buffer_d
478    failedPDA raidPtr
479 
480    for a total of 2d+2 arguments.
481    The result buffers results[0], results[1] are the buffers for the parity and q,
482    respectively.
483 
484    We compute Q first, then compute parity. The parity calculation may try to reuse
485    one of the input buffers for its output, so if we computed parity first, we would
486    corrupt the input for the q calculation.
487 
488    We treat this identically to the regularPQ case, ignoring the failedPDA extra argument.
489 */
490 
491 void
rf_Degraded_100_PQFunc(RF_DagNode_t * node)492 rf_Degraded_100_PQFunc(RF_DagNode_t *node)
493 {
494 	int     np = node->numParams;
495 
496 	RF_ASSERT(np >= 2);
497 	DegrQSubr(node);
498 	rf_RecoveryXorFunc(node);
499 }
500 
501 
502 /*
503    The two below are used when reading a stripe with a single lost data unit.
504    The parameters are
505 
506    pda_0, buffer_0, .... pda_n, buffer_n, P pda, P buffer, failedPDA, raidPtr
507 
508    and results[0] contains the data buffer. Which is originally zero-filled.
509 
510 */
511 
512 /* this Q func is used by the degraded-mode dag functions to recover lost data.
513  * the second-to-last parameter is the PDA for the failed portion of the access.
514  * the code here looks at this PDA and assumes that the xor target buffer is
515  * equal in size to the number of sectors in the failed PDA.  It then uses
516  * the other PDAs in the parameter list to determine where within the target
517  * buffer the corresponding data should be xored.
518  *
519  * Recall the basic equation is
520  *
521  *     Q = ( data_1 + 2 * data_2 ... + k * data_k  ) mod 256
522  *
523  * so to recover data_j we need
524  *
525  *    J data_j = (Q - data_1 - 2 data_2 ....- k* data_k) mod 256
526  *
527  * So the coefficient for each buffer is (255 - data_col), and j should be initialized by
528  * copying Q into it. Then we need to do a table lookup to convert to solve
529  *   data_j /= J
530  *
531  *
532  */
533 int
rf_RecoveryQFunc(RF_DagNode_t * node)534 rf_RecoveryQFunc(RF_DagNode_t *node)
535 {
536 	RF_Raid_t *raidPtr = (RF_Raid_t *) node->params[node->numParams - 1].p;
537 	RF_RaidLayout_t *layoutPtr = (RF_RaidLayout_t *) & raidPtr->Layout;
538 	RF_PhysDiskAddr_t *failedPDA = (RF_PhysDiskAddr_t *) node->params[node->numParams - 2].p;
539 	int     i;
540 	RF_PhysDiskAddr_t *pda;
541 	RF_RaidAddr_t suoffset, failedSUOffset = rf_StripeUnitOffset(layoutPtr, failedPDA->startSector);
542 	char   *srcbuf, *destbuf;
543 	RF_AccTraceEntry_t *tracerec = node->dagHdr->tracerec;
544 	RF_Etimer_t timer;
545 	unsigned long coeff;
546 
547 	RF_ETIMER_START(timer);
548 	/* start by copying Q into the buffer */
549 	memcpy(node->results[0], node->params[node->numParams - 3].p,
550 	    rf_RaidAddressToByte(raidPtr, failedPDA->numSector));
551 	for (i = 0; i < node->numParams - 4; i += 2) {
552 		RF_ASSERT(node->params[i + 1].p != node->results[0]);
553 		pda = (RF_PhysDiskAddr_t *) node->params[i].p;
554 		srcbuf = (char *) node->params[i + 1].p;
555 		suoffset = rf_StripeUnitOffset(layoutPtr, pda->startSector);
556 		destbuf = ((char *) node->results[0]) + rf_RaidAddressToByte(raidPtr, suoffset - failedSUOffset);
557 		coeff = rf_RaidAddressToStripeUnitID(&(raidPtr->Layout), pda->raidAddress);
558 		/* compute the data unit offset within the column */
559 		coeff = (coeff % raidPtr->Layout.numDataCol);
560 		rf_IncQ((unsigned long *) destbuf, (unsigned long *) srcbuf, rf_RaidAddressToByte(raidPtr, pda->numSector), coeff);
561 	}
562 	/* Do the nasty inversion now */
563 	coeff = (rf_RaidAddressToStripeUnitID(&(raidPtr->Layout), failedPDA->startSector) % raidPtr->Layout.numDataCol);
564 	rf_InvertQ(node->results[0], node->results[0], rf_RaidAddressToByte(raidPtr, pda->numSector), coeff);
565 	RF_ETIMER_STOP(timer);
566 	RF_ETIMER_EVAL(timer);
567 	tracerec->q_us += RF_ETIMER_VAL_US(timer);
568 	rf_GenericWakeupFunc(node, 0);
569 	return (0);
570 }
571 
572 int
rf_RecoveryPQFunc(RF_DagNode_t * node)573 rf_RecoveryPQFunc(RF_DagNode_t *node)
574 {
575 	RF_Raid_t *raidPtr = (RF_Raid_t *) node->params[node->numParams - 1].p;
576 	printf("raid%d: Recovery from PQ not implemented.\n",raidPtr->raidid);
577 	return (1);
578 }
579 /*
580    Degraded write Q subroutine.
581    Used when P is dead.
582    Large-write style Q computation.
583    Parameters
584 
585    (pda,buf),(pda,buf),.....,(failedPDA,bufPtr),failedPDA,raidPtr.
586 
587    We ignore failedPDA.
588 
589    This is a "simple style" recovery func.
590 */
591 
592 void
rf_PQ_DegradedWriteQFunc(RF_DagNode_t * node)593 rf_PQ_DegradedWriteQFunc(RF_DagNode_t *node)
594 {
595 	int     np = node->numParams;
596 	int     d;
597 	RF_Raid_t *raidPtr = (RF_Raid_t *) node->params[np - 1].p;
598 	unsigned secPerSU = raidPtr->Layout.sectorsPerStripeUnit;
599 	int     i;
600 	RF_AccTraceEntry_t *tracerec = node->dagHdr->tracerec;
601 	RF_Etimer_t timer;
602 	char   *qbuf = node->results[0];
603 	char   *obuf, *qpbuf;
604 	RF_PhysDiskAddr_t *old;
605 	unsigned long coeff;
606 	int     fail_start, j;
607 
608 	old = (RF_PhysDiskAddr_t *) node->params[np - 2].p;
609 	fail_start = old->startSector % secPerSU;
610 
611 	RF_ETIMER_START(timer);
612 
613 	d = (np - 2) / 2;
614 	RF_ASSERT(2 * d + 2 == np);
615 
616 	for (i = 0; i < d; i++) {
617 		old = (RF_PhysDiskAddr_t *) node->params[2 * i].p;
618 		obuf = (char *) node->params[2 * i + 1].p;
619 		coeff = rf_RaidAddressToStripeUnitID(&(raidPtr->Layout), old->raidAddress);
620 		/* compute the data unit offset within the column, then add
621 		 * one */
622 		coeff = (coeff % raidPtr->Layout.numDataCol);
623 		j = old->startSector % secPerSU;
624 		RF_ASSERT(j >= fail_start);
625 		qpbuf = qbuf + rf_RaidAddressToByte(raidPtr, j - fail_start);
626 		rf_IncQ((unsigned long *) qpbuf, (unsigned long *) obuf, rf_RaidAddressToByte(raidPtr, old->numSector), coeff);
627 	}
628 
629 	RF_ETIMER_STOP(timer);
630 	RF_ETIMER_EVAL(timer);
631 	tracerec->q_us += RF_ETIMER_VAL_US(timer);
632 	rf_GenericWakeupFunc(node, 0);
633 }
634 
635 
636 
637 
638 /* Q computations */
639 
640 /*
641    coeff - colummn;
642 
643    compute  dest ^= qfor[28-coeff][rn[coeff+1] a]
644 
645    on 5-bit basis;
646    length in bytes;
647 */
648 
649 void
rf_IncQ(unsigned long * dest,unsigned long * buf,unsigned length,unsigned coeff)650 rf_IncQ(unsigned long *dest, unsigned long *buf, unsigned length, unsigned coeff)
651 {
652 	unsigned long a, d, new;
653 	unsigned long a1, a2;
654 	unsigned int *q = &(rf_qfor[28 - coeff][0]);
655 	unsigned r = rf_rn[coeff + 1];
656 
657 #define EXTRACT(a,i) ((a >> (5L*i)) & 0x1f)
658 #define INSERT(a,i) (a << (5L*i))
659 
660 	length /= 8;
661 	/* 13 5 bit quants in a 64 bit word */
662 	while (length) {
663 		a = *buf++;
664 		d = *dest;
665 		a1 = EXTRACT(a, 0) ^ r;
666 		a2 = EXTRACT(a, 1) ^ r;
667 		new = INSERT(a2, 1) | a1;
668 		a1 = EXTRACT(a, 2) ^ r;
669 		a2 = EXTRACT(a, 3) ^ r;
670 		a1 = q[a1];
671 		a2 = q[a2];
672 		new = new | INSERT(a1, 2) | INSERT(a2, 3);
673 		a1 = EXTRACT(a, 4) ^ r;
674 		a2 = EXTRACT(a, 5) ^ r;
675 		a1 = q[a1];
676 		a2 = q[a2];
677 		new = new | INSERT(a1, 4) | INSERT(a2, 5);
678 		a1 = EXTRACT(a, 5) ^ r;
679 		a2 = EXTRACT(a, 6) ^ r;
680 		a1 = q[a1];
681 		a2 = q[a2];
682 		new = new | INSERT(a1, 5) | INSERT(a2, 6);
683 #if RF_LONGSHIFT > 2
684 		a1 = EXTRACT(a, 7) ^ r;
685 		a2 = EXTRACT(a, 8) ^ r;
686 		a1 = q[a1];
687 		a2 = q[a2];
688 		new = new | INSERT(a1, 7) | INSERT(a2, 8);
689 		a1 = EXTRACT(a, 9) ^ r;
690 		a2 = EXTRACT(a, 10) ^ r;
691 		a1 = q[a1];
692 		a2 = q[a2];
693 		new = new | INSERT(a1, 9) | INSERT(a2, 10);
694 		a1 = EXTRACT(a, 11) ^ r;
695 		a2 = EXTRACT(a, 12) ^ r;
696 		a1 = q[a1];
697 		a2 = q[a2];
698 		new = new | INSERT(a1, 11) | INSERT(a2, 12);
699 #endif				/* RF_LONGSHIFT > 2 */
700 		d ^= new;
701 		*dest++ = d;
702 		length--;
703 	}
704 }
705 /*
706    compute
707 
708    dest ^= rf_qfor[28-coeff][rf_rn[coeff+1] (old^new) ]
709 
710    on a five bit basis.
711    optimization: compute old ^ new on 64 bit basis.
712 
713    length in bytes.
714 */
715 
716 static void
QDelta(char * dest,char * obuf,char * nbuf,unsigned length,unsigned char coeff)717 QDelta(
718     char *dest,
719     char *obuf,
720     char *nbuf,
721     unsigned length,
722     unsigned char coeff)
723 {
724 	unsigned long a, d, new;
725 	unsigned long a1, a2;
726 	unsigned int *q = &(rf_qfor[28 - coeff][0]);
727 	unsigned int r = rf_rn[coeff + 1];
728 
729 	r = a1 = a2 = new = d = a = 0; /* XXX for now... */
730 	q = NULL; /* XXX for now */
731 
732 #ifdef _KERNEL
733 	/* PQ in kernel currently not supported because the encoding/decoding
734 	 * table is not present */
735 	memset(dest, 0, length);
736 #else				/* KERNEL */
737 	/* this code probably doesn't work and should be rewritten  -wvcii */
738 	/* 13 5 bit quants in a 64 bit word */
739 	length /= 8;
740 	while (length) {
741 		a = *obuf++;	/* XXX need to reorg to avoid cache conflicts */
742 		a ^= *nbuf++;
743 		d = *dest;
744 		a1 = EXTRACT(a, 0) ^ r;
745 		a2 = EXTRACT(a, 1) ^ r;
746 		a1 = q[a1];
747 		a2 = q[a2];
748 		new = INSERT(a2, 1) | a1;
749 		a1 = EXTRACT(a, 2) ^ r;
750 		a2 = EXTRACT(a, 3) ^ r;
751 		a1 = q[a1];
752 		a2 = q[a2];
753 		new = new | INSERT(a1, 2) | INSERT(a2, 3);
754 		a1 = EXTRACT(a, 4) ^ r;
755 		a2 = EXTRACT(a, 5) ^ r;
756 		a1 = q[a1];
757 		a2 = q[a2];
758 		new = new | INSERT(a1, 4) | INSERT(a2, 5);
759 		a1 = EXTRACT(a, 5) ^ r;
760 		a2 = EXTRACT(a, 6) ^ r;
761 		a1 = q[a1];
762 		a2 = q[a2];
763 		new = new | INSERT(a1, 5) | INSERT(a2, 6);
764 #if RF_LONGSHIFT > 2
765 		a1 = EXTRACT(a, 7) ^ r;
766 		a2 = EXTRACT(a, 8) ^ r;
767 		a1 = q[a1];
768 		a2 = q[a2];
769 		new = new | INSERT(a1, 7) | INSERT(a2, 8);
770 		a1 = EXTRACT(a, 9) ^ r;
771 		a2 = EXTRACT(a, 10) ^ r;
772 		a1 = q[a1];
773 		a2 = q[a2];
774 		new = new | INSERT(a1, 9) | INSERT(a2, 10);
775 		a1 = EXTRACT(a, 11) ^ r;
776 		a2 = EXTRACT(a, 12) ^ r;
777 		a1 = q[a1];
778 		a2 = q[a2];
779 		new = new | INSERT(a1, 11) | INSERT(a2, 12);
780 #endif				/* RF_LONGSHIFT > 2 */
781 		d ^= new;
782 		*dest++ = d;
783 		length--;
784 	}
785 #endif				/* _KERNEL */
786 }
787 /*
788    recover columns a and b from the given p and q into
789    bufs abuf and bbuf. All bufs are word aligned.
790    Length is in bytes.
791 */
792 
793 
794 /*
795  * XXX
796  *
797  * Everything about this seems wrong.
798  */
799 void
rf_PQ_recover(unsigned long * pbuf,unsigned long * qbuf,unsigned long * abuf,unsigned long * bbuf,unsigned length,unsigned coeff_a,unsigned coeff_b)800 rf_PQ_recover(unsigned long *pbuf, unsigned long *qbuf, unsigned long *abuf, unsigned long *bbuf, unsigned length, unsigned coeff_a, unsigned coeff_b)
801 {
802 	unsigned long p, q, a, a0, a1;
803 	int     col = (29 * coeff_a) + coeff_b;
804 	unsigned char *q0 = &(rf_qinv[col][0]);
805 
806 	length /= 8;
807 	while (length) {
808 		p = *pbuf++;
809 		q = *qbuf++;
810 		a0 = EXTRACT(p, 0);
811 		a1 = EXTRACT(q, 0);
812 		a = q0[a0 << 5 | a1];
813 #define MF(i) \
814       a0 = EXTRACT(p,i); \
815       a1 = EXTRACT(q,i); \
816       a  = a | INSERT(q0[a0<<5 | a1],i)
817 
818 		MF(1);
819 		MF(2);
820 		MF(3);
821 		MF(4);
822 		MF(5);
823 		MF(6);
824 #if 0
825 		MF(7);
826 		MF(8);
827 		MF(9);
828 		MF(10);
829 		MF(11);
830 		MF(12);
831 #endif				/* 0 */
832 		*abuf++ = a;
833 		*bbuf++ = a ^ p;
834 		length--;
835 	}
836 }
837 /*
838    Lost parity and a data column. Recover that data column.
839    Assume col coeff is lost. Let q the contents of Q after
840    all surviving data columns have been q-xored out of it.
841    Then we have the equation
842 
843    q[28-coeff][a_i ^ r_i+1] = q
844 
845    but q is cyclic with period 31.
846    So q[3+coeff][q[28-coeff][a_i ^ r_{i+1}]] =
847       q[31][a_i ^ r_{i+1}] = a_i ^ r_{i+1} .
848 
849    so a_i = r_{coeff+1} ^ q[3+coeff][q]
850 
851    The routine is passed q buffer and the buffer
852    the data is to be recoverd into. They can be the same.
853 */
854 
855 
856 
857 static void
rf_InvertQ(unsigned long * qbuf,unsigned long * abuf,unsigned length,unsigned coeff)858 rf_InvertQ(
859     unsigned long *qbuf,
860     unsigned long *abuf,
861     unsigned length,
862     unsigned coeff)
863 {
864 	unsigned long a, new;
865 	unsigned long a1, a2;
866 	unsigned int *q = &(rf_qfor[3 + coeff][0]);
867 	unsigned r = rf_rn[coeff + 1];
868 
869 	/* 13 5 bit quants in a 64 bit word */
870 	length /= 8;
871 	while (length) {
872 		a = *qbuf++;
873 		a1 = EXTRACT(a, 0);
874 		a2 = EXTRACT(a, 1);
875 		a1 = r ^ q[a1];
876 		a2 = r ^ q[a2];
877 		new = INSERT(a2, 1) | a1;
878 #define M(i,j) \
879       a1 = EXTRACT(a,i); \
880       a2 = EXTRACT(a,j); \
881       a1 = r ^ q[a1]; \
882       a2 = r ^ q[a2]; \
883       new = new | INSERT(a1,i) | INSERT(a2,j)
884 
885 		M(2, 3);
886 		M(4, 5);
887 		M(5, 6);
888 #if RF_LONGSHIFT > 2
889 		M(7, 8);
890 		M(9, 10);
891 		M(11, 12);
892 #endif				/* RF_LONGSHIFT > 2 */
893 		*abuf++ = new;
894 		length--;
895 	}
896 }
897 #endif				/* (RF_INCLUDE_DECL_PQ > 0) ||
898 				 * (RF_INCLUDE_RAID6 > 0) */
899