xref: /netbsd/sys/dev/raidframe/rf_dagffrd.c (revision 6550d01e)
1 /*	$NetBSD: rf_dagffrd.c,v 1.18 2006/11/16 01:33:23 christos Exp $	*/
2 /*
3  * Copyright (c) 1995 Carnegie-Mellon University.
4  * All rights reserved.
5  *
6  * Author: Mark Holland, Daniel Stodolsky, William V. Courtright II
7  *
8  * Permission to use, copy, modify and distribute this software and
9  * its documentation is hereby granted, provided that both the copyright
10  * notice and this permission notice appear in all copies of the
11  * software, derivative works or modified versions, and any portions
12  * thereof, and that both notices appear in supporting documentation.
13  *
14  * CARNEGIE MELLON ALLOWS FREE USE OF THIS SOFTWARE IN ITS "AS IS"
15  * CONDITION.  CARNEGIE MELLON DISCLAIMS ANY LIABILITY OF ANY KIND
16  * FOR ANY DAMAGES WHATSOEVER RESULTING FROM THE USE OF THIS SOFTWARE.
17  *
18  * Carnegie Mellon requests users of this software to return to
19  *
20  *  Software Distribution Coordinator  or  Software.Distribution@CS.CMU.EDU
21  *  School of Computer Science
22  *  Carnegie Mellon University
23  *  Pittsburgh PA 15213-3890
24  *
25  * any improvements or extensions that they make and grant Carnegie the
26  * rights to redistribute these changes.
27  */
28 
29 /*
30  * rf_dagffrd.c
31  *
32  * code for creating fault-free read DAGs
33  *
34  */
35 
36 #include <sys/cdefs.h>
37 __KERNEL_RCSID(0, "$NetBSD: rf_dagffrd.c,v 1.18 2006/11/16 01:33:23 christos Exp $");
38 
39 #include <dev/raidframe/raidframevar.h>
40 
41 #include "rf_raid.h"
42 #include "rf_dag.h"
43 #include "rf_dagutils.h"
44 #include "rf_dagfuncs.h"
45 #include "rf_debugMem.h"
46 #include "rf_general.h"
47 #include "rf_dagffrd.h"
48 
49 /******************************************************************************
50  *
51  * General comments on DAG creation:
52  *
53  * All DAGs in this file use roll-away error recovery.  Each DAG has a single
54  * commit node, usually called "Cmt."  If an error occurs before the Cmt node
55  * is reached, the execution engine will halt forward execution and work
56  * backward through the graph, executing the undo functions.  Assuming that
57  * each node in the graph prior to the Cmt node are undoable and atomic - or -
58  * does not make changes to permanent state, the graph will fail atomically.
59  * If an error occurs after the Cmt node executes, the engine will roll-forward
60  * through the graph, blindly executing nodes until it reaches the end.
61  * If a graph reaches the end, it is assumed to have completed successfully.
62  *
63  * A graph has only 1 Cmt node.
64  *
65  */
66 
67 
68 /******************************************************************************
69  *
70  * The following wrappers map the standard DAG creation interface to the
71  * DAG creation routines.  Additionally, these wrappers enable experimentation
72  * with new DAG structures by providing an extra level of indirection, allowing
73  * the DAG creation routines to be replaced at this single point.
74  */
75 
76 void
77 rf_CreateFaultFreeReadDAG(RF_Raid_t *raidPtr, RF_AccessStripeMap_t *asmap,
78 			  RF_DagHeader_t *dag_h, void *bp,
79 			  RF_RaidAccessFlags_t flags,
80 			  RF_AllocListElem_t *allocList)
81 {
82 	rf_CreateNonredundantDAG(raidPtr, asmap, dag_h, bp, flags, allocList,
83 	    RF_IO_TYPE_READ);
84 }
85 
86 
87 /******************************************************************************
88  *
89  * DAG creation code begins here
90  */
91 
92 /******************************************************************************
93  *
94  * creates a DAG to perform a nonredundant read or write of data within one
95  * stripe.
96  * For reads, this DAG is as follows:
97  *
98  *                   /---- read ----\
99  *    Header -- Block ---- read ---- Commit -- Terminate
100  *                   \---- read ----/
101  *
102  * For writes, this DAG is as follows:
103  *
104  *                    /---- write ----\
105  *    Header -- Commit ---- write ---- Block -- Terminate
106  *                    \---- write ----/
107  *
108  * There is one disk node per stripe unit accessed, and all disk nodes are in
109  * parallel.
110  *
111  * Tricky point here:  The first disk node (read or write) is created
112  * normally.  Subsequent disk nodes are created by copying the first one,
113  * and modifying a few params.  The "succedents" and "antecedents" fields are
114  * _not_ re-created in each node, but rather left pointing to the same array
115  * that was malloc'd when the first node was created.  Thus, it's essential
116  * that when this DAG is freed, the succedents and antecedents fields be freed
117  * in ONLY ONE of the read nodes.  This does not apply to the "params" field
118  * because it is recreated for each READ node.
119  *
120  * Note that normal-priority accesses do not need to be tagged with their
121  * parity stripe ID, because they will never be promoted.  Hence, I've
122  * commented-out the code to do this, and marked it with UNNEEDED.
123  *
124  *****************************************************************************/
125 
126 void
127 rf_CreateNonredundantDAG(RF_Raid_t *raidPtr,
128     RF_AccessStripeMap_t *asmap, RF_DagHeader_t *dag_h, void *bp,
129     RF_RaidAccessFlags_t flags, RF_AllocListElem_t *allocList,
130     RF_IoType_t type)
131 {
132 	RF_DagNode_t *diskNodes, *blockNode, *commitNode, *termNode;
133 	RF_DagNode_t *tmpNode, *tmpdiskNode;
134 	RF_PhysDiskAddr_t *pda = asmap->physInfo;
135 	int     (*doFunc) (RF_DagNode_t *), (*undoFunc) (RF_DagNode_t *);
136 	int     i, n, totalNumNodes;
137 	const char   *name;
138 
139 	n = asmap->numStripeUnitsAccessed;
140 	dag_h->creator = "NonredundantDAG";
141 
142 	RF_ASSERT(RF_IO_IS_R_OR_W(type));
143 	switch (type) {
144 	case RF_IO_TYPE_READ:
145 		doFunc = rf_DiskReadFunc;
146 		undoFunc = rf_DiskReadUndoFunc;
147 		name = "R  ";
148 #if RF_DEBUG_DAG
149 		if (rf_dagDebug)
150 			printf("[Creating non-redundant read DAG]\n");
151 #endif
152 		break;
153 	case RF_IO_TYPE_WRITE:
154 		doFunc = rf_DiskWriteFunc;
155 		undoFunc = rf_DiskWriteUndoFunc;
156 		name = "W  ";
157 #if RF_DEBUG_DAG
158 		if (rf_dagDebug)
159 			printf("[Creating non-redundant write DAG]\n");
160 #endif
161 		break;
162 	default:
163 		RF_PANIC();
164 	}
165 
166 	/*
167          * For reads, the dag can not commit until the block node is reached.
168          * for writes, the dag commits immediately.
169          */
170 	dag_h->numCommitNodes = 1;
171 	dag_h->numCommits = 0;
172 	dag_h->numSuccedents = 1;
173 
174 	/*
175          * Node count:
176          * 1 block node
177          * n data reads (or writes)
178          * 1 commit node
179          * 1 terminator node
180          */
181 	RF_ASSERT(n > 0);
182 	totalNumNodes = n + 3;
183 
184 	for (i = 0; i < n; i++) {
185 		tmpNode = rf_AllocDAGNode();
186 		tmpNode->list_next = dag_h->nodes;
187 		dag_h->nodes = tmpNode;
188 	}
189 	diskNodes = dag_h->nodes;
190 
191 	blockNode = rf_AllocDAGNode();
192 	blockNode->list_next = dag_h->nodes;
193 	dag_h->nodes = blockNode;
194 
195 	commitNode = rf_AllocDAGNode();
196 	commitNode->list_next = dag_h->nodes;
197 	dag_h->nodes = commitNode;
198 
199 	termNode = rf_AllocDAGNode();
200 	termNode->list_next = dag_h->nodes;
201 	dag_h->nodes = termNode;
202 
203 	/* initialize nodes */
204 	switch (type) {
205 	case RF_IO_TYPE_READ:
206 		rf_InitNode(blockNode, rf_wait, RF_FALSE, rf_NullNodeFunc, rf_NullNodeUndoFunc,
207 		    NULL, n, 0, 0, 0, dag_h, "Nil", allocList);
208 		rf_InitNode(commitNode, rf_wait, RF_TRUE, rf_NullNodeFunc, rf_NullNodeUndoFunc,
209 		    NULL, 1, n, 0, 0, dag_h, "Cmt", allocList);
210 		rf_InitNode(termNode, rf_wait, RF_FALSE, rf_TerminateFunc, rf_TerminateUndoFunc,
211 		    NULL, 0, 1, 0, 0, dag_h, "Trm", allocList);
212 		break;
213 	case RF_IO_TYPE_WRITE:
214 		rf_InitNode(blockNode, rf_wait, RF_FALSE, rf_NullNodeFunc, rf_NullNodeUndoFunc,
215 		    NULL, 1, 0, 0, 0, dag_h, "Nil", allocList);
216 		rf_InitNode(commitNode, rf_wait, RF_TRUE, rf_NullNodeFunc, rf_NullNodeUndoFunc,
217 		    NULL, n, 1, 0, 0, dag_h, "Cmt", allocList);
218 		rf_InitNode(termNode, rf_wait, RF_FALSE, rf_TerminateFunc, rf_TerminateUndoFunc,
219 		    NULL, 0, n, 0, 0, dag_h, "Trm", allocList);
220 		break;
221 	default:
222 		RF_PANIC();
223 	}
224 
225 	tmpdiskNode = diskNodes;
226 	for (i = 0; i < n; i++) {
227 		RF_ASSERT(pda != NULL);
228 		rf_InitNode(tmpdiskNode, rf_wait, RF_FALSE, doFunc, undoFunc, rf_GenericWakeupFunc,
229 		    1, 1, 4, 0, dag_h, name, allocList);
230 		tmpdiskNode->params[0].p = pda;
231 		tmpdiskNode->params[1].p = pda->bufPtr;
232 		/* parity stripe id is not necessary */
233 		tmpdiskNode->params[2].v = 0;
234 		tmpdiskNode->params[3].v = RF_CREATE_PARAM3(RF_IO_NORMAL_PRIORITY, 0);
235 		pda = pda->next;
236 		tmpdiskNode = tmpdiskNode->list_next;
237 	}
238 
239 	/*
240          * Connect nodes.
241          */
242 
243 	/* connect hdr to block node */
244 	RF_ASSERT(blockNode->numAntecedents == 0);
245 	dag_h->succedents[0] = blockNode;
246 
247 	if (type == RF_IO_TYPE_READ) {
248 		/* connecting a nonredundant read DAG */
249 		RF_ASSERT(blockNode->numSuccedents == n);
250 		RF_ASSERT(commitNode->numAntecedents == n);
251 		tmpdiskNode = diskNodes;
252 		for (i = 0; i < n; i++) {
253 			/* connect block node to each read node */
254 			RF_ASSERT(tmpdiskNode->numAntecedents == 1);
255 			blockNode->succedents[i] = tmpdiskNode;
256 			tmpdiskNode->antecedents[0] = blockNode;
257 			tmpdiskNode->antType[0] = rf_control;
258 
259 			/* connect each read node to the commit node */
260 			RF_ASSERT(tmpdiskNode->numSuccedents == 1);
261 			tmpdiskNode->succedents[0] = commitNode;
262 			commitNode->antecedents[i] = tmpdiskNode;
263 			commitNode->antType[i] = rf_control;
264 			tmpdiskNode = tmpdiskNode->list_next;
265 		}
266 		/* connect the commit node to the term node */
267 		RF_ASSERT(commitNode->numSuccedents == 1);
268 		RF_ASSERT(termNode->numAntecedents == 1);
269 		RF_ASSERT(termNode->numSuccedents == 0);
270 		commitNode->succedents[0] = termNode;
271 		termNode->antecedents[0] = commitNode;
272 		termNode->antType[0] = rf_control;
273 	} else {
274 		/* connecting a nonredundant write DAG */
275 		/* connect the block node to the commit node */
276 		RF_ASSERT(blockNode->numSuccedents == 1);
277 		RF_ASSERT(commitNode->numAntecedents == 1);
278 		blockNode->succedents[0] = commitNode;
279 		commitNode->antecedents[0] = blockNode;
280 		commitNode->antType[0] = rf_control;
281 
282 		RF_ASSERT(commitNode->numSuccedents == n);
283 		RF_ASSERT(termNode->numAntecedents == n);
284 		RF_ASSERT(termNode->numSuccedents == 0);
285 		tmpdiskNode = diskNodes;
286 		for (i = 0; i < n; i++) {
287 			/* connect the commit node to each write node */
288 			RF_ASSERT(tmpdiskNode->numAntecedents == 1);
289 			commitNode->succedents[i] = tmpdiskNode;
290 			tmpdiskNode->antecedents[0] = commitNode;
291 			tmpdiskNode->antType[0] = rf_control;
292 
293 			/* connect each write node to the term node */
294 			RF_ASSERT(tmpdiskNode->numSuccedents == 1);
295 			tmpdiskNode->succedents[0] = termNode;
296 			termNode->antecedents[i] = tmpdiskNode;
297 			termNode->antType[i] = rf_control;
298 			tmpdiskNode = tmpdiskNode->list_next;
299 		}
300 	}
301 }
302 /******************************************************************************
303  * Create a fault-free read DAG for RAID level 1
304  *
305  * Hdr -> Nil -> Rmir -> Cmt -> Trm
306  *
307  * The "Rmir" node schedules a read from the disk in the mirror pair with the
308  * shortest disk queue.  the proper queue is selected at Rmir execution.  this
309  * deferred mapping is unlike other archs in RAIDframe which generally fix
310  * mapping at DAG creation time.
311  *
312  * Parameters:  raidPtr   - description of the physical array
313  *              asmap     - logical & physical addresses for this access
314  *              bp        - buffer ptr (for holding read data)
315  *              flags     - general flags (e.g. disk locking)
316  *              allocList - list of memory allocated in DAG creation
317  *****************************************************************************/
318 
319 static void
320 CreateMirrorReadDAG(RF_Raid_t *raidPtr, RF_AccessStripeMap_t *asmap,
321     RF_DagHeader_t *dag_h, void *bp,
322     RF_RaidAccessFlags_t flags, RF_AllocListElem_t *allocList,
323     int (*readfunc) (RF_DagNode_t * node))
324 {
325 	RF_DagNode_t *readNodes, *blockNode, *commitNode, *termNode;
326 	RF_DagNode_t *tmpNode, *tmpreadNode;
327 	RF_PhysDiskAddr_t *data_pda = asmap->physInfo;
328 	RF_PhysDiskAddr_t *parity_pda = asmap->parityInfo;
329 	int     i, n, totalNumNodes;
330 
331 	n = asmap->numStripeUnitsAccessed;
332 	dag_h->creator = "RaidOneReadDAG";
333 #if RF_DEBUG_DAG
334 	if (rf_dagDebug) {
335 		printf("[Creating RAID level 1 read DAG]\n");
336 	}
337 #endif
338 	/*
339          * This dag can not commit until the commit node is reached
340          * errors prior to the commit point imply the dag has failed.
341          */
342 	dag_h->numCommitNodes = 1;
343 	dag_h->numCommits = 0;
344 	dag_h->numSuccedents = 1;
345 
346 	/*
347          * Node count:
348          * n data reads
349          * 1 block node
350          * 1 commit node
351          * 1 terminator node
352          */
353 	RF_ASSERT(n > 0);
354 	totalNumNodes = n + 3;
355 
356 	for (i = 0; i < n; i++) {
357 		tmpNode = rf_AllocDAGNode();
358 		tmpNode->list_next = dag_h->nodes;
359 		dag_h->nodes = tmpNode;
360 	}
361 	readNodes = dag_h->nodes;
362 
363 	blockNode = rf_AllocDAGNode();
364 	blockNode->list_next = dag_h->nodes;
365 	dag_h->nodes = blockNode;
366 
367 	commitNode = rf_AllocDAGNode();
368 	commitNode->list_next = dag_h->nodes;
369 	dag_h->nodes = commitNode;
370 
371 	termNode = rf_AllocDAGNode();
372 	termNode->list_next = dag_h->nodes;
373 	dag_h->nodes = termNode;
374 
375 	/* initialize nodes */
376 	rf_InitNode(blockNode, rf_wait, RF_FALSE, rf_NullNodeFunc,
377 	    rf_NullNodeUndoFunc, NULL, n, 0, 0, 0, dag_h, "Nil", allocList);
378 	rf_InitNode(commitNode, rf_wait, RF_TRUE, rf_NullNodeFunc,
379 	    rf_NullNodeUndoFunc, NULL, 1, n, 0, 0, dag_h, "Cmt", allocList);
380 	rf_InitNode(termNode, rf_wait, RF_FALSE, rf_TerminateFunc,
381 	    rf_TerminateUndoFunc, NULL, 0, 1, 0, 0, dag_h, "Trm", allocList);
382 
383 	tmpreadNode = readNodes;
384 	for (i = 0; i < n; i++) {
385 		RF_ASSERT(data_pda != NULL);
386 		RF_ASSERT(parity_pda != NULL);
387 		rf_InitNode(tmpreadNode, rf_wait, RF_FALSE, readfunc,
388 		    rf_DiskReadMirrorUndoFunc, rf_GenericWakeupFunc, 1, 1, 5, 0, dag_h,
389 		    "Rmir", allocList);
390 		tmpreadNode->params[0].p = data_pda;
391 		tmpreadNode->params[1].p = data_pda->bufPtr;
392 		/* parity stripe id is not necessary */
393 		tmpreadNode->params[2].p = 0;
394 		tmpreadNode->params[3].v = RF_CREATE_PARAM3(RF_IO_NORMAL_PRIORITY, 0);
395 		tmpreadNode->params[4].p = parity_pda;
396 		data_pda = data_pda->next;
397 		parity_pda = parity_pda->next;
398 		tmpreadNode = tmpreadNode->list_next;
399 	}
400 
401 	/*
402          * Connect nodes
403          */
404 
405 	/* connect hdr to block node */
406 	RF_ASSERT(blockNode->numAntecedents == 0);
407 	dag_h->succedents[0] = blockNode;
408 
409 	/* connect block node to read nodes */
410 	RF_ASSERT(blockNode->numSuccedents == n);
411 	tmpreadNode = readNodes;
412 	for (i = 0; i < n; i++) {
413 		RF_ASSERT(tmpreadNode->numAntecedents == 1);
414 		blockNode->succedents[i] = tmpreadNode;
415 		tmpreadNode->antecedents[0] = blockNode;
416 		tmpreadNode->antType[0] = rf_control;
417 		tmpreadNode = tmpreadNode->list_next;
418 	}
419 
420 	/* connect read nodes to commit node */
421 	RF_ASSERT(commitNode->numAntecedents == n);
422 	tmpreadNode = readNodes;
423 	for (i = 0; i < n; i++) {
424 		RF_ASSERT(tmpreadNode->numSuccedents == 1);
425 		tmpreadNode->succedents[0] = commitNode;
426 		commitNode->antecedents[i] = tmpreadNode;
427 		commitNode->antType[i] = rf_control;
428 		tmpreadNode = tmpreadNode->list_next;
429 	}
430 
431 	/* connect commit node to term node */
432 	RF_ASSERT(commitNode->numSuccedents == 1);
433 	RF_ASSERT(termNode->numAntecedents == 1);
434 	RF_ASSERT(termNode->numSuccedents == 0);
435 	commitNode->succedents[0] = termNode;
436 	termNode->antecedents[0] = commitNode;
437 	termNode->antType[0] = rf_control;
438 }
439 
440 void
441 rf_CreateMirrorIdleReadDAG(
442     RF_Raid_t * raidPtr,
443     RF_AccessStripeMap_t * asmap,
444     RF_DagHeader_t * dag_h,
445     void *bp,
446     RF_RaidAccessFlags_t flags,
447     RF_AllocListElem_t * allocList)
448 {
449 	CreateMirrorReadDAG(raidPtr, asmap, dag_h, bp, flags, allocList,
450 	    rf_DiskReadMirrorIdleFunc);
451 }
452 
453 #if (RF_INCLUDE_CHAINDECLUSTER > 0) || (RF_INCLUDE_INTERDECLUSTER > 0)
454 
455 void
456 rf_CreateMirrorPartitionReadDAG(RF_Raid_t *raidPtr,
457 				RF_AccessStripeMap_t *asmap,
458 				RF_DagHeader_t *dag_h, void *bp,
459 				RF_RaidAccessFlags_t flags,
460 				RF_AllocListElem_t *allocList)
461 {
462 	CreateMirrorReadDAG(raidPtr, asmap, dag_h, bp, flags, allocList,
463 	    rf_DiskReadMirrorPartitionFunc);
464 }
465 #endif
466