xref: /netbsd/sys/dev/raidframe/rf_paritylog.c (revision 6550d01e)
1 /*	$NetBSD: rf_paritylog.c,v 1.13 2007/03/04 06:02:38 christos Exp $	*/
2 /*
3  * Copyright (c) 1995 Carnegie-Mellon University.
4  * All rights reserved.
5  *
6  * Author: William V. Courtright II
7  *
8  * Permission to use, copy, modify and distribute this software and
9  * its documentation is hereby granted, provided that both the copyright
10  * notice and this permission notice appear in all copies of the
11  * software, derivative works or modified versions, and any portions
12  * thereof, and that both notices appear in supporting documentation.
13  *
14  * CARNEGIE MELLON ALLOWS FREE USE OF THIS SOFTWARE IN ITS "AS IS"
15  * CONDITION.  CARNEGIE MELLON DISCLAIMS ANY LIABILITY OF ANY KIND
16  * FOR ANY DAMAGES WHATSOEVER RESULTING FROM THE USE OF THIS SOFTWARE.
17  *
18  * Carnegie Mellon requests users of this software to return to
19  *
20  *  Software Distribution Coordinator  or  Software.Distribution@CS.CMU.EDU
21  *  School of Computer Science
22  *  Carnegie Mellon University
23  *  Pittsburgh PA 15213-3890
24  *
25  * any improvements or extensions that they make and grant Carnegie the
26  * rights to redistribute these changes.
27  */
28 
29 /* Code for manipulating in-core parity logs
30  *
31  */
32 
33 #include <sys/cdefs.h>
34 __KERNEL_RCSID(0, "$NetBSD: rf_paritylog.c,v 1.13 2007/03/04 06:02:38 christos Exp $");
35 
36 #include "rf_archs.h"
37 
38 #if RF_INCLUDE_PARITYLOGGING > 0
39 
40 /*
41  * Append-only log for recording parity "update" and "overwrite" records
42  */
43 
44 #include <dev/raidframe/raidframevar.h>
45 
46 #include "rf_threadstuff.h"
47 #include "rf_mcpair.h"
48 #include "rf_raid.h"
49 #include "rf_dag.h"
50 #include "rf_dagfuncs.h"
51 #include "rf_desc.h"
52 #include "rf_layout.h"
53 #include "rf_diskqueue.h"
54 #include "rf_etimer.h"
55 #include "rf_paritylog.h"
56 #include "rf_general.h"
57 #include "rf_map.h"
58 #include "rf_paritylogging.h"
59 #include "rf_paritylogDiskMgr.h"
60 
61 static RF_CommonLogData_t *
62 AllocParityLogCommonData(RF_Raid_t * raidPtr)
63 {
64 	RF_CommonLogData_t *common = NULL;
65 
66 	/* Return a struct for holding common parity log information from the
67 	 * free list (rf_parityLogDiskQueue.freeCommonList).  If the free list
68 	 * is empty, call RF_Malloc to create a new structure. NON-BLOCKING */
69 
70 	RF_LOCK_MUTEX(raidPtr->parityLogDiskQueue.mutex);
71 	if (raidPtr->parityLogDiskQueue.freeCommonList) {
72 		common = raidPtr->parityLogDiskQueue.freeCommonList;
73 		raidPtr->parityLogDiskQueue.freeCommonList = raidPtr->parityLogDiskQueue.freeCommonList->next;
74 		RF_UNLOCK_MUTEX(raidPtr->parityLogDiskQueue.mutex);
75 	} else {
76 		RF_UNLOCK_MUTEX(raidPtr->parityLogDiskQueue.mutex);
77 		RF_Malloc(common, sizeof(RF_CommonLogData_t), (RF_CommonLogData_t *));
78 		rf_mutex_init(&common->mutex);
79 	}
80 	common->next = NULL;
81 	return (common);
82 }
83 
84 static void
85 FreeParityLogCommonData(RF_CommonLogData_t * common)
86 {
87 	RF_Raid_t *raidPtr;
88 
89 	/* Insert a single struct for holding parity log information (data)
90 	 * into the free list (rf_parityLogDiskQueue.freeCommonList).
91 	 * NON-BLOCKING */
92 
93 	raidPtr = common->raidPtr;
94 	RF_LOCK_MUTEX(raidPtr->parityLogDiskQueue.mutex);
95 	common->next = raidPtr->parityLogDiskQueue.freeCommonList;
96 	raidPtr->parityLogDiskQueue.freeCommonList = common;
97 	RF_UNLOCK_MUTEX(raidPtr->parityLogDiskQueue.mutex);
98 }
99 
100 static RF_ParityLogData_t *
101 AllocParityLogData(RF_Raid_t * raidPtr)
102 {
103 	RF_ParityLogData_t *data = NULL;
104 
105 	/* Return a struct for holding parity log information from the free
106 	 * list (rf_parityLogDiskQueue.freeList).  If the free list is empty,
107 	 * call RF_Malloc to create a new structure. NON-BLOCKING */
108 
109 	RF_LOCK_MUTEX(raidPtr->parityLogDiskQueue.mutex);
110 	if (raidPtr->parityLogDiskQueue.freeDataList) {
111 		data = raidPtr->parityLogDiskQueue.freeDataList;
112 		raidPtr->parityLogDiskQueue.freeDataList = raidPtr->parityLogDiskQueue.freeDataList->next;
113 		RF_UNLOCK_MUTEX(raidPtr->parityLogDiskQueue.mutex);
114 	} else {
115 		RF_UNLOCK_MUTEX(raidPtr->parityLogDiskQueue.mutex);
116 		RF_Malloc(data, sizeof(RF_ParityLogData_t), (RF_ParityLogData_t *));
117 	}
118 	data->next = NULL;
119 	data->prev = NULL;
120 	return (data);
121 }
122 
123 
124 static void
125 FreeParityLogData(RF_ParityLogData_t * data)
126 {
127 	RF_ParityLogData_t *nextItem;
128 	RF_Raid_t *raidPtr;
129 
130 	/* Insert a linked list of structs for holding parity log information
131 	 * (data) into the free list (parityLogDiskQueue.freeList).
132 	 * NON-BLOCKING */
133 
134 	raidPtr = data->common->raidPtr;
135 	RF_LOCK_MUTEX(raidPtr->parityLogDiskQueue.mutex);
136 	while (data) {
137 		nextItem = data->next;
138 		data->next = raidPtr->parityLogDiskQueue.freeDataList;
139 		raidPtr->parityLogDiskQueue.freeDataList = data;
140 		data = nextItem;
141 	}
142 	RF_UNLOCK_MUTEX(raidPtr->parityLogDiskQueue.mutex);
143 }
144 
145 
146 static void
147 EnqueueParityLogData(
148     RF_ParityLogData_t * data,
149     RF_ParityLogData_t ** head,
150     RF_ParityLogData_t ** tail)
151 {
152 	RF_Raid_t *raidPtr;
153 
154 	/* Insert an in-core parity log (*data) into the head of a disk queue
155 	 * (*head, *tail). NON-BLOCKING */
156 
157 	raidPtr = data->common->raidPtr;
158 	if (rf_parityLogDebug)
159 		printf("[enqueueing parity log data, region %d, raidAddress %d, numSector %d]\n", data->regionID, (int) data->diskAddress.raidAddress, (int) data->diskAddress.numSector);
160 	RF_ASSERT(data->prev == NULL);
161 	RF_ASSERT(data->next == NULL);
162 	RF_LOCK_MUTEX(raidPtr->parityLogDiskQueue.mutex);
163 	if (*head) {
164 		/* insert into head of queue */
165 		RF_ASSERT((*head)->prev == NULL);
166 		RF_ASSERT((*tail)->next == NULL);
167 		data->next = *head;
168 		(*head)->prev = data;
169 		*head = data;
170 	} else {
171 		/* insert into empty list */
172 		RF_ASSERT(*head == NULL);
173 		RF_ASSERT(*tail == NULL);
174 		*head = data;
175 		*tail = data;
176 	}
177 	RF_ASSERT((*head)->prev == NULL);
178 	RF_ASSERT((*tail)->next == NULL);
179 	RF_UNLOCK_MUTEX(raidPtr->parityLogDiskQueue.mutex);
180 }
181 
182 static RF_ParityLogData_t *
183 DequeueParityLogData(
184     RF_Raid_t * raidPtr,
185     RF_ParityLogData_t ** head,
186     RF_ParityLogData_t ** tail,
187     int ignoreLocks)
188 {
189 	RF_ParityLogData_t *data;
190 
191 	/* Remove and return an in-core parity log from the tail of a disk
192 	 * queue (*head, *tail). NON-BLOCKING */
193 
194 	/* remove from tail, preserving FIFO order */
195 	if (!ignoreLocks)
196 		RF_LOCK_MUTEX(raidPtr->parityLogDiskQueue.mutex);
197 	data = *tail;
198 	if (data) {
199 		if (*head == *tail) {
200 			/* removing last item from queue */
201 			*head = NULL;
202 			*tail = NULL;
203 		} else {
204 			*tail = (*tail)->prev;
205 			(*tail)->next = NULL;
206 			RF_ASSERT((*head)->prev == NULL);
207 			RF_ASSERT((*tail)->next == NULL);
208 		}
209 		data->next = NULL;
210 		data->prev = NULL;
211 		if (rf_parityLogDebug)
212 			printf("[dequeueing parity log data, region %d, raidAddress %d, numSector %d]\n", data->regionID, (int) data->diskAddress.raidAddress, (int) data->diskAddress.numSector);
213 	}
214 	if (*head) {
215 		RF_ASSERT((*head)->prev == NULL);
216 		RF_ASSERT((*tail)->next == NULL);
217 	}
218 	if (!ignoreLocks)
219 		RF_UNLOCK_MUTEX(raidPtr->parityLogDiskQueue.mutex);
220 	return (data);
221 }
222 
223 
224 static void
225 RequeueParityLogData(
226     RF_ParityLogData_t * data,
227     RF_ParityLogData_t ** head,
228     RF_ParityLogData_t ** tail)
229 {
230 	RF_Raid_t *raidPtr;
231 
232 	/* Insert an in-core parity log (*data) into the tail of a disk queue
233 	 * (*head, *tail). NON-BLOCKING */
234 
235 	raidPtr = data->common->raidPtr;
236 	RF_ASSERT(data);
237 	if (rf_parityLogDebug)
238 		printf("[requeueing parity log data, region %d, raidAddress %d, numSector %d]\n", data->regionID, (int) data->diskAddress.raidAddress, (int) data->diskAddress.numSector);
239 	RF_LOCK_MUTEX(raidPtr->parityLogDiskQueue.mutex);
240 	if (*tail) {
241 		/* append to tail of list */
242 		data->prev = *tail;
243 		data->next = NULL;
244 		(*tail)->next = data;
245 		*tail = data;
246 	} else {
247 		/* inserting into an empty list */
248 		*head = data;
249 		*tail = data;
250 		(*head)->prev = NULL;
251 		(*tail)->next = NULL;
252 	}
253 	RF_ASSERT((*head)->prev == NULL);
254 	RF_ASSERT((*tail)->next == NULL);
255 	RF_UNLOCK_MUTEX(raidPtr->parityLogDiskQueue.mutex);
256 }
257 
258 RF_ParityLogData_t *
259 rf_CreateParityLogData(
260     RF_ParityRecordType_t operation,
261     RF_PhysDiskAddr_t * pda,
262     void *bufPtr,
263     RF_Raid_t * raidPtr,
264     int (*wakeFunc) (RF_DagNode_t * node, int status),
265     void *wakeArg,
266     RF_AccTraceEntry_t * tracerec,
267     RF_Etimer_t startTime)
268 {
269 	RF_ParityLogData_t *data, *resultHead = NULL, *resultTail = NULL;
270 	RF_CommonLogData_t *common;
271 	RF_PhysDiskAddr_t *diskAddress;
272 	int     boundary, offset = 0;
273 
274 	/* Return an initialized struct of info to be logged. Build one item
275 	 * per physical disk address, one item per region.
276 	 *
277 	 * NON-BLOCKING */
278 
279 	diskAddress = pda;
280 	common = AllocParityLogCommonData(raidPtr);
281 	RF_ASSERT(common);
282 
283 	common->operation = operation;
284 	common->bufPtr = bufPtr;
285 	common->raidPtr = raidPtr;
286 	common->wakeFunc = wakeFunc;
287 	common->wakeArg = wakeArg;
288 	common->tracerec = tracerec;
289 	common->startTime = startTime;
290 	common->cnt = 0;
291 
292 	if (rf_parityLogDebug)
293 		printf("[entering CreateParityLogData]\n");
294 	while (diskAddress) {
295 		common->cnt++;
296 		data = AllocParityLogData(raidPtr);
297 		RF_ASSERT(data);
298 		data->common = common;
299 		data->next = NULL;
300 		data->prev = NULL;
301 		data->regionID = rf_MapRegionIDParityLogging(raidPtr, diskAddress->startSector);
302 		if (data->regionID == rf_MapRegionIDParityLogging(raidPtr, diskAddress->startSector + diskAddress->numSector - 1)) {
303 			/* disk address does not cross a region boundary */
304 			data->diskAddress = *diskAddress;
305 			data->bufOffset = offset;
306 			offset = offset + diskAddress->numSector;
307 			EnqueueParityLogData(data, &resultHead, &resultTail);
308 			/* adjust disk address */
309 			diskAddress = diskAddress->next;
310 		} else {
311 			/* disk address crosses a region boundary */
312 			/* find address where region is crossed */
313 			boundary = 0;
314 			while (data->regionID == rf_MapRegionIDParityLogging(raidPtr, diskAddress->startSector + boundary))
315 				boundary++;
316 
317 			/* enter data before the boundary */
318 			data->diskAddress = *diskAddress;
319 			data->diskAddress.numSector = boundary;
320 			data->bufOffset = offset;
321 			offset += boundary;
322 			EnqueueParityLogData(data, &resultHead, &resultTail);
323 			/* adjust disk address */
324 			diskAddress->startSector += boundary;
325 			diskAddress->numSector -= boundary;
326 		}
327 	}
328 	if (rf_parityLogDebug)
329 		printf("[leaving CreateParityLogData]\n");
330 	return (resultHead);
331 }
332 
333 
334 RF_ParityLogData_t *
335 rf_SearchAndDequeueParityLogData(
336     RF_Raid_t * raidPtr,
337     int regionID,
338     RF_ParityLogData_t ** head,
339     RF_ParityLogData_t ** tail,
340     int ignoreLocks)
341 {
342 	RF_ParityLogData_t *w;
343 
344 	/* Remove and return an in-core parity log from a specified region
345 	 * (regionID). If a matching log is not found, return NULL.
346 	 *
347 	 * NON-BLOCKING. */
348 
349 	/* walk backward through a list, looking for an entry with a matching
350 	 * region ID */
351 	if (!ignoreLocks)
352 		RF_LOCK_MUTEX(raidPtr->parityLogDiskQueue.mutex);
353 	w = (*tail);
354 	while (w) {
355 		if (w->regionID == regionID) {
356 			/* remove an element from the list */
357 			if (w == *tail) {
358 				if (*head == *tail) {
359 					/* removing only element in the list */
360 					*head = NULL;
361 					*tail = NULL;
362 				} else {
363 					/* removing last item in the list */
364 					*tail = (*tail)->prev;
365 					(*tail)->next = NULL;
366 					RF_ASSERT((*head)->prev == NULL);
367 					RF_ASSERT((*tail)->next == NULL);
368 				}
369 			} else {
370 				if (w == *head) {
371 					/* removing first item in the list */
372 					*head = (*head)->next;
373 					(*head)->prev = NULL;
374 					RF_ASSERT((*head)->prev == NULL);
375 					RF_ASSERT((*tail)->next == NULL);
376 				} else {
377 					/* removing an item from the middle of
378 					 * the list */
379 					w->prev->next = w->next;
380 					w->next->prev = w->prev;
381 					RF_ASSERT((*head)->prev == NULL);
382 					RF_ASSERT((*tail)->next == NULL);
383 				}
384 			}
385 			w->prev = NULL;
386 			w->next = NULL;
387 			if (rf_parityLogDebug)
388 				printf("[dequeueing parity log data, region %d, raidAddress %d, numSector %d]\n", w->regionID, (int) w->diskAddress.raidAddress, (int) w->diskAddress.numSector);
389 			return (w);
390 		} else
391 			w = w->prev;
392 	}
393 	if (!ignoreLocks)
394 		RF_UNLOCK_MUTEX(raidPtr->parityLogDiskQueue.mutex);
395 	return (NULL);
396 }
397 
398 static RF_ParityLogData_t *
399 DequeueMatchingLogData(
400     RF_Raid_t * raidPtr,
401     RF_ParityLogData_t ** head,
402     RF_ParityLogData_t ** tail)
403 {
404 	RF_ParityLogData_t *logDataList, *logData;
405 	int     regionID;
406 
407 	/* Remove and return an in-core parity log from the tail of a disk
408 	 * queue (*head, *tail).  Then remove all matching (identical
409 	 * regionIDs) logData and return as a linked list.
410 	 *
411 	 * NON-BLOCKING */
412 
413 	logDataList = DequeueParityLogData(raidPtr, head, tail, RF_TRUE);
414 	if (logDataList) {
415 		regionID = logDataList->regionID;
416 		logData = logDataList;
417 		logData->next = rf_SearchAndDequeueParityLogData(raidPtr, regionID, head, tail, RF_TRUE);
418 		while (logData->next) {
419 			logData = logData->next;
420 			logData->next = rf_SearchAndDequeueParityLogData(raidPtr, regionID, head, tail, RF_TRUE);
421 		}
422 	}
423 	return (logDataList);
424 }
425 
426 
427 static RF_ParityLog_t *
428 AcquireParityLog(
429     RF_ParityLogData_t * logData,
430     int finish)
431 {
432 	RF_ParityLog_t *log = NULL;
433 	RF_Raid_t *raidPtr;
434 
435 	/* Grab a log buffer from the pool and return it. If no buffers are
436 	 * available, return NULL. NON-BLOCKING */
437 	raidPtr = logData->common->raidPtr;
438 	RF_LOCK_MUTEX(raidPtr->parityLogPool.mutex);
439 	if (raidPtr->parityLogPool.parityLogs) {
440 		log = raidPtr->parityLogPool.parityLogs;
441 		raidPtr->parityLogPool.parityLogs = raidPtr->parityLogPool.parityLogs->next;
442 		log->regionID = logData->regionID;
443 		log->numRecords = 0;
444 		log->next = NULL;
445 		raidPtr->logsInUse++;
446 		RF_ASSERT(raidPtr->logsInUse >= 0 && raidPtr->logsInUse <= raidPtr->numParityLogs);
447 	} else {
448 		/* no logs available, so place ourselves on the queue of work
449 		 * waiting on log buffers this is done while
450 		 * parityLogPool.mutex is held, to ensure synchronization with
451 		 * ReleaseParityLogs. */
452 		if (rf_parityLogDebug)
453 			printf("[blocked on log, region %d, finish %d]\n", logData->regionID, finish);
454 		if (finish)
455 			RequeueParityLogData(logData, &raidPtr->parityLogDiskQueue.logBlockHead, &raidPtr->parityLogDiskQueue.logBlockTail);
456 		else
457 			EnqueueParityLogData(logData, &raidPtr->parityLogDiskQueue.logBlockHead, &raidPtr->parityLogDiskQueue.logBlockTail);
458 	}
459 	RF_UNLOCK_MUTEX(raidPtr->parityLogPool.mutex);
460 	return (log);
461 }
462 
463 void
464 rf_ReleaseParityLogs(
465     RF_Raid_t * raidPtr,
466     RF_ParityLog_t * firstLog)
467 {
468 	RF_ParityLogData_t *logDataList;
469 	RF_ParityLog_t *log, *lastLog;
470 	int     cnt;
471 
472 	/* Insert a linked list of parity logs (firstLog) to the free list
473 	 * (parityLogPool.parityLogPool)
474 	 *
475 	 * NON-BLOCKING. */
476 
477 	RF_ASSERT(firstLog);
478 
479 	/* Before returning logs to global free list, service all requests
480 	 * which are blocked on logs.  Holding mutexes for parityLogPool and
481 	 * parityLogDiskQueue forces synchronization with AcquireParityLog(). */
482 	RF_LOCK_MUTEX(raidPtr->parityLogPool.mutex);
483 	RF_LOCK_MUTEX(raidPtr->parityLogDiskQueue.mutex);
484 	logDataList = DequeueMatchingLogData(raidPtr, &raidPtr->parityLogDiskQueue.logBlockHead, &raidPtr->parityLogDiskQueue.logBlockTail);
485 	log = firstLog;
486 	if (firstLog)
487 		firstLog = firstLog->next;
488 	log->numRecords = 0;
489 	log->next = NULL;
490 	while (logDataList && log) {
491 		RF_UNLOCK_MUTEX(raidPtr->parityLogPool.mutex);
492 		RF_UNLOCK_MUTEX(raidPtr->parityLogDiskQueue.mutex);
493 		rf_ParityLogAppend(logDataList, RF_TRUE, &log, RF_FALSE);
494 		if (rf_parityLogDebug)
495 			printf("[finishing up buf-blocked log data, region %d]\n", logDataList->regionID);
496 		if (log == NULL) {
497 			log = firstLog;
498 			if (firstLog) {
499 				firstLog = firstLog->next;
500 				log->numRecords = 0;
501 				log->next = NULL;
502 			}
503 		}
504 		RF_LOCK_MUTEX(raidPtr->parityLogPool.mutex);
505 		RF_LOCK_MUTEX(raidPtr->parityLogDiskQueue.mutex);
506 		if (log)
507 			logDataList = DequeueMatchingLogData(raidPtr, &raidPtr->parityLogDiskQueue.logBlockHead, &raidPtr->parityLogDiskQueue.logBlockTail);
508 	}
509 	/* return remaining logs to pool */
510 	if (log) {
511 		log->next = firstLog;
512 		firstLog = log;
513 	}
514 	if (firstLog) {
515 		lastLog = firstLog;
516 		raidPtr->logsInUse--;
517 		RF_ASSERT(raidPtr->logsInUse >= 0 && raidPtr->logsInUse <= raidPtr->numParityLogs);
518 		while (lastLog->next) {
519 			lastLog = lastLog->next;
520 			raidPtr->logsInUse--;
521 			RF_ASSERT(raidPtr->logsInUse >= 0 && raidPtr->logsInUse <= raidPtr->numParityLogs);
522 		}
523 		lastLog->next = raidPtr->parityLogPool.parityLogs;
524 		raidPtr->parityLogPool.parityLogs = firstLog;
525 		cnt = 0;
526 		log = raidPtr->parityLogPool.parityLogs;
527 		while (log) {
528 			cnt++;
529 			log = log->next;
530 		}
531 		RF_ASSERT(cnt + raidPtr->logsInUse == raidPtr->numParityLogs);
532 	}
533 	RF_UNLOCK_MUTEX(raidPtr->parityLogPool.mutex);
534 	RF_UNLOCK_MUTEX(raidPtr->parityLogDiskQueue.mutex);
535 }
536 
537 static void
538 ReintLog(
539     RF_Raid_t * raidPtr,
540     int regionID,
541     RF_ParityLog_t * log)
542 {
543 	RF_ASSERT(log);
544 
545 	/* Insert an in-core parity log (log) into the disk queue of
546 	 * reintegration work.  Set the flag (reintInProgress) for the
547 	 * specified region (regionID) to indicate that reintegration is in
548 	 * progress for this region. NON-BLOCKING */
549 
550 	RF_LOCK_MUTEX(raidPtr->regionInfo[regionID].reintMutex);
551 	raidPtr->regionInfo[regionID].reintInProgress = RF_TRUE;	/* cleared when reint
552 									 * complete */
553 
554 	if (rf_parityLogDebug)
555 		printf("[requesting reintegration of region %d]\n", log->regionID);
556 	/* move record to reintegration queue */
557 	RF_LOCK_MUTEX(raidPtr->parityLogDiskQueue.mutex);
558 	log->next = raidPtr->parityLogDiskQueue.reintQueue;
559 	raidPtr->parityLogDiskQueue.reintQueue = log;
560 	RF_UNLOCK_MUTEX(raidPtr->regionInfo[regionID].reintMutex);
561 	RF_UNLOCK_MUTEX(raidPtr->parityLogDiskQueue.mutex);
562 	RF_SIGNAL_COND(raidPtr->parityLogDiskQueue.cond);
563 }
564 
565 static void
566 FlushLog(
567     RF_Raid_t * raidPtr,
568     RF_ParityLog_t * log)
569 {
570 	/* insert a core log (log) into a list of logs
571 	 * (parityLogDiskQueue.flushQueue) waiting to be written to disk.
572 	 * NON-BLOCKING */
573 
574 	RF_ASSERT(log);
575 	RF_ASSERT(log->numRecords == raidPtr->numSectorsPerLog);
576 	RF_ASSERT(log->next == NULL);
577 	/* move log to flush queue */
578 	RF_LOCK_MUTEX(raidPtr->parityLogDiskQueue.mutex);
579 	log->next = raidPtr->parityLogDiskQueue.flushQueue;
580 	raidPtr->parityLogDiskQueue.flushQueue = log;
581 	RF_UNLOCK_MUTEX(raidPtr->parityLogDiskQueue.mutex);
582 	RF_SIGNAL_COND(raidPtr->parityLogDiskQueue.cond);
583 }
584 
585 static int
586 DumpParityLogToDisk(
587     int finish,
588     RF_ParityLogData_t * logData)
589 {
590 	int     i, diskCount, regionID = logData->regionID;
591 	RF_ParityLog_t *log;
592 	RF_Raid_t *raidPtr;
593 
594 	raidPtr = logData->common->raidPtr;
595 
596 	/* Move a core log to disk.  If the log disk is full, initiate
597 	 * reintegration.
598 	 *
599 	 * Return (0) if we can enqueue the dump immediately, otherwise return
600 	 * (1) to indicate we are blocked on reintegration and control of the
601 	 * thread should be relinquished.
602 	 *
603 	 * Caller must hold regionInfo[regionID].mutex
604 	 *
605 	 * NON-BLOCKING */
606 
607 	if (rf_parityLogDebug)
608 		printf("[dumping parity log to disk, region %d]\n", regionID);
609 	log = raidPtr->regionInfo[regionID].coreLog;
610 	RF_ASSERT(log->numRecords == raidPtr->numSectorsPerLog);
611 	RF_ASSERT(log->next == NULL);
612 
613 	/* if reintegration is in progress, must queue work */
614 	RF_LOCK_MUTEX(raidPtr->regionInfo[regionID].reintMutex);
615 	if (raidPtr->regionInfo[regionID].reintInProgress) {
616 		/* Can not proceed since this region is currently being
617 		 * reintegrated. We can not block, so queue remaining work and
618 		 * return */
619 		if (rf_parityLogDebug)
620 			printf("[region %d waiting on reintegration]\n", regionID);
621 		/* XXX not sure about the use of finish - shouldn't this
622 		 * always be "Enqueue"? */
623 		if (finish)
624 			RequeueParityLogData(logData, &raidPtr->parityLogDiskQueue.reintBlockHead, &raidPtr->parityLogDiskQueue.reintBlockTail);
625 		else
626 			EnqueueParityLogData(logData, &raidPtr->parityLogDiskQueue.reintBlockHead, &raidPtr->parityLogDiskQueue.reintBlockTail);
627 		RF_UNLOCK_MUTEX(raidPtr->regionInfo[regionID].reintMutex);
628 		return (1);	/* relenquish control of this thread */
629 	}
630 	RF_UNLOCK_MUTEX(raidPtr->regionInfo[regionID].reintMutex);
631 	raidPtr->regionInfo[regionID].coreLog = NULL;
632 	if ((raidPtr->regionInfo[regionID].diskCount) < raidPtr->regionInfo[regionID].capacity)
633 		/* IMPORTANT!! this loop bound assumes region disk holds an
634 		 * integral number of core logs */
635 	{
636 		/* update disk map for this region */
637 		diskCount = raidPtr->regionInfo[regionID].diskCount;
638 		for (i = 0; i < raidPtr->numSectorsPerLog; i++) {
639 			raidPtr->regionInfo[regionID].diskMap[i + diskCount].operation = log->records[i].operation;
640 			raidPtr->regionInfo[regionID].diskMap[i + diskCount].parityAddr = log->records[i].parityAddr;
641 		}
642 		log->diskOffset = diskCount;
643 		raidPtr->regionInfo[regionID].diskCount += raidPtr->numSectorsPerLog;
644 		FlushLog(raidPtr, log);
645 	} else {
646 		/* no room for log on disk, send it to disk manager and
647 		 * request reintegration */
648 		RF_ASSERT(raidPtr->regionInfo[regionID].diskCount == raidPtr->regionInfo[regionID].capacity);
649 		ReintLog(raidPtr, regionID, log);
650 	}
651 	if (rf_parityLogDebug)
652 		printf("[finished dumping parity log to disk, region %d]\n", regionID);
653 	return (0);
654 }
655 
656 int
657 rf_ParityLogAppend(
658     RF_ParityLogData_t * logData,
659     int finish,
660     RF_ParityLog_t ** incomingLog,
661     int clearReintFlag)
662 {
663 	int     regionID, logItem, itemDone;
664 	RF_ParityLogData_t *item;
665 	int     punt, done = RF_FALSE;
666 	RF_ParityLog_t *log;
667 	RF_Raid_t *raidPtr;
668 	RF_Etimer_t timer;
669 	int     (*wakeFunc) (RF_DagNode_t * node, int status);
670 	void   *wakeArg;
671 
672 	/* Add parity to the appropriate log, one sector at a time. This
673 	 * routine is called is called by dag functions ParityLogUpdateFunc
674 	 * and ParityLogOverwriteFunc and therefore MUST BE NONBLOCKING.
675 	 *
676 	 * Parity to be logged is contained in a linked-list (logData).  When
677 	 * this routine returns, every sector in the list will be in one of
678 	 * three places: 1) entered into the parity log 2) queued, waiting on
679 	 * reintegration 3) queued, waiting on a core log
680 	 *
681 	 * Blocked work is passed to the ParityLoggingDiskManager for completion.
682 	 * Later, as conditions which required the block are removed, the work
683 	 * reenters this routine with the "finish" parameter set to "RF_TRUE."
684 	 *
685 	 * NON-BLOCKING */
686 
687 	raidPtr = logData->common->raidPtr;
688 	/* lock the region for the first item in logData */
689 	RF_ASSERT(logData != NULL);
690 	regionID = logData->regionID;
691 	RF_LOCK_MUTEX(raidPtr->regionInfo[regionID].mutex);
692 	RF_ASSERT(raidPtr->regionInfo[regionID].loggingEnabled);
693 
694 	if (clearReintFlag) {
695 		/* Enable flushing for this region.  Holding both locks
696 		 * provides a synchronization barrier with DumpParityLogToDisk */
697 		RF_LOCK_MUTEX(raidPtr->regionInfo[regionID].reintMutex);
698 		RF_LOCK_MUTEX(raidPtr->parityLogDiskQueue.mutex);
699 		RF_ASSERT(raidPtr->regionInfo[regionID].reintInProgress == RF_TRUE);
700 		raidPtr->regionInfo[regionID].diskCount = 0;
701 		raidPtr->regionInfo[regionID].reintInProgress = RF_FALSE;
702 		RF_UNLOCK_MUTEX(raidPtr->regionInfo[regionID].reintMutex);	/* flushing is now
703 										 * enabled */
704 		RF_UNLOCK_MUTEX(raidPtr->parityLogDiskQueue.mutex);
705 	}
706 	/* process each item in logData */
707 	while (logData) {
708 		/* remove an item from logData */
709 		item = logData;
710 		logData = logData->next;
711 		item->next = NULL;
712 		item->prev = NULL;
713 
714 		if (rf_parityLogDebug)
715 			printf("[appending parity log data, region %d, raidAddress %d, numSector %d]\n", item->regionID, (int) item->diskAddress.raidAddress, (int) item->diskAddress.numSector);
716 
717 		/* see if we moved to a new region */
718 		if (regionID != item->regionID) {
719 			RF_UNLOCK_MUTEX(raidPtr->regionInfo[regionID].mutex);
720 			regionID = item->regionID;
721 			RF_LOCK_MUTEX(raidPtr->regionInfo[regionID].mutex);
722 			RF_ASSERT(raidPtr->regionInfo[regionID].loggingEnabled);
723 		}
724 		punt = RF_FALSE;/* Set to RF_TRUE if work is blocked.  This
725 				 * can happen in one of two ways: 1) no core
726 				 * log (AcquireParityLog) 2) waiting on
727 				 * reintegration (DumpParityLogToDisk) If punt
728 				 * is RF_TRUE, the dataItem was queued, so
729 				 * skip to next item. */
730 
731 		/* process item, one sector at a time, until all sectors
732 		 * processed or we punt */
733 		if (item->diskAddress.numSector > 0)
734 			done = RF_FALSE;
735 		else
736 			RF_ASSERT(0);
737 		while (!punt && !done) {
738 			/* verify that a core log exists for this region */
739 			if (!raidPtr->regionInfo[regionID].coreLog) {
740 				/* Attempt to acquire a parity log. If
741 				 * acquisition fails, queue remaining work in
742 				 * data item and move to nextItem. */
743 				if (incomingLog)
744 					if (*incomingLog) {
745 						RF_ASSERT((*incomingLog)->next == NULL);
746 						raidPtr->regionInfo[regionID].coreLog = *incomingLog;
747 						raidPtr->regionInfo[regionID].coreLog->regionID = regionID;
748 						*incomingLog = NULL;
749 					} else
750 						raidPtr->regionInfo[regionID].coreLog = AcquireParityLog(item, finish);
751 				else
752 					raidPtr->regionInfo[regionID].coreLog = AcquireParityLog(item, finish);
753 				/* Note: AcquireParityLog either returns a log
754 				 * or enqueues currentItem */
755 			}
756 			if (!raidPtr->regionInfo[regionID].coreLog)
757 				punt = RF_TRUE;	/* failed to find a core log */
758 			else {
759 				RF_ASSERT(raidPtr->regionInfo[regionID].coreLog->next == NULL);
760 				/* verify that the log has room for new
761 				 * entries */
762 				/* if log is full, dump it to disk and grab a
763 				 * new log */
764 				if (raidPtr->regionInfo[regionID].coreLog->numRecords == raidPtr->numSectorsPerLog) {
765 					/* log is full, dump it to disk */
766 					if (DumpParityLogToDisk(finish, item))
767 						punt = RF_TRUE;	/* dump unsuccessful,
768 								 * blocked on
769 								 * reintegration */
770 					else {
771 						/* dump was successful */
772 						if (incomingLog)
773 							if (*incomingLog) {
774 								RF_ASSERT((*incomingLog)->next == NULL);
775 								raidPtr->regionInfo[regionID].coreLog = *incomingLog;
776 								raidPtr->regionInfo[regionID].coreLog->regionID = regionID;
777 								*incomingLog = NULL;
778 							} else
779 								raidPtr->regionInfo[regionID].coreLog = AcquireParityLog(item, finish);
780 						else
781 							raidPtr->regionInfo[regionID].coreLog = AcquireParityLog(item, finish);
782 						/* if a core log is not
783 						 * available, must queue work
784 						 * and return */
785 						if (!raidPtr->regionInfo[regionID].coreLog)
786 							punt = RF_TRUE;	/* blocked on log
787 									 * availability */
788 					}
789 				}
790 			}
791 			/* if we didn't punt on this item, attempt to add a
792 			 * sector to the core log */
793 			if (!punt) {
794 				RF_ASSERT(raidPtr->regionInfo[regionID].coreLog->next == NULL);
795 				/* at this point, we have a core log with
796 				 * enough room for a sector */
797 				/* copy a sector into the log */
798 				log = raidPtr->regionInfo[regionID].coreLog;
799 				RF_ASSERT(log->numRecords < raidPtr->numSectorsPerLog);
800 				logItem = log->numRecords++;
801 				log->records[logItem].parityAddr = item->diskAddress;
802 				RF_ASSERT(log->records[logItem].parityAddr.startSector >= raidPtr->regionInfo[regionID].parityStartAddr);
803 				RF_ASSERT(log->records[logItem].parityAddr.startSector < raidPtr->regionInfo[regionID].parityStartAddr + raidPtr->regionInfo[regionID].numSectorsParity);
804 				log->records[logItem].parityAddr.numSector = 1;
805 				log->records[logItem].operation = item->common->operation;
806 				memcpy((char *)log->bufPtr + (logItem * (1 << item->common->raidPtr->logBytesPerSector)), ((char *)item->common->bufPtr + (item->bufOffset++ * (1 << item->common->raidPtr->logBytesPerSector))), (1 << item->common->raidPtr->logBytesPerSector));
807 				item->diskAddress.numSector--;
808 				item->diskAddress.startSector++;
809 				if (item->diskAddress.numSector == 0)
810 					done = RF_TRUE;
811 			}
812 		}
813 
814 		if (!punt) {
815 			/* Processed this item completely, decrement count of
816 			 * items to be processed. */
817 			RF_ASSERT(item->diskAddress.numSector == 0);
818 			RF_LOCK_MUTEX(item->common->mutex);
819 			item->common->cnt--;
820 			if (item->common->cnt == 0)
821 				itemDone = RF_TRUE;
822 			else
823 				itemDone = RF_FALSE;
824 			RF_UNLOCK_MUTEX(item->common->mutex);
825 			if (itemDone) {
826 				/* Finished processing all log data for this
827 				 * IO Return structs to free list and invoke
828 				 * wakeup function. */
829 				timer = item->common->startTime;	/* grab initial value of
830 									 * timer */
831 				RF_ETIMER_STOP(timer);
832 				RF_ETIMER_EVAL(timer);
833 				item->common->tracerec->plog_us += RF_ETIMER_VAL_US(timer);
834 				if (rf_parityLogDebug)
835 					printf("[waking process for region %d]\n", item->regionID);
836 				wakeFunc = item->common->wakeFunc;
837 				wakeArg = item->common->wakeArg;
838 				FreeParityLogCommonData(item->common);
839 				FreeParityLogData(item);
840 				(wakeFunc) (wakeArg, 0);
841 			} else
842 				FreeParityLogData(item);
843 		}
844 	}
845 	RF_UNLOCK_MUTEX(raidPtr->regionInfo[regionID].mutex);
846 	if (rf_parityLogDebug)
847 		printf("[exiting ParityLogAppend]\n");
848 	return (0);
849 }
850 
851 
852 void
853 rf_EnableParityLogging(RF_Raid_t * raidPtr)
854 {
855 	int     regionID;
856 
857 	for (regionID = 0; regionID < rf_numParityRegions; regionID++) {
858 		RF_LOCK_MUTEX(raidPtr->regionInfo[regionID].mutex);
859 		raidPtr->regionInfo[regionID].loggingEnabled = RF_TRUE;
860 		RF_UNLOCK_MUTEX(raidPtr->regionInfo[regionID].mutex);
861 	}
862 	if (rf_parityLogDebug)
863 		printf("[parity logging enabled]\n");
864 }
865 #endif				/* RF_INCLUDE_PARITYLOGGING > 0 */
866