xref: /netbsd/sys/dev/raidframe/rf_paritylog.c (revision c4a72b64)
1 /*	$NetBSD: rf_paritylog.c,v 1.9 2002/09/14 17:53:58 oster Exp $	*/
2 /*
3  * Copyright (c) 1995 Carnegie-Mellon University.
4  * All rights reserved.
5  *
6  * Author: William V. Courtright II
7  *
8  * Permission to use, copy, modify and distribute this software and
9  * its documentation is hereby granted, provided that both the copyright
10  * notice and this permission notice appear in all copies of the
11  * software, derivative works or modified versions, and any portions
12  * thereof, and that both notices appear in supporting documentation.
13  *
14  * CARNEGIE MELLON ALLOWS FREE USE OF THIS SOFTWARE IN ITS "AS IS"
15  * CONDITION.  CARNEGIE MELLON DISCLAIMS ANY LIABILITY OF ANY KIND
16  * FOR ANY DAMAGES WHATSOEVER RESULTING FROM THE USE OF THIS SOFTWARE.
17  *
18  * Carnegie Mellon requests users of this software to return to
19  *
20  *  Software Distribution Coordinator  or  Software.Distribution@CS.CMU.EDU
21  *  School of Computer Science
22  *  Carnegie Mellon University
23  *  Pittsburgh PA 15213-3890
24  *
25  * any improvements or extensions that they make and grant Carnegie the
26  * rights to redistribute these changes.
27  */
28 
29 /* Code for manipulating in-core parity logs
30  *
31  */
32 
33 #include <sys/cdefs.h>
34 __KERNEL_RCSID(0, "$NetBSD: rf_paritylog.c,v 1.9 2002/09/14 17:53:58 oster Exp $");
35 
36 #include "rf_archs.h"
37 
38 #if RF_INCLUDE_PARITYLOGGING > 0
39 
40 /*
41  * Append-only log for recording parity "update" and "overwrite" records
42  */
43 
44 #include <dev/raidframe/raidframevar.h>
45 
46 #include "rf_threadstuff.h"
47 #include "rf_mcpair.h"
48 #include "rf_raid.h"
49 #include "rf_dag.h"
50 #include "rf_dagfuncs.h"
51 #include "rf_desc.h"
52 #include "rf_layout.h"
53 #include "rf_diskqueue.h"
54 #include "rf_etimer.h"
55 #include "rf_paritylog.h"
56 #include "rf_general.h"
57 #include "rf_map.h"
58 #include "rf_paritylogging.h"
59 #include "rf_paritylogDiskMgr.h"
60 
61 static RF_CommonLogData_t *
62 AllocParityLogCommonData(RF_Raid_t * raidPtr)
63 {
64 	RF_CommonLogData_t *common = NULL;
65 	int     rc;
66 
67 	/* Return a struct for holding common parity log information from the
68 	 * free list (rf_parityLogDiskQueue.freeCommonList).  If the free list
69 	 * is empty, call RF_Malloc to create a new structure. NON-BLOCKING */
70 
71 	RF_LOCK_MUTEX(raidPtr->parityLogDiskQueue.mutex);
72 	if (raidPtr->parityLogDiskQueue.freeCommonList) {
73 		common = raidPtr->parityLogDiskQueue.freeCommonList;
74 		raidPtr->parityLogDiskQueue.freeCommonList = raidPtr->parityLogDiskQueue.freeCommonList->next;
75 		RF_UNLOCK_MUTEX(raidPtr->parityLogDiskQueue.mutex);
76 	} else {
77 		RF_UNLOCK_MUTEX(raidPtr->parityLogDiskQueue.mutex);
78 		RF_Malloc(common, sizeof(RF_CommonLogData_t), (RF_CommonLogData_t *));
79 		rc = rf_mutex_init(&common->mutex);
80 		if (rc) {
81 			rf_print_unable_to_init_mutex(__FILE__, __LINE__, rc);
82 			RF_Free(common, sizeof(RF_CommonLogData_t));
83 			common = NULL;
84 		}
85 	}
86 	common->next = NULL;
87 	return (common);
88 }
89 
90 static void
91 FreeParityLogCommonData(RF_CommonLogData_t * common)
92 {
93 	RF_Raid_t *raidPtr;
94 
95 	/* Insert a single struct for holding parity log information (data)
96 	 * into the free list (rf_parityLogDiskQueue.freeCommonList).
97 	 * NON-BLOCKING */
98 
99 	raidPtr = common->raidPtr;
100 	RF_LOCK_MUTEX(raidPtr->parityLogDiskQueue.mutex);
101 	common->next = raidPtr->parityLogDiskQueue.freeCommonList;
102 	raidPtr->parityLogDiskQueue.freeCommonList = common;
103 	RF_UNLOCK_MUTEX(raidPtr->parityLogDiskQueue.mutex);
104 }
105 
106 static RF_ParityLogData_t *
107 AllocParityLogData(RF_Raid_t * raidPtr)
108 {
109 	RF_ParityLogData_t *data = NULL;
110 
111 	/* Return a struct for holding parity log information from the free
112 	 * list (rf_parityLogDiskQueue.freeList).  If the free list is empty,
113 	 * call RF_Malloc to create a new structure. NON-BLOCKING */
114 
115 	RF_LOCK_MUTEX(raidPtr->parityLogDiskQueue.mutex);
116 	if (raidPtr->parityLogDiskQueue.freeDataList) {
117 		data = raidPtr->parityLogDiskQueue.freeDataList;
118 		raidPtr->parityLogDiskQueue.freeDataList = raidPtr->parityLogDiskQueue.freeDataList->next;
119 		RF_UNLOCK_MUTEX(raidPtr->parityLogDiskQueue.mutex);
120 	} else {
121 		RF_UNLOCK_MUTEX(raidPtr->parityLogDiskQueue.mutex);
122 		RF_Malloc(data, sizeof(RF_ParityLogData_t), (RF_ParityLogData_t *));
123 	}
124 	data->next = NULL;
125 	data->prev = NULL;
126 	return (data);
127 }
128 
129 
130 static void
131 FreeParityLogData(RF_ParityLogData_t * data)
132 {
133 	RF_ParityLogData_t *nextItem;
134 	RF_Raid_t *raidPtr;
135 
136 	/* Insert a linked list of structs for holding parity log information
137 	 * (data) into the free list (parityLogDiskQueue.freeList).
138 	 * NON-BLOCKING */
139 
140 	raidPtr = data->common->raidPtr;
141 	RF_LOCK_MUTEX(raidPtr->parityLogDiskQueue.mutex);
142 	while (data) {
143 		nextItem = data->next;
144 		data->next = raidPtr->parityLogDiskQueue.freeDataList;
145 		raidPtr->parityLogDiskQueue.freeDataList = data;
146 		data = nextItem;
147 	}
148 	RF_UNLOCK_MUTEX(raidPtr->parityLogDiskQueue.mutex);
149 }
150 
151 
152 static void
153 EnqueueParityLogData(
154     RF_ParityLogData_t * data,
155     RF_ParityLogData_t ** head,
156     RF_ParityLogData_t ** tail)
157 {
158 	RF_Raid_t *raidPtr;
159 
160 	/* Insert an in-core parity log (*data) into the head of a disk queue
161 	 * (*head, *tail). NON-BLOCKING */
162 
163 	raidPtr = data->common->raidPtr;
164 	if (rf_parityLogDebug)
165 		printf("[enqueueing parity log data, region %d, raidAddress %d, numSector %d]\n", data->regionID, (int) data->diskAddress.raidAddress, (int) data->diskAddress.numSector);
166 	RF_ASSERT(data->prev == NULL);
167 	RF_ASSERT(data->next == NULL);
168 	RF_LOCK_MUTEX(raidPtr->parityLogDiskQueue.mutex);
169 	if (*head) {
170 		/* insert into head of queue */
171 		RF_ASSERT((*head)->prev == NULL);
172 		RF_ASSERT((*tail)->next == NULL);
173 		data->next = *head;
174 		(*head)->prev = data;
175 		*head = data;
176 	} else {
177 		/* insert into empty list */
178 		RF_ASSERT(*head == NULL);
179 		RF_ASSERT(*tail == NULL);
180 		*head = data;
181 		*tail = data;
182 	}
183 	RF_ASSERT((*head)->prev == NULL);
184 	RF_ASSERT((*tail)->next == NULL);
185 	RF_UNLOCK_MUTEX(raidPtr->parityLogDiskQueue.mutex);
186 }
187 
188 static RF_ParityLogData_t *
189 DequeueParityLogData(
190     RF_Raid_t * raidPtr,
191     RF_ParityLogData_t ** head,
192     RF_ParityLogData_t ** tail,
193     int ignoreLocks)
194 {
195 	RF_ParityLogData_t *data;
196 
197 	/* Remove and return an in-core parity log from the tail of a disk
198 	 * queue (*head, *tail). NON-BLOCKING */
199 
200 	/* remove from tail, preserving FIFO order */
201 	if (!ignoreLocks)
202 		RF_LOCK_MUTEX(raidPtr->parityLogDiskQueue.mutex);
203 	data = *tail;
204 	if (data) {
205 		if (*head == *tail) {
206 			/* removing last item from queue */
207 			*head = NULL;
208 			*tail = NULL;
209 		} else {
210 			*tail = (*tail)->prev;
211 			(*tail)->next = NULL;
212 			RF_ASSERT((*head)->prev == NULL);
213 			RF_ASSERT((*tail)->next == NULL);
214 		}
215 		data->next = NULL;
216 		data->prev = NULL;
217 		if (rf_parityLogDebug)
218 			printf("[dequeueing parity log data, region %d, raidAddress %d, numSector %d]\n", data->regionID, (int) data->diskAddress.raidAddress, (int) data->diskAddress.numSector);
219 	}
220 	if (*head) {
221 		RF_ASSERT((*head)->prev == NULL);
222 		RF_ASSERT((*tail)->next == NULL);
223 	}
224 	if (!ignoreLocks)
225 		RF_UNLOCK_MUTEX(raidPtr->parityLogDiskQueue.mutex);
226 	return (data);
227 }
228 
229 
230 static void
231 RequeueParityLogData(
232     RF_ParityLogData_t * data,
233     RF_ParityLogData_t ** head,
234     RF_ParityLogData_t ** tail)
235 {
236 	RF_Raid_t *raidPtr;
237 
238 	/* Insert an in-core parity log (*data) into the tail of a disk queue
239 	 * (*head, *tail). NON-BLOCKING */
240 
241 	raidPtr = data->common->raidPtr;
242 	RF_ASSERT(data);
243 	if (rf_parityLogDebug)
244 		printf("[requeueing parity log data, region %d, raidAddress %d, numSector %d]\n", data->regionID, (int) data->diskAddress.raidAddress, (int) data->diskAddress.numSector);
245 	RF_LOCK_MUTEX(raidPtr->parityLogDiskQueue.mutex);
246 	if (*tail) {
247 		/* append to tail of list */
248 		data->prev = *tail;
249 		data->next = NULL;
250 		(*tail)->next = data;
251 		*tail = data;
252 	} else {
253 		/* inserting into an empty list */
254 		*head = data;
255 		*tail = data;
256 		(*head)->prev = NULL;
257 		(*tail)->next = NULL;
258 	}
259 	RF_ASSERT((*head)->prev == NULL);
260 	RF_ASSERT((*tail)->next == NULL);
261 	RF_UNLOCK_MUTEX(raidPtr->parityLogDiskQueue.mutex);
262 }
263 
264 RF_ParityLogData_t *
265 rf_CreateParityLogData(
266     RF_ParityRecordType_t operation,
267     RF_PhysDiskAddr_t * pda,
268     caddr_t bufPtr,
269     RF_Raid_t * raidPtr,
270     int (*wakeFunc) (RF_DagNode_t * node, int status),
271     void *wakeArg,
272     RF_AccTraceEntry_t * tracerec,
273     RF_Etimer_t startTime)
274 {
275 	RF_ParityLogData_t *data, *resultHead = NULL, *resultTail = NULL;
276 	RF_CommonLogData_t *common;
277 	RF_PhysDiskAddr_t *diskAddress;
278 	int     boundary, offset = 0;
279 
280 	/* Return an initialized struct of info to be logged. Build one item
281 	 * per physical disk address, one item per region.
282 	 *
283 	 * NON-BLOCKING */
284 
285 	diskAddress = pda;
286 	common = AllocParityLogCommonData(raidPtr);
287 	RF_ASSERT(common);
288 
289 	common->operation = operation;
290 	common->bufPtr = bufPtr;
291 	common->raidPtr = raidPtr;
292 	common->wakeFunc = wakeFunc;
293 	common->wakeArg = wakeArg;
294 	common->tracerec = tracerec;
295 	common->startTime = startTime;
296 	common->cnt = 0;
297 
298 	if (rf_parityLogDebug)
299 		printf("[entering CreateParityLogData]\n");
300 	while (diskAddress) {
301 		common->cnt++;
302 		data = AllocParityLogData(raidPtr);
303 		RF_ASSERT(data);
304 		data->common = common;
305 		data->next = NULL;
306 		data->prev = NULL;
307 		data->regionID = rf_MapRegionIDParityLogging(raidPtr, diskAddress->startSector);
308 		if (data->regionID == rf_MapRegionIDParityLogging(raidPtr, diskAddress->startSector + diskAddress->numSector - 1)) {
309 			/* disk address does not cross a region boundary */
310 			data->diskAddress = *diskAddress;
311 			data->bufOffset = offset;
312 			offset = offset + diskAddress->numSector;
313 			EnqueueParityLogData(data, &resultHead, &resultTail);
314 			/* adjust disk address */
315 			diskAddress = diskAddress->next;
316 		} else {
317 			/* disk address crosses a region boundary */
318 			/* find address where region is crossed */
319 			boundary = 0;
320 			while (data->regionID == rf_MapRegionIDParityLogging(raidPtr, diskAddress->startSector + boundary))
321 				boundary++;
322 
323 			/* enter data before the boundary */
324 			data->diskAddress = *diskAddress;
325 			data->diskAddress.numSector = boundary;
326 			data->bufOffset = offset;
327 			offset += boundary;
328 			EnqueueParityLogData(data, &resultHead, &resultTail);
329 			/* adjust disk address */
330 			diskAddress->startSector += boundary;
331 			diskAddress->numSector -= boundary;
332 		}
333 	}
334 	if (rf_parityLogDebug)
335 		printf("[leaving CreateParityLogData]\n");
336 	return (resultHead);
337 }
338 
339 
340 RF_ParityLogData_t *
341 rf_SearchAndDequeueParityLogData(
342     RF_Raid_t * raidPtr,
343     int regionID,
344     RF_ParityLogData_t ** head,
345     RF_ParityLogData_t ** tail,
346     int ignoreLocks)
347 {
348 	RF_ParityLogData_t *w;
349 
350 	/* Remove and return an in-core parity log from a specified region
351 	 * (regionID). If a matching log is not found, return NULL.
352 	 *
353 	 * NON-BLOCKING. */
354 
355 	/* walk backward through a list, looking for an entry with a matching
356 	 * region ID */
357 	if (!ignoreLocks)
358 		RF_LOCK_MUTEX(raidPtr->parityLogDiskQueue.mutex);
359 	w = (*tail);
360 	while (w) {
361 		if (w->regionID == regionID) {
362 			/* remove an element from the list */
363 			if (w == *tail) {
364 				if (*head == *tail) {
365 					/* removing only element in the list */
366 					*head = NULL;
367 					*tail = NULL;
368 				} else {
369 					/* removing last item in the list */
370 					*tail = (*tail)->prev;
371 					(*tail)->next = NULL;
372 					RF_ASSERT((*head)->prev == NULL);
373 					RF_ASSERT((*tail)->next == NULL);
374 				}
375 			} else {
376 				if (w == *head) {
377 					/* removing first item in the list */
378 					*head = (*head)->next;
379 					(*head)->prev = NULL;
380 					RF_ASSERT((*head)->prev == NULL);
381 					RF_ASSERT((*tail)->next == NULL);
382 				} else {
383 					/* removing an item from the middle of
384 					 * the list */
385 					w->prev->next = w->next;
386 					w->next->prev = w->prev;
387 					RF_ASSERT((*head)->prev == NULL);
388 					RF_ASSERT((*tail)->next == NULL);
389 				}
390 			}
391 			w->prev = NULL;
392 			w->next = NULL;
393 			if (rf_parityLogDebug)
394 				printf("[dequeueing parity log data, region %d, raidAddress %d, numSector %d]\n", w->regionID, (int) w->diskAddress.raidAddress, (int) w->diskAddress.numSector);
395 			return (w);
396 		} else
397 			w = w->prev;
398 	}
399 	if (!ignoreLocks)
400 		RF_UNLOCK_MUTEX(raidPtr->parityLogDiskQueue.mutex);
401 	return (NULL);
402 }
403 
404 static RF_ParityLogData_t *
405 DequeueMatchingLogData(
406     RF_Raid_t * raidPtr,
407     RF_ParityLogData_t ** head,
408     RF_ParityLogData_t ** tail)
409 {
410 	RF_ParityLogData_t *logDataList, *logData;
411 	int     regionID;
412 
413 	/* Remove and return an in-core parity log from the tail of a disk
414 	 * queue (*head, *tail).  Then remove all matching (identical
415 	 * regionIDs) logData and return as a linked list.
416 	 *
417 	 * NON-BLOCKING */
418 
419 	logDataList = DequeueParityLogData(raidPtr, head, tail, RF_TRUE);
420 	if (logDataList) {
421 		regionID = logDataList->regionID;
422 		logData = logDataList;
423 		logData->next = rf_SearchAndDequeueParityLogData(raidPtr, regionID, head, tail, RF_TRUE);
424 		while (logData->next) {
425 			logData = logData->next;
426 			logData->next = rf_SearchAndDequeueParityLogData(raidPtr, regionID, head, tail, RF_TRUE);
427 		}
428 	}
429 	return (logDataList);
430 }
431 
432 
433 static RF_ParityLog_t *
434 AcquireParityLog(
435     RF_ParityLogData_t * logData,
436     int finish)
437 {
438 	RF_ParityLog_t *log = NULL;
439 	RF_Raid_t *raidPtr;
440 
441 	/* Grab a log buffer from the pool and return it. If no buffers are
442 	 * available, return NULL. NON-BLOCKING */
443 	raidPtr = logData->common->raidPtr;
444 	RF_LOCK_MUTEX(raidPtr->parityLogPool.mutex);
445 	if (raidPtr->parityLogPool.parityLogs) {
446 		log = raidPtr->parityLogPool.parityLogs;
447 		raidPtr->parityLogPool.parityLogs = raidPtr->parityLogPool.parityLogs->next;
448 		log->regionID = logData->regionID;
449 		log->numRecords = 0;
450 		log->next = NULL;
451 		raidPtr->logsInUse++;
452 		RF_ASSERT(raidPtr->logsInUse >= 0 && raidPtr->logsInUse <= raidPtr->numParityLogs);
453 	} else {
454 		/* no logs available, so place ourselves on the queue of work
455 		 * waiting on log buffers this is done while
456 		 * parityLogPool.mutex is held, to ensure synchronization with
457 		 * ReleaseParityLogs. */
458 		if (rf_parityLogDebug)
459 			printf("[blocked on log, region %d, finish %d]\n", logData->regionID, finish);
460 		if (finish)
461 			RequeueParityLogData(logData, &raidPtr->parityLogDiskQueue.logBlockHead, &raidPtr->parityLogDiskQueue.logBlockTail);
462 		else
463 			EnqueueParityLogData(logData, &raidPtr->parityLogDiskQueue.logBlockHead, &raidPtr->parityLogDiskQueue.logBlockTail);
464 	}
465 	RF_UNLOCK_MUTEX(raidPtr->parityLogPool.mutex);
466 	return (log);
467 }
468 
469 void
470 rf_ReleaseParityLogs(
471     RF_Raid_t * raidPtr,
472     RF_ParityLog_t * firstLog)
473 {
474 	RF_ParityLogData_t *logDataList;
475 	RF_ParityLog_t *log, *lastLog;
476 	int     cnt;
477 
478 	/* Insert a linked list of parity logs (firstLog) to the free list
479 	 * (parityLogPool.parityLogPool)
480 	 *
481 	 * NON-BLOCKING. */
482 
483 	RF_ASSERT(firstLog);
484 
485 	/* Before returning logs to global free list, service all requests
486 	 * which are blocked on logs.  Holding mutexes for parityLogPool and
487 	 * parityLogDiskQueue forces synchronization with AcquireParityLog(). */
488 	RF_LOCK_MUTEX(raidPtr->parityLogPool.mutex);
489 	RF_LOCK_MUTEX(raidPtr->parityLogDiskQueue.mutex);
490 	logDataList = DequeueMatchingLogData(raidPtr, &raidPtr->parityLogDiskQueue.logBlockHead, &raidPtr->parityLogDiskQueue.logBlockTail);
491 	log = firstLog;
492 	if (firstLog)
493 		firstLog = firstLog->next;
494 	log->numRecords = 0;
495 	log->next = NULL;
496 	while (logDataList && log) {
497 		RF_UNLOCK_MUTEX(raidPtr->parityLogPool.mutex);
498 		RF_UNLOCK_MUTEX(raidPtr->parityLogDiskQueue.mutex);
499 		rf_ParityLogAppend(logDataList, RF_TRUE, &log, RF_FALSE);
500 		if (rf_parityLogDebug)
501 			printf("[finishing up buf-blocked log data, region %d]\n", logDataList->regionID);
502 		if (log == NULL) {
503 			log = firstLog;
504 			if (firstLog) {
505 				firstLog = firstLog->next;
506 				log->numRecords = 0;
507 				log->next = NULL;
508 			}
509 		}
510 		RF_LOCK_MUTEX(raidPtr->parityLogPool.mutex);
511 		RF_LOCK_MUTEX(raidPtr->parityLogDiskQueue.mutex);
512 		if (log)
513 			logDataList = DequeueMatchingLogData(raidPtr, &raidPtr->parityLogDiskQueue.logBlockHead, &raidPtr->parityLogDiskQueue.logBlockTail);
514 	}
515 	/* return remaining logs to pool */
516 	if (log) {
517 		log->next = firstLog;
518 		firstLog = log;
519 	}
520 	if (firstLog) {
521 		lastLog = firstLog;
522 		raidPtr->logsInUse--;
523 		RF_ASSERT(raidPtr->logsInUse >= 0 && raidPtr->logsInUse <= raidPtr->numParityLogs);
524 		while (lastLog->next) {
525 			lastLog = lastLog->next;
526 			raidPtr->logsInUse--;
527 			RF_ASSERT(raidPtr->logsInUse >= 0 && raidPtr->logsInUse <= raidPtr->numParityLogs);
528 		}
529 		lastLog->next = raidPtr->parityLogPool.parityLogs;
530 		raidPtr->parityLogPool.parityLogs = firstLog;
531 		cnt = 0;
532 		log = raidPtr->parityLogPool.parityLogs;
533 		while (log) {
534 			cnt++;
535 			log = log->next;
536 		}
537 		RF_ASSERT(cnt + raidPtr->logsInUse == raidPtr->numParityLogs);
538 	}
539 	RF_UNLOCK_MUTEX(raidPtr->parityLogPool.mutex);
540 	RF_UNLOCK_MUTEX(raidPtr->parityLogDiskQueue.mutex);
541 }
542 
543 static void
544 ReintLog(
545     RF_Raid_t * raidPtr,
546     int regionID,
547     RF_ParityLog_t * log)
548 {
549 	RF_ASSERT(log);
550 
551 	/* Insert an in-core parity log (log) into the disk queue of
552 	 * reintegration work.  Set the flag (reintInProgress) for the
553 	 * specified region (regionID) to indicate that reintegration is in
554 	 * progress for this region. NON-BLOCKING */
555 
556 	RF_LOCK_MUTEX(raidPtr->regionInfo[regionID].reintMutex);
557 	raidPtr->regionInfo[regionID].reintInProgress = RF_TRUE;	/* cleared when reint
558 									 * complete */
559 
560 	if (rf_parityLogDebug)
561 		printf("[requesting reintegration of region %d]\n", log->regionID);
562 	/* move record to reintegration queue */
563 	RF_LOCK_MUTEX(raidPtr->parityLogDiskQueue.mutex);
564 	log->next = raidPtr->parityLogDiskQueue.reintQueue;
565 	raidPtr->parityLogDiskQueue.reintQueue = log;
566 	RF_UNLOCK_MUTEX(raidPtr->regionInfo[regionID].reintMutex);
567 	RF_UNLOCK_MUTEX(raidPtr->parityLogDiskQueue.mutex);
568 	RF_SIGNAL_COND(raidPtr->parityLogDiskQueue.cond);
569 }
570 
571 static void
572 FlushLog(
573     RF_Raid_t * raidPtr,
574     RF_ParityLog_t * log)
575 {
576 	/* insert a core log (log) into a list of logs
577 	 * (parityLogDiskQueue.flushQueue) waiting to be written to disk.
578 	 * NON-BLOCKING */
579 
580 	RF_ASSERT(log);
581 	RF_ASSERT(log->numRecords == raidPtr->numSectorsPerLog);
582 	RF_ASSERT(log->next == NULL);
583 	/* move log to flush queue */
584 	RF_LOCK_MUTEX(raidPtr->parityLogDiskQueue.mutex);
585 	log->next = raidPtr->parityLogDiskQueue.flushQueue;
586 	raidPtr->parityLogDiskQueue.flushQueue = log;
587 	RF_UNLOCK_MUTEX(raidPtr->parityLogDiskQueue.mutex);
588 	RF_SIGNAL_COND(raidPtr->parityLogDiskQueue.cond);
589 }
590 
591 static int
592 DumpParityLogToDisk(
593     int finish,
594     RF_ParityLogData_t * logData)
595 {
596 	int     i, diskCount, regionID = logData->regionID;
597 	RF_ParityLog_t *log;
598 	RF_Raid_t *raidPtr;
599 
600 	raidPtr = logData->common->raidPtr;
601 
602 	/* Move a core log to disk.  If the log disk is full, initiate
603 	 * reintegration.
604 	 *
605 	 * Return (0) if we can enqueue the dump immediately, otherwise return
606 	 * (1) to indicate we are blocked on reintegration and control of the
607 	 * thread should be relinquished.
608 	 *
609 	 * Caller must hold regionInfo[regionID].mutex
610 	 *
611 	 * NON-BLOCKING */
612 
613 	if (rf_parityLogDebug)
614 		printf("[dumping parity log to disk, region %d]\n", regionID);
615 	log = raidPtr->regionInfo[regionID].coreLog;
616 	RF_ASSERT(log->numRecords == raidPtr->numSectorsPerLog);
617 	RF_ASSERT(log->next == NULL);
618 
619 	/* if reintegration is in progress, must queue work */
620 	RF_LOCK_MUTEX(raidPtr->regionInfo[regionID].reintMutex);
621 	if (raidPtr->regionInfo[regionID].reintInProgress) {
622 		/* Can not proceed since this region is currently being
623 		 * reintegrated. We can not block, so queue remaining work and
624 		 * return */
625 		if (rf_parityLogDebug)
626 			printf("[region %d waiting on reintegration]\n", regionID);
627 		/* XXX not sure about the use of finish - shouldn't this
628 		 * always be "Enqueue"? */
629 		if (finish)
630 			RequeueParityLogData(logData, &raidPtr->parityLogDiskQueue.reintBlockHead, &raidPtr->parityLogDiskQueue.reintBlockTail);
631 		else
632 			EnqueueParityLogData(logData, &raidPtr->parityLogDiskQueue.reintBlockHead, &raidPtr->parityLogDiskQueue.reintBlockTail);
633 		RF_UNLOCK_MUTEX(raidPtr->regionInfo[regionID].reintMutex);
634 		return (1);	/* relenquish control of this thread */
635 	}
636 	RF_UNLOCK_MUTEX(raidPtr->regionInfo[regionID].reintMutex);
637 	raidPtr->regionInfo[regionID].coreLog = NULL;
638 	if ((raidPtr->regionInfo[regionID].diskCount) < raidPtr->regionInfo[regionID].capacity)
639 		/* IMPORTANT!! this loop bound assumes region disk holds an
640 		 * integral number of core logs */
641 	{
642 		/* update disk map for this region */
643 		diskCount = raidPtr->regionInfo[regionID].diskCount;
644 		for (i = 0; i < raidPtr->numSectorsPerLog; i++) {
645 			raidPtr->regionInfo[regionID].diskMap[i + diskCount].operation = log->records[i].operation;
646 			raidPtr->regionInfo[regionID].diskMap[i + diskCount].parityAddr = log->records[i].parityAddr;
647 		}
648 		log->diskOffset = diskCount;
649 		raidPtr->regionInfo[regionID].diskCount += raidPtr->numSectorsPerLog;
650 		FlushLog(raidPtr, log);
651 	} else {
652 		/* no room for log on disk, send it to disk manager and
653 		 * request reintegration */
654 		RF_ASSERT(raidPtr->regionInfo[regionID].diskCount == raidPtr->regionInfo[regionID].capacity);
655 		ReintLog(raidPtr, regionID, log);
656 	}
657 	if (rf_parityLogDebug)
658 		printf("[finished dumping parity log to disk, region %d]\n", regionID);
659 	return (0);
660 }
661 
662 int
663 rf_ParityLogAppend(
664     RF_ParityLogData_t * logData,
665     int finish,
666     RF_ParityLog_t ** incomingLog,
667     int clearReintFlag)
668 {
669 	int     regionID, logItem, itemDone;
670 	RF_ParityLogData_t *item;
671 	int     punt, done = RF_FALSE;
672 	RF_ParityLog_t *log;
673 	RF_Raid_t *raidPtr;
674 	RF_Etimer_t timer;
675 	int     (*wakeFunc) (RF_DagNode_t * node, int status);
676 	void   *wakeArg;
677 
678 	/* Add parity to the appropriate log, one sector at a time. This
679 	 * routine is called is called by dag functions ParityLogUpdateFunc
680 	 * and ParityLogOverwriteFunc and therefore MUST BE NONBLOCKING.
681 	 *
682 	 * Parity to be logged is contained in a linked-list (logData).  When
683 	 * this routine returns, every sector in the list will be in one of
684 	 * three places: 1) entered into the parity log 2) queued, waiting on
685 	 * reintegration 3) queued, waiting on a core log
686 	 *
687 	 * Blocked work is passed to the ParityLoggingDiskManager for completion.
688 	 * Later, as conditions which required the block are removed, the work
689 	 * reenters this routine with the "finish" parameter set to "RF_TRUE."
690 	 *
691 	 * NON-BLOCKING */
692 
693 	raidPtr = logData->common->raidPtr;
694 	/* lock the region for the first item in logData */
695 	RF_ASSERT(logData != NULL);
696 	regionID = logData->regionID;
697 	RF_LOCK_MUTEX(raidPtr->regionInfo[regionID].mutex);
698 	RF_ASSERT(raidPtr->regionInfo[regionID].loggingEnabled);
699 
700 	if (clearReintFlag) {
701 		/* Enable flushing for this region.  Holding both locks
702 		 * provides a synchronization barrier with DumpParityLogToDisk */
703 		RF_LOCK_MUTEX(raidPtr->regionInfo[regionID].reintMutex);
704 		RF_LOCK_MUTEX(raidPtr->parityLogDiskQueue.mutex);
705 		RF_ASSERT(raidPtr->regionInfo[regionID].reintInProgress == RF_TRUE);
706 		raidPtr->regionInfo[regionID].diskCount = 0;
707 		raidPtr->regionInfo[regionID].reintInProgress = RF_FALSE;
708 		RF_UNLOCK_MUTEX(raidPtr->regionInfo[regionID].reintMutex);	/* flushing is now
709 										 * enabled */
710 		RF_UNLOCK_MUTEX(raidPtr->parityLogDiskQueue.mutex);
711 	}
712 	/* process each item in logData */
713 	while (logData) {
714 		/* remove an item from logData */
715 		item = logData;
716 		logData = logData->next;
717 		item->next = NULL;
718 		item->prev = NULL;
719 
720 		if (rf_parityLogDebug)
721 			printf("[appending parity log data, region %d, raidAddress %d, numSector %d]\n", item->regionID, (int) item->diskAddress.raidAddress, (int) item->diskAddress.numSector);
722 
723 		/* see if we moved to a new region */
724 		if (regionID != item->regionID) {
725 			RF_UNLOCK_MUTEX(raidPtr->regionInfo[regionID].mutex);
726 			regionID = item->regionID;
727 			RF_LOCK_MUTEX(raidPtr->regionInfo[regionID].mutex);
728 			RF_ASSERT(raidPtr->regionInfo[regionID].loggingEnabled);
729 		}
730 		punt = RF_FALSE;/* Set to RF_TRUE if work is blocked.  This
731 				 * can happen in one of two ways: 1) no core
732 				 * log (AcquireParityLog) 2) waiting on
733 				 * reintegration (DumpParityLogToDisk) If punt
734 				 * is RF_TRUE, the dataItem was queued, so
735 				 * skip to next item. */
736 
737 		/* process item, one sector at a time, until all sectors
738 		 * processed or we punt */
739 		if (item->diskAddress.numSector > 0)
740 			done = RF_FALSE;
741 		else
742 			RF_ASSERT(0);
743 		while (!punt && !done) {
744 			/* verify that a core log exists for this region */
745 			if (!raidPtr->regionInfo[regionID].coreLog) {
746 				/* Attempt to acquire a parity log. If
747 				 * acquisition fails, queue remaining work in
748 				 * data item and move to nextItem. */
749 				if (incomingLog)
750 					if (*incomingLog) {
751 						RF_ASSERT((*incomingLog)->next == NULL);
752 						raidPtr->regionInfo[regionID].coreLog = *incomingLog;
753 						raidPtr->regionInfo[regionID].coreLog->regionID = regionID;
754 						*incomingLog = NULL;
755 					} else
756 						raidPtr->regionInfo[regionID].coreLog = AcquireParityLog(item, finish);
757 				else
758 					raidPtr->regionInfo[regionID].coreLog = AcquireParityLog(item, finish);
759 				/* Note: AcquireParityLog either returns a log
760 				 * or enqueues currentItem */
761 			}
762 			if (!raidPtr->regionInfo[regionID].coreLog)
763 				punt = RF_TRUE;	/* failed to find a core log */
764 			else {
765 				RF_ASSERT(raidPtr->regionInfo[regionID].coreLog->next == NULL);
766 				/* verify that the log has room for new
767 				 * entries */
768 				/* if log is full, dump it to disk and grab a
769 				 * new log */
770 				if (raidPtr->regionInfo[regionID].coreLog->numRecords == raidPtr->numSectorsPerLog) {
771 					/* log is full, dump it to disk */
772 					if (DumpParityLogToDisk(finish, item))
773 						punt = RF_TRUE;	/* dump unsuccessful,
774 								 * blocked on
775 								 * reintegration */
776 					else {
777 						/* dump was successful */
778 						if (incomingLog)
779 							if (*incomingLog) {
780 								RF_ASSERT((*incomingLog)->next == NULL);
781 								raidPtr->regionInfo[regionID].coreLog = *incomingLog;
782 								raidPtr->regionInfo[regionID].coreLog->regionID = regionID;
783 								*incomingLog = NULL;
784 							} else
785 								raidPtr->regionInfo[regionID].coreLog = AcquireParityLog(item, finish);
786 						else
787 							raidPtr->regionInfo[regionID].coreLog = AcquireParityLog(item, finish);
788 						/* if a core log is not
789 						 * available, must queue work
790 						 * and return */
791 						if (!raidPtr->regionInfo[regionID].coreLog)
792 							punt = RF_TRUE;	/* blocked on log
793 									 * availability */
794 					}
795 				}
796 			}
797 			/* if we didn't punt on this item, attempt to add a
798 			 * sector to the core log */
799 			if (!punt) {
800 				RF_ASSERT(raidPtr->regionInfo[regionID].coreLog->next == NULL);
801 				/* at this point, we have a core log with
802 				 * enough room for a sector */
803 				/* copy a sector into the log */
804 				log = raidPtr->regionInfo[regionID].coreLog;
805 				RF_ASSERT(log->numRecords < raidPtr->numSectorsPerLog);
806 				logItem = log->numRecords++;
807 				log->records[logItem].parityAddr = item->diskAddress;
808 				RF_ASSERT(log->records[logItem].parityAddr.startSector >= raidPtr->regionInfo[regionID].parityStartAddr);
809 				RF_ASSERT(log->records[logItem].parityAddr.startSector < raidPtr->regionInfo[regionID].parityStartAddr + raidPtr->regionInfo[regionID].numSectorsParity);
810 				log->records[logItem].parityAddr.numSector = 1;
811 				log->records[logItem].operation = item->common->operation;
812 				memcpy(log->bufPtr + (logItem * (1 << item->common->raidPtr->logBytesPerSector)), (item->common->bufPtr + (item->bufOffset++ * (1 << item->common->raidPtr->logBytesPerSector))), (1 << item->common->raidPtr->logBytesPerSector));
813 				item->diskAddress.numSector--;
814 				item->diskAddress.startSector++;
815 				if (item->diskAddress.numSector == 0)
816 					done = RF_TRUE;
817 			}
818 		}
819 
820 		if (!punt) {
821 			/* Processed this item completely, decrement count of
822 			 * items to be processed. */
823 			RF_ASSERT(item->diskAddress.numSector == 0);
824 			RF_LOCK_MUTEX(item->common->mutex);
825 			item->common->cnt--;
826 			if (item->common->cnt == 0)
827 				itemDone = RF_TRUE;
828 			else
829 				itemDone = RF_FALSE;
830 			RF_UNLOCK_MUTEX(item->common->mutex);
831 			if (itemDone) {
832 				/* Finished processing all log data for this
833 				 * IO Return structs to free list and invoke
834 				 * wakeup function. */
835 				timer = item->common->startTime;	/* grab initial value of
836 									 * timer */
837 				RF_ETIMER_STOP(timer);
838 				RF_ETIMER_EVAL(timer);
839 				item->common->tracerec->plog_us += RF_ETIMER_VAL_US(timer);
840 				if (rf_parityLogDebug)
841 					printf("[waking process for region %d]\n", item->regionID);
842 				wakeFunc = item->common->wakeFunc;
843 				wakeArg = item->common->wakeArg;
844 				FreeParityLogCommonData(item->common);
845 				FreeParityLogData(item);
846 				(wakeFunc) (wakeArg, 0);
847 			} else
848 				FreeParityLogData(item);
849 		}
850 	}
851 	RF_UNLOCK_MUTEX(raidPtr->regionInfo[regionID].mutex);
852 	if (rf_parityLogDebug)
853 		printf("[exiting ParityLogAppend]\n");
854 	return (0);
855 }
856 
857 
858 void
859 rf_EnableParityLogging(RF_Raid_t * raidPtr)
860 {
861 	int     regionID;
862 
863 	for (regionID = 0; regionID < rf_numParityRegions; regionID++) {
864 		RF_LOCK_MUTEX(raidPtr->regionInfo[regionID].mutex);
865 		raidPtr->regionInfo[regionID].loggingEnabled = RF_TRUE;
866 		RF_UNLOCK_MUTEX(raidPtr->regionInfo[regionID].mutex);
867 	}
868 	if (rf_parityLogDebug)
869 		printf("[parity logging enabled]\n");
870 }
871 #endif				/* RF_INCLUDE_PARITYLOGGING > 0 */
872