1 /*	$NetBSD: rf_paritylog.c,v 1.18 2011/05/11 06:03:06 mrg Exp $	*/
2 /*
3  * Copyright (c) 1995 Carnegie-Mellon University.
4  * All rights reserved.
5  *
6  * Author: William V. Courtright II
7  *
8  * Permission to use, copy, modify and distribute this software and
9  * its documentation is hereby granted, provided that both the copyright
10  * notice and this permission notice appear in all copies of the
11  * software, derivative works or modified versions, and any portions
12  * thereof, and that both notices appear in supporting documentation.
13  *
14  * CARNEGIE MELLON ALLOWS FREE USE OF THIS SOFTWARE IN ITS "AS IS"
15  * CONDITION.  CARNEGIE MELLON DISCLAIMS ANY LIABILITY OF ANY KIND
16  * FOR ANY DAMAGES WHATSOEVER RESULTING FROM THE USE OF THIS SOFTWARE.
17  *
18  * Carnegie Mellon requests users of this software to return to
19  *
20  *  Software Distribution Coordinator  or  Software.Distribution@CS.CMU.EDU
21  *  School of Computer Science
22  *  Carnegie Mellon University
23  *  Pittsburgh PA 15213-3890
24  *
25  * any improvements or extensions that they make and grant Carnegie the
26  * rights to redistribute these changes.
27  */
28 
29 /* Code for manipulating in-core parity logs
30  *
31  */
32 
33 #include <sys/cdefs.h>
34 __KERNEL_RCSID(0, "$NetBSD: rf_paritylog.c,v 1.18 2011/05/11 06:03:06 mrg Exp $");
35 
36 #include "rf_archs.h"
37 
38 #if RF_INCLUDE_PARITYLOGGING > 0
39 
40 /*
41  * Append-only log for recording parity "update" and "overwrite" records
42  */
43 
44 #include <dev/raidframe/raidframevar.h>
45 
46 #include "rf_threadstuff.h"
47 #include "rf_mcpair.h"
48 #include "rf_raid.h"
49 #include "rf_dag.h"
50 #include "rf_dagfuncs.h"
51 #include "rf_desc.h"
52 #include "rf_layout.h"
53 #include "rf_diskqueue.h"
54 #include "rf_etimer.h"
55 #include "rf_paritylog.h"
56 #include "rf_general.h"
57 #include "rf_map.h"
58 #include "rf_paritylogging.h"
59 #include "rf_paritylogDiskMgr.h"
60 
61 static RF_CommonLogData_t *
AllocParityLogCommonData(RF_Raid_t * raidPtr)62 AllocParityLogCommonData(RF_Raid_t * raidPtr)
63 {
64 	RF_CommonLogData_t *common = NULL;
65 
66 	/* Return a struct for holding common parity log information from the
67 	 * free list (rf_parityLogDiskQueue.freeCommonList).  If the free list
68 	 * is empty, call RF_Malloc to create a new structure. NON-BLOCKING */
69 
70 	rf_lock_mutex2(raidPtr->parityLogDiskQueue.mutex);
71 	if (raidPtr->parityLogDiskQueue.freeCommonList) {
72 		common = raidPtr->parityLogDiskQueue.freeCommonList;
73 		raidPtr->parityLogDiskQueue.freeCommonList = raidPtr->parityLogDiskQueue.freeCommonList->next;
74 		rf_unlock_mutex2(raidPtr->parityLogDiskQueue.mutex);
75 	} else {
76 		rf_unlock_mutex2(raidPtr->parityLogDiskQueue.mutex);
77 		RF_Malloc(common, sizeof(RF_CommonLogData_t), (RF_CommonLogData_t *));
78 		/* destroy is in rf_paritylogging.c */
79 		rf_init_mutex2(common->mutex, IPL_VM);
80 	}
81 	common->next = NULL;
82 	return (common);
83 }
84 
85 static void
FreeParityLogCommonData(RF_CommonLogData_t * common)86 FreeParityLogCommonData(RF_CommonLogData_t * common)
87 {
88 	RF_Raid_t *raidPtr;
89 
90 	/* Insert a single struct for holding parity log information (data)
91 	 * into the free list (rf_parityLogDiskQueue.freeCommonList).
92 	 * NON-BLOCKING */
93 
94 	raidPtr = common->raidPtr;
95 	rf_lock_mutex2(raidPtr->parityLogDiskQueue.mutex);
96 	common->next = raidPtr->parityLogDiskQueue.freeCommonList;
97 	raidPtr->parityLogDiskQueue.freeCommonList = common;
98 	rf_unlock_mutex2(raidPtr->parityLogDiskQueue.mutex);
99 }
100 
101 static RF_ParityLogData_t *
AllocParityLogData(RF_Raid_t * raidPtr)102 AllocParityLogData(RF_Raid_t * raidPtr)
103 {
104 	RF_ParityLogData_t *data = NULL;
105 
106 	/* Return a struct for holding parity log information from the free
107 	 * list (rf_parityLogDiskQueue.freeList).  If the free list is empty,
108 	 * call RF_Malloc to create a new structure. NON-BLOCKING */
109 
110 	rf_lock_mutex2(raidPtr->parityLogDiskQueue.mutex);
111 	if (raidPtr->parityLogDiskQueue.freeDataList) {
112 		data = raidPtr->parityLogDiskQueue.freeDataList;
113 		raidPtr->parityLogDiskQueue.freeDataList = raidPtr->parityLogDiskQueue.freeDataList->next;
114 		rf_unlock_mutex2(raidPtr->parityLogDiskQueue.mutex);
115 	} else {
116 		rf_unlock_mutex2(raidPtr->parityLogDiskQueue.mutex);
117 		RF_Malloc(data, sizeof(RF_ParityLogData_t), (RF_ParityLogData_t *));
118 	}
119 	data->next = NULL;
120 	data->prev = NULL;
121 	return (data);
122 }
123 
124 
125 static void
FreeParityLogData(RF_ParityLogData_t * data)126 FreeParityLogData(RF_ParityLogData_t * data)
127 {
128 	RF_ParityLogData_t *nextItem;
129 	RF_Raid_t *raidPtr;
130 
131 	/* Insert a linked list of structs for holding parity log information
132 	 * (data) into the free list (parityLogDiskQueue.freeList).
133 	 * NON-BLOCKING */
134 
135 	raidPtr = data->common->raidPtr;
136 	rf_lock_mutex2(raidPtr->parityLogDiskQueue.mutex);
137 	while (data) {
138 		nextItem = data->next;
139 		data->next = raidPtr->parityLogDiskQueue.freeDataList;
140 		raidPtr->parityLogDiskQueue.freeDataList = data;
141 		data = nextItem;
142 	}
143 	rf_unlock_mutex2(raidPtr->parityLogDiskQueue.mutex);
144 }
145 
146 
147 static void
EnqueueParityLogData(RF_ParityLogData_t * data,RF_ParityLogData_t ** head,RF_ParityLogData_t ** tail)148 EnqueueParityLogData(
149     RF_ParityLogData_t * data,
150     RF_ParityLogData_t ** head,
151     RF_ParityLogData_t ** tail)
152 {
153 	RF_Raid_t *raidPtr;
154 
155 	/* Insert an in-core parity log (*data) into the head of a disk queue
156 	 * (*head, *tail). NON-BLOCKING */
157 
158 	raidPtr = data->common->raidPtr;
159 	if (rf_parityLogDebug)
160 		printf("[enqueueing parity log data, region %d, raidAddress %d, numSector %d]\n", data->regionID, (int) data->diskAddress.raidAddress, (int) data->diskAddress.numSector);
161 	RF_ASSERT(data->prev == NULL);
162 	RF_ASSERT(data->next == NULL);
163 	rf_lock_mutex2(raidPtr->parityLogDiskQueue.mutex);
164 	if (*head) {
165 		/* insert into head of queue */
166 		RF_ASSERT((*head)->prev == NULL);
167 		RF_ASSERT((*tail)->next == NULL);
168 		data->next = *head;
169 		(*head)->prev = data;
170 		*head = data;
171 	} else {
172 		/* insert into empty list */
173 		RF_ASSERT(*head == NULL);
174 		RF_ASSERT(*tail == NULL);
175 		*head = data;
176 		*tail = data;
177 	}
178 	RF_ASSERT((*head)->prev == NULL);
179 	RF_ASSERT((*tail)->next == NULL);
180 	rf_unlock_mutex2(raidPtr->parityLogDiskQueue.mutex);
181 }
182 
183 static RF_ParityLogData_t *
DequeueParityLogData(RF_Raid_t * raidPtr,RF_ParityLogData_t ** head,RF_ParityLogData_t ** tail,int ignoreLocks)184 DequeueParityLogData(
185     RF_Raid_t * raidPtr,
186     RF_ParityLogData_t ** head,
187     RF_ParityLogData_t ** tail,
188     int ignoreLocks)
189 {
190 	RF_ParityLogData_t *data;
191 
192 	/* Remove and return an in-core parity log from the tail of a disk
193 	 * queue (*head, *tail). NON-BLOCKING */
194 
195 	/* remove from tail, preserving FIFO order */
196 	if (!ignoreLocks)
197 		rf_lock_mutex2(raidPtr->parityLogDiskQueue.mutex);
198 	data = *tail;
199 	if (data) {
200 		if (*head == *tail) {
201 			/* removing last item from queue */
202 			*head = NULL;
203 			*tail = NULL;
204 		} else {
205 			*tail = (*tail)->prev;
206 			(*tail)->next = NULL;
207 			RF_ASSERT((*head)->prev == NULL);
208 			RF_ASSERT((*tail)->next == NULL);
209 		}
210 		data->next = NULL;
211 		data->prev = NULL;
212 		if (rf_parityLogDebug)
213 			printf("[dequeueing parity log data, region %d, raidAddress %d, numSector %d]\n", data->regionID, (int) data->diskAddress.raidAddress, (int) data->diskAddress.numSector);
214 	}
215 	if (*head) {
216 		RF_ASSERT((*head)->prev == NULL);
217 		RF_ASSERT((*tail)->next == NULL);
218 	}
219 	if (!ignoreLocks)
220 		rf_unlock_mutex2(raidPtr->parityLogDiskQueue.mutex);
221 	return (data);
222 }
223 
224 
225 static void
RequeueParityLogData(RF_ParityLogData_t * data,RF_ParityLogData_t ** head,RF_ParityLogData_t ** tail)226 RequeueParityLogData(
227     RF_ParityLogData_t * data,
228     RF_ParityLogData_t ** head,
229     RF_ParityLogData_t ** tail)
230 {
231 	RF_Raid_t *raidPtr;
232 
233 	/* Insert an in-core parity log (*data) into the tail of a disk queue
234 	 * (*head, *tail). NON-BLOCKING */
235 
236 	raidPtr = data->common->raidPtr;
237 	RF_ASSERT(data);
238 	if (rf_parityLogDebug)
239 		printf("[requeueing parity log data, region %d, raidAddress %d, numSector %d]\n", data->regionID, (int) data->diskAddress.raidAddress, (int) data->diskAddress.numSector);
240 	rf_lock_mutex2(raidPtr->parityLogDiskQueue.mutex);
241 	if (*tail) {
242 		/* append to tail of list */
243 		data->prev = *tail;
244 		data->next = NULL;
245 		(*tail)->next = data;
246 		*tail = data;
247 	} else {
248 		/* inserting into an empty list */
249 		*head = data;
250 		*tail = data;
251 		(*head)->prev = NULL;
252 		(*tail)->next = NULL;
253 	}
254 	RF_ASSERT((*head)->prev == NULL);
255 	RF_ASSERT((*tail)->next == NULL);
256 	rf_unlock_mutex2(raidPtr->parityLogDiskQueue.mutex);
257 }
258 
259 RF_ParityLogData_t *
rf_CreateParityLogData(RF_ParityRecordType_t operation,RF_PhysDiskAddr_t * pda,void * bufPtr,RF_Raid_t * raidPtr,int (* wakeFunc)(RF_DagNode_t * node,int status),void * wakeArg,RF_AccTraceEntry_t * tracerec,RF_Etimer_t startTime)260 rf_CreateParityLogData(
261     RF_ParityRecordType_t operation,
262     RF_PhysDiskAddr_t * pda,
263     void *bufPtr,
264     RF_Raid_t * raidPtr,
265     int (*wakeFunc) (RF_DagNode_t * node, int status),
266     void *wakeArg,
267     RF_AccTraceEntry_t * tracerec,
268     RF_Etimer_t startTime)
269 {
270 	RF_ParityLogData_t *data, *resultHead = NULL, *resultTail = NULL;
271 	RF_CommonLogData_t *common;
272 	RF_PhysDiskAddr_t *diskAddress;
273 	int     boundary, offset = 0;
274 
275 	/* Return an initialized struct of info to be logged. Build one item
276 	 * per physical disk address, one item per region.
277 	 *
278 	 * NON-BLOCKING */
279 
280 	diskAddress = pda;
281 	common = AllocParityLogCommonData(raidPtr);
282 	RF_ASSERT(common);
283 
284 	common->operation = operation;
285 	common->bufPtr = bufPtr;
286 	common->raidPtr = raidPtr;
287 	common->wakeFunc = wakeFunc;
288 	common->wakeArg = wakeArg;
289 	common->tracerec = tracerec;
290 	common->startTime = startTime;
291 	common->cnt = 0;
292 
293 	if (rf_parityLogDebug)
294 		printf("[entering CreateParityLogData]\n");
295 	while (diskAddress) {
296 		common->cnt++;
297 		data = AllocParityLogData(raidPtr);
298 		RF_ASSERT(data);
299 		data->common = common;
300 		data->next = NULL;
301 		data->prev = NULL;
302 		data->regionID = rf_MapRegionIDParityLogging(raidPtr, diskAddress->startSector);
303 		if (data->regionID == rf_MapRegionIDParityLogging(raidPtr, diskAddress->startSector + diskAddress->numSector - 1)) {
304 			/* disk address does not cross a region boundary */
305 			data->diskAddress = *diskAddress;
306 			data->bufOffset = offset;
307 			offset = offset + diskAddress->numSector;
308 			EnqueueParityLogData(data, &resultHead, &resultTail);
309 			/* adjust disk address */
310 			diskAddress = diskAddress->next;
311 		} else {
312 			/* disk address crosses a region boundary */
313 			/* find address where region is crossed */
314 			boundary = 0;
315 			while (data->regionID == rf_MapRegionIDParityLogging(raidPtr, diskAddress->startSector + boundary))
316 				boundary++;
317 
318 			/* enter data before the boundary */
319 			data->diskAddress = *diskAddress;
320 			data->diskAddress.numSector = boundary;
321 			data->bufOffset = offset;
322 			offset += boundary;
323 			EnqueueParityLogData(data, &resultHead, &resultTail);
324 			/* adjust disk address */
325 			diskAddress->startSector += boundary;
326 			diskAddress->numSector -= boundary;
327 		}
328 	}
329 	if (rf_parityLogDebug)
330 		printf("[leaving CreateParityLogData]\n");
331 	return (resultHead);
332 }
333 
334 
335 RF_ParityLogData_t *
rf_SearchAndDequeueParityLogData(RF_Raid_t * raidPtr,int regionID,RF_ParityLogData_t ** head,RF_ParityLogData_t ** tail,int ignoreLocks)336 rf_SearchAndDequeueParityLogData(
337     RF_Raid_t * raidPtr,
338     int regionID,
339     RF_ParityLogData_t ** head,
340     RF_ParityLogData_t ** tail,
341     int ignoreLocks)
342 {
343 	RF_ParityLogData_t *w;
344 
345 	/* Remove and return an in-core parity log from a specified region
346 	 * (regionID). If a matching log is not found, return NULL.
347 	 *
348 	 * NON-BLOCKING. */
349 
350 	/* walk backward through a list, looking for an entry with a matching
351 	 * region ID */
352 	if (!ignoreLocks)
353 		rf_lock_mutex2(raidPtr->parityLogDiskQueue.mutex);
354 	w = (*tail);
355 	while (w) {
356 		if (w->regionID == regionID) {
357 			/* remove an element from the list */
358 			if (w == *tail) {
359 				if (*head == *tail) {
360 					/* removing only element in the list */
361 					*head = NULL;
362 					*tail = NULL;
363 				} else {
364 					/* removing last item in the list */
365 					*tail = (*tail)->prev;
366 					(*tail)->next = NULL;
367 					RF_ASSERT((*head)->prev == NULL);
368 					RF_ASSERT((*tail)->next == NULL);
369 				}
370 			} else {
371 				if (w == *head) {
372 					/* removing first item in the list */
373 					*head = (*head)->next;
374 					(*head)->prev = NULL;
375 					RF_ASSERT((*head)->prev == NULL);
376 					RF_ASSERT((*tail)->next == NULL);
377 				} else {
378 					/* removing an item from the middle of
379 					 * the list */
380 					w->prev->next = w->next;
381 					w->next->prev = w->prev;
382 					RF_ASSERT((*head)->prev == NULL);
383 					RF_ASSERT((*tail)->next == NULL);
384 				}
385 			}
386 			w->prev = NULL;
387 			w->next = NULL;
388 			if (rf_parityLogDebug)
389 				printf("[dequeueing parity log data, region %d, raidAddress %d, numSector %d]\n", w->regionID, (int) w->diskAddress.raidAddress, (int) w->diskAddress.numSector);
390 			return (w);
391 		} else
392 			w = w->prev;
393 	}
394 	if (!ignoreLocks)
395 		rf_unlock_mutex2(raidPtr->parityLogDiskQueue.mutex);
396 	return (NULL);
397 }
398 
399 static RF_ParityLogData_t *
DequeueMatchingLogData(RF_Raid_t * raidPtr,RF_ParityLogData_t ** head,RF_ParityLogData_t ** tail)400 DequeueMatchingLogData(
401     RF_Raid_t * raidPtr,
402     RF_ParityLogData_t ** head,
403     RF_ParityLogData_t ** tail)
404 {
405 	RF_ParityLogData_t *logDataList, *logData;
406 	int     regionID;
407 
408 	/* Remove and return an in-core parity log from the tail of a disk
409 	 * queue (*head, *tail).  Then remove all matching (identical
410 	 * regionIDs) logData and return as a linked list.
411 	 *
412 	 * NON-BLOCKING */
413 
414 	logDataList = DequeueParityLogData(raidPtr, head, tail, RF_TRUE);
415 	if (logDataList) {
416 		regionID = logDataList->regionID;
417 		logData = logDataList;
418 		logData->next = rf_SearchAndDequeueParityLogData(raidPtr, regionID, head, tail, RF_TRUE);
419 		while (logData->next) {
420 			logData = logData->next;
421 			logData->next = rf_SearchAndDequeueParityLogData(raidPtr, regionID, head, tail, RF_TRUE);
422 		}
423 	}
424 	return (logDataList);
425 }
426 
427 
428 static RF_ParityLog_t *
AcquireParityLog(RF_ParityLogData_t * logData,int finish)429 AcquireParityLog(
430     RF_ParityLogData_t * logData,
431     int finish)
432 {
433 	RF_ParityLog_t *log = NULL;
434 	RF_Raid_t *raidPtr;
435 
436 	/* Grab a log buffer from the pool and return it. If no buffers are
437 	 * available, return NULL. NON-BLOCKING */
438 	raidPtr = logData->common->raidPtr;
439 	rf_lock_mutex2(raidPtr->parityLogPool.mutex);
440 	if (raidPtr->parityLogPool.parityLogs) {
441 		log = raidPtr->parityLogPool.parityLogs;
442 		raidPtr->parityLogPool.parityLogs = raidPtr->parityLogPool.parityLogs->next;
443 		log->regionID = logData->regionID;
444 		log->numRecords = 0;
445 		log->next = NULL;
446 		raidPtr->logsInUse++;
447 		RF_ASSERT(raidPtr->logsInUse >= 0 && raidPtr->logsInUse <= raidPtr->numParityLogs);
448 	} else {
449 		/* no logs available, so place ourselves on the queue of work
450 		 * waiting on log buffers this is done while
451 		 * parityLogPool.mutex is held, to ensure synchronization with
452 		 * ReleaseParityLogs. */
453 		if (rf_parityLogDebug)
454 			printf("[blocked on log, region %d, finish %d]\n", logData->regionID, finish);
455 		if (finish)
456 			RequeueParityLogData(logData, &raidPtr->parityLogDiskQueue.logBlockHead, &raidPtr->parityLogDiskQueue.logBlockTail);
457 		else
458 			EnqueueParityLogData(logData, &raidPtr->parityLogDiskQueue.logBlockHead, &raidPtr->parityLogDiskQueue.logBlockTail);
459 	}
460 	rf_unlock_mutex2(raidPtr->parityLogPool.mutex);
461 	return (log);
462 }
463 
464 void
rf_ReleaseParityLogs(RF_Raid_t * raidPtr,RF_ParityLog_t * firstLog)465 rf_ReleaseParityLogs(
466     RF_Raid_t * raidPtr,
467     RF_ParityLog_t * firstLog)
468 {
469 	RF_ParityLogData_t *logDataList;
470 	RF_ParityLog_t *log, *lastLog;
471 	int     cnt;
472 
473 	/* Insert a linked list of parity logs (firstLog) to the free list
474 	 * (parityLogPool.parityLogPool)
475 	 *
476 	 * NON-BLOCKING. */
477 
478 	RF_ASSERT(firstLog);
479 
480 	/* Before returning logs to global free list, service all requests
481 	 * which are blocked on logs.  Holding mutexes for parityLogPool and
482 	 * parityLogDiskQueue forces synchronization with AcquireParityLog(). */
483 	rf_lock_mutex2(raidPtr->parityLogPool.mutex);
484 	rf_lock_mutex2(raidPtr->parityLogDiskQueue.mutex);
485 	logDataList = DequeueMatchingLogData(raidPtr, &raidPtr->parityLogDiskQueue.logBlockHead, &raidPtr->parityLogDiskQueue.logBlockTail);
486 	log = firstLog;
487 	if (firstLog)
488 		firstLog = firstLog->next;
489 	log->numRecords = 0;
490 	log->next = NULL;
491 	while (logDataList && log) {
492 		rf_unlock_mutex2(raidPtr->parityLogPool.mutex);
493 		rf_unlock_mutex2(raidPtr->parityLogDiskQueue.mutex);
494 		rf_ParityLogAppend(logDataList, RF_TRUE, &log, RF_FALSE);
495 		if (rf_parityLogDebug)
496 			printf("[finishing up buf-blocked log data, region %d]\n", logDataList->regionID);
497 		if (log == NULL) {
498 			log = firstLog;
499 			if (firstLog) {
500 				firstLog = firstLog->next;
501 				log->numRecords = 0;
502 				log->next = NULL;
503 			}
504 		}
505 		rf_lock_mutex2(raidPtr->parityLogPool.mutex);
506 		rf_lock_mutex2(raidPtr->parityLogDiskQueue.mutex);
507 		if (log)
508 			logDataList = DequeueMatchingLogData(raidPtr, &raidPtr->parityLogDiskQueue.logBlockHead, &raidPtr->parityLogDiskQueue.logBlockTail);
509 	}
510 	/* return remaining logs to pool */
511 	if (log) {
512 		log->next = firstLog;
513 		firstLog = log;
514 	}
515 	if (firstLog) {
516 		lastLog = firstLog;
517 		raidPtr->logsInUse--;
518 		RF_ASSERT(raidPtr->logsInUse >= 0 && raidPtr->logsInUse <= raidPtr->numParityLogs);
519 		while (lastLog->next) {
520 			lastLog = lastLog->next;
521 			raidPtr->logsInUse--;
522 			RF_ASSERT(raidPtr->logsInUse >= 0 && raidPtr->logsInUse <= raidPtr->numParityLogs);
523 		}
524 		lastLog->next = raidPtr->parityLogPool.parityLogs;
525 		raidPtr->parityLogPool.parityLogs = firstLog;
526 		cnt = 0;
527 		log = raidPtr->parityLogPool.parityLogs;
528 		while (log) {
529 			cnt++;
530 			log = log->next;
531 		}
532 		RF_ASSERT(cnt + raidPtr->logsInUse == raidPtr->numParityLogs);
533 	}
534 	rf_unlock_mutex2(raidPtr->parityLogPool.mutex);
535 	rf_unlock_mutex2(raidPtr->parityLogDiskQueue.mutex);
536 }
537 
538 static void
ReintLog(RF_Raid_t * raidPtr,int regionID,RF_ParityLog_t * log)539 ReintLog(
540     RF_Raid_t * raidPtr,
541     int regionID,
542     RF_ParityLog_t * log)
543 {
544 	RF_ASSERT(log);
545 
546 	/* Insert an in-core parity log (log) into the disk queue of
547 	 * reintegration work.  Set the flag (reintInProgress) for the
548 	 * specified region (regionID) to indicate that reintegration is in
549 	 * progress for this region. NON-BLOCKING */
550 
551 	rf_lock_mutex2(raidPtr->regionInfo[regionID].reintMutex);
552 	raidPtr->regionInfo[regionID].reintInProgress = RF_TRUE;	/* cleared when reint
553 									 * complete */
554 
555 	if (rf_parityLogDebug)
556 		printf("[requesting reintegration of region %d]\n", log->regionID);
557 	/* move record to reintegration queue */
558 	rf_lock_mutex2(raidPtr->parityLogDiskQueue.mutex);
559 	log->next = raidPtr->parityLogDiskQueue.reintQueue;
560 	raidPtr->parityLogDiskQueue.reintQueue = log;
561 	rf_unlock_mutex2(raidPtr->regionInfo[regionID].reintMutex);
562 	rf_signal_cond2(raidPtr->parityLogDiskQueue.cond);
563 	rf_unlock_mutex2(raidPtr->parityLogDiskQueue.mutex);
564 }
565 
566 static void
FlushLog(RF_Raid_t * raidPtr,RF_ParityLog_t * log)567 FlushLog(
568     RF_Raid_t * raidPtr,
569     RF_ParityLog_t * log)
570 {
571 	/* insert a core log (log) into a list of logs
572 	 * (parityLogDiskQueue.flushQueue) waiting to be written to disk.
573 	 * NON-BLOCKING */
574 
575 	RF_ASSERT(log);
576 	RF_ASSERT(log->numRecords == raidPtr->numSectorsPerLog);
577 	RF_ASSERT(log->next == NULL);
578 	/* move log to flush queue */
579 	rf_lock_mutex2(raidPtr->parityLogDiskQueue.mutex);
580 	log->next = raidPtr->parityLogDiskQueue.flushQueue;
581 	raidPtr->parityLogDiskQueue.flushQueue = log;
582 	rf_signal_cond2(raidPtr->parityLogDiskQueue.cond);
583 	rf_unlock_mutex2(raidPtr->parityLogDiskQueue.mutex);
584 }
585 
586 static int
DumpParityLogToDisk(int finish,RF_ParityLogData_t * logData)587 DumpParityLogToDisk(
588     int finish,
589     RF_ParityLogData_t * logData)
590 {
591 	int     i, diskCount, regionID = logData->regionID;
592 	RF_ParityLog_t *log;
593 	RF_Raid_t *raidPtr;
594 
595 	raidPtr = logData->common->raidPtr;
596 
597 	/* Move a core log to disk.  If the log disk is full, initiate
598 	 * reintegration.
599 	 *
600 	 * Return (0) if we can enqueue the dump immediately, otherwise return
601 	 * (1) to indicate we are blocked on reintegration and control of the
602 	 * thread should be relinquished.
603 	 *
604 	 * Caller must hold regionInfo[regionID].mutex
605 	 *
606 	 * NON-BLOCKING */
607 
608 	RF_ASSERT(rf_owned_mutex2(raidPtr->regionInfo[regionID].mutex));
609 
610 	if (rf_parityLogDebug)
611 		printf("[dumping parity log to disk, region %d]\n", regionID);
612 	log = raidPtr->regionInfo[regionID].coreLog;
613 	RF_ASSERT(log->numRecords == raidPtr->numSectorsPerLog);
614 	RF_ASSERT(log->next == NULL);
615 
616 	/* if reintegration is in progress, must queue work */
617 	rf_lock_mutex2(raidPtr->regionInfo[regionID].reintMutex);
618 	if (raidPtr->regionInfo[regionID].reintInProgress) {
619 		/* Can not proceed since this region is currently being
620 		 * reintegrated. We can not block, so queue remaining work and
621 		 * return */
622 		if (rf_parityLogDebug)
623 			printf("[region %d waiting on reintegration]\n", regionID);
624 		/* XXX not sure about the use of finish - shouldn't this
625 		 * always be "Enqueue"? */
626 		if (finish)
627 			RequeueParityLogData(logData, &raidPtr->parityLogDiskQueue.reintBlockHead, &raidPtr->parityLogDiskQueue.reintBlockTail);
628 		else
629 			EnqueueParityLogData(logData, &raidPtr->parityLogDiskQueue.reintBlockHead, &raidPtr->parityLogDiskQueue.reintBlockTail);
630 		rf_unlock_mutex2(raidPtr->regionInfo[regionID].reintMutex);
631 		return (1);	/* relenquish control of this thread */
632 	}
633 	rf_unlock_mutex2(raidPtr->regionInfo[regionID].reintMutex);
634 	raidPtr->regionInfo[regionID].coreLog = NULL;
635 	if ((raidPtr->regionInfo[regionID].diskCount) < raidPtr->regionInfo[regionID].capacity)
636 		/* IMPORTANT!! this loop bound assumes region disk holds an
637 		 * integral number of core logs */
638 	{
639 		/* update disk map for this region */
640 		diskCount = raidPtr->regionInfo[regionID].diskCount;
641 		for (i = 0; i < raidPtr->numSectorsPerLog; i++) {
642 			raidPtr->regionInfo[regionID].diskMap[i + diskCount].operation = log->records[i].operation;
643 			raidPtr->regionInfo[regionID].diskMap[i + diskCount].parityAddr = log->records[i].parityAddr;
644 		}
645 		log->diskOffset = diskCount;
646 		raidPtr->regionInfo[regionID].diskCount += raidPtr->numSectorsPerLog;
647 		FlushLog(raidPtr, log);
648 	} else {
649 		/* no room for log on disk, send it to disk manager and
650 		 * request reintegration */
651 		RF_ASSERT(raidPtr->regionInfo[regionID].diskCount == raidPtr->regionInfo[regionID].capacity);
652 		ReintLog(raidPtr, regionID, log);
653 	}
654 	if (rf_parityLogDebug)
655 		printf("[finished dumping parity log to disk, region %d]\n", regionID);
656 	return (0);
657 }
658 
659 int
rf_ParityLogAppend(RF_ParityLogData_t * logData,int finish,RF_ParityLog_t ** incomingLog,int clearReintFlag)660 rf_ParityLogAppend(
661     RF_ParityLogData_t * logData,
662     int finish,
663     RF_ParityLog_t ** incomingLog,
664     int clearReintFlag)
665 {
666 	int     regionID, logItem, itemDone;
667 	RF_ParityLogData_t *item;
668 	int     punt, done = RF_FALSE;
669 	RF_ParityLog_t *log;
670 	RF_Raid_t *raidPtr;
671 	RF_Etimer_t timer;
672 	int     (*wakeFunc) (RF_DagNode_t * node, int status);
673 	void   *wakeArg;
674 
675 	/* Add parity to the appropriate log, one sector at a time. This
676 	 * routine is called is called by dag functions ParityLogUpdateFunc
677 	 * and ParityLogOverwriteFunc and therefore MUST BE NONBLOCKING.
678 	 *
679 	 * Parity to be logged is contained in a linked-list (logData).  When
680 	 * this routine returns, every sector in the list will be in one of
681 	 * three places: 1) entered into the parity log 2) queued, waiting on
682 	 * reintegration 3) queued, waiting on a core log
683 	 *
684 	 * Blocked work is passed to the ParityLoggingDiskManager for completion.
685 	 * Later, as conditions which required the block are removed, the work
686 	 * reenters this routine with the "finish" parameter set to "RF_TRUE."
687 	 *
688 	 * NON-BLOCKING */
689 
690 	raidPtr = logData->common->raidPtr;
691 	/* lock the region for the first item in logData */
692 	RF_ASSERT(logData != NULL);
693 	regionID = logData->regionID;
694 	rf_lock_mutex2(raidPtr->regionInfo[regionID].mutex);
695 	RF_ASSERT(raidPtr->regionInfo[regionID].loggingEnabled);
696 
697 	if (clearReintFlag) {
698 		/* Enable flushing for this region.  Holding both locks
699 		 * provides a synchronization barrier with DumpParityLogToDisk */
700 		rf_lock_mutex2(raidPtr->regionInfo[regionID].reintMutex);
701 		/* XXXmrg need this? */
702 		rf_lock_mutex2(raidPtr->parityLogDiskQueue.mutex);
703 		RF_ASSERT(raidPtr->regionInfo[regionID].reintInProgress == RF_TRUE);
704 		raidPtr->regionInfo[regionID].diskCount = 0;
705 		raidPtr->regionInfo[regionID].reintInProgress = RF_FALSE;
706 		rf_unlock_mutex2(raidPtr->regionInfo[regionID].reintMutex);	/* flushing is now
707 										 * enabled */
708 		/* XXXmrg need this? */
709 		rf_unlock_mutex2(raidPtr->parityLogDiskQueue.mutex);
710 	}
711 	/* process each item in logData */
712 	while (logData) {
713 		/* remove an item from logData */
714 		item = logData;
715 		logData = logData->next;
716 		item->next = NULL;
717 		item->prev = NULL;
718 
719 		if (rf_parityLogDebug)
720 			printf("[appending parity log data, region %d, raidAddress %d, numSector %d]\n", item->regionID, (int) item->diskAddress.raidAddress, (int) item->diskAddress.numSector);
721 
722 		/* see if we moved to a new region */
723 		if (regionID != item->regionID) {
724 			rf_unlock_mutex2(raidPtr->regionInfo[regionID].mutex);
725 			regionID = item->regionID;
726 			rf_lock_mutex2(raidPtr->regionInfo[regionID].mutex);
727 			RF_ASSERT(raidPtr->regionInfo[regionID].loggingEnabled);
728 		}
729 		punt = RF_FALSE;/* Set to RF_TRUE if work is blocked.  This
730 				 * can happen in one of two ways: 1) no core
731 				 * log (AcquireParityLog) 2) waiting on
732 				 * reintegration (DumpParityLogToDisk) If punt
733 				 * is RF_TRUE, the dataItem was queued, so
734 				 * skip to next item. */
735 
736 		/* process item, one sector at a time, until all sectors
737 		 * processed or we punt */
738 		if (item->diskAddress.numSector > 0)
739 			done = RF_FALSE;
740 		else
741 			RF_ASSERT(0);
742 		while (!punt && !done) {
743 			/* verify that a core log exists for this region */
744 			if (!raidPtr->regionInfo[regionID].coreLog) {
745 				/* Attempt to acquire a parity log. If
746 				 * acquisition fails, queue remaining work in
747 				 * data item and move to nextItem. */
748 				if (incomingLog)
749 					if (*incomingLog) {
750 						RF_ASSERT((*incomingLog)->next == NULL);
751 						raidPtr->regionInfo[regionID].coreLog = *incomingLog;
752 						raidPtr->regionInfo[regionID].coreLog->regionID = regionID;
753 						*incomingLog = NULL;
754 					} else
755 						raidPtr->regionInfo[regionID].coreLog = AcquireParityLog(item, finish);
756 				else
757 					raidPtr->regionInfo[regionID].coreLog = AcquireParityLog(item, finish);
758 				/* Note: AcquireParityLog either returns a log
759 				 * or enqueues currentItem */
760 			}
761 			if (!raidPtr->regionInfo[regionID].coreLog)
762 				punt = RF_TRUE;	/* failed to find a core log */
763 			else {
764 				RF_ASSERT(raidPtr->regionInfo[regionID].coreLog->next == NULL);
765 				/* verify that the log has room for new
766 				 * entries */
767 				/* if log is full, dump it to disk and grab a
768 				 * new log */
769 				if (raidPtr->regionInfo[regionID].coreLog->numRecords == raidPtr->numSectorsPerLog) {
770 					/* log is full, dump it to disk */
771 					if (DumpParityLogToDisk(finish, item))
772 						punt = RF_TRUE;	/* dump unsuccessful,
773 								 * blocked on
774 								 * reintegration */
775 					else {
776 						/* dump was successful */
777 						if (incomingLog)
778 							if (*incomingLog) {
779 								RF_ASSERT((*incomingLog)->next == NULL);
780 								raidPtr->regionInfo[regionID].coreLog = *incomingLog;
781 								raidPtr->regionInfo[regionID].coreLog->regionID = regionID;
782 								*incomingLog = NULL;
783 							} else
784 								raidPtr->regionInfo[regionID].coreLog = AcquireParityLog(item, finish);
785 						else
786 							raidPtr->regionInfo[regionID].coreLog = AcquireParityLog(item, finish);
787 						/* if a core log is not
788 						 * available, must queue work
789 						 * and return */
790 						if (!raidPtr->regionInfo[regionID].coreLog)
791 							punt = RF_TRUE;	/* blocked on log
792 									 * availability */
793 					}
794 				}
795 			}
796 			/* if we didn't punt on this item, attempt to add a
797 			 * sector to the core log */
798 			if (!punt) {
799 				RF_ASSERT(raidPtr->regionInfo[regionID].coreLog->next == NULL);
800 				/* at this point, we have a core log with
801 				 * enough room for a sector */
802 				/* copy a sector into the log */
803 				log = raidPtr->regionInfo[regionID].coreLog;
804 				RF_ASSERT(log->numRecords < raidPtr->numSectorsPerLog);
805 				logItem = log->numRecords++;
806 				log->records[logItem].parityAddr = item->diskAddress;
807 				RF_ASSERT(log->records[logItem].parityAddr.startSector >= raidPtr->regionInfo[regionID].parityStartAddr);
808 				RF_ASSERT(log->records[logItem].parityAddr.startSector < raidPtr->regionInfo[regionID].parityStartAddr + raidPtr->regionInfo[regionID].numSectorsParity);
809 				log->records[logItem].parityAddr.numSector = 1;
810 				log->records[logItem].operation = item->common->operation;
811 				memcpy((char *)log->bufPtr + (logItem * (1 << item->common->raidPtr->logBytesPerSector)), ((char *)item->common->bufPtr + (item->bufOffset++ * (1 << item->common->raidPtr->logBytesPerSector))), (1 << item->common->raidPtr->logBytesPerSector));
812 				item->diskAddress.numSector--;
813 				item->diskAddress.startSector++;
814 				if (item->diskAddress.numSector == 0)
815 					done = RF_TRUE;
816 			}
817 		}
818 
819 		if (!punt) {
820 			/* Processed this item completely, decrement count of
821 			 * items to be processed. */
822 			RF_ASSERT(item->diskAddress.numSector == 0);
823 			rf_lock_mutex2(item->common->mutex);
824 			item->common->cnt--;
825 			if (item->common->cnt == 0)
826 				itemDone = RF_TRUE;
827 			else
828 				itemDone = RF_FALSE;
829 			rf_unlock_mutex2(item->common->mutex);
830 			if (itemDone) {
831 				/* Finished processing all log data for this
832 				 * IO Return structs to free list and invoke
833 				 * wakeup function. */
834 				timer = item->common->startTime;	/* grab initial value of
835 									 * timer */
836 				RF_ETIMER_STOP(timer);
837 				RF_ETIMER_EVAL(timer);
838 				item->common->tracerec->plog_us += RF_ETIMER_VAL_US(timer);
839 				if (rf_parityLogDebug)
840 					printf("[waking process for region %d]\n", item->regionID);
841 				wakeFunc = item->common->wakeFunc;
842 				wakeArg = item->common->wakeArg;
843 				FreeParityLogCommonData(item->common);
844 				FreeParityLogData(item);
845 				(wakeFunc) (wakeArg, 0);
846 			} else
847 				FreeParityLogData(item);
848 		}
849 	}
850 	rf_unlock_mutex2(raidPtr->regionInfo[regionID].mutex);
851 	if (rf_parityLogDebug)
852 		printf("[exiting ParityLogAppend]\n");
853 	return (0);
854 }
855 
856 
857 void
rf_EnableParityLogging(RF_Raid_t * raidPtr)858 rf_EnableParityLogging(RF_Raid_t * raidPtr)
859 {
860 	int     regionID;
861 
862 	for (regionID = 0; regionID < rf_numParityRegions; regionID++) {
863 		rf_lock_mutex2(raidPtr->regionInfo[regionID].mutex);
864 		raidPtr->regionInfo[regionID].loggingEnabled = RF_TRUE;
865 		rf_unlock_mutex2(raidPtr->regionInfo[regionID].mutex);
866 	}
867 	if (rf_parityLogDebug)
868 		printf("[parity logging enabled]\n");
869 }
870 #endif				/* RF_INCLUDE_PARITYLOGGING > 0 */
871