1 /* BLURB lgpl
2
3 Coda File System
4 Release 6
5
6 Copyright (c) 1987-2016 Carnegie Mellon University
7 Additional copyrights listed below
8
9 This code is distributed "AS IS" without warranty of any kind under
10 the terms of the GNU Library General Public Licence Version 2, as
11 shown in the file LICENSE. The technical and financial contributors to
12 Coda are listed in the file CREDITS.
13
14 Additional copyrights
15 none currently
16
17 #*/
18
19 /*
20 *
21 * RVM log recovery support
22 *
23 */
24 #include <unistd.h>
25 #include <sys/types.h>
26 #include <sys/stat.h>
27 #include <fcntl.h>
28 #include <sys/file.h>
29 #include <sys/time.h>
30 #include <errno.h>
31 #include "rvm_private.h"
32
33 #ifdef RVM_LOG_TAIL_BUG
34 #include <rvmtesting.h>
35 extern unsigned long *ClobberAddress;
36 #endif /* RVM_LOG_TAIL_BUG */
37
38 /* global variables */
39
40 extern log_t *default_log; /* default log descriptor ptr */
41 extern list_entry_t seg_root; /* segment list */
42 extern rw_lock_t seg_root_lock; /* segment list lock */
43 extern rvm_bool_t rvm_utlsw; /* true if running in rvmutl */
44 extern char *rvm_errmsg; /* internal error message buffer */
45
46 rvm_bool_t rvm_no_yield = rvm_false; /* inhibit yields in recovery */
47 rvm_length_t rvm_num_nodes; /* number of nodes in change tree */
48 rvm_length_t rvm_max_depth; /* maximum depth of change tree */
49
50 chk_vec_t *rvm_chk_vec = NULL; /* monitor range vector */
51 rvm_length_t rvm_chk_len = 0; /* length of monitor range vector */
52 rvm_monitor_call_t *rvm_monitor = NULL; /* call-back function ptr */
53 rvm_signal_call_t *rvm_chk_sigint; /* SIGINT test call (rvmutl only) */
54 rvm_length_t truncation_times_vec[truncation_times_len]
55 = {truncation_times_dist};
56 rvm_bool_t rvm_no_update; /* no segment or log update if true */
57 rvm_bool_t rvm_replay; /* is replay if true */
58 rvm_bool_t rvm_chk_sum; /* force checksumming of all records */
59 rvm_bool_t rvm_shadow_buf; /* use shadow buffer */
60
61 /* macros & locals */
62
63 #ifndef ZERO
64 #define ZERO 0
65 #else
66 #endif
67
68 /*static rvm_length_t nv_local_max = NV_LOCAL_MAX;*/
69 static struct timeval trunc_start_time;
70 static rvm_length_t last_tree_build_time;
71 static rvm_length_t last_tree_apply_time;
72
73 #define NODES_PER_YIELD 1000000
74 static rvm_length_t num_nodes = NODES_PER_YIELD;
75 /* test if modification range will change monitored addresses */
76 /* nv_addr - vm address */
77 /* nv_len - length of vm range */
78 /* nv_data - nv data in vm */
79 /* nv_offset - offset of data in log */
80 /* rec_hdr - ptr to record header if not null */
81 /* msg - invocation message */
monitor_vmaddr(char * nv_addr,rvm_length_t nv_len,char * nv_data,rvm_offset_t * nv_offset,rec_hdr_t * rec_hdr,char * msg)82 static void monitor_vmaddr(char *nv_addr, rvm_length_t nv_len,
83 char *nv_data, rvm_offset_t *nv_offset,
84 rec_hdr_t *rec_hdr, char *msg)
85 {
86 rvm_length_t last_chk_addr;
87 rvm_length_t last_nv_addr;
88 rvm_length_t i;
89
90 /* check monitored ranges for specified range */
91 for (i=0; i < rvm_chk_len; i++)
92 {
93 if (rvm_chk_sigint != NULL)
94 if ((*rvm_chk_sigint)(NULL)) return; /* test for interrupt */
95
96 last_chk_addr = (rvm_length_t)RVM_ADD_LENGTH_TO_ADDR(
97 rvm_chk_vec[i].vmaddr,rvm_chk_vec[i].length);
98 last_nv_addr =
99 (rvm_length_t)RVM_ADD_LENGTH_TO_ADDR(nv_addr,nv_len);
100
101 if ((((rvm_length_t)rvm_chk_vec[i].vmaddr
102 >= (rvm_length_t)nv_addr)
103 && ((rvm_length_t)rvm_chk_vec[i].vmaddr < last_nv_addr))
104 || ((last_chk_addr > (rvm_length_t)nv_addr)
105 && (last_chk_addr < last_nv_addr))
106 ) {
107
108 /* found modification, call print support */
109 if (nv_data != NULL) /* check bytes offset */
110 nv_data = RVM_ADD_LENGTH_TO_ADDR(nv_data,
111 BYTE_SKEW(nv_addr));
112 (*rvm_monitor)((rvm_length_t)nv_addr,nv_len,nv_data,
113 nv_offset,rec_hdr,i,msg);
114 }
115 }
116
117 return;
118 }
119 /* allocate log recovery buffers */
120 char *tst_buf; /* debug temp */
alloc_log_buf(log)121 rvm_return_t alloc_log_buf(log)
122 log_t *log; /* log descriptor */
123 {
124 log_buf_t *log_buf = &log->log_buf; /* log buffer descriptor */
125
126 if ((log_buf->buf=page_alloc(log_buf->length)) == NULL)
127 return RVM_ENO_MEMORY;
128 #ifdef SPECIAL_DEBUG
129 if ((log_buf->shadow_buf=page_alloc(log_buf->length)) == NULL)
130 return RVM_ENO_MEMORY;
131 if ((tst_buf=page_alloc(log_buf->length)) == NULL)
132 return RVM_ENO_MEMORY;
133 #endif /* SPECIAL_DEBUG */
134 log_buf->buf_len = RVM_MK_OFFSET(0,log_buf->length);
135
136 if ((log_buf->aux_buf=page_alloc(log_buf->aux_length)) == NULL)
137 return RVM_ENO_MEMORY;
138
139 /* write-protect the buffers */
140 /* I've taken out the mach-specific code, but it might be interesting to
141 * implement this feature on other systems using mprotect. Therefore I've
142 * `retained' the essence of the original code in this comment -- JH
143 *
144 * MACH_RVM_PROTECT
145 *
146 * protect(log_buf->buf, log_buf->length, FALSE, VM_PROT_READ);
147 *
148 * #ifdef SPECIAL_DEBUG
149 * protect(log_buf->shadow_buf, log_buf->length, FALSE, VM_PROT_READ);
150 * protect(tst_buf, log_buf->length, FALSE, VM_PROT_READ);
151 * #endif SPECIAL_DEBUG
152 *
153 * protect(log_buf->aux_buf, log_buf->aux_length, FALSE, VM_PROT_READ);
154 */
155
156 return RVM_SUCCESS;
157 }
158
159 /* free log recovery buffer */
free_log_buf(log)160 void free_log_buf(log)
161 log_t *log; /* log descriptor */
162 {
163 log_buf_t *log_buf = &log->log_buf; /* log buffer descriptor */
164
165 if (log_buf->buf != NULL)
166 {
167 page_free(log_buf->buf,log_buf->length);
168 log_buf->buf = NULL;
169 log_buf->length = 0;
170 RVM_ZERO_OFFSET(log_buf->buf_len);
171 log_buf->ptr = -1;
172 }
173
174 if (log_buf->aux_buf != NULL)
175 {
176 page_free(log_buf->aux_buf,log_buf->aux_length);
177 log_buf->aux_buf = NULL;
178 log_buf->aux_length = 0;
179 }
180 }
181 /* init log buffer with desired offset data from log */
init_buffer(log,offset,direction,synch)182 rvm_return_t init_buffer(log,offset,direction,synch)
183 log_t *log; /* log descriptor */
184 rvm_offset_t *offset; /* offset in log to load */
185 rvm_bool_t direction; /* true ==> forward */
186 rvm_bool_t synch; /* true ==> synchronization required */
187 {
188 log_buf_t *log_buf = &log->log_buf; /* log buffer descriptor */
189 rvm_length_t length; /* length of buffer */
190 rvm_offset_t read_len; /* read length calculation temp */
191 rvm_return_t retval = RVM_SUCCESS; /* return value */
192
193 assert(RVM_OFFSET_GEQ(*offset,log->status.log_start));
194 assert(RVM_OFFSET_LEQ(*offset,log->dev.num_bytes));
195 assert(log->trunc_thread == cthread_self());
196
197 /* calculate buffer read length and ptr */
198 log_buf->ptr = OFFSET_TO_SECTOR_INDEX(*offset);
199 if (direction == FORWARD)
200 { /* forward */
201 log_buf->offset = CHOP_OFFSET_TO_SECTOR_SIZE(*offset);
202 if (RVM_OFFSET_EQL(log_buf->offset,log->dev.num_bytes))
203 read_len = log->status.log_size;
204 else
205 read_len = RVM_SUB_OFFSETS(log->dev.num_bytes,
206 log_buf->offset);
207 }
208 else
209 { /* reverse */
210 log_buf->offset = ROUND_OFFSET_TO_SECTOR_SIZE(*offset);
211 if (RVM_OFFSET_EQL(log_buf->offset,log->status.log_start))
212 log_buf->offset = log->dev.num_bytes;
213 if (RVM_OFFSET_EQL(log_buf->offset,log->dev.num_bytes))
214 read_len = log->status.log_size;
215 else
216 read_len = RVM_SUB_OFFSETS(log_buf->offset,
217 log->status.log_start);
218 }
219
220 /* get actual length to read */
221 if (RVM_OFFSET_GTR(read_len,log_buf->buf_len))
222 length = log_buf->length;
223 else
224 length = RVM_OFFSET_TO_LENGTH(read_len);
225 /* set offset of read for reverse fill */
226 if (direction == REVERSE)
227 {
228 log_buf->offset = RVM_SUB_LENGTH_FROM_OFFSET(log_buf->offset,
229 length);
230 if (log_buf->ptr == 0)
231 log_buf->ptr = length;
232 else
233 log_buf->ptr += (length-SECTOR_SIZE);
234 }
235
236 /* lock device & allow swap if necessary */
237 if (synch)
238 {
239 if (!rvm_no_yield) cthread_yield();
240 assert(log->trunc_thread == cthread_self());
241 mutex_lock(&log->dev_lock); /* begin dev_lock crit sec */
242 assert(log->trunc_thread == cthread_self());
243 }
244
245 /* allow write to buffer */
246 /* MACH_RVM_PROTECT
247 *
248 * protect(log_buf->buf, log_buf->length, FALSE,
249 * VM_PROT_WRITE | VM_PROT_READ);
250 */
251
252 /* read data from log device */
253 if ((log_buf->r_length=read_dev(&log->dev,&log_buf->offset,
254 log_buf->buf,length)) < 0)
255 {
256 retval = RVM_EIO; /* i/o error */
257 log_buf->r_length = 0; /* buffer invalid */
258 }
259 assert(log->trunc_thread == cthread_self());
260
261 /* write protect buffer & unlock */
262 /* MACH_RVM_PROTECT
263 *
264 * protect(log_buf->buf, log_buf->length, FALSE, VM_PROT_READ);
265 *
266 * #ifdef SPECIAL_DEBUG
267 * / * re-read into shadow buffer & compare * /
268 * if (rvm_shadow_buf)
269 * {
270 * ret = vm_protect(task_self_,(vm_address_t)(log_buf->shadow_buf),
271 * (vm_size_t)(log_buf->length),FALSE,
272 * VM_PROT_WRITE | VM_PROT_READ);
273 * assert(ret == KERN_SUCCESS);
274 * if ((r_length=read_dev(&log->dev,&log_buf->offset,
275 * log_buf->shadow_buf,length)) < 0)
276 * {
277 * retval = RVM_EIO; / * i/o error * /
278 * assert(rvm_false);
279 * }
280 * assert(r_length == length);
281 * assert(r_length == log_buf->r_length);
282 * ret = vm_protect(task_self_,(vm_address_t)(log_buf->shadow_buf),
283 * (vm_size_t)(log_buf->length),FALSE,VM_PROT_READ);
284 * assert(ret == KERN_SUCCESS);
285 * assert(memcmp(log_buf->buf,log_buf->shadow_buf,length) == 0);
286 * }
287 * #endif SPECIAL_DEBUG
288 */
289
290 if (synch)
291 mutex_unlock(&log->dev_lock); /* end dev_lock crit sec */
292 assert(log->trunc_thread == cthread_self());
293
294 return retval;
295 }
296 /* refill buffer in scan direction */
refill_buffer(log,direction,synch)297 static rvm_return_t refill_buffer(log,direction,synch)
298 log_t *log; /* log descriptor */
299 rvm_bool_t direction; /* true ==> forward */
300 rvm_bool_t synch; /* true ==> synchronization required */
301 {
302 log_buf_t *log_buf = &log->log_buf; /* log buffer descriptor */
303 rvm_offset_t offset; /* new buffer offset temp */
304
305 /* compute new offset for buffer fill */
306 offset = RVM_ADD_LENGTH_TO_OFFSET(log_buf->offset,log_buf->ptr);
307
308 /* fill the buffer */
309 return init_buffer(log,&offset,direction,synch);
310 }
311 /* compare buf & shadow buf from gdb */
312 #ifdef DEBUG_GDB
log_buf_cmp(disp)313 int log_buf_cmp(disp)
314 int disp;
315 {
316 log_buf_t *log_buf = &default_log->log_buf;
317 int i;
318
319 if (disp < 0) disp = 0;
320 for (i=disp;i<log_buf->r_length;i++)
321 if (log_buf->buf[i] != log_buf->shadow_buf[i])
322 return i;
323
324 return -1;
325 }
326
327 /* compare with disk */
disk_buf_cmp(buf,disp)328 int disk_buf_cmp(buf,disp)
329 char *buf;
330 int disp;
331 {
332 log_buf_t *log_buf = &default_log->log_buf;
333 int i;
334 int r_length;
335
336 /* allow write to buffer */
337 /* MACH_RVM_PROTECT
338 *
339 * protect(log_buf->buf, log_buf->length, FALSE, VM_PROT_WRITE | VM_PROT_READ);
340 */
341
342 /* read buffer from log */
343 if ((r_length=read_dev(&default_log->dev,&log_buf->offset,
344 tst_buf,log_buf->r_length)) < 0)
345 assert(rvm_false); /* i/o error */
346 assert(r_length == log_buf->r_length);
347
348 /* re-protect buffer */
349 /* MACH_RVM_PROTECT
350 *
351 * protect(log_buf->buf, log_buf->length, FALSE, VM_PROT_READ);
352 */
353
354 /* compare results */
355 if (disp < 0) disp = 0;
356 for (i=disp;i<log_buf->r_length;i++)
357 if (buf[i] != tst_buf[i])
358 return i;
359
360 return -1;
361 }
362 /* locate byte in buffer via gdb */
find_byte(chr,buf,disp,max_len)363 int find_byte(chr,buf,disp,max_len)
364 char chr;
365 char *buf;
366 int disp;
367 int max_len;
368 {
369 int i;
370
371 if (disp < 0) disp = 0;
372 for (i=disp;i<max_len;i++)
373 if (chr == buf[i])
374 return i;
375
376 return -1;
377 }
378
379 /* locate word in buffer via gdb */
find_word(wrd,buf,disp,max_len)380 int find_word(wrd,buf,disp,max_len)
381 rvm_length_t wrd;
382 rvm_length_t *buf;
383 int disp;
384 int max_len;
385 {
386 int i;
387
388 if (disp < 0) disp = 0;
389 for (i=disp/sizeof(rvm_length_t);i<max_len/sizeof(rvm_length_t);i++)
390 if (wrd == buf[i])
391 return i;
392
393 return -1;
394 }
395
396 /* find word in log buffer via gdb */
find_buf_word(wrd,disp)397 int find_buf_word(wrd,disp)
398 rvm_length_t wrd;
399 int disp;
400 {
401 log_buf_t *log_buf = &default_log->log_buf;
402
403 return find_word(wrd, (rvm_length_t *)log_buf->buf,disp,log_buf->r_length);
404 }
405 #endif /* DEBUG_GDB */
406 /* load log auxillary buffer */
load_aux_buf(log,log_offset,length,aux_ptr,data_len,synch,pre_load)407 rvm_return_t load_aux_buf(log,log_offset,length,aux_ptr,
408 data_len,synch,pre_load)
409 log_t *log; /* log descriptor */
410 rvm_offset_t *log_offset; /* buffer read offset */
411 rvm_length_t length; /* data length wanted */
412 rvm_length_t *aux_ptr; /* ptr to aux. buf offset */
413 rvm_length_t *data_len; /* ptr to actual data length read */
414 rvm_bool_t synch; /* true ==> synchronization required */
415 rvm_bool_t pre_load; /* permit pre-loading of range */
416 {
417 log_buf_t *log_buf = &log->log_buf; /* log buffer descriptor */
418 rvm_offset_t high_offset; /* end of read area */
419 rvm_length_t read_len; /* buffer read length */
420 rvm_return_t retval = RVM_SUCCESS;
421
422 assert(log->trunc_thread == cthread_self());
423
424 /* check offset */
425 if (RVM_OFFSET_GTR(*log_offset,log->dev.num_bytes))
426 {
427 *aux_ptr = -1; /* out of bounds -- partial record */
428 return RVM_SUCCESS;
429 }
430
431 /* see if request is already in buffer */
432 high_offset = RVM_ADD_LENGTH_TO_OFFSET(log_buf->aux_offset,
433 log_buf->aux_rlength);
434 if ((RVM_OFFSET_GEQ(*log_offset,log_buf->aux_offset))
435 && (RVM_OFFSET_LSS(*log_offset,high_offset)))
436 {
437 /* yes, have at least some of the data so report how much */
438 *aux_ptr = RVM_OFFSET_TO_LENGTH(
439 RVM_SUB_OFFSETS(*log_offset,log_buf->aux_offset));
440 read_len = RVM_OFFSET_TO_LENGTH(
441 RVM_SUB_OFFSETS(high_offset,*log_offset));
442 if (read_len < length)
443 *data_len = read_len;
444 else
445 *data_len = length;
446 return RVM_SUCCESS;
447 }
448
449 /* if less than sector requested, see if pre-load permitted */
450 if (pre_load && (length < SECTOR_SIZE))
451 read_len = log_buf->aux_length; /* yes, fill buffer */
452 else
453 read_len = length; /* no, just do what requested */
454 /* determine length and offset for log read */
455 log_buf->aux_offset = CHOP_OFFSET_TO_SECTOR_SIZE(*log_offset);
456 high_offset = RVM_ADD_LENGTH_TO_OFFSET(*log_offset,read_len);
457 high_offset = ROUND_OFFSET_TO_SECTOR_SIZE(high_offset);
458 if (RVM_OFFSET_GTR(high_offset,log->dev.num_bytes))
459 high_offset = log->dev.num_bytes; /* don't read past end of log */
460
461 /* report actual length read and ptr into buffer */
462 read_len = RVM_OFFSET_TO_LENGTH(
463 RVM_SUB_OFFSETS(high_offset,log_buf->aux_offset));
464 *aux_ptr = OFFSET_TO_SECTOR_INDEX(*log_offset);
465 if (read_len > log_buf->aux_length)
466 {
467 if ((read_len >= length)
468 && (length <= (log_buf->aux_length-SECTOR_SIZE)))
469 *data_len = length;
470 else
471 *data_len = log_buf->aux_length - *aux_ptr;
472 read_len = log_buf->aux_length;
473 }
474 else
475 *data_len = length;
476
477 /* lock device and allow swap if necessary */
478 if (synch)
479 {
480 if (!rvm_no_yield) cthread_yield(); /* allow swap now */
481 assert(log->trunc_thread == cthread_self());
482 mutex_lock(&log->dev_lock); /* begin dev_lock crit sec */
483 assert(log->trunc_thread == cthread_self());
484 }
485
486 /* allow write to buffer */
487 /* MACH_RVM_PROTECT
488 *
489 * protect(log_buf->aux_buf, log_buf->aux_length, FALSE,
490 * VM_PROT_WRITE | VM_PROT_READ);
491 */
492
493 /* read new value data from log */
494 if ((log_buf->aux_rlength=read_dev(&log->dev,&log_buf->aux_offset,
495 log_buf->aux_buf,read_len)) < 0)
496 {
497 retval = RVM_EIO;
498 log_buf->aux_rlength = 0;
499 }
500 assert(log->trunc_thread == cthread_self());
501
502 /* write protect buffer & unlock */
503 /* MACH_RVM_PROTECT
504 *
505 * protect(log_buf->aux_buf, log_buf->aux_length, FALSE, VM_PROT_READ);
506 */
507
508 if (synch)
509 mutex_unlock(&log->dev_lock); /* end dev_lock crit sec */
510 assert(log->trunc_thread == cthread_self());
511
512 return retval;
513 }
514
clear_aux_buf(log)515 void clear_aux_buf(log)
516 log_t *log; /* log descriptor */
517 {
518 log_buf_t *log_buf = &log->log_buf; /* log buffer descriptor */
519
520 RVM_ZERO_OFFSET(log_buf->aux_offset);
521 log_buf->aux_rlength = 0;
522 }
523 /* record header type validation */
chk_hdr_type(rec_hdr)524 static rvm_bool_t chk_hdr_type(rec_hdr)
525 rec_hdr_t *rec_hdr; /* generic record header */
526 {
527 switch (rec_hdr->struct_id)
528 {
529 case trans_hdr_id: /* transaction header */
530 return rvm_true;
531 case log_seg_id: /* log segment dictionary entry */
532 return rvm_true;
533 case log_wrap_id: /* log wrap-aound marker */
534 return rvm_true;
535 default: /* unknown header type */
536 return rvm_false;
537 }
538 }
539
540 /* test if record belongs to currently valid part of log */
chk_hdr_currency(log,rec_hdr)541 rvm_bool_t chk_hdr_currency(log,rec_hdr)
542 log_t *log; /* log descriptor */
543 rec_hdr_t *rec_hdr; /* generic record header */
544 {
545 log_status_t *status = &log->status; /* status descriptor */
546
547 /* be sure record number makes sense */
548 if ((status->first_rec_num != 0) &&
549 (rec_hdr->rec_num < status->first_rec_num))
550 return rvm_false; /* obsolete record */
551
552 /* be sure record written after previous truncation & before this one */
553 if (TIME_LSS(rec_hdr->timestamp,status->prev_trunc)
554 || TIME_GTR(rec_hdr->timestamp,status->last_trunc))
555 return rvm_false; /* obsolete record */
556
557 return rvm_true;
558 }
559
reset_hdr_chks(log)560 void reset_hdr_chks(log)
561 log_t *log; /* log descriptor */
562 {
563 log_buf_t *log_buf = &log->log_buf; /* log buffer descriptor */
564
565 log_buf->prev_rec_num = 0;
566 ZERO_TIME(log_buf->prev_timestamp);
567 }
568 /* test if record is out of sequence in log */
chk_hdr_sequence(log,rec_hdr,direction)569 rvm_bool_t chk_hdr_sequence(log,rec_hdr,direction)
570 log_t *log; /* log descriptor */
571 rec_hdr_t *rec_hdr; /* generic record header */
572 rvm_bool_t direction; /* scan direction */
573 {
574 log_buf_t *log_buf = &log->log_buf; /* recovery buffer descriptor */
575
576 /* check record number closely */
577 if ((log_buf->prev_rec_num != 0) &&
578 (((direction == FORWARD)
579 && (rec_hdr->rec_num != log_buf->prev_rec_num+1))
580 || ((direction == REVERSE)
581 && (rec_hdr->rec_num != log_buf->prev_rec_num-1))))
582 return rvm_false; /* sequence error */
583
584 /* check record write time closely */
585 if ((!TIME_EQL_ZERO(log_buf->prev_timestamp)) &&
586 (((direction == FORWARD)
587 && TIME_LSS(rec_hdr->timestamp,log_buf->prev_timestamp))
588 || ((direction == REVERSE)
589 && TIME_GTR(rec_hdr->timestamp,log_buf->prev_timestamp))))
590 return rvm_false; /* sequence error */
591
592 return rvm_true;
593 }
594 /* record header validation */
chk_hdr(log,rec_hdr,rec_end,direction)595 static rvm_bool_t chk_hdr(log,rec_hdr,rec_end,direction)
596 log_t *log; /* log descriptor */
597 rec_hdr_t *rec_hdr; /* generic record header */
598 rec_end_t *rec_end; /* generic record end marker */
599 rvm_bool_t direction; /* scan direction */
600 {
601
602 /* be sure record type valid */
603 if (!chk_hdr_type(rec_hdr))
604 return rvm_false;
605
606 /* checks for normal operation only */
607 if (!rvm_utlsw)
608 {
609 /* make sure record current */
610 if (chk_hdr_currency(log,rec_hdr) != rvm_true)
611 return rvm_false; /* record obsolete */
612
613 /* make sure record in proper sequence */
614 if (chk_hdr_sequence(log,rec_hdr,direction) != rvm_true)
615 return rvm_false; /* sequence error */
616 }
617
618 /* generic record head/end validation */
619 if ((rec_end != NULL) &&
620 ((rec_end->rec_hdr.struct_id != rec_end_id)
621 || (rec_hdr->struct_id != rec_end->rec_type)
622 || (rec_hdr->rec_num != rec_end->rec_hdr.rec_num)
623 || (rec_hdr->rec_length != rec_end->rec_hdr.rec_length)
624 || (!TIME_EQL(rec_hdr->timestamp,rec_end->rec_hdr.timestamp))))
625 return rvm_false;
626
627 return rvm_true;
628 }
629 /* log record header validation */
validate_hdr(log,rec_hdr,rec_end,direction)630 rvm_bool_t validate_hdr(log,rec_hdr,rec_end,direction)
631 log_t *log; /* log descriptor */
632 rec_hdr_t *rec_hdr; /* generic record header */
633 rec_end_t *rec_end; /* generic record end marker */
634 rvm_bool_t direction; /* scan direction */
635 {
636 log_buf_t *log_buf = &log->log_buf; /* recovery buffer descriptor */
637
638 /* clear sequence checking hide-a-ways if direction reversed */
639 if (direction != log_buf->prev_direction)
640 reset_hdr_chks(log);
641
642 /* do basic record header checks */
643 if (!chk_hdr(log,rec_hdr,rec_end,direction))
644 return rvm_false; /* header invalid */
645
646 /* type-specific validation */
647 switch (rec_hdr->struct_id)
648 {
649 case trans_hdr_id: /* transaction header */
650 break;
651 case log_seg_id: /* log segment dictionary entry */
652 break;
653 case log_wrap_id: /* log wrap-aound marker */
654 goto exit;
655 default: /* unknown/improper header type */
656 return rvm_false;
657 }
658
659 /* update buffer ptr and previous record state */
660 if (direction == FORWARD) /* forward, return header position */
661 log_buf->ptr = (long)rec_hdr - (long)log_buf->buf;
662 else /* reverse, return end marker pos. */
663 log_buf->ptr = (long)rec_end - (long)log_buf->buf;
664
665 exit:
666 log_buf->prev_rec_num = rec_hdr->rec_num;
667 log_buf->prev_timestamp = rec_hdr->timestamp;
668 log_buf->prev_direction = direction;
669
670 return rvm_true;
671 }
672 /* get next new value range by forward scan of transaction record
673 ptr points to next range header
674 exits with as much of range in buffer as will fit */
scan_nv_forward(log,synch)675 rvm_return_t scan_nv_forward(log,synch)
676 log_t *log; /* log descriptor */
677 rvm_bool_t synch; /* true ==> synchronization required */
678 {
679 log_buf_t *log_buf = &log->log_buf; /* log buffer descriptor */
680 rvm_offset_t offset; /* offset calculation temp */
681 rec_hdr_t *rec_hdr; /* temporary cast for record header */
682 rvm_return_t retval; /* return value */
683
684 /* see if new header is entirely within buffer */
685 if ((log_buf->ptr+sizeof(rec_hdr_t)) >= log_buf->r_length)
686 {
687 /* no, refill buffer */
688 offset = RVM_ADD_LENGTH_TO_OFFSET(log_buf->offset,
689 log_buf->ptr);
690 if ((retval=init_buffer(log,&offset,FORWARD,synch))
691 != RVM_SUCCESS) return retval;
692 }
693
694 /* check header */
695 rec_hdr = (rec_hdr_t *)&log_buf->buf[log_buf->ptr];
696 switch (rec_hdr->struct_id)
697 {
698 case nv_range_id: break;
699 case rec_end_id: return RVM_SUCCESS;
700
701 default: return RVM_SUCCESS; /* need better reporting */
702 }
703
704 /* get whole range in buffer */
705 if ((log_buf->ptr+rec_hdr->rec_length) > log_buf->r_length)
706 {
707 if ((retval=refill_buffer(log,FORWARD,synch))
708 != RVM_SUCCESS) return retval;
709 }
710
711 return RVM_SUCCESS;
712 }
713 /* get previous new value range by reverse scan of transaction record
714 ptr points to previous range header; exits with range in buffer */
scan_nv_reverse(log,synch)715 static rvm_return_t scan_nv_reverse(log,synch)
716 log_t *log; /* log descriptor */
717 rvm_bool_t synch; /* true ==> synchronization required */
718 {
719 log_buf_t *log_buf = &log->log_buf; /* log buffer descriptor */
720 rec_hdr_t *rec_hdr; /* temporary cast for record header */
721 long len=0; /* back displacement to prev. hdr */
722 rvm_offset_t offset; /* offset calculation temp */
723 rvm_return_t retval; /* return value */
724
725 /* get new header position */
726 rec_hdr = (rec_hdr_t *)&log_buf->buf[log_buf->ptr];
727 switch (rec_hdr->struct_id)
728 {
729 case rec_end_id:
730 len = ((rec_end_t *)rec_hdr)->sub_rec_len;
731 break;
732
733 case nv_range_id:
734 len = ((nv_range_t *)rec_hdr)->sub_rec_len;
735 break;
736
737 default:
738 assert(rvm_false); /* trouble -- log damage? */
739 }
740
741 /* see if new header is entirely within buffer */
742 if ((log_buf->ptr-len) < 0)
743 {
744 /* no, refill buffer according to length of data */
745 if ((len-sizeof(nv_range_t)) <= NV_LOCAL_MAX)
746 { /* small, get data into buffer */
747 if ((retval=refill_buffer(log,REVERSE,synch))
748 != RVM_SUCCESS) return retval;
749 log_buf->ptr -= len;
750 }
751 else
752 { /* large, skip data for now */
753 offset = RVM_ADD_LENGTH_TO_OFFSET(log_buf->offset,
754 (log_buf->ptr+sizeof(nv_range_t)));
755 offset = RVM_SUB_LENGTH_FROM_OFFSET(offset,len);
756 if ((retval=init_buffer(log,&offset,REVERSE,synch))
757 != RVM_SUCCESS) return retval;
758 log_buf->ptr -= sizeof(nv_range_t);
759 }
760 }
761 else log_buf->ptr -= len;
762 /* exit pointing to new header */
763 rec_hdr = (rec_hdr_t *)&log_buf->buf[log_buf->ptr];
764 if (rec_hdr->struct_id == trans_hdr_id)
765 return RVM_SUCCESS;
766 assert(rec_hdr->struct_id == nv_range_id);
767
768 return RVM_SUCCESS;
769 }
770 /* validate record in buffer in forward scan */
validate_rec_forward(log,synch)771 static rvm_return_t validate_rec_forward(log,synch)
772 log_t *log; /* log descriptor */
773 rvm_bool_t synch; /* true ==> synchronization required */
774 {
775 log_buf_t *log_buf = &log->log_buf; /* log buffer descriptor */
776 rec_hdr_t *rec_hdr; /* temporary cast for next record hdr */
777 rec_end_t *rec_end = NULL; /* temporary cast for record end */
778 rvm_offset_t end_offset; /* temporary for caluculating end */
779 rvm_return_t retval;
780 long tmp_ptr;
781 rvm_length_t tmp_len;
782
783 /* see if next header is entirely within buffer */
784 if ((log_buf->ptr + MAX_HDR_SIZE) > log_buf->r_length)
785 {
786 /* no, re-init buffer */
787 end_offset = RVM_ADD_LENGTH_TO_OFFSET(log_buf->offset,
788 log_buf->ptr);
789 if ((retval=init_buffer(log,&end_offset,FORWARD,synch))
790 != RVM_SUCCESS) return retval;
791 }
792
793 /* check header type */
794 rec_hdr = (rec_hdr_t *)&log_buf->buf[log_buf->ptr];
795 if (rec_hdr->struct_id == log_wrap_id)
796 goto validate; /* skip rec_end stuff for wrap */
797 if (!chk_hdr(log,rec_hdr,NULL,FORWARD))
798 goto no_record; /* no next record */
799
800 /* see if record will fit in buffer */
801 if ((ROUND_TO_SECTOR_SIZE(rec_hdr->rec_length+sizeof(rec_end_t))
802 + SECTOR_SIZE)
803 <= log_buf->length)
804 {
805 /* yes, get whole record in buffer */
806 if ((log_buf->ptr+rec_hdr->rec_length+sizeof(rec_end_t))
807 > log_buf->length)
808 {
809 /* refill buffer */
810 if ((retval=refill_buffer(log,FORWARD,synch))
811 != RVM_SUCCESS) return retval;
812 rec_hdr = (rec_hdr_t *)&log_buf->buf[log_buf->ptr];
813 }
814 tmp_ptr = log_buf->ptr + rec_hdr->rec_length;
815 rec_end = (rec_end_t *)&log_buf->buf[tmp_ptr];
816 }
817 else
818 {
819 /* no, won't fit -- read rec_end into aux buffer for validation */
820 end_offset = RVM_ADD_LENGTH_TO_OFFSET(log_buf->offset,
821 log_buf->ptr+rec_hdr->rec_length);
822
823 /* check offset alignment to see if rec_hdr is trash */
824 tmp_ptr = RVM_OFFSET_TO_LENGTH(end_offset);
825 if (tmp_ptr != CHOP_TO_LENGTH(tmp_ptr))
826 goto no_record; /* end marker alignment wrong */
827 retval = load_aux_buf(log, &end_offset, sizeof(rec_end_t),
828 &tmp_ptr, &tmp_len, synch, rvm_false);
829 if (retval != RVM_SUCCESS) return retval;
830 if (tmp_ptr == -1)
831 goto no_record; /* record end not available */
832 rec_end = (rec_end_t *)&log_buf->aux_buf[tmp_ptr];
833 }
834
835 /* validate whole record now that end is available */
836 validate:
837 if (validate_hdr(log,rec_hdr,rec_end,FORWARD))
838 return RVM_SUCCESS;
839
840 no_record: /* no next record */
841 log_buf->ptr = -1;
842 return RVM_SUCCESS;
843 }
844 /* scan forward from present position at a record structure
845 returns updated offset indexed by ptr; -1 ==> no next rec. */
scan_forward(log,synch)846 rvm_return_t scan_forward(log,synch)
847 log_t *log; /* log descriptor */
848 rvm_bool_t synch; /* true ==> synchronization required */
849 {
850 log_buf_t *log_buf = &log->log_buf; /* log buffer descriptor */
851 rec_hdr_t *rec_hdr; /* cast for next record hdr */
852 rvm_return_t retval;
853
854 assert(log_buf->ptr != -1); /* invalid position */
855 rec_hdr = (rec_hdr_t *)&log_buf->buf[log_buf->ptr];
856 switch (rec_hdr->struct_id)
857 {
858 case trans_hdr_id: case log_seg_id:
859 log_buf->ptr += (rec_hdr->rec_length+sizeof(rec_end_t));
860 break;
861 case rec_end_id:
862 log_buf->ptr += sizeof(rec_end_t);
863 break;
864 case nv_range_id: /* scan past remaining ranges */
865 DO_FOREVER
866 {
867 if ((retval=scan_nv_forward(log,synch)) != RVM_SUCCESS)
868 return retval;
869 rec_hdr = (rec_hdr_t *)&log_buf->buf[log_buf->ptr];
870 switch (rec_hdr->struct_id)
871 {
872 case nv_range_id:
873 log_buf->ptr += rec_hdr->rec_length;
874 break;
875 case rec_end_id:
876 log_buf->ptr += sizeof(rec_end_t);
877 goto trans_done;
878 default: /* validate_rec_forward will handle */
879 goto trans_done;
880 }
881 }
882 trans_done:
883 break;
884 case log_wrap_id:
885 if ((retval=init_buffer(log,&log->status.log_start,
886 FORWARD,synch))
887 != RVM_SUCCESS) return retval;
888 break;
889 default:
890 if (rvm_utlsw)
891 {
892 log_buf->ptr = -1; /* utility can handle unknown records */
893 return RVM_SUCCESS;
894 }
895 assert(rvm_false); /* unknown record type */
896 }
897
898 /* validate next record */
899 return validate_rec_forward(log,synch);
900 }
901 /* scan for wrap marker */
scan_wrap_reverse(log,synch)902 rvm_return_t scan_wrap_reverse(log,synch)
903 rvm_bool_t synch; /* true ==> synchronization required */
904 log_t *log; /* log descriptor */
905 {
906 log_buf_t *log_buf = &log->log_buf; /* log buffer descriptor */
907 rec_hdr_t *rec_hdr; /* temporary cast for record header */
908 log_wrap_t *log_wrap; /* temporary cast for wrap marker */
909 long tmp_ptr; /* temporary buffer ptr */
910 rvm_return_t retval;
911
912 /* load last sectors of log */
913 if ((retval=init_buffer(log,&log->dev.num_bytes,
914 REVERSE,synch))
915 != RVM_SUCCESS) return retval;
916
917 /* scan for wrap marker */
918 /* for the purpose of locating the wrap marker, we use the (duplicated)
919 struct_id2 which, while positions at the end of the record, guarantees
920 that we must interpret it first, otherwise, we may possibly
921 mis-interpret other field of the record to have a struct_id of
922 log_wrap_id ! */
923 for (tmp_ptr = (log_buf->ptr - sizeof(log_wrap_t));
924 tmp_ptr >= 0; tmp_ptr -= sizeof(rvm_length_t))
925 {
926 log_wrap = (log_wrap_t *)&log_buf->buf[tmp_ptr];
927 if (log_wrap->struct_id2 == log_wrap_id)
928 {
929 assert( (log_wrap->rec_hdr.struct_id==log_wrap_id) || rvm_utlsw );
930 /* XXXX fix this */
931 #if 0
932 if (!((log_wrap->struct_id == log_wrap_id) || rvm_utlsw)) {
933 printf("not true!\n");
934 assert(0);
935 }
936 #endif
937 break;
938 }
939 }
940
941 /* validate header if tmp_ptr legit */
942 if ((tmp_ptr >= 0) && (tmp_ptr < log_buf->r_length))
943 {
944 log_buf->ptr = tmp_ptr;
945 rec_hdr = (rec_hdr_t *)&log_buf->buf[log_buf->ptr];
946 if (!validate_hdr(log,rec_hdr,NULL,REVERSE))
947 log_buf->ptr = -1;
948 }
949 else
950 /* no wrap marker found */
951 if (rvm_utlsw)
952 log_buf->ptr = -1; /* utility can deal with it */
953 else assert(rvm_false);
954
955 return RVM_SUCCESS;
956 }
957 /* validate current record in buffer in reverse scan */
validate_rec_reverse(log,synch)958 rvm_return_t validate_rec_reverse(log,synch)
959 rvm_bool_t synch; /* true ==> synchronization required */
960 log_t *log; /* log descriptor */
961 {
962 log_buf_t *log_buf = &log->log_buf; /* log buffer descriptor */
963 log_status_t *status = &log->status; /* status area */
964 rec_end_t *rec_end = NULL; /* temporary cast for record end */
965 rec_hdr_t *rec_hdr; /* temporary cast for record header */
966 long tmp_ptr; /* temporary buffer ptr */
967 rvm_length_t tmp_len;
968 rvm_offset_t offset; /* temp for offset calculations */
969 rvm_return_t retval;
970
971 /* get previous end marker into buffer */
972 if ((long)(log_buf->ptr-sizeof(rec_end_t)) < 0)
973 {
974 offset = RVM_ADD_LENGTH_TO_OFFSET(log_buf->offset,
975 log_buf->ptr);
976 if (RVM_OFFSET_EQL(offset,status->log_start))
977 {
978 retval=scan_wrap_reverse(log,synch);
979 return retval; /* exit pointing to wrap marker */
980 }
981 else
982 {
983 if ((retval=init_buffer(log,&offset,REVERSE,synch))
984 != RVM_SUCCESS) return retval;
985 }
986 }
987 log_buf->ptr -= sizeof(rec_end_t);
988
989 /* check new end marker */
990 rec_end = (rec_end_t *)&log_buf->buf[log_buf->ptr];
991 if (rec_end->rec_hdr.struct_id != rec_end_id)
992 goto no_record; /* no next record */
993 /* see if record will fit in buffer */
994 if ((ROUND_TO_SECTOR_SIZE(rec_end->rec_hdr.rec_length+sizeof(rec_end_t))
995 + SECTOR_SIZE) <= log_buf->length)
996 {
997 /* yes, get whole record in buffer */
998 if ((long)(log_buf->ptr - rec_end->rec_hdr.rec_length) < 0)
999 {
1000 /* refill buffer (be sure end marker is included) */
1001 log_buf->ptr += sizeof(rec_end_t);
1002 if ((retval=refill_buffer(log,REVERSE,synch))
1003 != RVM_SUCCESS) return retval;
1004 log_buf->ptr -= sizeof(rec_end_t);
1005 rec_end = (rec_end_t *)&log_buf->buf[log_buf->ptr];
1006 }
1007 tmp_ptr = log_buf->ptr - rec_end->rec_hdr.rec_length;
1008 rec_hdr = (rec_hdr_t *)&log_buf->buf[tmp_ptr];
1009 }
1010 else
1011 {
1012 /* no, save rec_end for validation & get header in aux. buffer */
1013 offset = RVM_SUB_LENGTH_FROM_OFFSET(log_buf->offset,
1014 rec_end->rec_hdr.rec_length);
1015 offset = RVM_ADD_LENGTH_TO_OFFSET(offset,log_buf->ptr);
1016
1017 /* check offset alignment to see if rec_end is trash */
1018 tmp_ptr = RVM_OFFSET_TO_LENGTH(offset);
1019 if (tmp_ptr != CHOP_TO_LENGTH(tmp_ptr))
1020 goto no_record; /* header alignment wrong */
1021 retval = load_aux_buf(log, &offset, MAX_HDR_SIZE, &tmp_ptr, &tmp_len,
1022 synch, rvm_false);
1023 if (retval != RVM_SUCCESS) return retval;
1024 if (tmp_ptr == -1)
1025 goto no_record; /* record header not available */
1026 rec_hdr = (rec_hdr_t *)&log_buf->aux_buf[tmp_ptr];
1027 }
1028
1029 /* validate whole record now that header is available */
1030 if (validate_hdr(log,rec_hdr,rec_end,REVERSE))
1031 return RVM_SUCCESS;
1032
1033 no_record:
1034 log_buf->ptr = -1; /* no next record */
1035 return RVM_SUCCESS;
1036 }
1037 /* scan backward from present position at a record structure
1038 returns index of offset in ptr; -1 ==> no next rec. */
scan_reverse(log,synch)1039 rvm_return_t scan_reverse(log,synch)
1040 log_t *log; /* log descriptor */
1041 rvm_bool_t synch; /* true ==> synchronization required */
1042 {
1043 log_buf_t *log_buf = &log->log_buf; /* log buffer descriptor */
1044 log_status_t *status = &log->status; /* status area */
1045 rec_hdr_t *rec_hdr; /* temporary cast for record header */
1046 rvm_offset_t offset; /* temp for offset calculations */
1047 rvm_return_t retval;
1048
1049 assert(log_buf->ptr != -1); /* can't reposition from this! */
1050
1051 /* test if scan starting from tail */
1052 offset = RVM_ADD_LENGTH_TO_OFFSET(log_buf->offset,log_buf->ptr);
1053 if (RVM_OFFSET_EQL(offset,status->prev_log_tail)
1054 || (rvm_utlsw && RVM_OFFSET_EQL(offset,status->log_tail)))
1055 return validate_rec_reverse(log,synch);
1056
1057 /* test if at start of log & must wrap around */
1058 if ((RVM_OFFSET_EQL(log_buf->offset,status->log_start)) &&
1059 (log_buf->ptr == 0))
1060 {
1061 if ((retval=scan_wrap_reverse(log,synch)) != RVM_SUCCESS)
1062 return retval;
1063 return RVM_SUCCESS; /* exit pointing to wrap marker */
1064 }
1065
1066 /* move to previous record end marker */
1067 rec_hdr = (rec_hdr_t *)&log_buf->buf[log_buf->ptr];
1068 switch (rec_hdr->struct_id)
1069 {
1070 case trans_hdr_id: case log_seg_id:
1071 case log_wrap_id:
1072 break;
1073 case rec_end_id:
1074 if (((rec_end_t *)rec_hdr)->rec_type != trans_hdr_id)
1075 { /* record is always in buffer */
1076 log_buf->ptr -= rec_hdr->rec_length;
1077 break;
1078 }
1079 case nv_range_id: /* scan past remaining ranges */
1080 DO_FOREVER
1081 {
1082 if ((retval=scan_nv_reverse(log,synch)) != RVM_SUCCESS)
1083 return retval;
1084 rec_hdr = (rec_hdr_t *)&log_buf->buf[log_buf->ptr];
1085 if (rec_hdr->struct_id == trans_hdr_id)
1086 break;
1087 }
1088 break;
1089 default:
1090 {
1091 if (rvm_utlsw)
1092 {
1093 log_buf->ptr = -1; /* utl can recover */
1094 return RVM_SUCCESS;
1095 }
1096 assert(rvm_false); /* not at recognizable point in log */
1097 }
1098 }
1099
1100 /* validate new record and set log_buf->ptr */
1101 return validate_rec_reverse(log,synch);
1102 }
1103 /* Recovery: phase 1 -- locate current log tail from last status block
1104 location */
1105
1106 /* log_wrap status update for tail location */
set_wrap_status(status,rec_hdr)1107 static void set_wrap_status(status,rec_hdr)
1108 log_status_t *status; /* status descriptor */
1109 rec_hdr_t *rec_hdr; /* current record scanned in buffer */
1110 {
1111 status->wrap_time = rec_hdr->timestamp;
1112 status->n_special++;
1113 status->tot_wrap++;
1114 }
1115 /* range checksum computation & check */
range_chk_sum(log,nv,chk_val,synch)1116 static rvm_return_t range_chk_sum(log,nv,chk_val,synch)
1117 log_t *log; /* log descriptor */
1118 nv_range_t *nv; /* range header */
1119 rvm_bool_t *chk_val; /* result [out] */
1120 rvm_bool_t synch; /* true ==> synchronization required */
1121 {
1122 log_buf_t *log_buf = &log->log_buf; /* log buffer descriptor */
1123 rvm_length_t nv_chk_sum; /* nv's check sum */
1124 rvm_length_t chk_sum_temp = 0; /* check sum temp */
1125 rvm_length_t nv_length; /* actual length of data */
1126 rvm_length_t chk_length; /* length of check summed range */
1127 rvm_length_t align_skew; /* initial alignment skew */
1128 rvm_return_t retval; /* return value */
1129
1130 (*chk_val) = rvm_false;
1131 nv_chk_sum = nv->chk_sum;
1132 nv_length = nv->length;
1133 align_skew = BYTE_SKEW(RVM_OFFSET_TO_LENGTH(nv->offset));
1134 log_buf->ptr += sizeof(nv_range_t);
1135
1136 /* do checksum over as many buffer loads as needed */
1137 DO_FOREVER
1138 {
1139 chk_length = log_buf->r_length - log_buf->ptr - align_skew;
1140 if (chk_length > nv_length) chk_length = nv_length;
1141 chk_sum_temp +=
1142 chk_sum(&log_buf->buf[log_buf->ptr+align_skew],
1143 chk_length);
1144 nv_length -= chk_length;
1145 log_buf->ptr += (chk_length+align_skew);
1146 if (nv_length == 0) break; /* done */
1147 if ((retval=refill_buffer(log,FORWARD,synch))
1148 != RVM_SUCCESS) return retval;
1149 align_skew = 0; /* following buffers have no padding */
1150 }
1151 log_buf->ptr = ROUND_TO_LENGTH(log_buf->ptr);
1152
1153 /* report result */
1154 if (nv_chk_sum == chk_sum_temp)
1155 (*chk_val) = rvm_true;
1156
1157 return RVM_SUCCESS;
1158 }
1159 /* transaction validation & status update for tail location */
set_trans_status(log,rec_hdr)1160 static rvm_return_t set_trans_status(log,rec_hdr)
1161 log_t *log; /* log descriptor */
1162 rec_hdr_t *rec_hdr; /* current trans record in buffer */
1163 {
1164 log_buf_t *log_buf = &log->log_buf; /* log buffer descriptor */
1165 log_status_t *status = &log->status; /* status descriptor */
1166 trans_hdr_t trans_hdr; /* copy of header */
1167 long num_ranges = 0; /* range scan counter */
1168 nv_range_t *nv; /* range header */
1169 rvm_bool_t chk_val; /* checksum test result */
1170 rvm_return_t retval; /* return value */
1171
1172 /* keep copy of header to get status if ranges are OK */
1173 BCOPY((char *)rec_hdr,(char *)&trans_hdr,sizeof(trans_hdr_t));
1174
1175 /* scan and check sum all ranges */
1176 log_buf->ptr += sizeof(trans_hdr_t);
1177 DO_FOREVER
1178 {
1179 if ((retval=scan_nv_forward(log,NO_SYNCH)) != RVM_SUCCESS)
1180 return retval;
1181 rec_hdr = (rec_hdr_t *)&(log_buf->buf[log_buf->ptr]);
1182 if (rec_hdr->struct_id == rec_end_id)
1183 break; /* done */
1184 if (rec_hdr->struct_id != nv_range_id)
1185 goto bad_record; /* invalid record */
1186 nv = (nv_range_t *)rec_hdr;
1187 if (trans_hdr.rec_hdr.rec_num != nv->rec_hdr.rec_num)
1188 goto bad_record; /* wrong transaction */
1189
1190 /* test range's data check sum */
1191 if ((retval=range_chk_sum(log,nv,&chk_val,NO_SYNCH))
1192 != RVM_SUCCESS) return retval;
1193 if (chk_val != rvm_true) goto bad_record; /* check sum failure */
1194
1195 num_ranges++;
1196 }
1197 /* be sure all ranges are present */
1198 if (num_ranges != trans_hdr.num_ranges)
1199 goto bad_record; /* incomplete */
1200
1201 /* transaction complete, update status */
1202 status->last_uname = trans_hdr.uname;
1203 if (trans_hdr.flags & FLUSH_FLAG)
1204 status->n_flush_commit++;
1205 else status->n_no_flush_commit++;
1206 if (((trans_hdr.flags & FIRST_ENTRY_FLAG) != 0)
1207 && ((trans_hdr.flags & LAST_ENTRY_FLAG) == 0))
1208 status->n_split++;
1209 return RVM_SUCCESS;
1210
1211 bad_record:
1212 log_buf->ptr = -1;
1213 return RVM_SUCCESS;
1214 }
1215 /* Locate tail, update in-memory copy of status block; always reads forward */
locate_tail(log)1216 rvm_return_t locate_tail(log)
1217 log_t *log; /* log descriptor */
1218 {
1219 log_status_t *status = &log->status; /* status descriptor */
1220 log_buf_t *log_buf = &log->log_buf; /* log buffer descriptor */
1221 rvm_offset_t tail; /* tail offset */
1222 rvm_offset_t temp_tail; /* tail offset temp */
1223 rvm_length_t last_rec_num = 0; /* record number of tail record */
1224 rec_hdr_t *rec_hdr; /* current record scanned in buffer */
1225 struct timeval save_last_trunc;
1226 struct timeval last_write = status->last_write; /* last write to log */
1227 rvm_bool_t save_rvm_utlsw = rvm_utlsw;
1228 rvm_return_t retval = RVM_SUCCESS; /* return value */
1229
1230 assert(log->trunc_thread == cthread_self());
1231 assert((status->trunc_state & RVM_TRUNC_PHASES) == ZERO);
1232 status->trunc_state |= RVM_TRUNC_FIND_TAIL;
1233
1234 /* initialize scanner sequence checking state and buffers */
1235 rvm_utlsw = rvm_false;
1236 reset_hdr_chks(log);
1237 clear_aux_buf(log);
1238
1239 /* if truncation caught in crash, reset head */
1240 if (!RVM_OFFSET_EQL_ZERO(status->prev_log_head))
1241 {
1242 status->log_head = status->prev_log_head;
1243 status->last_rec_num = status->next_rec_num-1;
1244 }
1245
1246 /* set temporary timestamp for record validation */
1247 save_last_trunc = status->last_trunc;
1248 make_uname(&status->last_trunc);
1249 if (TIME_GTR(save_last_trunc,status->last_trunc))
1250 { /* date/time wrong! */
1251 retval = RVM_EINTERNAL;
1252 rvm_errmsg = ERR_DATE_SKEW;
1253 goto err_exit;
1254 }
1255
1256 /* need to update status: init read buffer at head */
1257 if ((retval=init_buffer(log,&status->log_head,
1258 FORWARD,NO_SYNCH))
1259 != RVM_SUCCESS) goto err_exit;
1260 assert(log->trunc_thread == cthread_self());
1261 assert((status->trunc_state & RVM_TRUNC_PHASES) == RVM_TRUNC_FIND_TAIL);
1262
1263 /* validate 1st record, none ==> log empty */
1264 rec_hdr = (rec_hdr_t *)&(log_buf->buf[log_buf->ptr]);
1265 if (!validate_hdr(log,rec_hdr,NULL,FORWARD))
1266 {
1267 #ifdef RVM_LOG_TAIL_BUG
1268 unprotect_page__Fi(ClobberAddress);
1269 #endif /* RVM_LOG_TAIL_BUG */
1270 #ifdef RVM_LOG_TAIL_SHADOW
1271 assert(RVM_OFFSET_EQL(log_tail_shadow,status->log_tail));
1272 #endif /* RVM_LOG_TAIL_SHADOW */
1273 status->log_tail = status->log_head;
1274 #ifdef RVM_LOG_TAIL_SHADOW
1275 RVM_ASSIGN_OFFSET(log_tail_shadow,status->log_tail);
1276 #endif /* RVM_LOG_TAIL_SHADOW */
1277 #ifdef RVM_LOG_TAIL_BUG
1278 protect_page__Fi(ClobberAddress);
1279 #endif /* RVM_LOG_TAIL_BUG */
1280 clear_log_status(log);
1281 goto exit;
1282 }
1283 /* update status block head info if necessary */
1284 if (status->first_rec_num == 0)
1285 status->first_rec_num = rec_hdr->rec_num;
1286 if (TIME_EQL_ZERO(status->first_write))
1287 status->first_write = rec_hdr->timestamp;
1288 if (rec_hdr->struct_id == log_wrap_id)
1289 status->wrap_time = rec_hdr->timestamp;
1290
1291 /* locate first transaction, if needed */
1292 if (TIME_EQL_ZERO(status->first_uname))
1293 do
1294 {
1295 /* update other status data */
1296 rec_hdr = (rec_hdr_t *)&(log_buf->buf[log_buf->ptr]);
1297 last_rec_num = rec_hdr->rec_num;
1298 status->last_write = rec_hdr->timestamp;
1299 if (rec_hdr->struct_id == log_wrap_id)
1300 status->wrap_time = rec_hdr->timestamp;
1301
1302 if (rec_hdr->struct_id == trans_hdr_id)
1303 { /* transaction found */
1304 status->first_uname = ((trans_hdr_t *)
1305 rec_hdr)->uname;
1306 status->last_uname = ((trans_hdr_t *)
1307 rec_hdr)->uname;
1308 break;
1309 }
1310 if (rec_hdr->struct_id == log_wrap_id)
1311 status->wrap_time = rec_hdr->timestamp;
1312 if ((retval=scan_forward(log,NO_SYNCH)) != RVM_SUCCESS)
1313 goto err_exit;
1314 assert(log->trunc_thread == cthread_self());
1315 assert((status->trunc_state & RVM_TRUNC_PHASES)
1316 == RVM_TRUNC_FIND_TAIL);
1317 if (rvm_chk_sigint != NULL) /* test for interrupt */
1318 if ((*rvm_chk_sigint)(NULL)) goto err_exit;
1319 }
1320 while (log_buf->ptr != -1); /* tail found, no transactions */
1321
1322 /* re-init scanner sequence checking state since small logs can cause
1323 a few records to be rescanned and re-init read buffer at tail
1324 */
1325 tail = status->log_tail;
1326 reset_hdr_chks(log);
1327 if ((retval=init_buffer(log,&tail,FORWARD,NO_SYNCH))
1328 != RVM_SUCCESS) goto err_exit;
1329 assert(log->trunc_thread == cthread_self());
1330 assert((status->trunc_state & RVM_TRUNC_PHASES) == RVM_TRUNC_FIND_TAIL);
1331 /* see if record at tail is valid, scan until bad record found */
1332 if ((retval=validate_rec_forward(log,NO_SYNCH)) != RVM_SUCCESS)
1333 goto err_exit;
1334 DO_FOREVER
1335 {
1336 if (log_buf->ptr == -1) break; /* tail located */
1337
1338 /* compute provisional new tail offset, rec_num, timestamp */
1339 rec_hdr = (rec_hdr_t *)(&log_buf->buf[log_buf->ptr]);
1340 temp_tail = RVM_ADD_LENGTH_TO_OFFSET(log_buf->offset,
1341 (log_buf->ptr+rec_hdr->rec_length
1342 +sizeof(rec_end_t)));
1343 last_rec_num = rec_hdr->rec_num;
1344 last_write = rec_hdr->timestamp;
1345
1346 /* type-specific status data recovery */
1347 switch (rec_hdr->struct_id)
1348 {
1349 case log_wrap_id:
1350 set_wrap_status(status,rec_hdr);
1351 tail = status->log_start;
1352 break;
1353
1354 case trans_hdr_id:
1355 if ((retval=set_trans_status(log,rec_hdr)) != RVM_SUCCESS)
1356 goto err_exit;
1357 assert(log->trunc_thread == cthread_self());
1358 assert((status->trunc_state & RVM_TRUNC_PHASES)
1359 == RVM_TRUNC_FIND_TAIL);
1360 if (log_buf->ptr != -1)
1361 tail = temp_tail; /* update if trans OK */
1362 break;
1363
1364 case log_seg_id:
1365 status->n_special++;
1366 tail = temp_tail;
1367 break;
1368
1369 default: assert(rvm_false); /* error - should have header */
1370 }
1371
1372 /* scan to next record */
1373 if (log_buf->ptr == -1) break; /* tail located */
1374 if ((retval=scan_forward(log,NO_SYNCH)) != RVM_SUCCESS)
1375 goto err_exit;
1376 assert(log->trunc_thread == cthread_self());
1377 assert((status->trunc_state & RVM_TRUNC_PHASES) == RVM_TRUNC_FIND_TAIL);
1378 if (rvm_chk_sigint != NULL) /* test for interrupt */
1379 if ((*rvm_chk_sigint)(NULL)) goto err_exit;
1380 }
1381 /* tail found, update in-memory status */
1382 #ifdef RVM_LOG_TAIL_BUG
1383 unprotect_page__Fi(ClobberAddress);
1384 #endif /* RVM_LOG_TAIL_BUG */
1385 #ifdef RVM_LOG_TAIL_SHADOW
1386 assert(RVM_OFFSET_EQL(log_tail_shadow,status->log_tail));
1387 #endif /* RVM_LOG_TAIL_SHADOW */
1388 status->log_tail = tail;
1389 #ifdef RVM_LOG_TAIL_SHADOW
1390 RVM_ASSIGN_OFFSET(log_tail_shadow,status->log_tail);
1391 #endif /* RVM_LOG_TAIL_SHADOW */
1392 #ifdef RVM_LOG_TAIL_BUG
1393 protect_page__Fi(ClobberAddress);
1394 #endif /* RVM_LOG_TAIL_BUG */
1395 status->last_write = last_write;
1396 if (RVM_OFFSET_EQL(status->log_head,status->log_tail))
1397 clear_log_status(log); /* log empty */
1398 else
1399 { /* log not empty */
1400 status->log_empty = rvm_false;
1401
1402 if (status->next_rec_num <= last_rec_num)
1403 status->next_rec_num = last_rec_num+1;
1404 if (status->last_rec_num != last_rec_num)
1405 status->last_rec_num = last_rec_num;
1406 }
1407
1408 exit:
1409 status->valid = rvm_true;
1410 err_exit:
1411 rvm_utlsw = save_rvm_utlsw;
1412 status->last_trunc = save_last_trunc;
1413 assert(log->trunc_thread == cthread_self());
1414 assert((status->trunc_state & RVM_TRUNC_PHASES) == RVM_TRUNC_FIND_TAIL);
1415 return retval;
1416 }
1417 /* add segment short id to dictionary */
enter_seg_dict(log,seg_code)1418 rvm_return_t enter_seg_dict(log,seg_code)
1419 log_t *log;
1420 long seg_code;
1421 {
1422 seg_dict_t *seg_dict;
1423 long old_dict_size,new_dict_size;
1424
1425 /* lengthen seg_dict_vec if necessary */
1426 if (log->seg_dict_len < seg_code)
1427 {
1428 new_dict_size = seg_code*sizeof(seg_dict_t);
1429 old_dict_size = log->seg_dict_len*sizeof(seg_dict_t);
1430 log->seg_dict_vec = (seg_dict_t *)
1431 REALLOC((char *)log->seg_dict_vec,new_dict_size);
1432 if (log->seg_dict_vec == NULL)
1433 return RVM_ENO_MEMORY;
1434 (void)BZERO((char *)((long)log->seg_dict_vec+old_dict_size),
1435 new_dict_size-old_dict_size);
1436 log->seg_dict_len = seg_code;
1437 }
1438
1439 /* enter in dictionary if not already defined */
1440 seg_dict = &log->seg_dict_vec[SEG_DICT_INDEX(seg_code)];
1441 if (seg_dict->struct_id != seg_dict_id)
1442 {
1443 seg_dict->struct_id = seg_dict_id;
1444 seg_dict->seg_code = seg_code;
1445 seg_dict->seg = NULL;
1446 init_tree_root(&seg_dict->mod_tree);
1447 (void)dev_init(&seg_dict->dev,NULL);
1448 }
1449 return RVM_SUCCESS;
1450 }
1451 /* complete definition of seg_dict entry */
def_seg_dict(log,rec_hdr)1452 rvm_return_t def_seg_dict(log,rec_hdr)
1453 log_t *log; /* log descriptor */
1454 rec_hdr_t *rec_hdr; /* log segment definition descriptor
1455 (with log record header) */
1456 {
1457 log_seg_t *log_seg; /* log segment definition descriptor */
1458 seg_dict_t *seg_dict; /* segment dictionary entry */
1459 char *seg_name; /* ptr to segment name in seg_dict rec */
1460 device_t *dev; /* device descriptor */
1461 rvm_return_t retval;
1462
1463 assert(rec_hdr->struct_id == log_seg_id);
1464 log_seg = (log_seg_t *)RVM_ADD_LENGTH_TO_ADDR(rec_hdr,
1465 sizeof(rec_hdr_t));
1466
1467 /* create dictionary entry if necessary */
1468 if ((retval=enter_seg_dict(log,log_seg->seg_code)) != RVM_SUCCESS)
1469 return retval;
1470 seg_dict = &log->seg_dict_vec[SEG_DICT_INDEX(log_seg->seg_code)];
1471
1472 /* if segment not defined, set device name (open later) */
1473 seg_name = (char *)((rvm_length_t)rec_hdr+LOG_SPECIAL_SIZE);
1474 seg_dict->seg = seg_lookup(seg_name,&retval);
1475 if (seg_dict->seg == NULL)
1476 {
1477 assert(log->in_recovery || rvm_utlsw);
1478 dev = &seg_dict->dev;
1479 dev->name = malloc(log_seg->name_len+1);
1480 if (dev->name == NULL)
1481 return RVM_ENO_MEMORY;
1482 (void)strcpy(dev->name,seg_name);
1483 dev->num_bytes = log_seg->num_bytes;
1484 }
1485
1486 return RVM_SUCCESS;
1487 }
1488 /* change tree comparator for tree_insert */
cmp_partial_include(node1,node2)1489 static long cmp_partial_include(node1,node2)
1490 dev_region_t *node1;
1491 dev_region_t *node2;
1492 {
1493 return dev_partial_include(&node1->offset,&node1->end_offset,
1494 &node2->offset,&node2->end_offset);
1495 }
1496
1497 /* set length of change tree node from offsets */
set_node_length(node)1498 static void set_node_length(node)
1499 dev_region_t *node; /* change tree node */
1500 {
1501 rvm_offset_t offset_temp; /* offset arithmetic temp */
1502
1503 offset_temp = RVM_SUB_OFFSETS(node->end_offset,node->offset);
1504 assert(RVM_OFFSET_LEQ(offset_temp,node->end_offset)); /* overflow! */
1505 node->length = RVM_OFFSET_TO_LENGTH(offset_temp);
1506
1507 }
change_tree_insert(seg_dict,node)1508 static rvm_return_t change_tree_insert(seg_dict,node)
1509 seg_dict_t *seg_dict; /* seg_dict for this nv */
1510 dev_region_t *node; /* change tree node for this nv */
1511 {
1512 dev_region_t *x_node; /* existing node if conflict */
1513 dev_region_t *split_node; /* ptr to created node, when used */
1514 rvm_length_t log_diff; /* adjustment to log/nv_buf offset */
1515 long cmpval; /* comparison return value */
1516 char *shadow_vmaddr; /* vmaddr of shadowed data */
1517 rvm_length_t shadow_length = 0; /* length of shadowed data */
1518 rvm_length_t shadow_skew = 0; /* byte skew of shadowed data */
1519 char *shadow_ptr = NULL; /* ptr to shadowed data in vm */
1520 rvm_offset_t shadow_offset; /* offset of shadowed data in log */
1521 rvm_return_t retval;
1522
1523 /* try to insert node & see if values already there */
1524 if (node->length == 0) goto free_node; /* eliminate zero-length nodes */
1525
1526 if (num_nodes-- == 0)
1527 {
1528 num_nodes = NODES_PER_YIELD;
1529 if (!(default_log->in_recovery || rvm_utlsw))
1530 {
1531 if (!rvm_no_yield) cthread_yield(); /* allow reschedule */
1532 }
1533 }
1534 assert(default_log->trunc_thread == cthread_self());
1535 assert((default_log->status.trunc_state & RVM_TRUNC_PHASES)
1536 == RVM_TRUNC_BUILD_TREE);
1537
1538 if (tree_insert(&seg_dict->mod_tree,node,cmp_partial_include))
1539 {
1540 if (rvm_chk_len != 0) /* do monitoring */
1541 monitor_vmaddr(node->vmaddr,node->length,node->nv_ptr,
1542 &node->log_offset,NULL,
1543 "change_tree_insert: inserting entire range");
1544 return RVM_SUCCESS; /* no shadowed values */
1545 }
1546 x_node = (dev_region_t *) /* get existing node */
1547 (seg_dict->mod_tree.traverse[seg_dict->mod_tree.level].ptr);
1548
1549 /* some values already there: test existing node spans new */
1550 if (dev_total_include(&node->offset,&node->end_offset,
1551 &x_node->offset,&x_node->end_offset) == 0)
1552 {
1553 if (rvm_chk_len != 0) /* do monitoring */
1554 monitor_vmaddr(node->vmaddr,node->length,NULL,NULL,NULL,
1555 "change_tree_insert: all values shadowed");
1556 goto free_node; /* yes, all values shadowed */
1557 }
1558 /* some shadowed, test if new values span existing node */
1559 if ((cmpval=dev_total_include(&x_node->offset,&x_node->end_offset,
1560 &node->offset,&node->end_offset)) == 0)
1561 if (RVM_OFFSET_LSS(node->offset,x_node->offset))
1562 { /* make node for preceeding values */
1563 if ((split_node=make_dev_region()) == NULL)
1564 return RVM_ENO_MEMORY;
1565 if (node->nv_buf != NULL)
1566 {
1567 assert(RVM_OFFSET_EQL_ZERO(node->log_offset));
1568 assert(node->nv_buf->struct_id == nv_buf_id);
1569 split_node->nv_buf = node->nv_buf;
1570 node->nv_buf->ref_cnt++;
1571 split_node->nv_ptr = node->nv_ptr;
1572 }
1573 else
1574 assert(node->nv_ptr == NULL);
1575
1576 /* complete the new node */
1577 split_node->offset = node->offset;
1578 split_node->end_offset = x_node->offset;
1579 split_node->log_offset = node->log_offset;
1580 split_node->vmaddr = node->vmaddr;
1581 set_node_length(split_node);
1582 node->vmaddr += split_node->length;
1583 node->offset = RVM_ADD_LENGTH_TO_OFFSET(node->offset,
1584 split_node->length);
1585 log_diff = split_node->length +
1586 BYTE_SKEW(RVM_OFFSET_TO_LENGTH(split_node->offset));
1587
1588 if (node->nv_ptr != NULL)
1589 node->nv_ptr = (char *)CHOP_TO_LENGTH(
1590 RVM_ADD_LENGTH_TO_ADDR(node->nv_ptr,log_diff));
1591 else
1592 node->log_offset = CHOP_OFFSET_TO_LENGTH_SIZE(
1593 RVM_ADD_LENGTH_TO_OFFSET(split_node->log_offset,
1594 log_diff));
1595
1596 /* insert split node in tree */
1597 if (rvm_chk_len != 0) /* do monitoring */
1598 monitor_vmaddr(split_node->vmaddr,split_node->length,
1599 NULL,NULL,NULL,
1600 "change_tree_insert: inserting split node");
1601 if ((retval=change_tree_insert(seg_dict,split_node))
1602 != RVM_SUCCESS) return retval;
1603 }
1604 /* test if new values follow existing node */
1605 shadow_skew = BYTE_SKEW(RVM_OFFSET_TO_LENGTH(node->offset));
1606 if (cmpval <= 0)
1607 {
1608 /* yes, reset starting offset */
1609 shadow_vmaddr = node->vmaddr;
1610 shadow_length = RVM_OFFSET_TO_LENGTH(
1611 RVM_SUB_OFFSETS(x_node->end_offset,node->offset));
1612 shadow_ptr = node->nv_ptr;
1613 shadow_offset = node->log_offset;
1614 node->offset = x_node->end_offset;
1615 set_node_length(node);
1616 if (node->nv_ptr != NULL) /* adjust buffer pointer */
1617 node->nv_ptr = (char *)CHOP_TO_LENGTH(
1618 RVM_ADD_LENGTH_TO_ADDR(node->nv_ptr,
1619 shadow_length+shadow_skew));
1620 else /* adjust log offset */
1621 node->log_offset = CHOP_OFFSET_TO_LENGTH_SIZE(
1622 RVM_ADD_LENGTH_TO_OFFSET(node->log_offset,
1623 shadow_length+shadow_skew));
1624 node->vmaddr = RVM_ADD_LENGTH_TO_ADDR(node->vmaddr,
1625 shadow_length);
1626 }
1627 else
1628 /* new values preceed existing node, but don't span it */
1629 { /* reset end offset */
1630 node->end_offset = x_node->offset;
1631 shadow_length = node->length; /* save old length */
1632 set_node_length(node);
1633 shadow_length -= node->length; /* correct for new length */
1634 shadow_vmaddr = RVM_ADD_LENGTH_TO_ADDR(node->vmaddr,
1635 node->length);
1636 if (node->nv_ptr != NULL)
1637 shadow_ptr = (char *)CHOP_TO_LENGTH(
1638 RVM_ADD_LENGTH_TO_ADDR(node->nv_ptr,
1639 shadow_length+shadow_skew));
1640 shadow_offset = CHOP_OFFSET_TO_LENGTH_SIZE(
1641 RVM_ADD_LENGTH_TO_OFFSET(node->log_offset,
1642 shadow_length+shadow_skew));
1643 }
1644 /* insert modified node */
1645 if (rvm_chk_len != 0) /* do monitoring */
1646 {
1647 if (shadow_length != 0)
1648 monitor_vmaddr(shadow_vmaddr,shadow_length,shadow_ptr,
1649 &shadow_offset,NULL,
1650 "change_tree_insert: values shadowed");
1651 monitor_vmaddr(node->vmaddr,node->length,NULL,NULL,NULL,
1652 "change_tree_insert: inserting non-shadowed values");
1653 }
1654 return change_tree_insert(seg_dict,node);
1655
1656 free_node:
1657 free_dev_region(node);
1658 return RVM_SUCCESS;
1659 }
1660 /* prepare new value record for seg_dict's mod_tree
1661 if new values are <= nv_local_max, they must be in buffer */
do_nv(log,nv)1662 static rvm_return_t do_nv(log,nv)
1663 log_t *log;
1664 nv_range_t *nv;
1665 {
1666 log_status_t *status = &log->status; /* status descriptor */
1667 log_buf_t *log_buf = &log->log_buf; /* log buffer descriptor */
1668 seg_dict_t *seg_dict; /* seg_dict for this nv */
1669 dev_region_t *node; /* change tree node for this nv */
1670 rvm_length_t aligned_len; /* allocation temp */
1671 rvm_offset_t offset; /* monitoring temp */
1672 rvm_bool_t chk_val; /* checksum result */
1673 rvm_return_t retval; /* return value */
1674
1675 assert(log->trunc_thread == cthread_self());
1676 assert((status->trunc_state & RVM_TRUNC_PHASES)
1677 == RVM_TRUNC_BUILD_TREE);
1678 assert(nv->rec_hdr.struct_id == nv_range_id); /* not a nv range header */
1679 assert(TIME_EQL(log_buf->timestamp,nv->rec_hdr.timestamp));
1680
1681 if (rvm_chk_len != 0) /* do monitoring */
1682 {
1683 offset = RVM_ADD_LENGTH_TO_OFFSET(log_buf->offset,
1684 log_buf->ptr+sizeof(nv_range_t));
1685 monitor_vmaddr(nv->vmaddr, nv->length, NULL, &offset,
1686 &nv->rec_hdr, "do_nv: data from log");
1687 }
1688
1689 if (nv->length == 0) return RVM_SUCCESS; /* ignore null changes */
1690
1691 /* be sure in segment dictionary */
1692 if ((retval=enter_seg_dict(log,nv->seg_code)) != RVM_SUCCESS)
1693 return retval;
1694 seg_dict = &log->seg_dict_vec[SEG_DICT_INDEX(nv->seg_code)];
1695
1696 /* make a tree node for changes */
1697 if ((node = make_dev_region()) == NULL) return RVM_ENO_MEMORY;
1698 node->offset = nv->offset;
1699 node->end_offset = RVM_ADD_LENGTH_TO_OFFSET(nv->offset,nv->length);
1700 node->length = nv->length;
1701 node->vmaddr = nv->vmaddr;
1702 /* see if mods small enough to keep in vm */
1703 if (nv->length <= NV_LOCAL_MAX)
1704 { /* yes, get some space for nv */
1705 aligned_len = ALIGNED_LEN(RVM_OFFSET_TO_LENGTH(nv->offset),
1706 nv->length);
1707 if ((node->nv_buf=(nv_buf_t *)malloc(NV_BUF_SIZE(aligned_len)))
1708 == NULL) return RVM_ENO_MEMORY;
1709 node->nv_buf->struct_id = nv_buf_id;
1710 node->nv_buf->alloc_len = NV_BUF_SIZE(aligned_len);
1711 node->nv_buf->ref_cnt = 1;
1712 node->nv_buf->chk_sum = nv->chk_sum;
1713 node->nv_buf->data_len = nv->length;
1714 node->nv_ptr = (char *)&node->nv_buf->buf;
1715 assert(((rvm_length_t)nv+sizeof(nv_range_t))
1716 >= (rvm_length_t)default_log->log_buf.buf);
1717 assert(((rvm_length_t)nv+sizeof(nv_range_t))
1718 < ((rvm_length_t)default_log->log_buf.buf
1719 +default_log->log_buf.r_length));
1720
1721 /* basic BCOPY will not change alignment since buffer padded */
1722 (void)BCOPY(RVM_ADD_LENGTH_TO_ADDR(nv,sizeof(nv_range_t)),
1723 node->nv_ptr,aligned_len);
1724 }
1725 else
1726 /* no, set offset in log for nv's */
1727 node->log_offset = RVM_ADD_LENGTH_TO_OFFSET(log_buf->offset,
1728 (rvm_length_t)nv-(rvm_length_t)log_buf->buf
1729 +sizeof(nv_range_t));
1730
1731 /* put in change tree */
1732 if ((retval=change_tree_insert(seg_dict,node)) != RVM_SUCCESS)
1733 return retval;
1734
1735 /* see if complete check sum test wanted */
1736 if (rvm_chk_sum)
1737 {
1738 if ((retval=range_chk_sum(log,nv,&chk_val,SYNCH))
1739 != RVM_SUCCESS) return retval;
1740 assert(chk_val == rvm_true); /* check sum failure */
1741 if ((retval=scan_nv_reverse(log,SYNCH)) != RVM_SUCCESS)
1742 return retval;
1743 assert(log->trunc_thread == cthread_self());
1744 assert((status->trunc_state & RVM_TRUNC_PHASES)
1745 == RVM_TRUNC_BUILD_TREE);
1746 }
1747
1748 return RVM_SUCCESS;
1749 }
1750 /* scan modifications of transaction in reverse order & build tree */
do_trans(log,skip_trans)1751 static rvm_return_t do_trans(log,skip_trans)
1752 log_t *log; /* log descriptor */
1753 rvm_bool_t skip_trans; /* scan, but ignore if true */
1754 {
1755 log_status_t *status = &log->status; /* status descriptor */
1756 log_buf_t *log_buf = &log->log_buf; /* log buffer descriptor */
1757
1758 rec_hdr_t *rec_hdr; /* last record header scanned */
1759 rec_end_t *rec_end; /* end marker for transaction */
1760 trans_hdr_t *trans_hdr; /* transaction header ptr */
1761 long num_ranges = 0; /* ranges processed */
1762 long prev_range = 0; /* previous range number */
1763 rvm_return_t retval; /* return value */
1764
1765 assert(log->trunc_thread == cthread_self());
1766 assert((status->trunc_state & RVM_TRUNC_PHASES)
1767 == RVM_TRUNC_BUILD_TREE);
1768
1769 /* remember the transaction's timestamp and scan ranges */
1770 rec_end = (rec_end_t *)&log_buf->buf[log_buf->ptr];
1771 assert(rec_end->rec_hdr.struct_id == rec_end_id);
1772 log_buf->timestamp = rec_end->rec_hdr.timestamp;
1773 DO_FOREVER
1774 {
1775 if ((retval=scan_nv_reverse(log,SYNCH)) != RVM_SUCCESS)
1776 return retval;
1777 assert(log->trunc_thread == cthread_self());
1778 assert((status->trunc_state & RVM_TRUNC_PHASES)
1779 == RVM_TRUNC_BUILD_TREE);
1780 rec_hdr = (rec_hdr_t *)&log_buf->buf[log_buf->ptr];
1781
1782 /* test for end */
1783 if (rec_hdr->struct_id == trans_hdr_id)
1784 break; /* done */
1785
1786 /* check order and process the range */
1787 assert(rec_hdr->struct_id == nv_range_id);
1788 if (prev_range != 0)
1789 assert(((nv_range_t *)rec_hdr)->range_num == (prev_range-1));
1790 if (!skip_trans)
1791 if ((retval=do_nv(log,(nv_range_t *)rec_hdr))
1792 != RVM_SUCCESS) return retval;
1793
1794 /* tally ranges processed */
1795 num_ranges++;
1796 prev_range = ((nv_range_t *)rec_hdr)->range_num;
1797 }
1798
1799 /* sanity checks at the end... */
1800 trans_hdr = (trans_hdr_t *)rec_hdr;
1801 assert(trans_hdr->rec_hdr.struct_id == trans_hdr_id);
1802 assert(TIME_EQL(trans_hdr->rec_hdr.timestamp,log_buf->timestamp));
1803 assert(trans_hdr->num_ranges == num_ranges);
1804 if (num_ranges != 0) assert(prev_range == 1);
1805
1806 return RVM_SUCCESS;
1807 }
1808 /* log wrap-around validation */
chk_wrap(log,force_wrap_chk,skip_trans)1809 static rvm_return_t chk_wrap(log,force_wrap_chk,skip_trans)
1810 log_t *log; /* log descriptor */
1811 rvm_bool_t force_wrap_chk; /* wrap check required if true */
1812 rvm_bool_t *skip_trans; /* set true if bad split */
1813 {
1814 log_status_t *status = &log->status; /* status descriptor */
1815 log_buf_t *log_buf = &log->log_buf; /* log buffer descriptor */
1816 rvm_offset_t offset; /* offset temp */
1817 rvm_offset_t end_offset; /* offset of last trans end marker */
1818 rec_end_t *rec_end; /* last record scanned in buffer */
1819 trans_hdr_t last_trans_hdr; /* last transaction record header */
1820 trans_hdr_t *trans_hdr; /* header temporary */
1821 log_wrap_t *log_wrap; /* wrap-around marker */
1822 long tmp_ptr; /* buffer index temp */
1823 long data_len; /* length temporary */
1824 rvm_return_t retval; /* return value */
1825
1826 *skip_trans = rvm_false;
1827 rec_end = (rec_end_t *)&log_buf->buf[log_buf->ptr];
1828 offset = RVM_ADD_LENGTH_TO_OFFSET(log_buf->offset,log_buf->ptr);
1829 offset = RVM_SUB_LENGTH_FROM_OFFSET(offset,rec_end->rec_hdr.rec_length);
1830
1831 /* check if transaction header is at start of log data area */
1832 if (!RVM_OFFSET_EQL(offset,status->log_start) && (!force_wrap_chk))
1833 return RVM_SUCCESS; /* no, nothing more needed */
1834
1835 /* get header */
1836 if (force_wrap_chk)
1837 {
1838 /* header can be anywhere */
1839 if (RVM_OFFSET_LSS(offset,log_buf->offset))
1840 {
1841 retval = load_aux_buf(log,&offset,sizeof(trans_hdr_t),
1842 &tmp_ptr,&data_len,SYNCH,rvm_false);
1843 if (retval != RVM_SUCCESS) return retval;
1844 assert(log->trunc_thread == cthread_self());
1845 assert((status->trunc_state & RVM_TRUNC_PHASES)
1846 == RVM_TRUNC_BUILD_TREE);
1847 assert(data_len >= sizeof(trans_hdr_t));
1848 trans_hdr = (trans_hdr_t *)&log_buf->aux_buf[tmp_ptr];
1849 }
1850 else
1851 trans_hdr = (trans_hdr_t *)&log_buf->buf[log_buf->ptr
1852 -rec_end->rec_hdr.rec_length];
1853 }
1854 else
1855 /* header is at start of aux_buf or recovery buffer */
1856 if (RVM_OFFSET_LSS(offset,log_buf->offset))
1857 trans_hdr = (trans_hdr_t *)log_buf->aux_buf;
1858 else
1859 trans_hdr = (trans_hdr_t *)log_buf->buf;
1860
1861 /* check for split transaction */
1862 assert(trans_hdr->rec_hdr.struct_id == trans_hdr_id);
1863 if (TRANS_HDR(FIRST_ENTRY_FLAG)
1864 && TRANS_HDR(LAST_ENTRY_FLAG))
1865 return RVM_SUCCESS; /* not split, nothing more needed */
1866
1867 /* split, see if must check further or skip record */
1868 assert(TRANS_HDR(FIRST_ENTRY_FLAG) || TRANS_HDR(LAST_ENTRY_FLAG));
1869 if (!TRANS_HDR(LAST_ENTRY_FLAG))
1870 {
1871 if (log_buf->split_ok)
1872 { /* split previously checked */
1873 log_buf->split_ok = rvm_false;
1874 return RVM_SUCCESS;
1875 }
1876 if (force_wrap_chk) /* if not last entry, trans not good */
1877 {
1878 *skip_trans = rvm_true;
1879 return RVM_SUCCESS;
1880 }
1881 }
1882
1883 /* must make local copy and scan for first record of transaction */
1884 end_offset = RVM_ADD_LENGTH_TO_OFFSET(log_buf->offset,
1885 (log_buf->ptr+sizeof(rec_end_t)));
1886 (void)BCOPY(trans_hdr,&last_trans_hdr,sizeof(trans_hdr_t));
1887 if ((retval=scan_reverse(log,SYNCH)) != RVM_SUCCESS)
1888 return retval;
1889 assert(log->trunc_thread == cthread_self());
1890 assert((status->trunc_state & RVM_TRUNC_PHASES)
1891 == RVM_TRUNC_BUILD_TREE);
1892
1893 /* wrap-around had better be next... */
1894 assert((long)log_buf->ptr >= 0);
1895 log_wrap = (log_wrap_t *)&log_buf->buf[log_buf->ptr];
1896 assert(log_wrap->rec_hdr.struct_id == log_wrap_id);
1897 assert(log_wrap->rec_hdr.rec_num == (last_trans_hdr.rec_hdr.rec_num-1));
1898
1899 /* now scan for first record of transaction */
1900 if ((retval=scan_reverse(log,SYNCH)) != RVM_SUCCESS)
1901 return retval;
1902 assert(log->trunc_thread == cthread_self());
1903 assert((status->trunc_state & RVM_TRUNC_PHASES)
1904 == RVM_TRUNC_BUILD_TREE);
1905 assert((long)log_buf->ptr >= 0);
1906 rec_end = (rec_end_t *)&log_buf->buf[log_buf->ptr];
1907 assert(rec_end->rec_hdr.struct_id == rec_end_id);
1908 /* check if the header is the first record of last transaction */
1909 offset = RVM_ADD_LENGTH_TO_OFFSET(log_buf->offset,log_buf->ptr);
1910 offset = RVM_SUB_LENGTH_FROM_OFFSET(offset,rec_end->rec_hdr.rec_length);
1911 if (RVM_OFFSET_LSS(offset,log_buf->offset))
1912 {
1913 /* header is in aux_buf */
1914 tmp_ptr = OFFSET_TO_SECTOR_INDEX(offset);
1915 trans_hdr = (trans_hdr_t *)&log_buf->aux_buf[tmp_ptr];
1916 }
1917 else
1918 {
1919 /* header is in recovery buffer */
1920 tmp_ptr = RVM_OFFSET_TO_LENGTH(RVM_SUB_OFFSETS(offset,
1921 log_buf->offset));
1922 assert(tmp_ptr >= 0);
1923 trans_hdr = (trans_hdr_t *)&log_buf->buf[tmp_ptr];
1924 }
1925
1926 /* sanity checks... */
1927 assert(trans_hdr->rec_hdr.struct_id == trans_hdr_id);
1928 assert(TRANS_HDR(FIRST_ENTRY_FLAG));
1929 assert(TIME_EQL(trans_hdr->uname,last_trans_hdr.uname));
1930 assert(trans_hdr->rec_hdr.rec_num == (last_trans_hdr.rec_hdr.rec_num-2));
1931
1932 /* all is well, restore last transaction record */
1933 log_buf->prev_rec_num = 0;
1934 ZERO_TIME(log_buf->prev_timestamp);
1935 if ((retval=init_buffer(log,&end_offset,REVERSE,SYNCH))
1936 != RVM_SUCCESS) return retval;
1937 assert(log->trunc_thread == cthread_self());
1938 assert((status->trunc_state & RVM_TRUNC_PHASES)
1939 == RVM_TRUNC_BUILD_TREE);
1940 log_buf->ptr -= sizeof(rec_end_t);
1941 log_buf->split_ok = rvm_true;
1942
1943 return RVM_SUCCESS;
1944 }
1945 /* Recovery: phase 2 -- build modification trees, and
1946 construct dictionary of segment short names
1947 */
1948 #define X(a)
build_tree(log)1949 static rvm_return_t build_tree(log)
1950 log_t *log; /* log descriptor */
1951 {
1952 log_status_t *status = &log->status; /* status descriptor */
1953 log_buf_t *log_buf = &log->log_buf; /* log buffer descriptor */
1954 rvm_return_t retval; /* return value */
1955 rvm_offset_t tail; /* tail offset temp */
1956 rec_end_t *rec_end; /* last record scanned in buffer */
1957 rvm_length_t trans_cnt = 0; /* transactions processed */
1958 rvm_bool_t force_wrap_chk = rvm_false; /* true if suspect bad wrap */
1959 rvm_bool_t skip_trans; /* true if bad wrap trans to be skipped */
1960
1961 assert(log->trunc_thread == cthread_self());
1962 assert(((status->trunc_state & RVM_TRUNC_PHASES) == RVM_TRUNC_FIND_TAIL)
1963 || ((status->trunc_state & RVM_TRUNC_PHASES) == ZERO));
1964 status->trunc_state = (status->trunc_state & (~RVM_TRUNC_FIND_TAIL))
1965 | RVM_TRUNC_BUILD_TREE;
1966
1967 /* reset sequence checks and init scan buffers */
1968 X(reset_hdr)
1969 reset_hdr_chks(log);
1970 X(clear_aux)
1971 clear_aux_buf(log);
1972 X(init_buf)
1973 if (RVM_OFFSET_EQL(status->prev_log_tail, status->log_start))
1974 retval = init_buffer(log,&status->log_start, FORWARD,SYNCH);
1975 else
1976 retval = init_buffer(log,&status->prev_log_tail, REVERSE,SYNCH);
1977 assert(retval == RVM_SUCCESS);
1978 assert(log->trunc_thread == cthread_self());
1979 X(done_init_buf)
1980 /* scan in reverse from tail to find records for uncommitted changes */
1981 num_nodes = NODES_PER_YIELD;
1982 log_buf->split_ok = rvm_false; /* split records not checked yet */
1983 tail = status->prev_log_tail; /* use previous epoch tail */
1984 while (!RVM_OFFSET_EQL(tail,status->prev_log_head))
1985 {
1986 X(start loop)
1987 if ((retval=scan_reverse(log,SYNCH)) != RVM_SUCCESS)
1988 return retval;
1989 X(done scan_reverse)
1990 assert(log->trunc_thread == cthread_self());
1991 assert((status->trunc_state & RVM_TRUNC_PHASES)
1992 == RVM_TRUNC_BUILD_TREE);
1993 if (rvm_chk_sigint != NULL) /* test for interrupt */
1994 if ((*rvm_chk_sigint)(NULL)) return RVM_SUCCESS;
1995 assert((long)log_buf->ptr >= 0); /* log damage, invalid record */
1996
1997 /* check type of end marker, do type-dependent processing */
1998 rec_end = (rec_end_t *)&log_buf->buf[log_buf->ptr];
1999 if (rec_end->rec_hdr.struct_id == log_wrap_id)
2000 {
2001 X(log_wrap)
2002 if (!log_buf->split_ok)
2003 force_wrap_chk = rvm_true;
2004 }
2005 else
2006 {
2007 X(else)
2008 assert(rec_end->rec_hdr.struct_id == rec_end_id);
2009 switch (rec_end->rec_type)
2010 {
2011 case trans_hdr_id: /* process transaction */
2012 X( trans_hdr_id: chk_wrap)
2013 if ((retval=chk_wrap(log,force_wrap_chk,&skip_trans))
2014 != RVM_SUCCESS) return retval;
2015 force_wrap_chk = rvm_false;
2016 X( trans_hdr_id: do_trans)
2017 if ((retval=do_trans(log,skip_trans)) != RVM_SUCCESS)
2018 return retval;
2019 X( trans_hdr_id: end)
2020 trans_cnt++;
2021 break;
2022 case log_seg_id: /* enter seg short id in dictionary */
2023 X( log_seg_id: def_seg_dict)
2024 if ((retval=def_seg_dict(log,(rec_hdr_t *)
2025 RVM_SUB_LENGTH_FROM_ADDR(rec_end,
2026 rec_end->rec_hdr.rec_length)))
2027 != RVM_SUCCESS) return retval;
2028 X( log_seg_id: done)
2029 log_buf->ptr -= rec_end->rec_hdr.rec_length;
2030 break;
2031 default: assert(rvm_false); /* trouble, log damage? */
2032 }
2033 }
2034
2035 /* update local tail ptr */
2036 tail = RVM_ADD_LENGTH_TO_OFFSET(log_buf->offset,log_buf->ptr);
2037 }
2038
2039 /* leave buffer unprotected for later phases */
2040 /* MACH_RVM_PROTECT
2041 *
2042 * protect(log_buf->buf, log_buf->length, FALSE, VM_PROT_WRITE | VM_PROT_READ);
2043 */
2044
2045 return RVM_SUCCESS;
2046 }
2047 /* pre-scan change tree to see how much to read to read into buffer */
pre_scan(log,tree)2048 static dev_region_t *pre_scan(log,tree)
2049 log_t *log; /* log descriptor */
2050 tree_root_t *tree; /* current tree root */
2051 {
2052 log_buf_t *log_buf = &log->log_buf; /* log buffer descriptor */
2053 dev_region_t *last_node = NULL;
2054 dev_region_t *node; /* current change tree node */
2055 rvm_offset_t temp;
2056
2057 /* find node with least offset */
2058 node = (dev_region_t *)tree->root;
2059 /* XXX - Can node ever be NULL? If so, last_node can be random */
2060 /* I currently believe it must be NON-null */
2061 assert(node != NULL);
2062 while (node != NULL)
2063 {
2064 assert(node->links.node.struct_id == dev_region_id);
2065 last_node = node;
2066 node = (dev_region_t *)node->links.node.lss;
2067 }
2068 log_buf->offset = CHOP_OFFSET_TO_SECTOR_SIZE(last_node->offset);
2069
2070 /* scan for maximum offset node that will fit in buffer */
2071 node = (dev_region_t *)tree->root;
2072 while (node != NULL)
2073 {
2074 assert(node->links.node.struct_id == dev_region_id);
2075
2076 /* compute buffer extension for this node */
2077 temp = RVM_SUB_OFFSETS(node->end_offset,log_buf->offset);
2078 temp = ROUND_OFFSET_TO_SECTOR_SIZE(temp);
2079
2080 /* see if will fit in log buffer */
2081 if (RVM_OFFSET_GTR(temp,log_buf->buf_len))
2082 node = (dev_region_t *)node->links.node.lss; /* try smaller */
2083 else
2084 {
2085 /* see if there's another that will also fit */
2086 last_node = node;
2087 node = (dev_region_t *)node->links.node.gtr;
2088 }
2089 }
2090
2091 return last_node;
2092 }
2093 /* merge large node disk-resident new values with segment data */
disk_merge(log,node,preload)2094 static rvm_return_t disk_merge(log,node,preload)
2095 log_t *log; /* log descriptor */
2096 dev_region_t *node; /* node to merge */
2097 rvm_bool_t preload; /* end sector preload done if true */
2098 {
2099 log_status_t *status = &log->status; /* status descriptor */
2100 log_buf_t *log_buf = &log->log_buf; /* log buffer descriptor */
2101 rvm_length_t data_len=0; /* actual nv data length read */
2102 rvm_length_t buf_ptr; /* log buffer ptr */
2103 rvm_length_t aux_ptr; /* aux buffer ptr
2104 (compensates for sector alignment) */
2105 rvm_length_t tmp_ptr; /* temporary buffer ptr */
2106 long rw_length; /* actual i/o transfer length */
2107 rvm_offset_t end_offset; /* end offset temporary */
2108 rvm_return_t retval; /* return value */
2109 rvm_bool_t was_preloaded = preload; /* save preload state */
2110
2111 assert(log->trunc_thread == cthread_self());
2112 assert((status->trunc_state & RVM_TRUNC_PHASES)
2113 == RVM_TRUNC_APPLY);
2114 assert(node->links.node.struct_id == dev_region_id);
2115
2116 /* set log buffer pointer and end offset */
2117 end_offset = CHOP_OFFSET_TO_SECTOR_SIZE(node->end_offset);
2118 buf_ptr = RVM_OFFSET_TO_LENGTH(RVM_SUB_OFFSETS(node->offset,
2119 log_buf->offset));
2120 node->log_offset = RVM_ADD_LENGTH_TO_OFFSET(node->log_offset,
2121 BYTE_SKEW(buf_ptr));
2122 DO_FOREVER
2123 { /* fill log buffer from aux buf */
2124 while ((buf_ptr < log_buf->length)
2125 && (node->length != 0))
2126 {
2127 /* see how much to get in this pass & load aux_buf */
2128 if ((log_buf->length-buf_ptr) < node->length)
2129 rw_length = log_buf->length-buf_ptr; /* fill log_buf */
2130 else
2131 rw_length = node->length; /* get all remaining */
2132 if ((retval=load_aux_buf(log,&node->log_offset,rw_length,
2133 &aux_ptr,&data_len,SYNCH,rvm_true))
2134 != RVM_SUCCESS) return retval;
2135 /* sanity checks and monitoring */
2136 assert((aux_ptr+data_len) <= log_buf->aux_rlength);
2137 assert((buf_ptr+data_len) <= log_buf->length);
2138 assert(BYTE_SKEW(aux_ptr) == BYTE_SKEW(node->vmaddr));
2139 assert((long)(node->length-data_len) >= 0);
2140 if (rvm_chk_len != 0)
2141 monitor_vmaddr(node->vmaddr,data_len,
2142 &log_buf->aux_buf[aux_ptr],NULL,NULL,
2143 "disk_merge: data read from log:");
2144
2145 /* preload of last modified segment sector */
2146 if (RVM_OFFSET_GTR(RVM_ADD_LENGTH_TO_OFFSET(
2147 node->offset,data_len),end_offset)
2148 && (!preload))
2149 {
2150 /* must load last sector of mods from segment */
2151 tmp_ptr = CHOP_TO_SECTOR_SIZE(buf_ptr+data_len);
2152 if (!(log->in_recovery || rvm_utlsw || rvm_no_yield))
2153 {
2154 cthread_yield(); /* allow reschedule */
2155 assert(log->trunc_thread == cthread_self());
2156 }
2157 if ((rw_length=read_dev(log->cur_seg_dev,&end_offset,
2158 &log_buf->buf[tmp_ptr],SECTOR_SIZE)) < 0)
2159 return RVM_EIO;
2160 assert(log->trunc_thread == cthread_self());
2161 assert((status->trunc_state & RVM_TRUNC_PHASES)
2162 == RVM_TRUNC_APPLY);
2163 assert(rw_length == SECTOR_SIZE);
2164 preload = rvm_true;
2165
2166 /* monitor data from last sector */
2167 if (rvm_chk_len != 0)
2168 monitor_vmaddr(node->vmaddr,data_len,
2169 &log_buf->buf[buf_ptr],NULL,NULL,
2170 "disk_merge: data read from segment:");
2171 }
2172
2173 /* copy to segment (in log buffer) */
2174 (void)BCOPY(&log_buf->aux_buf[aux_ptr],
2175 &log_buf->buf[buf_ptr],data_len);
2176
2177 /* tally bytes merged & do monitoring */
2178 if (rvm_chk_len != 0)
2179 {
2180 monitor_vmaddr(node->vmaddr,data_len,
2181 &log_buf->buf[buf_ptr],NULL,NULL,
2182 "disk_merge: data merged to segment:");
2183 }
2184 node->length -= data_len;
2185 node->vmaddr += data_len;
2186 node->log_offset =
2187 RVM_ADD_LENGTH_TO_OFFSET(node->log_offset,data_len);
2188 node->offset =
2189 RVM_ADD_LENGTH_TO_OFFSET(node->offset,data_len);
2190 buf_ptr += data_len;
2191 /* if done, set final write length */
2192 if (node->length == 0)
2193 {
2194 assert(RVM_OFFSET_EQL(node->offset,
2195 node->end_offset));
2196 end_offset =
2197 RVM_ADD_LENGTH_TO_OFFSET(log_buf->offset,buf_ptr);
2198 assert(RVM_OFFSET_EQL(end_offset,node->end_offset));
2199 if (!was_preloaded)
2200 log_buf->r_length = ROUND_TO_SECTOR_SIZE(buf_ptr);
2201 return RVM_SUCCESS;
2202 }
2203 }
2204
2205 /* write buffer to segment & monitor */
2206 assert(buf_ptr == log_buf->length);
2207 if ((rw_length=write_dev(log->cur_seg_dev,&log_buf->offset,
2208 log_buf->buf,log_buf->length,
2209 SYNCH))
2210 < 0) return RVM_EIO;
2211 assert(log->trunc_thread == cthread_self());
2212 assert((status->trunc_state & RVM_TRUNC_PHASES)
2213 == RVM_TRUNC_APPLY);
2214 assert(rw_length == log_buf->length);
2215 if (rvm_chk_len != 0)
2216 monitor_vmaddr(node->vmaddr-data_len,data_len,
2217 &log_buf->buf[buf_ptr-data_len],NULL,NULL,
2218 "disk_merge: data written to segment:");
2219 if (!(log->in_recovery || rvm_utlsw || rvm_no_yield))
2220 {
2221 cthread_yield(); /* allow reschedule */
2222 assert(log->trunc_thread == cthread_self());
2223 assert((status->trunc_state & RVM_TRUNC_PHASES)
2224 == RVM_TRUNC_APPLY);
2225 }
2226 log_buf->offset =
2227 RVM_ADD_LENGTH_TO_OFFSET(log_buf->offset,buf_ptr);
2228 buf_ptr = 0;
2229 assert(OFFSET_TO_SECTOR_INDEX(log_buf->offset) == 0);
2230 }
2231 }
2232 /* merge node's new values with segment data in buffer */
merge_node(log,node,preload)2233 static rvm_return_t merge_node(log,node,preload)
2234 log_t *log; /* log descriptor */
2235 dev_region_t *node; /* current change tree node */
2236 rvm_bool_t preload; /* end sector preload done if true */
2237 {
2238 log_buf_t *log_buf = &log->log_buf; /* log buffer descriptor */
2239 rvm_length_t temp;
2240 rvm_return_t retval; /* return value */
2241
2242 /* do monitoring and merge node data into segment */
2243 if (RVM_OFFSET_EQL_ZERO(node->log_offset))
2244 { /* data in node */
2245 if (rvm_chk_len != ZERO)
2246 monitor_vmaddr(node->vmaddr,node->length,
2247 node->nv_ptr,NULL,NULL,
2248 "merge_node: data copied from node:");
2249 temp = RVM_OFFSET_TO_LENGTH(RVM_SUB_OFFSETS(node->offset,
2250 log_buf->offset));
2251 assert((temp+node->length) <= log_buf->r_length);
2252 dest_aligned_bcopy(node->nv_ptr,&log_buf->buf[temp],
2253 node->length);
2254 }
2255 else /* data on disk -- use aux_buf */
2256 if ((retval=disk_merge(log,node,preload)) != RVM_SUCCESS)
2257 return retval;
2258
2259 /* free node and check for yield */
2260 (void) free_dev_region(node);
2261 if (num_nodes-- == 0)
2262 {
2263 num_nodes = NODES_PER_YIELD;
2264 if (!(log->in_recovery || rvm_utlsw || rvm_no_yield))
2265 {
2266 cthread_yield(); /* allow reschedule */
2267 assert(log->trunc_thread == cthread_self());
2268 }
2269 }
2270
2271 return RVM_SUCCESS;
2272 }
2273
update_seg(log,seg_dict,seg_dev)2274 static rvm_return_t update_seg(log,seg_dict,seg_dev)
2275 log_t *log; /* log descriptor */
2276 seg_dict_t *seg_dict; /* segment dictionary entry */
2277 device_t *seg_dev; /* segment device descriptor */
2278 {
2279 log_status_t *status = &log->status; /* status descriptor */
2280 log_buf_t *log_buf = &log->log_buf; /* log buffer descriptor */
2281 long r_length; /* length of data transfered */
2282 rvm_bool_t preload; /* end sector preload done if true */
2283 char *addr=NULL; /* monitoring address */
2284 rvm_offset_t temp; /* offset temporary */
2285 dev_region_t *node; /* current node */
2286 dev_region_t *last_node; /* last node before buffer write */
2287 rvm_return_t retval = RVM_SUCCESS; /* return value */
2288 long nodes_done = 0;
2289
2290 /* sanity checks and initializations */
2291 assert(&log->dev != seg_dev);
2292 assert(log->trunc_thread == cthread_self());
2293 assert((status->trunc_state & RVM_TRUNC_PHASES)
2294 == RVM_TRUNC_APPLY);
2295 rvm_num_nodes = seg_dict->mod_tree.n_nodes;
2296 rvm_max_depth = seg_dict->mod_tree.max_depth;
2297 clear_aux_buf(log);
2298
2299 /* process the change tree */
2300 if (!(log->in_recovery || rvm_utlsw)) /* begin segment dev_lock crit sec
2301 */
2302 {
2303 mutex_lock(&seg_dict->seg->dev_lock);
2304 assert(log->trunc_thread == cthread_self());
2305 assert((status->trunc_state & RVM_TRUNC_PHASES)
2306 == RVM_TRUNC_APPLY);
2307 }
2308 while (seg_dict->mod_tree.root != NULL)
2309 {
2310 /* pre-scan tree to determine how to fill buffer */
2311 last_node = pre_scan(log,&seg_dict->mod_tree);
2312
2313 /* initialize buffer with segment data */
2314 temp = RVM_SUB_OFFSETS(last_node->end_offset,
2315 log_buf->offset);
2316 temp = ROUND_OFFSET_TO_SECTOR_SIZE(temp);
2317 if (RVM_OFFSET_LEQ(temp,log_buf->buf_len))
2318 {
2319 /* node(s) fit in log buffer */
2320 log_buf->r_length = RVM_OFFSET_TO_LENGTH(RVM_SUB_OFFSETS(
2321 last_node->end_offset,
2322 log_buf->offset));
2323 log_buf->r_length =
2324 ROUND_TO_SECTOR_SIZE(log_buf->r_length);
2325 assert(log_buf->r_length <= log_buf->length);
2326 preload = rvm_true;
2327 }
2328 else
2329 {
2330 log_buf->r_length = SECTOR_SIZE; /* very large node!! */
2331 preload = rvm_false;
2332 }
2333 /* allow reschedule & do the read */
2334 if (!(log->in_recovery || rvm_utlsw || rvm_no_yield))
2335 {
2336 cthread_yield();
2337 assert(log->trunc_thread == cthread_self());
2338 assert((status->trunc_state & RVM_TRUNC_PHASES)
2339 == RVM_TRUNC_APPLY);
2340 }
2341 if ((r_length=read_dev(seg_dev,&log_buf->offset,
2342 log_buf->buf,log_buf->r_length)) < 0)
2343 {
2344 retval = RVM_EIO;
2345 goto err_exit;
2346 }
2347 assert(log->trunc_thread == cthread_self());
2348 assert((status->trunc_state & RVM_TRUNC_PHASES)
2349 == RVM_TRUNC_APPLY);
2350 assert(r_length == log_buf->r_length);
2351
2352 /* merge selected nodes into buffer */
2353 num_nodes = NODES_PER_YIELD;
2354 UNLINK_NODES_OF(seg_dict->mod_tree,dev_region_t,node)
2355 {
2356 assert(node->links.node.struct_id == dev_region_id);
2357 nodes_done++;
2358
2359 /* do monitoring */
2360 if (rvm_chk_len != 0)
2361 {
2362 temp = log_buf->offset;
2363 addr = (char *)CHOP_TO_SECTOR_SIZE(node->vmaddr);
2364 monitor_vmaddr(addr,log_buf->r_length,log_buf->buf,
2365 &log_buf->offset,NULL,
2366 "update_seg: data read from segment:");
2367 }
2368
2369 /* merge data */
2370 if ((retval=merge_node(log,node,preload))
2371 != RVM_SUCCESS) goto err_exit;
2372 if (rvm_chk_sigint != NULL) /* test for interrupt */
2373 if ((*rvm_chk_sigint)(NULL)) goto err_exit;
2374 if (node == last_node) break;
2375 }
2376
2377 /* update the segment on disk */
2378 if ((r_length=write_dev(seg_dev,&log_buf->offset,log_buf->buf,
2379 log_buf->r_length,rvm_true)) < 0)
2380 {
2381 retval = RVM_EIO;
2382 goto err_exit;
2383 }
2384 assert(log->trunc_thread == cthread_self());
2385 assert((status->trunc_state & RVM_TRUNC_PHASES)
2386 == RVM_TRUNC_APPLY);
2387 assert(r_length == log_buf->r_length);
2388 /* do monitoring */
2389 if (rvm_chk_len != 0)
2390 {
2391 if (!RVM_OFFSET_EQL(temp,log_buf->offset))
2392 addr=RVM_ADD_LENGTH_TO_ADDR(addr,RVM_OFFSET_TO_LENGTH(
2393 RVM_SUB_OFFSETS(log_buf->offset,temp)));
2394 monitor_vmaddr(addr,log_buf->r_length,log_buf->buf,
2395 &log_buf->offset,NULL,
2396 "update_seg: data written to segment:");
2397 }
2398 }
2399
2400 /* tree checks and cleanup after unlinking */
2401 assert(nodes_done == rvm_num_nodes);
2402 assert(seg_dict->mod_tree.n_nodes == 0);
2403
2404 err_exit:
2405 if (!(log->in_recovery || rvm_utlsw)) /* end segment dev_lock crit sec */
2406 {
2407 mutex_unlock(&seg_dict->seg->dev_lock);
2408 assert(log->trunc_thread == cthread_self());
2409 assert((status->trunc_state & RVM_TRUNC_PHASES)
2410 == RVM_TRUNC_APPLY);
2411 }
2412 return retval;
2413 }
2414 /* Recovery: phase 3 -- apply modifications to segments */
apply_mods(log)2415 static rvm_return_t apply_mods(log)
2416 log_t *log; /* log descriptor */
2417 {
2418 log_status_t *status = &log->status; /* status descriptor */
2419 seg_dict_t *seg_dict; /* current segment dictionary entry */
2420 device_t *seg_dev; /* segment device descriptor */
2421 rvm_return_t retval = RVM_SUCCESS; /* return value */
2422 long i; /* loop counter */
2423 rvm_length_t flags = O_RDWR;
2424
2425 assert(log->trunc_thread == cthread_self());
2426 assert((status->trunc_state & RVM_TRUNC_PHASES)
2427 == RVM_TRUNC_BUILD_TREE);
2428 status->trunc_state = (status->trunc_state & ~RVM_TRUNC_BUILD_TREE)
2429 | RVM_TRUNC_APPLY;
2430
2431 /* iterate through segment dictionary */
2432 for (i=0;i<log->seg_dict_len;i++)
2433 {
2434 seg_dict = &log->seg_dict_vec[i];
2435 assert(seg_dict->struct_id == seg_dict_id);
2436
2437 if (seg_dict->mod_tree.root == NULL)
2438 continue; /* no changes to this seg */
2439
2440 /* open device and get characteristics if necessary */
2441 if (log->in_recovery)
2442 {
2443 seg_dev = &seg_dict->dev;
2444 if (rvm_no_update) flags = O_RDONLY;
2445 if (open_dev(seg_dev,flags,0) < 0)
2446 return RVM_EIO;
2447 assert(log->trunc_thread == cthread_self());
2448 if (set_dev_char(seg_dev,&seg_dev->num_bytes) < 0)
2449 {
2450 close_dev(seg_dev);
2451 return RVM_EIO;
2452 }
2453 assert(log->trunc_thread == cthread_self());
2454 assert((status->trunc_state & RVM_TRUNC_PHASES)
2455 == RVM_TRUNC_APPLY);
2456 }
2457 else
2458 {
2459 assert(seg_dict->seg->links.struct_id == seg_id);
2460 seg_dev = &(seg_dict->seg->dev); /* already open */
2461 }
2462 log->cur_seg_dev = seg_dev;
2463
2464 /* read segment data and merge new values */
2465 if ((retval=update_seg(log,seg_dict,seg_dev))
2466 != RVM_SUCCESS) return retval;
2467 assert(log->trunc_thread == cthread_self());
2468 assert((status->trunc_state & RVM_TRUNC_PHASES)
2469 == RVM_TRUNC_APPLY);
2470
2471 /* close segment device if in recovery */
2472 if (log->in_recovery)
2473 if (close_dev(seg_dev) < 0)
2474 return RVM_EIO;
2475 }
2476
2477 /* re-protect buffer */
2478 /* MACH_RVM_PROTECT
2479 *
2480 * protect(log->log_buf.buf, log->log_buf.length, FALSE, VM_PROT_READ);
2481 */
2482
2483 return retval;
2484 }
2485 /* Recovery: phase 4 -- update head/tail of log */
status_update(log,new_1st_rec_num)2486 static rvm_return_t status_update(log, new_1st_rec_num)
2487 log_t *log; /* log descriptor */
2488 rvm_length_t new_1st_rec_num;
2489 {
2490 log_status_t *status = &log->status; /* status descriptor */
2491 struct timeval end_time; /* end of action time temp */
2492 int kretval;
2493 rvm_return_t retval = RVM_SUCCESS; /* return value */
2494
2495 assert(log->trunc_thread == cthread_self());
2496 assert((status->trunc_state & RVM_TRUNC_PHASES)
2497 == RVM_TRUNC_APPLY);
2498 status->trunc_state = (status->trunc_state & ~RVM_TRUNC_APPLY)
2499 | RVM_TRUNC_UPDATE;
2500
2501 /* update the status block on disk */
2502 CRITICAL(log->dev_lock, /* begin log device lock crit sec */
2503 {
2504 assert(log->trunc_thread == cthread_self());
2505 assert((status->trunc_state & RVM_TRUNC_PHASES)
2506 == RVM_TRUNC_UPDATE);
2507 status->prev_trunc = status->last_trunc;
2508
2509 if (RVM_OFFSET_EQL(status->log_head,status->log_tail))
2510 clear_log_status(log); /* log empty */
2511 else
2512 {
2513 RVM_ZERO_OFFSET(status->prev_log_head);
2514 RVM_ZERO_OFFSET(status->prev_log_tail);
2515 status->first_rec_num = new_1st_rec_num;
2516 }
2517
2518 /* end timings */
2519 kretval= gettimeofday(&end_time,(struct timezone *)NULL);
2520 if (kretval != 0) goto err_exit;
2521 end_time = sub_times(&end_time,&trunc_start_time);
2522 status->tot_truncation_time =
2523 add_times(&status->tot_truncation_time,&end_time);
2524 status->last_truncation_time = round_time(&end_time);
2525 enter_histogram(status->last_truncation_time,
2526 log->status.tot_truncation_times,
2527 truncation_times_vec,truncation_times_len);
2528 status->last_tree_build_time = last_tree_build_time;
2529 enter_histogram(last_tree_build_time,
2530 log->status.tot_tree_build_times,
2531 truncation_times_vec,truncation_times_len);
2532 status->last_tree_apply_time = last_tree_apply_time;
2533 enter_histogram(last_tree_apply_time,
2534 log->status.tot_tree_apply_times,
2535 truncation_times_vec,truncation_times_len);
2536
2537 retval = write_log_status(log,NULL);
2538 err_exit:;
2539 assert(log->trunc_thread == cthread_self());
2540 assert((status->trunc_state & RVM_TRUNC_PHASES)
2541 == RVM_TRUNC_UPDATE);
2542 }); /* end log device lock crit sec */
2543 if (kretval != 0) return RVM_EIO;
2544 if (retval != RVM_SUCCESS) return retval;
2545
2546 if (log->in_recovery && (!rvm_utlsw)) /* do recovery-only processing */
2547 {
2548 /* kill segment dictionary */
2549 free_seg_dict_vec(log);
2550
2551 log->in_recovery = rvm_false;
2552 }
2553
2554 return retval;
2555 }
2556 /* switch truncation epochs */
new_epoch(log,count)2557 static rvm_return_t new_epoch(log,count)
2558 log_t *log; /* log descriptor */
2559 rvm_length_t *count; /* ptr to statistics counter */
2560 {
2561 log_status_t *status = &log->status; /* log status descriptor */
2562 rvm_return_t retval = RVM_SUCCESS;
2563
2564 /* be sure last records in truncation are in log */
2565 assert(log->trunc_thread == cthread_self());
2566 if (sync_dev(&log->dev) < 0)
2567 return RVM_EIO;
2568 assert(log->trunc_thread == cthread_self());
2569
2570 /* count truncations & accumulate statistics */
2571 (*count)++;
2572 copy_log_stats(log);
2573
2574 /* set up head/tail pointers for truncation */
2575 status->prev_log_head = status->log_head;
2576 status->log_head = status->log_tail;
2577 status->prev_log_tail = status->log_tail;
2578 status->last_rec_num = status->next_rec_num-1;
2579
2580 /* set epoch time stamp and write status block */
2581 make_uname(&status->last_trunc);
2582 if ((retval=write_log_status(log,NULL)) != RVM_SUCCESS)
2583 return retval;
2584 assert(log->trunc_thread == cthread_self());
2585
2586 /* restore log segment definitions */
2587 retval = define_all_segs(log);
2588 assert(log->trunc_thread == cthread_self());
2589 return retval;
2590 }
2591
2592 /* recover committed state from log */
log_recover(log,count,is_daemon,flag)2593 rvm_return_t log_recover(log,count,is_daemon,flag)
2594 log_t *log; /* log descriptor */
2595 rvm_length_t *count; /* ptr to statistics counter */
2596 rvm_bool_t is_daemon; /* true if called by daemon */
2597 rvm_length_t flag; /* truncation type flag */
2598 {
2599 log_status_t *status = &log->status; /* log status descriptor */
2600 log_daemon_t *daemon = &log->daemon; /* log daemon descriptor */
2601 struct timeval end_time; /* end of action time temp */
2602 struct timeval tmp_time; /* local timing temp */
2603 int kretval;
2604 rvm_bool_t do_truncation = rvm_false;
2605 rvm_return_t retval = RVM_SUCCESS;
2606 rvm_length_t new_1st_rec_num=0;
2607 X(start)
2608 CRITICAL(log->truncation_lock, /* begin truncation lock crit sec */
2609 {
2610 /* capture truncation thread & flag for checking */
2611 assert(log->trunc_thread == (cthread_t)NULL);
2612 assert(status->trunc_state == ZERO);
2613 log->trunc_thread = cthread_self();
2614 status->trunc_state = flag;
2615 X(dev_lock)
2616 CRITICAL(log->dev_lock, /* begin dev_lock crit sec */
2617 {
2618 /* process statistics */
2619 assert(log->trunc_thread == cthread_self());
2620 kretval= gettimeofday(&trunc_start_time,
2621 (struct timezone *)NULL);
2622 if (kretval != 0)
2623 {
2624 retval = RVM_EIO;
2625 goto err_exit1;
2626 }
2627 last_tree_build_time = 0;
2628 last_tree_apply_time = 0;
2629 X(in_recovery)
2630 /* phase 1: locate tail & start new epoch */
2631 if (log->in_recovery)
2632 {
2633 if ((retval=locate_tail(log)) != RVM_SUCCESS)
2634 goto err_exit1;
2635 assert((status->trunc_state & RVM_TRUNC_PHASES)
2636 == RVM_TRUNC_FIND_TAIL);
2637 }
2638 assert(log->trunc_thread == cthread_self());
2639 if (rvm_chk_sigint != NULL) /* test for interrupt */
2640 if ((*rvm_chk_sigint)(NULL)) goto err_exit1;
2641 /* see if truncation actually needed */
2642 if (RVM_OFFSET_EQL(status->log_tail,status->log_head))
2643 status->log_empty = rvm_true;
2644 else
2645 {
2646 status->log_empty = rvm_false;
2647 do_truncation = rvm_true;
2648 new_1st_rec_num = status->next_rec_num;
2649
2650 /* switch epochs */
2651 if ((retval=new_epoch(log,count)) != RVM_SUCCESS)
2652 goto err_exit1;
2653 assert(log->trunc_thread == cthread_self());
2654 }
2655
2656 X(err_exit1)
2657 err_exit1:;
2658 /* signal `initiate_truncation' that the first part is done */
2659 if (is_daemon)
2660 {
2661 mutex_lock(&daemon->lock);
2662 assert(log->daemon.thread == cthread_self());
2663 assert(daemon->state == truncating);
2664 assert((status->trunc_state & RVM_ASYNC_TRUNCATE) != 0);
2665 condition_signal(&daemon->flush_flag);
2666 mutex_unlock(&daemon->lock);
2667 }
2668 }); /* end dev_lock crit sec */
2669
2670 if (retval != RVM_SUCCESS) goto err_exit;
2671 if (rvm_chk_sigint != NULL) /* test for interrupt */
2672 if ((*rvm_chk_sigint)(NULL)) goto err_exit;
2673 /* do log scan if truncation actually needed */
2674 if (do_truncation)
2675 {
2676 X(do_trunc)
2677 /* build tree and time */
2678 kretval= gettimeofday(&tmp_time,(struct timezone *)NULL);
2679 if (kretval != 0) assert(0); /* return RVM_EIO; */
2680 X(build_tree)
2681 if ((retval=build_tree(log)) != RVM_SUCCESS) /* phase 2 */
2682 assert(0); /* return retval; */
2683 X(build_tree done)
2684 assert(log->trunc_thread == cthread_self());
2685 assert((status->trunc_state & RVM_TRUNC_PHASES)
2686 == RVM_TRUNC_BUILD_TREE);
2687
2688 kretval= gettimeofday(&end_time,(struct timezone *)NULL);
2689 if (kretval != 0) assert(0); /* return RVM_EIO; */
2690 end_time = sub_times(&end_time,&tmp_time);
2691 last_tree_build_time = round_time(&end_time);
2692 if (rvm_chk_sigint != NULL) /* test for interrupt */
2693 if ((*rvm_chk_sigint)(NULL)) goto err_exit;
2694
2695 /* apply tree and time */
2696 kretval= gettimeofday(&tmp_time,(struct timezone *)NULL);
2697 if (kretval != 0) assert(0); /* return RVM_EIO; */
2698 X(apply_mods)
2699 if ((retval=apply_mods(log)) != RVM_SUCCESS) /* phase 3 */
2700 goto err_exit;
2701 X(apply_mods end)
2702 assert(log->trunc_thread == cthread_self());
2703 assert((status->trunc_state & RVM_TRUNC_PHASES)
2704 == RVM_TRUNC_APPLY);
2705 kretval= gettimeofday(&end_time,(struct timezone *)NULL);
2706 if (kretval != 0) assert(0); /* return RVM_EIO; */
2707 end_time = sub_times(&end_time,&tmp_time);
2708 last_tree_apply_time = round_time(&end_time);
2709 if (rvm_chk_sigint != NULL) /* test for interrupt */
2710 if ((*rvm_chk_sigint)(NULL)) goto err_exit;
2711 }
2712 else
2713 status->trunc_state =
2714 (status->trunc_state & ~RVM_TRUNC_PHASES)
2715 | RVM_TRUNC_APPLY;
2716 X(status_upd)
2717 /* always update the status */
2718 retval = status_update(log, new_1st_rec_num); /* phase 4 */
2719 assert(log->trunc_thread == cthread_self());
2720 assert((status->trunc_state & RVM_TRUNC_PHASES)
2721 == RVM_TRUNC_UPDATE);
2722 /* wake up any threads waiting on a truncation */
2723 err_exit:
2724 assert(log->trunc_thread == cthread_self());
2725 CRITICAL(daemon->lock, /* begin daemon->lock crit sec */
2726 {
2727 assert(log->trunc_thread == cthread_self());
2728 if (is_daemon)
2729 {
2730 assert(log->daemon.thread == cthread_self());
2731 assert((status->trunc_state & RVM_ASYNC_TRUNCATE) != 0);
2732 assert(daemon->state == truncating);
2733 if (retval != RVM_SUCCESS)
2734 daemon->state = error;
2735 }
2736 assert(log->trunc_thread == cthread_self());
2737 }); /* end daemon->lock crit sec */
2738
2739 log->trunc_thread = (cthread_t)NULL;
2740 status->trunc_state = ZERO;
2741 }); /* end truncation lock crit sec */
2742
2743 return retval;
2744 }
2745 #undef X
2746
2747
2748 /* rvm_truncate */
rvm_truncate()2749 rvm_return_t rvm_truncate()
2750 {
2751 rvm_return_t retval;
2752
2753 /* initial checks */
2754 if (bad_init())
2755 return RVM_EINIT;
2756 if (default_log == NULL)
2757 return RVM_ELOG;
2758
2759 /* flush any queued records */
2760 if ((retval=flush_log(default_log,
2761 &default_log->status.n_flush))
2762 != RVM_SUCCESS) return retval;
2763
2764 /* do truncation */
2765 retval = log_recover(default_log,
2766 &default_log->status.tot_rvm_truncate,
2767 rvm_false,RVM_TRUNCATE_CALL);
2768 return retval;
2769 }
2770
2771
2772 /* map & flush <--> truncation synchronization functions */
2773
2774 /* initiate asynchronous truncation */
initiate_truncation(log,threshold)2775 rvm_bool_t initiate_truncation(log,threshold)
2776 log_t *log; /* log descriptor */
2777 rvm_length_t threshold; /* log % truncation threshold */
2778 {
2779 log_daemon_t *daemon = &log->daemon; /* daemon control descriptor */
2780 rvm_bool_t did_init = rvm_false; /* true if initiated truncation */
2781
2782 /* test threshold for asynch truncation */
2783 if (!daemon->truncate || threshold < daemon->truncate)
2784 return rvm_false;
2785
2786 /* trigger a truncation if log at threshold */
2787 CRITICAL(daemon->lock, /* begin daemon->lock crit sec */
2788 {
2789 /* wake up daemon if idle */
2790 if (daemon->state == rvm_idle)
2791 {
2792 did_init = rvm_true;
2793 daemon->state = truncating;
2794 condition_signal(&daemon->code);
2795 condition_wait(&daemon->flush_flag,&daemon->lock);
2796 }
2797 }); /* end daemon->lock crit sec */
2798
2799 return did_init;
2800 }
2801 /* wait until truncation has processed all records up to time_stamp */
wait_for_truncation(log,time_stamp)2802 rvm_return_t wait_for_truncation(log,time_stamp)
2803 log_t *log; /* log descriptor */
2804 struct timeval *time_stamp; /* time threshold */
2805 {
2806 log_daemon_t *daemon = &log->daemon; /* deamon control descriptor */
2807 log_status_t *status = &log->status; /* log status descriptor */
2808 rvm_bool_t force_trunc = rvm_false; /* do syncronous truncation */
2809 rvm_bool_t exit_sw = rvm_false;
2810 rvm_return_t retval = RVM_SUCCESS;
2811
2812 while (!exit_sw)
2813 {
2814 CRITICAL(daemon->lock, /* begin daemon lock crit sec */
2815 {
2816 /* synchronous truncation if daemon not in use */
2817 if ((daemon->truncate == 0) || (daemon->state == rvm_idle))
2818 {
2819 force_trunc = rvm_true;
2820 goto exit_wait;
2821 }
2822
2823 /* wait for concurrent truncation completion */
2824 while (daemon->state == truncating)
2825 {
2826 condition_wait(&daemon->wake_up,&daemon->lock);
2827 }
2828 if (daemon->state == error)
2829 {
2830 retval = RVM_EINTERNAL; /* quit if daemon had error */
2831 goto exit_wait;
2832 }
2833
2834 /* see if records up to time threshold have been processed */
2835 if ((time_stamp == NULL) ||
2836 (TIME_GEQ(status->last_trunc,*time_stamp)))
2837 goto exit_wait; /* yes, exit */
2838
2839 /* no, must trigger another truncation */
2840 daemon->state = truncating;
2841 condition_signal(&daemon->code);
2842 goto exit_crit_sec;
2843
2844 exit_wait: exit_sw = rvm_true;
2845 exit_crit_sec:;
2846 }); /* end daemon lock crit sec */
2847 }
2848
2849 /* do synchronous truncation */
2850 if (force_trunc)
2851 retval = log_recover(log,&log->status.tot_sync_truncation,
2852 rvm_false,RVM_SYNC_TRUNCATE);
2853
2854 return retval;
2855 }
2856 /* truncation daemon */
log_daemon(void * arg)2857 void log_daemon(void *arg)
2858 {
2859 log_t *log = arg; /* log descriptor */
2860 log_daemon_t *daemon = &log->daemon; /* deamon control descriptor */
2861 daemon_state_t state; /* daemon state code */
2862 rvm_return_t retval __attribute__((unused));
2863
2864 #ifdef RVM_USELWP
2865 PRE_Concurrent(1);
2866 #endif
2867
2868 DO_FOREVER
2869 {
2870 /* wait to be awakened by request */
2871 CRITICAL(daemon->lock, /* begin daemon lock crit sec */
2872 {
2873 daemon->state = rvm_idle;
2874 condition_broadcast(&daemon->wake_up);
2875 while (daemon->state == rvm_idle) {
2876 condition_wait(&daemon->code, &daemon->lock);
2877 }
2878 state = daemon->state; /* end daemon lock crit sec */
2879 });
2880
2881 /* process request */
2882 switch (state)
2883 {
2884 case truncating: /* do a truncation */
2885 retval = log_recover(log,&log->status.tot_async_truncation,
2886 rvm_true,RVM_ASYNC_TRUNCATE);
2887
2888 CRITICAL(daemon->lock, state = daemon->state);
2889 if (state == error)
2890 cthread_exit(retval); /* error -- return code */
2891 if (state != terminate) break;
2892
2893 case terminate:
2894 #ifdef RVM_USELWP
2895 daemon->thread = NULL;
2896 #endif
2897 cthread_exit(RVM_SUCCESS); /* normal exit */
2898
2899 default: assert(rvm_false); /* error */
2900 }
2901 }
2902 }
2903