1 /* BLURB lgpl
2 
3                            Coda File System
4                               Release 6
5 
6           Copyright (c) 1987-2016 Carnegie Mellon University
7                   Additional copyrights listed below
8 
9 This  code  is  distributed "AS IS" without warranty of any kind under
10 the  terms of the  GNU  Library General Public Licence  Version 2,  as
11 shown in the file LICENSE. The technical and financial contributors to
12 Coda are listed in the file CREDITS.
13 
14                         Additional copyrights
15                            none currently
16 
17 #*/
18 
19 /*
20 *
21 *                       RVM log recovery support
22 *
23 */
24 #include <unistd.h>
25 #include <sys/types.h>
26 #include <sys/stat.h>
27 #include <fcntl.h>
28 #include <sys/file.h>
29 #include <sys/time.h>
30 #include <errno.h>
31 #include "rvm_private.h"
32 
33 #ifdef RVM_LOG_TAIL_BUG
34 #include <rvmtesting.h>
35 extern unsigned long *ClobberAddress;
36 #endif /* RVM_LOG_TAIL_BUG */
37 
38 /* global variables */
39 
40 extern log_t        *default_log;       /* default log descriptor ptr */
41 extern list_entry_t seg_root;           /* segment list */
42 extern rw_lock_t    seg_root_lock;      /* segment list lock */
43 extern rvm_bool_t   rvm_utlsw;          /* true if running in rvmutl */
44 extern char         *rvm_errmsg;        /* internal error message buffer */
45 
46 rvm_bool_t          rvm_no_yield = rvm_false; /* inhibit yields in recovery */
47 rvm_length_t        rvm_num_nodes;      /* number of nodes in change tree */
48 rvm_length_t        rvm_max_depth;      /* maximum depth of change tree */
49 
50 chk_vec_t           *rvm_chk_vec = NULL; /* monitor range vector */
51 rvm_length_t        rvm_chk_len = 0;    /* length of monitor range vector */
52 rvm_monitor_call_t  *rvm_monitor = NULL; /* call-back function ptr */
53 rvm_signal_call_t   *rvm_chk_sigint;    /* SIGINT test call (rvmutl only) */
54 rvm_length_t        truncation_times_vec[truncation_times_len]
55                                          = {truncation_times_dist};
56 rvm_bool_t          rvm_no_update;      /* no segment or log update if true */
57 rvm_bool_t          rvm_replay;         /* is replay if true */
58 rvm_bool_t          rvm_chk_sum;        /* force checksumming of all records */
59 rvm_bool_t          rvm_shadow_buf;     /* use shadow buffer */
60 
61 /* macros & locals */
62 
63 #ifndef ZERO
64 #define ZERO 0
65 #else
66 #endif
67 
68 /*static rvm_length_t     nv_local_max = NV_LOCAL_MAX;*/
69 static struct timeval   trunc_start_time;
70 static rvm_length_t     last_tree_build_time;
71 static rvm_length_t     last_tree_apply_time;
72 
73 #define NODES_PER_YIELD 1000000
74 static rvm_length_t num_nodes = NODES_PER_YIELD;
75 /* test if modification range will change monitored addresses */
76 /* nv_addr   - vm address */
77 /* nv_len    - length of vm range */
78 /* nv_data   - nv data in vm */
79 /* nv_offset - offset of data in log */
80 /* rec_hdr   - ptr to record header if not null */
81 /* msg       - invocation message */
monitor_vmaddr(char * nv_addr,rvm_length_t nv_len,char * nv_data,rvm_offset_t * nv_offset,rec_hdr_t * rec_hdr,char * msg)82 static void monitor_vmaddr(char *nv_addr, rvm_length_t nv_len,
83                            char *nv_data, rvm_offset_t *nv_offset,
84                            rec_hdr_t *rec_hdr, char *msg)
85 {
86     rvm_length_t    last_chk_addr;
87     rvm_length_t    last_nv_addr;
88     rvm_length_t    i;
89 
90     /* check monitored ranges for specified range */
91     for (i=0; i < rvm_chk_len; i++)
92     {
93         if (rvm_chk_sigint != NULL)
94             if ((*rvm_chk_sigint)(NULL)) return; /* test for interrupt */
95 
96         last_chk_addr = (rvm_length_t)RVM_ADD_LENGTH_TO_ADDR(
97                          rvm_chk_vec[i].vmaddr,rvm_chk_vec[i].length);
98         last_nv_addr =
99             (rvm_length_t)RVM_ADD_LENGTH_TO_ADDR(nv_addr,nv_len);
100 
101         if ((((rvm_length_t)rvm_chk_vec[i].vmaddr
102               >= (rvm_length_t)nv_addr)
103              && ((rvm_length_t)rvm_chk_vec[i].vmaddr < last_nv_addr))
104             ||  ((last_chk_addr > (rvm_length_t)nv_addr)
105              && (last_chk_addr < last_nv_addr))
106             ) {
107 
108             /* found modification, call print support */
109             if (nv_data != NULL)        /* check bytes offset */
110                 nv_data = RVM_ADD_LENGTH_TO_ADDR(nv_data,
111                               BYTE_SKEW(nv_addr));
112             (*rvm_monitor)((rvm_length_t)nv_addr,nv_len,nv_data,
113                            nv_offset,rec_hdr,i,msg);
114         }
115     }
116 
117     return;
118 }
119 /* allocate log recovery buffers */
120 char                *tst_buf;           /* debug temp */
alloc_log_buf(log)121 rvm_return_t alloc_log_buf(log)
122     log_t           *log;               /* log descriptor */
123     {
124     log_buf_t       *log_buf = &log->log_buf; /* log buffer descriptor */
125 
126     if ((log_buf->buf=page_alloc(log_buf->length)) == NULL)
127         return RVM_ENO_MEMORY;
128 #ifdef SPECIAL_DEBUG
129     if ((log_buf->shadow_buf=page_alloc(log_buf->length)) == NULL)
130         return RVM_ENO_MEMORY;
131     if ((tst_buf=page_alloc(log_buf->length)) == NULL)
132         return RVM_ENO_MEMORY;
133 #endif /* SPECIAL_DEBUG */
134     log_buf->buf_len = RVM_MK_OFFSET(0,log_buf->length);
135 
136     if ((log_buf->aux_buf=page_alloc(log_buf->aux_length)) == NULL)
137         return RVM_ENO_MEMORY;
138 
139     /* write-protect the buffers */
140 /* I've taken out the mach-specific code, but it might be interesting to
141  * implement this feature on other systems using mprotect. Therefore I've
142  * `retained' the essence of the original code in this comment -- JH
143  *
144  * MACH_RVM_PROTECT
145  *
146  * protect(log_buf->buf,        log_buf->length,     FALSE, VM_PROT_READ);
147  *
148  * #ifdef SPECIAL_DEBUG
149  * protect(log_buf->shadow_buf, log_buf->length,     FALSE, VM_PROT_READ);
150  * protect(tst_buf,             log_buf->length,     FALSE, VM_PROT_READ);
151  * #endif SPECIAL_DEBUG
152  *
153  * protect(log_buf->aux_buf,    log_buf->aux_length, FALSE, VM_PROT_READ);
154  */
155 
156     return RVM_SUCCESS;
157     }
158 
159 /* free log recovery buffer */
free_log_buf(log)160 void free_log_buf(log)
161     log_t           *log;               /* log descriptor */
162     {
163     log_buf_t       *log_buf = &log->log_buf; /* log buffer descriptor */
164 
165     if (log_buf->buf != NULL)
166         {
167         page_free(log_buf->buf,log_buf->length);
168         log_buf->buf = NULL;
169         log_buf->length = 0;
170         RVM_ZERO_OFFSET(log_buf->buf_len);
171         log_buf->ptr = -1;
172         }
173 
174     if (log_buf->aux_buf != NULL)
175         {
176         page_free(log_buf->aux_buf,log_buf->aux_length);
177         log_buf->aux_buf = NULL;
178         log_buf->aux_length = 0;
179         }
180     }
181 /* init log buffer with desired offset data from log */
init_buffer(log,offset,direction,synch)182 rvm_return_t init_buffer(log,offset,direction,synch)
183     log_t           *log;               /* log descriptor */
184     rvm_offset_t    *offset;            /* offset in log to load */
185     rvm_bool_t      direction;          /* true ==> forward */
186     rvm_bool_t      synch;              /* true ==> synchronization required */
187     {
188     log_buf_t       *log_buf = &log->log_buf; /* log buffer descriptor */
189     rvm_length_t    length;             /* length of buffer */
190     rvm_offset_t    read_len;           /* read length calculation temp */
191     rvm_return_t    retval = RVM_SUCCESS; /* return value */
192 
193     assert(RVM_OFFSET_GEQ(*offset,log->status.log_start));
194     assert(RVM_OFFSET_LEQ(*offset,log->dev.num_bytes));
195     assert(log->trunc_thread == cthread_self());
196 
197     /* calculate buffer read length and ptr */
198     log_buf->ptr = OFFSET_TO_SECTOR_INDEX(*offset);
199     if (direction == FORWARD)
200         {                               /* forward */
201         log_buf->offset = CHOP_OFFSET_TO_SECTOR_SIZE(*offset);
202         if (RVM_OFFSET_EQL(log_buf->offset,log->dev.num_bytes))
203             read_len = log->status.log_size;
204         else
205             read_len = RVM_SUB_OFFSETS(log->dev.num_bytes,
206                                        log_buf->offset);
207         }
208     else
209         {                               /* reverse */
210         log_buf->offset = ROUND_OFFSET_TO_SECTOR_SIZE(*offset);
211         if (RVM_OFFSET_EQL(log_buf->offset,log->status.log_start))
212             log_buf->offset = log->dev.num_bytes;
213         if (RVM_OFFSET_EQL(log_buf->offset,log->dev.num_bytes))
214             read_len = log->status.log_size;
215         else
216             read_len = RVM_SUB_OFFSETS(log_buf->offset,
217                                        log->status.log_start);
218         }
219 
220     /* get actual length to read */
221     if (RVM_OFFSET_GTR(read_len,log_buf->buf_len))
222         length = log_buf->length;
223     else
224         length = RVM_OFFSET_TO_LENGTH(read_len);
225     /* set offset of read for reverse fill */
226     if (direction == REVERSE)
227         {
228         log_buf->offset = RVM_SUB_LENGTH_FROM_OFFSET(log_buf->offset,
229                                                      length);
230         if (log_buf->ptr == 0)
231             log_buf->ptr = length;
232         else
233             log_buf->ptr += (length-SECTOR_SIZE);
234         }
235 
236     /* lock device & allow swap if necessary */
237     if (synch)
238         {
239         if (!rvm_no_yield) cthread_yield();
240         assert(log->trunc_thread == cthread_self());
241         mutex_lock(&log->dev_lock); /* begin dev_lock crit sec */
242         assert(log->trunc_thread == cthread_self());
243         }
244 
245     /* allow write to buffer */
246 /* MACH_RVM_PROTECT
247  *
248  * protect(log_buf->buf, log_buf->length, FALSE,
249  *         VM_PROT_WRITE | VM_PROT_READ);
250  */
251 
252     /* read data from log device */
253     if ((log_buf->r_length=read_dev(&log->dev,&log_buf->offset,
254                                    log_buf->buf,length)) < 0)
255         {
256         retval = RVM_EIO;               /* i/o error */
257         log_buf->r_length = 0;          /* buffer invalid */
258         }
259     assert(log->trunc_thread == cthread_self());
260 
261     /* write protect buffer & unlock */
262 /* MACH_RVM_PROTECT
263  *
264  * protect(log_buf->buf, log_buf->length, FALSE, VM_PROT_READ);
265  *
266  * #ifdef SPECIAL_DEBUG
267  * / * re-read into shadow buffer & compare * /
268  * if (rvm_shadow_buf)
269  * {
270  *     ret = vm_protect(task_self_,(vm_address_t)(log_buf->shadow_buf),
271  *                      (vm_size_t)(log_buf->length),FALSE,
272  *                      VM_PROT_WRITE | VM_PROT_READ);
273  *     assert(ret == KERN_SUCCESS);
274  *     if ((r_length=read_dev(&log->dev,&log_buf->offset,
275  *                            log_buf->shadow_buf,length)) < 0)
276  *     {
277  *         retval = RVM_EIO;               / * i/o error * /
278  *         assert(rvm_false);
279  *     }
280  *     assert(r_length == length);
281  *     assert(r_length == log_buf->r_length);
282  *     ret = vm_protect(task_self_,(vm_address_t)(log_buf->shadow_buf),
283  *                      (vm_size_t)(log_buf->length),FALSE,VM_PROT_READ);
284  *     assert(ret == KERN_SUCCESS);
285  *     assert(memcmp(log_buf->buf,log_buf->shadow_buf,length) == 0);
286  * }
287  * #endif SPECIAL_DEBUG
288  */
289 
290     if (synch)
291         mutex_unlock(&log->dev_lock);   /* end dev_lock crit sec */
292     assert(log->trunc_thread == cthread_self());
293 
294     return retval;
295     }
296 /* refill buffer in scan direction */
refill_buffer(log,direction,synch)297 static rvm_return_t refill_buffer(log,direction,synch)
298     log_t           *log;               /* log descriptor */
299     rvm_bool_t      direction;          /* true ==> forward */
300     rvm_bool_t      synch;              /* true ==> synchronization required */
301     {
302     log_buf_t       *log_buf = &log->log_buf; /* log buffer descriptor */
303     rvm_offset_t    offset;             /* new buffer offset temp */
304 
305     /* compute new offset for buffer fill */
306     offset = RVM_ADD_LENGTH_TO_OFFSET(log_buf->offset,log_buf->ptr);
307 
308     /* fill the buffer */
309     return init_buffer(log,&offset,direction,synch);
310     }
311 /* compare buf & shadow buf from gdb */
312 #ifdef DEBUG_GDB
log_buf_cmp(disp)313 int log_buf_cmp(disp)
314     int             disp;
315     {
316     log_buf_t       *log_buf = &default_log->log_buf;
317     int             i;
318 
319     if (disp < 0) disp = 0;
320     for (i=disp;i<log_buf->r_length;i++)
321         if (log_buf->buf[i] != log_buf->shadow_buf[i])
322             return i;
323 
324     return -1;
325     }
326 
327 /* compare with disk */
disk_buf_cmp(buf,disp)328 int disk_buf_cmp(buf,disp)
329     char            *buf;
330     int             disp;
331     {
332     log_buf_t       *log_buf = &default_log->log_buf;
333     int             i;
334     int             r_length;
335 
336     /* allow write to buffer */
337 /* MACH_RVM_PROTECT
338  *
339  * protect(log_buf->buf, log_buf->length, FALSE, VM_PROT_WRITE | VM_PROT_READ);
340  */
341 
342     /* read buffer from log */
343     if ((r_length=read_dev(&default_log->dev,&log_buf->offset,
344                            tst_buf,log_buf->r_length)) < 0)
345         assert(rvm_false);          /* i/o error */
346     assert(r_length == log_buf->r_length);
347 
348     /* re-protect buffer */
349 /* MACH_RVM_PROTECT
350  *
351  * protect(log_buf->buf, log_buf->length, FALSE, VM_PROT_READ);
352  */
353 
354     /* compare results */
355     if (disp < 0) disp = 0;
356     for (i=disp;i<log_buf->r_length;i++)
357         if (buf[i] != tst_buf[i])
358             return i;
359 
360     return -1;
361     }
362 /* locate byte in buffer via gdb */
find_byte(chr,buf,disp,max_len)363 int find_byte(chr,buf,disp,max_len)
364     char            chr;
365     char            *buf;
366     int             disp;
367     int             max_len;
368     {
369     int             i;
370 
371     if (disp < 0) disp = 0;
372     for (i=disp;i<max_len;i++)
373         if (chr == buf[i])
374             return i;
375 
376     return -1;
377     }
378 
379 /* locate word in buffer via gdb */
find_word(wrd,buf,disp,max_len)380 int find_word(wrd,buf,disp,max_len)
381     rvm_length_t    wrd;
382     rvm_length_t    *buf;
383     int             disp;
384     int             max_len;
385     {
386     int             i;
387 
388     if (disp < 0) disp = 0;
389     for (i=disp/sizeof(rvm_length_t);i<max_len/sizeof(rvm_length_t);i++)
390         if (wrd == buf[i])
391             return i;
392 
393     return -1;
394     }
395 
396 /* find word in log buffer via gdb */
find_buf_word(wrd,disp)397 int find_buf_word(wrd,disp)
398     rvm_length_t    wrd;
399     int             disp;
400     {
401     log_buf_t       *log_buf = &default_log->log_buf;
402 
403     return find_word(wrd, (rvm_length_t *)log_buf->buf,disp,log_buf->r_length);
404     }
405 #endif /* DEBUG_GDB */
406 /* load log auxillary buffer */
load_aux_buf(log,log_offset,length,aux_ptr,data_len,synch,pre_load)407 rvm_return_t load_aux_buf(log,log_offset,length,aux_ptr,
408                                  data_len,synch,pre_load)
409     log_t           *log;               /* log descriptor */
410     rvm_offset_t    *log_offset;        /* buffer read offset */
411     rvm_length_t    length;             /* data length wanted */
412     rvm_length_t    *aux_ptr;           /* ptr to aux. buf offset */
413     rvm_length_t    *data_len;          /* ptr to actual data length read */
414     rvm_bool_t      synch;              /* true ==> synchronization required */
415     rvm_bool_t      pre_load;           /* permit pre-loading of range */
416     {
417     log_buf_t       *log_buf = &log->log_buf; /* log buffer descriptor */
418     rvm_offset_t    high_offset;        /* end of read area */
419     rvm_length_t    read_len;           /* buffer read length */
420     rvm_return_t    retval = RVM_SUCCESS;
421 
422     assert(log->trunc_thread == cthread_self());
423 
424     /* check offset */
425     if (RVM_OFFSET_GTR(*log_offset,log->dev.num_bytes))
426         {
427         *aux_ptr = -1;                  /* out of bounds -- partial record */
428         return RVM_SUCCESS;
429         }
430 
431     /* see if request is already in buffer */
432     high_offset = RVM_ADD_LENGTH_TO_OFFSET(log_buf->aux_offset,
433                                            log_buf->aux_rlength);
434     if ((RVM_OFFSET_GEQ(*log_offset,log_buf->aux_offset))
435         && (RVM_OFFSET_LSS(*log_offset,high_offset)))
436         {
437         /* yes, have at least some of the data so report how much */
438         *aux_ptr = RVM_OFFSET_TO_LENGTH(
439                      RVM_SUB_OFFSETS(*log_offset,log_buf->aux_offset));
440         read_len = RVM_OFFSET_TO_LENGTH(
441                        RVM_SUB_OFFSETS(high_offset,*log_offset));
442         if (read_len < length)
443             *data_len = read_len;
444         else
445             *data_len = length;
446         return RVM_SUCCESS;
447         }
448 
449     /* if less than sector requested, see if pre-load permitted */
450     if (pre_load && (length < SECTOR_SIZE))
451         read_len = log_buf->aux_length; /* yes, fill buffer */
452     else
453         read_len = length;              /* no, just do what requested */
454     /* determine length and offset for log read */
455     log_buf->aux_offset = CHOP_OFFSET_TO_SECTOR_SIZE(*log_offset);
456     high_offset = RVM_ADD_LENGTH_TO_OFFSET(*log_offset,read_len);
457     high_offset = ROUND_OFFSET_TO_SECTOR_SIZE(high_offset);
458     if (RVM_OFFSET_GTR(high_offset,log->dev.num_bytes))
459         high_offset = log->dev.num_bytes; /* don't read past end of log */
460 
461     /* report actual length read and ptr into buffer */
462     read_len = RVM_OFFSET_TO_LENGTH(
463                 RVM_SUB_OFFSETS(high_offset,log_buf->aux_offset));
464     *aux_ptr = OFFSET_TO_SECTOR_INDEX(*log_offset);
465     if (read_len > log_buf->aux_length)
466         {
467         if ((read_len >= length)
468             && (length <= (log_buf->aux_length-SECTOR_SIZE)))
469             *data_len = length;
470         else
471             *data_len = log_buf->aux_length - *aux_ptr;
472         read_len = log_buf->aux_length;
473         }
474     else
475         *data_len = length;
476 
477     /* lock device and allow swap if necessary */
478     if (synch)
479         {
480         if (!rvm_no_yield) cthread_yield(); /* allow swap now */
481         assert(log->trunc_thread == cthread_self());
482         mutex_lock(&log->dev_lock); /* begin dev_lock crit sec */
483         assert(log->trunc_thread == cthread_self());
484         }
485 
486     /* allow write to buffer */
487 /* MACH_RVM_PROTECT
488  *
489  * protect(log_buf->aux_buf, log_buf->aux_length, FALSE,
490  *         VM_PROT_WRITE | VM_PROT_READ);
491  */
492 
493     /* read new value data from log */
494     if ((log_buf->aux_rlength=read_dev(&log->dev,&log_buf->aux_offset,
495                  log_buf->aux_buf,read_len)) < 0)
496         {
497         retval = RVM_EIO;
498         log_buf->aux_rlength = 0;
499         }
500     assert(log->trunc_thread == cthread_self());
501 
502     /* write protect buffer & unlock */
503 /* MACH_RVM_PROTECT
504  *
505  * protect(log_buf->aux_buf, log_buf->aux_length, FALSE, VM_PROT_READ);
506  */
507 
508     if (synch)
509         mutex_unlock(&log->dev_lock);   /* end dev_lock crit sec */
510     assert(log->trunc_thread == cthread_self());
511 
512     return retval;
513     }
514 
clear_aux_buf(log)515 void clear_aux_buf(log)
516     log_t           *log;               /* log descriptor */
517     {
518     log_buf_t       *log_buf = &log->log_buf; /* log buffer descriptor */
519 
520     RVM_ZERO_OFFSET(log_buf->aux_offset);
521     log_buf->aux_rlength = 0;
522     }
523 /* record header type validation */
chk_hdr_type(rec_hdr)524 static rvm_bool_t chk_hdr_type(rec_hdr)
525     rec_hdr_t       *rec_hdr;           /* generic record header */
526     {
527     switch (rec_hdr->struct_id)
528         {
529       case trans_hdr_id:                /* transaction header */
530         return rvm_true;
531       case log_seg_id:                  /* log segment dictionary entry */
532         return rvm_true;
533       case log_wrap_id:                 /* log wrap-aound marker */
534         return rvm_true;
535       default:                          /* unknown header type */
536         return rvm_false;
537         }
538     }
539 
540 /* test if record belongs to currently valid part of log */
chk_hdr_currency(log,rec_hdr)541 rvm_bool_t chk_hdr_currency(log,rec_hdr)
542     log_t           *log;               /* log descriptor */
543     rec_hdr_t       *rec_hdr;           /* generic record header */
544     {
545     log_status_t    *status = &log->status; /* status descriptor */
546 
547     /* be sure record number makes sense */
548     if ((status->first_rec_num != 0) &&
549         (rec_hdr->rec_num < status->first_rec_num))
550         return rvm_false;               /* obsolete record */
551 
552     /* be sure record written after previous truncation & before this one */
553     if (TIME_LSS(rec_hdr->timestamp,status->prev_trunc)
554         || TIME_GTR(rec_hdr->timestamp,status->last_trunc))
555         return rvm_false;                   /* obsolete record */
556 
557     return rvm_true;
558     }
559 
reset_hdr_chks(log)560 void reset_hdr_chks(log)
561     log_t           *log;               /* log descriptor */
562     {
563     log_buf_t       *log_buf = &log->log_buf; /* log buffer descriptor */
564 
565     log_buf->prev_rec_num = 0;
566     ZERO_TIME(log_buf->prev_timestamp);
567     }
568 /* test if record is out of sequence in log */
chk_hdr_sequence(log,rec_hdr,direction)569 rvm_bool_t chk_hdr_sequence(log,rec_hdr,direction)
570     log_t           *log;               /* log descriptor */
571     rec_hdr_t       *rec_hdr;           /* generic record header */
572     rvm_bool_t      direction;          /* scan direction */
573     {
574     log_buf_t       *log_buf = &log->log_buf; /* recovery buffer descriptor */
575 
576     /* check record number closely */
577     if ((log_buf->prev_rec_num != 0) &&
578         (((direction == FORWARD)
579           && (rec_hdr->rec_num != log_buf->prev_rec_num+1))
580          || ((direction == REVERSE)
581              && (rec_hdr->rec_num != log_buf->prev_rec_num-1))))
582         return rvm_false;                   /* sequence error */
583 
584     /* check record write time closely */
585     if ((!TIME_EQL_ZERO(log_buf->prev_timestamp)) &&
586         (((direction == FORWARD)
587           && TIME_LSS(rec_hdr->timestamp,log_buf->prev_timestamp))
588          || ((direction == REVERSE)
589              && TIME_GTR(rec_hdr->timestamp,log_buf->prev_timestamp))))
590         return rvm_false;                   /* sequence error */
591 
592     return rvm_true;
593     }
594 /* record header validation */
chk_hdr(log,rec_hdr,rec_end,direction)595 static rvm_bool_t chk_hdr(log,rec_hdr,rec_end,direction)
596     log_t           *log;               /* log descriptor */
597     rec_hdr_t       *rec_hdr;           /* generic record header */
598     rec_end_t       *rec_end;           /* generic record end marker */
599     rvm_bool_t      direction;          /* scan direction */
600     {
601 
602     /* be sure record type valid */
603     if (!chk_hdr_type(rec_hdr))
604         return rvm_false;
605 
606     /* checks for normal operation only */
607     if (!rvm_utlsw)
608         {
609         /* make sure record current */
610         if (chk_hdr_currency(log,rec_hdr) != rvm_true)
611             return rvm_false;               /* record obsolete */
612 
613         /* make sure record in proper sequence */
614         if (chk_hdr_sequence(log,rec_hdr,direction) != rvm_true)
615             return rvm_false;               /* sequence error */
616         }
617 
618     /* generic record head/end validation */
619     if ((rec_end != NULL) &&
620         ((rec_end->rec_hdr.struct_id != rec_end_id)
621         || (rec_hdr->struct_id != rec_end->rec_type)
622         || (rec_hdr->rec_num != rec_end->rec_hdr.rec_num)
623         || (rec_hdr->rec_length != rec_end->rec_hdr.rec_length)
624         || (!TIME_EQL(rec_hdr->timestamp,rec_end->rec_hdr.timestamp))))
625         return rvm_false;
626 
627     return rvm_true;
628     }
629 /* log record header validation */
validate_hdr(log,rec_hdr,rec_end,direction)630 rvm_bool_t validate_hdr(log,rec_hdr,rec_end,direction)
631     log_t           *log;               /* log descriptor */
632     rec_hdr_t       *rec_hdr;           /* generic record header */
633     rec_end_t       *rec_end;           /* generic record end marker */
634     rvm_bool_t      direction;          /* scan direction */
635     {
636     log_buf_t       *log_buf = &log->log_buf; /* recovery buffer descriptor */
637 
638     /* clear sequence checking hide-a-ways if direction reversed */
639     if (direction != log_buf->prev_direction)
640         reset_hdr_chks(log);
641 
642     /* do basic record header checks */
643     if (!chk_hdr(log,rec_hdr,rec_end,direction))
644         return rvm_false;               /* header invalid */
645 
646     /* type-specific validation */
647     switch (rec_hdr->struct_id)
648         {
649       case trans_hdr_id:                /* transaction header */
650         break;
651       case log_seg_id:                  /* log segment dictionary entry */
652         break;
653       case log_wrap_id:                 /* log wrap-aound marker */
654         goto exit;
655       default:                          /* unknown/improper header type */
656         return rvm_false;
657         }
658 
659     /* update buffer ptr and previous record state */
660     if (direction == FORWARD)           /* forward, return header position */
661         log_buf->ptr = (long)rec_hdr - (long)log_buf->buf;
662     else                                /* reverse, return end marker pos. */
663         log_buf->ptr = (long)rec_end - (long)log_buf->buf;
664 
665   exit:
666     log_buf->prev_rec_num = rec_hdr->rec_num;
667     log_buf->prev_timestamp = rec_hdr->timestamp;
668     log_buf->prev_direction = direction;
669 
670     return rvm_true;
671     }
672 /* get next new value range by forward scan of transaction record
673    ptr points to next range header
674    exits with as much of range in buffer as will fit */
scan_nv_forward(log,synch)675 rvm_return_t scan_nv_forward(log,synch)
676     log_t           *log;               /* log descriptor */
677     rvm_bool_t      synch;              /* true ==> synchronization required */
678     {
679     log_buf_t       *log_buf = &log->log_buf; /* log buffer descriptor */
680     rvm_offset_t    offset;             /* offset calculation temp */
681     rec_hdr_t       *rec_hdr;           /* temporary cast for record header */
682     rvm_return_t    retval;             /* return value */
683 
684     /* see if new header is entirely within buffer */
685     if ((log_buf->ptr+sizeof(rec_hdr_t)) >= log_buf->r_length)
686         {
687         /* no, refill buffer */
688         offset = RVM_ADD_LENGTH_TO_OFFSET(log_buf->offset,
689                                           log_buf->ptr);
690         if ((retval=init_buffer(log,&offset,FORWARD,synch))
691             != RVM_SUCCESS) return retval;
692         }
693 
694     /* check header */
695     rec_hdr = (rec_hdr_t *)&log_buf->buf[log_buf->ptr];
696     switch (rec_hdr->struct_id)
697         {
698       case nv_range_id:     break;
699       case rec_end_id:      return RVM_SUCCESS;
700 
701       default:              return RVM_SUCCESS; /* need better reporting */
702         }
703 
704     /* get whole range in buffer */
705     if ((log_buf->ptr+rec_hdr->rec_length) > log_buf->r_length)
706         {
707         if ((retval=refill_buffer(log,FORWARD,synch))
708             != RVM_SUCCESS) return retval;
709         }
710 
711     return RVM_SUCCESS;
712     }
713 /* get previous new value range by reverse scan of transaction record
714    ptr points to previous range header; exits with range in buffer */
scan_nv_reverse(log,synch)715 static rvm_return_t scan_nv_reverse(log,synch)
716     log_t          *log;                /* log descriptor */
717     rvm_bool_t      synch;              /* true ==> synchronization required */
718     {
719     log_buf_t       *log_buf = &log->log_buf; /* log buffer descriptor */
720     rec_hdr_t       *rec_hdr;           /* temporary cast for record header */
721     long            len=0;                /* back displacement to prev. hdr */
722     rvm_offset_t    offset;             /* offset calculation temp */
723     rvm_return_t    retval;             /* return value */
724 
725     /* get new header position */
726     rec_hdr = (rec_hdr_t *)&log_buf->buf[log_buf->ptr];
727     switch (rec_hdr->struct_id)
728     {
729       case rec_end_id:
730         len = ((rec_end_t *)rec_hdr)->sub_rec_len;
731         break;
732 
733       case nv_range_id:
734         len = ((nv_range_t *)rec_hdr)->sub_rec_len;
735         break;
736 
737       default:
738         assert(rvm_false);               /* trouble -- log damage? */
739     }
740 
741     /* see if new header is entirely within buffer */
742     if ((log_buf->ptr-len) < 0)
743         {
744         /* no, refill buffer according to length of data */
745         if ((len-sizeof(nv_range_t)) <= NV_LOCAL_MAX)
746             {                           /* small, get data into buffer */
747             if ((retval=refill_buffer(log,REVERSE,synch))
748                 != RVM_SUCCESS) return retval;
749             log_buf->ptr -= len;
750             }
751         else
752             {                           /* large, skip data for now */
753             offset = RVM_ADD_LENGTH_TO_OFFSET(log_buf->offset,
754                          (log_buf->ptr+sizeof(nv_range_t)));
755             offset = RVM_SUB_LENGTH_FROM_OFFSET(offset,len);
756             if ((retval=init_buffer(log,&offset,REVERSE,synch))
757                 != RVM_SUCCESS) return retval;
758             log_buf->ptr -= sizeof(nv_range_t);
759             }
760         }
761     else log_buf->ptr -= len;
762     /* exit pointing to new header */
763     rec_hdr = (rec_hdr_t *)&log_buf->buf[log_buf->ptr];
764     if (rec_hdr->struct_id == trans_hdr_id)
765         return RVM_SUCCESS;
766     assert(rec_hdr->struct_id == nv_range_id);
767 
768     return RVM_SUCCESS;
769     }
770 /* validate record in buffer in forward scan */
validate_rec_forward(log,synch)771 static rvm_return_t validate_rec_forward(log,synch)
772     log_t           *log;               /* log descriptor */
773     rvm_bool_t      synch;              /* true ==> synchronization required */
774     {
775     log_buf_t       *log_buf = &log->log_buf; /* log buffer descriptor */
776     rec_hdr_t       *rec_hdr;           /* temporary cast for next record hdr */
777     rec_end_t       *rec_end = NULL;    /* temporary cast for record end */
778     rvm_offset_t    end_offset;         /* temporary for caluculating end */
779     rvm_return_t    retval;
780     long            tmp_ptr;
781     rvm_length_t    tmp_len;
782 
783     /* see if next header is entirely within buffer */
784     if ((log_buf->ptr + MAX_HDR_SIZE) > log_buf->r_length)
785         {
786         /* no, re-init buffer */
787         end_offset = RVM_ADD_LENGTH_TO_OFFSET(log_buf->offset,
788                                               log_buf->ptr);
789        if ((retval=init_buffer(log,&end_offset,FORWARD,synch))
790            != RVM_SUCCESS) return retval;
791         }
792 
793     /* check header type */
794     rec_hdr = (rec_hdr_t *)&log_buf->buf[log_buf->ptr];
795     if (rec_hdr->struct_id == log_wrap_id)
796         goto validate;                  /* skip rec_end stuff for wrap */
797     if (!chk_hdr(log,rec_hdr,NULL,FORWARD))
798         goto no_record;                 /* no next record */
799 
800     /* see if record will fit in buffer */
801     if ((ROUND_TO_SECTOR_SIZE(rec_hdr->rec_length+sizeof(rec_end_t))
802          + SECTOR_SIZE)
803         <= log_buf->length)
804         {
805         /* yes, get whole record in buffer */
806         if ((log_buf->ptr+rec_hdr->rec_length+sizeof(rec_end_t))
807             > log_buf->length)
808             {
809             /* refill buffer */
810             if ((retval=refill_buffer(log,FORWARD,synch))
811                 != RVM_SUCCESS) return retval;
812             rec_hdr = (rec_hdr_t *)&log_buf->buf[log_buf->ptr];
813             }
814         tmp_ptr = log_buf->ptr + rec_hdr->rec_length;
815         rec_end = (rec_end_t *)&log_buf->buf[tmp_ptr];
816         }
817     else
818         {
819         /* no, won't fit -- read rec_end into aux buffer for validation */
820         end_offset = RVM_ADD_LENGTH_TO_OFFSET(log_buf->offset,
821                          log_buf->ptr+rec_hdr->rec_length);
822 
823         /* check offset alignment to see if rec_hdr is trash */
824         tmp_ptr = RVM_OFFSET_TO_LENGTH(end_offset);
825         if (tmp_ptr != CHOP_TO_LENGTH(tmp_ptr))
826             goto no_record;             /* end marker alignment wrong */
827         retval = load_aux_buf(log, &end_offset, sizeof(rec_end_t),
828 			      &tmp_ptr, &tmp_len, synch, rvm_false);
829         if (retval != RVM_SUCCESS) return retval;
830         if (tmp_ptr == -1)
831             goto no_record;             /* record end not available */
832         rec_end = (rec_end_t *)&log_buf->aux_buf[tmp_ptr];
833         }
834 
835     /* validate whole record now that end is available */
836   validate:
837     if (validate_hdr(log,rec_hdr,rec_end,FORWARD))
838         return RVM_SUCCESS;
839 
840   no_record:                            /* no next record */
841     log_buf->ptr = -1;
842     return RVM_SUCCESS;
843     }
844 /* scan forward from present position at a record structure
845    returns updated offset indexed by ptr; -1 ==> no next rec. */
scan_forward(log,synch)846 rvm_return_t scan_forward(log,synch)
847     log_t           *log;               /* log descriptor */
848     rvm_bool_t      synch;              /* true ==> synchronization required */
849     {
850     log_buf_t       *log_buf = &log->log_buf; /* log buffer descriptor */
851     rec_hdr_t       *rec_hdr;           /* cast for next record hdr */
852     rvm_return_t    retval;
853 
854     assert(log_buf->ptr != -1);         /* invalid position */
855     rec_hdr = (rec_hdr_t *)&log_buf->buf[log_buf->ptr];
856     switch (rec_hdr->struct_id)
857         {
858       case trans_hdr_id: case log_seg_id:
859         log_buf->ptr += (rec_hdr->rec_length+sizeof(rec_end_t));
860         break;
861       case rec_end_id:
862         log_buf->ptr += sizeof(rec_end_t);
863         break;
864       case nv_range_id:                 /* scan past remaining ranges */
865         DO_FOREVER
866             {
867             if ((retval=scan_nv_forward(log,synch)) != RVM_SUCCESS)
868                 return retval;
869             rec_hdr = (rec_hdr_t *)&log_buf->buf[log_buf->ptr];
870             switch (rec_hdr->struct_id)
871                 {
872               case nv_range_id:
873                 log_buf->ptr += rec_hdr->rec_length;
874                 break;
875               case rec_end_id:
876                 log_buf->ptr += sizeof(rec_end_t);
877                 goto trans_done;
878               default:                  /* validate_rec_forward will handle */
879                 goto trans_done;
880                 }
881             }
882 trans_done:
883         break;
884       case log_wrap_id:
885         if ((retval=init_buffer(log,&log->status.log_start,
886                                 FORWARD,synch))
887                 != RVM_SUCCESS) return retval;
888         break;
889       default:
890         if (rvm_utlsw)
891             {
892             log_buf->ptr = -1;          /* utility can handle unknown records */
893             return RVM_SUCCESS;
894             }
895         assert(rvm_false);                  /* unknown record type */
896         }
897 
898     /* validate next record */
899     return validate_rec_forward(log,synch);
900     }
901 /* scan for wrap marker */
scan_wrap_reverse(log,synch)902 rvm_return_t scan_wrap_reverse(log,synch)
903     rvm_bool_t      synch;              /* true ==> synchronization required */
904     log_t           *log;               /* log descriptor */
905     {
906     log_buf_t       *log_buf = &log->log_buf; /* log buffer descriptor */
907     rec_hdr_t       *rec_hdr;           /* temporary cast for record header */
908     log_wrap_t      *log_wrap;          /* temporary cast for wrap marker */
909     long            tmp_ptr;            /* temporary buffer ptr */
910     rvm_return_t    retval;
911 
912     /* load last sectors of log */
913     if ((retval=init_buffer(log,&log->dev.num_bytes,
914                             REVERSE,synch))
915         != RVM_SUCCESS) return retval;
916 
917     /* scan for wrap marker */
918     /* for the purpose of locating the wrap marker, we use the (duplicated)
919        struct_id2 which, while positions at the end of the record, guarantees
920        that we must interpret it first, otherwise, we may possibly
921        mis-interpret other field of the record to have a struct_id of
922        log_wrap_id ! */
923     for (tmp_ptr = (log_buf->ptr - sizeof(log_wrap_t));
924          tmp_ptr >= 0; tmp_ptr -= sizeof(rvm_length_t))
925         {
926         log_wrap = (log_wrap_t *)&log_buf->buf[tmp_ptr];
927         if (log_wrap->struct_id2 == log_wrap_id)
928             {
929 		assert( (log_wrap->rec_hdr.struct_id==log_wrap_id) || rvm_utlsw );
930 		/* XXXX fix this */
931 #if 0
932 		if (!((log_wrap->struct_id == log_wrap_id) || rvm_utlsw)) {
933 		    printf("not true!\n");
934 		    assert(0);
935 		}
936 #endif
937 	    break;
938             }
939         }
940 
941     /* validate header if tmp_ptr legit */
942     if ((tmp_ptr >= 0) && (tmp_ptr < log_buf->r_length))
943         {
944 	    log_buf->ptr = tmp_ptr;
945 	    rec_hdr = (rec_hdr_t *)&log_buf->buf[log_buf->ptr];
946 	    if (!validate_hdr(log,rec_hdr,NULL,REVERSE))
947 		log_buf->ptr = -1;
948         }
949     else
950         /* no wrap marker found */
951         if (rvm_utlsw)
952             log_buf->ptr = -1;          /* utility can deal with it */
953         else assert(rvm_false);
954 
955     return RVM_SUCCESS;
956     }
957 /* validate current record in buffer in reverse scan */
validate_rec_reverse(log,synch)958 rvm_return_t validate_rec_reverse(log,synch)
959      rvm_bool_t      synch;              /* true ==> synchronization required */
960      log_t           *log;               /* log descriptor */
961 {
962     log_buf_t       *log_buf = &log->log_buf; /* log buffer descriptor */
963     log_status_t    *status = &log->status; /* status area */
964     rec_end_t       *rec_end = NULL;    /* temporary cast for record end */
965     rec_hdr_t       *rec_hdr;           /* temporary cast for record header */
966     long            tmp_ptr;            /* temporary buffer ptr */
967     rvm_length_t    tmp_len;
968     rvm_offset_t    offset;             /* temp for offset calculations */
969     rvm_return_t    retval;
970 
971     /* get previous end marker into buffer */
972     if ((long)(log_buf->ptr-sizeof(rec_end_t)) < 0)
973         {
974 	    offset = RVM_ADD_LENGTH_TO_OFFSET(log_buf->offset,
975 					      log_buf->ptr);
976 	    if (RVM_OFFSET_EQL(offset,status->log_start))
977 		{
978             retval=scan_wrap_reverse(log,synch);
979             return retval;              /* exit pointing to wrap marker */
980             }
981         else
982             {
983             if ((retval=init_buffer(log,&offset,REVERSE,synch))
984                 != RVM_SUCCESS) return retval;
985             }
986         }
987     log_buf->ptr -= sizeof(rec_end_t);
988 
989     /* check new end marker */
990     rec_end = (rec_end_t *)&log_buf->buf[log_buf->ptr];
991     if (rec_end->rec_hdr.struct_id != rec_end_id)
992         goto no_record;             /* no next record */
993     /* see if record will fit in buffer */
994     if ((ROUND_TO_SECTOR_SIZE(rec_end->rec_hdr.rec_length+sizeof(rec_end_t))
995         + SECTOR_SIZE) <= log_buf->length)
996         {
997         /* yes, get whole record in buffer */
998         if ((long)(log_buf->ptr - rec_end->rec_hdr.rec_length) < 0)
999             {
1000             /* refill buffer (be sure end marker is included) */
1001             log_buf->ptr += sizeof(rec_end_t);
1002             if ((retval=refill_buffer(log,REVERSE,synch))
1003                 != RVM_SUCCESS) return retval;
1004             log_buf->ptr -= sizeof(rec_end_t);
1005             rec_end = (rec_end_t *)&log_buf->buf[log_buf->ptr];
1006             }
1007         tmp_ptr = log_buf->ptr - rec_end->rec_hdr.rec_length;
1008         rec_hdr = (rec_hdr_t *)&log_buf->buf[tmp_ptr];
1009         }
1010     else
1011         {
1012         /* no, save rec_end for validation & get header in aux. buffer */
1013         offset = RVM_SUB_LENGTH_FROM_OFFSET(log_buf->offset,
1014                                             rec_end->rec_hdr.rec_length);
1015         offset = RVM_ADD_LENGTH_TO_OFFSET(offset,log_buf->ptr);
1016 
1017         /* check offset alignment to see if rec_end is trash */
1018         tmp_ptr = RVM_OFFSET_TO_LENGTH(offset);
1019         if (tmp_ptr != CHOP_TO_LENGTH(tmp_ptr))
1020             goto no_record;             /* header alignment wrong */
1021         retval = load_aux_buf(log, &offset, MAX_HDR_SIZE, &tmp_ptr, &tmp_len,
1022                               synch, rvm_false);
1023         if (retval != RVM_SUCCESS) return retval;
1024         if (tmp_ptr == -1)
1025             goto no_record;             /* record header not available */
1026         rec_hdr = (rec_hdr_t *)&log_buf->aux_buf[tmp_ptr];
1027         }
1028 
1029     /* validate whole record now that header is available */
1030     if (validate_hdr(log,rec_hdr,rec_end,REVERSE))
1031         return RVM_SUCCESS;
1032 
1033 no_record:
1034     log_buf->ptr = -1;               /* no next record */
1035     return RVM_SUCCESS;
1036     }
1037 /* scan backward from present position at a record structure
1038    returns index of offset in ptr; -1 ==> no next rec. */
scan_reverse(log,synch)1039 rvm_return_t scan_reverse(log,synch)
1040     log_t           *log;               /* log descriptor */
1041     rvm_bool_t      synch;              /* true ==> synchronization required */
1042     {
1043     log_buf_t       *log_buf = &log->log_buf; /* log buffer descriptor */
1044     log_status_t    *status = &log->status; /* status area */
1045     rec_hdr_t       *rec_hdr;           /* temporary cast for record header */
1046     rvm_offset_t    offset;             /* temp for offset calculations */
1047     rvm_return_t    retval;
1048 
1049     assert(log_buf->ptr != -1);         /* can't reposition from this! */
1050 
1051     /* test if scan starting from tail */
1052     offset = RVM_ADD_LENGTH_TO_OFFSET(log_buf->offset,log_buf->ptr);
1053     if (RVM_OFFSET_EQL(offset,status->prev_log_tail)
1054         || (rvm_utlsw && RVM_OFFSET_EQL(offset,status->log_tail)))
1055         return validate_rec_reverse(log,synch);
1056 
1057     /* test if at start of log & must wrap around */
1058     if ((RVM_OFFSET_EQL(log_buf->offset,status->log_start)) &&
1059         (log_buf->ptr == 0))
1060         {
1061         if ((retval=scan_wrap_reverse(log,synch)) != RVM_SUCCESS)
1062             return retval;
1063         return RVM_SUCCESS;             /* exit pointing to wrap marker */
1064         }
1065 
1066     /* move to previous record end marker */
1067     rec_hdr = (rec_hdr_t *)&log_buf->buf[log_buf->ptr];
1068     switch (rec_hdr->struct_id)
1069         {
1070       case trans_hdr_id: case log_seg_id:
1071       case log_wrap_id:
1072         break;
1073       case rec_end_id:
1074         if (((rec_end_t *)rec_hdr)->rec_type != trans_hdr_id)
1075             {                           /* record is always in buffer */
1076             log_buf->ptr -= rec_hdr->rec_length;
1077             break;
1078             }
1079       case nv_range_id:                 /* scan past remaining ranges */
1080         DO_FOREVER
1081             {
1082             if ((retval=scan_nv_reverse(log,synch)) != RVM_SUCCESS)
1083                 return retval;
1084             rec_hdr = (rec_hdr_t *)&log_buf->buf[log_buf->ptr];
1085             if (rec_hdr->struct_id == trans_hdr_id)
1086                 break;
1087             }
1088         break;
1089       default:
1090             {
1091             if (rvm_utlsw)
1092                 {
1093                 log_buf->ptr = -1;      /* utl can recover */
1094                 return RVM_SUCCESS;
1095                 }
1096             assert(rvm_false);          /* not at recognizable point in log */
1097             }
1098         }
1099 
1100     /* validate new record and set log_buf->ptr */
1101     return validate_rec_reverse(log,synch);
1102     }
1103 /* Recovery: phase 1 -- locate current log tail from last status block
1104      location */
1105 
1106 /* log_wrap status update for tail location */
set_wrap_status(status,rec_hdr)1107 static void set_wrap_status(status,rec_hdr)
1108     log_status_t    *status;            /* status descriptor */
1109     rec_hdr_t       *rec_hdr;           /* current record scanned in buffer */
1110     {
1111     status->wrap_time = rec_hdr->timestamp;
1112     status->n_special++;
1113     status->tot_wrap++;
1114     }
1115 /* range checksum computation & check */
range_chk_sum(log,nv,chk_val,synch)1116 static rvm_return_t range_chk_sum(log,nv,chk_val,synch)
1117     log_t           *log;               /* log descriptor */
1118     nv_range_t      *nv;                /* range header */
1119     rvm_bool_t      *chk_val;           /* result [out] */
1120     rvm_bool_t      synch;              /* true ==> synchronization required */
1121     {
1122     log_buf_t       *log_buf = &log->log_buf; /* log buffer descriptor */
1123     rvm_length_t    nv_chk_sum;         /* nv's check sum */
1124     rvm_length_t    chk_sum_temp = 0;   /* check sum temp */
1125     rvm_length_t    nv_length;          /* actual length of data */
1126     rvm_length_t    chk_length;         /* length of check summed range */
1127     rvm_length_t    align_skew;         /* initial alignment skew */
1128     rvm_return_t    retval;             /* return value */
1129 
1130     (*chk_val) = rvm_false;
1131     nv_chk_sum = nv->chk_sum;
1132     nv_length = nv->length;
1133     align_skew = BYTE_SKEW(RVM_OFFSET_TO_LENGTH(nv->offset));
1134     log_buf->ptr += sizeof(nv_range_t);
1135 
1136     /* do checksum over as many buffer loads as needed */
1137     DO_FOREVER
1138         {
1139         chk_length = log_buf->r_length - log_buf->ptr - align_skew;
1140         if (chk_length > nv_length) chk_length = nv_length;
1141         chk_sum_temp +=
1142             chk_sum(&log_buf->buf[log_buf->ptr+align_skew],
1143                     chk_length);
1144         nv_length -= chk_length;
1145         log_buf->ptr += (chk_length+align_skew);
1146         if (nv_length == 0) break;  /* done */
1147         if ((retval=refill_buffer(log,FORWARD,synch))
1148             != RVM_SUCCESS) return retval;
1149         align_skew = 0;             /* following buffers have no padding */
1150         }
1151     log_buf->ptr = ROUND_TO_LENGTH(log_buf->ptr);
1152 
1153     /* report result */
1154     if (nv_chk_sum == chk_sum_temp)
1155         (*chk_val) = rvm_true;
1156 
1157     return RVM_SUCCESS;
1158     }
1159 /* transaction validation & status update for tail location */
set_trans_status(log,rec_hdr)1160 static rvm_return_t set_trans_status(log,rec_hdr)
1161     log_t           *log;               /* log descriptor */
1162     rec_hdr_t        *rec_hdr;           /* current trans record in buffer */
1163     {
1164     log_buf_t       *log_buf = &log->log_buf; /* log buffer descriptor */
1165     log_status_t    *status = &log->status;   /* status descriptor */
1166     trans_hdr_t     trans_hdr;          /* copy of header */
1167     long            num_ranges = 0;     /* range scan counter */
1168     nv_range_t      *nv;                /* range header */
1169     rvm_bool_t      chk_val;            /* checksum test result */
1170     rvm_return_t    retval;             /* return value */
1171 
1172     /* keep copy of header to get status if ranges are OK */
1173     BCOPY((char *)rec_hdr,(char *)&trans_hdr,sizeof(trans_hdr_t));
1174 
1175     /* scan and check sum all ranges */
1176     log_buf->ptr += sizeof(trans_hdr_t);
1177     DO_FOREVER
1178         {
1179         if ((retval=scan_nv_forward(log,NO_SYNCH)) != RVM_SUCCESS)
1180             return retval;
1181         rec_hdr = (rec_hdr_t *)&(log_buf->buf[log_buf->ptr]);
1182         if (rec_hdr->struct_id == rec_end_id)
1183             break;                      /* done */
1184         if (rec_hdr->struct_id != nv_range_id)
1185             goto bad_record;            /* invalid record */
1186         nv = (nv_range_t *)rec_hdr;
1187         if (trans_hdr.rec_hdr.rec_num != nv->rec_hdr.rec_num)
1188             goto bad_record;            /* wrong transaction */
1189 
1190         /* test range's data check sum */
1191         if ((retval=range_chk_sum(log,nv,&chk_val,NO_SYNCH))
1192             != RVM_SUCCESS) return retval;
1193         if (chk_val != rvm_true) goto bad_record; /* check sum failure */
1194 
1195         num_ranges++;
1196         }
1197     /* be sure all ranges are present */
1198     if (num_ranges != trans_hdr.num_ranges)
1199         goto bad_record;                /* incomplete */
1200 
1201     /* transaction complete, update status */
1202     status->last_uname = trans_hdr.uname;
1203     if (trans_hdr.flags & FLUSH_FLAG)
1204         status->n_flush_commit++;
1205     else status->n_no_flush_commit++;
1206     if (((trans_hdr.flags & FIRST_ENTRY_FLAG) != 0)
1207         && ((trans_hdr.flags & LAST_ENTRY_FLAG) == 0))
1208         status->n_split++;
1209     return RVM_SUCCESS;
1210 
1211 bad_record:
1212     log_buf->ptr = -1;
1213     return RVM_SUCCESS;
1214     }
1215 /* Locate tail, update in-memory copy of status block; always reads forward */
locate_tail(log)1216 rvm_return_t locate_tail(log)
1217     log_t           *log;               /* log descriptor */
1218     {
1219     log_status_t    *status = &log->status;   /* status descriptor */
1220     log_buf_t       *log_buf = &log->log_buf; /* log buffer descriptor */
1221     rvm_offset_t    tail;               /* tail offset */
1222     rvm_offset_t    temp_tail;          /* tail offset temp */
1223     rvm_length_t    last_rec_num = 0;   /* record number of tail record */
1224     rec_hdr_t       *rec_hdr;           /* current record scanned in buffer */
1225     struct timeval  save_last_trunc;
1226     struct timeval  last_write = status->last_write; /* last write to log */
1227     rvm_bool_t      save_rvm_utlsw = rvm_utlsw;
1228     rvm_return_t    retval = RVM_SUCCESS; /* return value */
1229 
1230     assert(log->trunc_thread == cthread_self());
1231     assert((status->trunc_state & RVM_TRUNC_PHASES) == ZERO);
1232     status->trunc_state |= RVM_TRUNC_FIND_TAIL;
1233 
1234     /* initialize scanner sequence checking state and buffers */
1235     rvm_utlsw = rvm_false;
1236     reset_hdr_chks(log);
1237     clear_aux_buf(log);
1238 
1239     /* if truncation caught in crash, reset head */
1240     if (!RVM_OFFSET_EQL_ZERO(status->prev_log_head))
1241         {
1242         status->log_head = status->prev_log_head;
1243         status->last_rec_num = status->next_rec_num-1;
1244         }
1245 
1246     /* set temporary timestamp for record validation */
1247     save_last_trunc = status->last_trunc;
1248     make_uname(&status->last_trunc);
1249     if (TIME_GTR(save_last_trunc,status->last_trunc))
1250         {                               /* date/time wrong! */
1251         retval = RVM_EINTERNAL;
1252         rvm_errmsg = ERR_DATE_SKEW;
1253         goto err_exit;
1254         }
1255 
1256     /* need to update status: init read buffer at head */
1257     if ((retval=init_buffer(log,&status->log_head,
1258                             FORWARD,NO_SYNCH))
1259         != RVM_SUCCESS) goto err_exit;
1260     assert(log->trunc_thread == cthread_self());
1261     assert((status->trunc_state & RVM_TRUNC_PHASES) == RVM_TRUNC_FIND_TAIL);
1262 
1263     /* validate 1st record, none ==> log empty */
1264     rec_hdr = (rec_hdr_t *)&(log_buf->buf[log_buf->ptr]);
1265     if (!validate_hdr(log,rec_hdr,NULL,FORWARD))
1266         {
1267 #ifdef RVM_LOG_TAIL_BUG
1268         unprotect_page__Fi(ClobberAddress);
1269 #endif /* RVM_LOG_TAIL_BUG */
1270 #ifdef RVM_LOG_TAIL_SHADOW
1271 	assert(RVM_OFFSET_EQL(log_tail_shadow,status->log_tail));
1272 #endif /* RVM_LOG_TAIL_SHADOW */
1273         status->log_tail = status->log_head;
1274 #ifdef RVM_LOG_TAIL_SHADOW
1275 	RVM_ASSIGN_OFFSET(log_tail_shadow,status->log_tail);
1276 #endif /* RVM_LOG_TAIL_SHADOW */
1277 #ifdef RVM_LOG_TAIL_BUG
1278         protect_page__Fi(ClobberAddress);
1279 #endif /* RVM_LOG_TAIL_BUG */
1280         clear_log_status(log);
1281         goto exit;
1282         }
1283     /* update status block head info if necessary */
1284     if (status->first_rec_num == 0)
1285         status->first_rec_num = rec_hdr->rec_num;
1286     if (TIME_EQL_ZERO(status->first_write))
1287         status->first_write = rec_hdr->timestamp;
1288     if (rec_hdr->struct_id == log_wrap_id)
1289         status->wrap_time = rec_hdr->timestamp;
1290 
1291     /* locate first transaction, if needed */
1292     if (TIME_EQL_ZERO(status->first_uname))
1293         do
1294             {
1295             /* update other status data */
1296             rec_hdr = (rec_hdr_t *)&(log_buf->buf[log_buf->ptr]);
1297             last_rec_num = rec_hdr->rec_num;
1298             status->last_write = rec_hdr->timestamp;
1299             if (rec_hdr->struct_id == log_wrap_id)
1300                 status->wrap_time = rec_hdr->timestamp;
1301 
1302             if (rec_hdr->struct_id == trans_hdr_id)
1303                 {                       /* transaction found */
1304                 status->first_uname = ((trans_hdr_t *)
1305                                        rec_hdr)->uname;
1306                 status->last_uname = ((trans_hdr_t *)
1307                                       rec_hdr)->uname;
1308                 break;
1309                 }
1310             if (rec_hdr->struct_id == log_wrap_id)
1311                 status->wrap_time = rec_hdr->timestamp;
1312             if ((retval=scan_forward(log,NO_SYNCH)) != RVM_SUCCESS)
1313                 goto err_exit;
1314             assert(log->trunc_thread == cthread_self());
1315             assert((status->trunc_state & RVM_TRUNC_PHASES)
1316                    == RVM_TRUNC_FIND_TAIL);
1317             if (rvm_chk_sigint != NULL) /* test for interrupt */
1318                 if ((*rvm_chk_sigint)(NULL)) goto err_exit;
1319             }
1320             while (log_buf->ptr != -1); /* tail found, no transactions */
1321 
1322     /* re-init scanner sequence checking state since small logs can cause
1323        a few records to be rescanned and re-init read buffer at tail
1324     */
1325     tail = status->log_tail;
1326     reset_hdr_chks(log);
1327     if ((retval=init_buffer(log,&tail,FORWARD,NO_SYNCH))
1328         != RVM_SUCCESS) goto err_exit;
1329     assert(log->trunc_thread == cthread_self());
1330     assert((status->trunc_state & RVM_TRUNC_PHASES) == RVM_TRUNC_FIND_TAIL);
1331     /* see if record at tail is valid, scan until bad record found */
1332     if ((retval=validate_rec_forward(log,NO_SYNCH)) != RVM_SUCCESS)
1333         goto err_exit;
1334     DO_FOREVER
1335         {
1336         if (log_buf->ptr == -1) break; /* tail located */
1337 
1338         /* compute provisional new tail offset, rec_num, timestamp */
1339         rec_hdr = (rec_hdr_t *)(&log_buf->buf[log_buf->ptr]);
1340         temp_tail = RVM_ADD_LENGTH_TO_OFFSET(log_buf->offset,
1341                         (log_buf->ptr+rec_hdr->rec_length
1342                         +sizeof(rec_end_t)));
1343         last_rec_num = rec_hdr->rec_num;
1344         last_write = rec_hdr->timestamp;
1345 
1346         /* type-specific status data recovery */
1347         switch (rec_hdr->struct_id)
1348             {
1349           case log_wrap_id:
1350             set_wrap_status(status,rec_hdr);
1351             tail = status->log_start;
1352             break;
1353 
1354           case trans_hdr_id:
1355             if ((retval=set_trans_status(log,rec_hdr)) != RVM_SUCCESS)
1356                 goto err_exit;
1357             assert(log->trunc_thread == cthread_self());
1358             assert((status->trunc_state & RVM_TRUNC_PHASES)
1359                    == RVM_TRUNC_FIND_TAIL);
1360             if (log_buf->ptr != -1)
1361                 tail = temp_tail;       /* update if trans OK */
1362             break;
1363 
1364           case log_seg_id:
1365             status->n_special++;
1366             tail = temp_tail;
1367             break;
1368 
1369           default:  assert(rvm_false);  /* error - should have header */
1370             }
1371 
1372         /* scan to next record */
1373         if (log_buf->ptr == -1) break; /* tail located */
1374         if ((retval=scan_forward(log,NO_SYNCH)) != RVM_SUCCESS)
1375             goto err_exit;
1376         assert(log->trunc_thread == cthread_self());
1377         assert((status->trunc_state & RVM_TRUNC_PHASES) == RVM_TRUNC_FIND_TAIL);
1378         if (rvm_chk_sigint != NULL)     /* test for interrupt */
1379             if ((*rvm_chk_sigint)(NULL)) goto err_exit;
1380         }
1381     /* tail found, update in-memory status */
1382 #ifdef RVM_LOG_TAIL_BUG
1383     unprotect_page__Fi(ClobberAddress);
1384 #endif /* RVM_LOG_TAIL_BUG */
1385 #ifdef RVM_LOG_TAIL_SHADOW
1386     assert(RVM_OFFSET_EQL(log_tail_shadow,status->log_tail));
1387 #endif /* RVM_LOG_TAIL_SHADOW */
1388     status->log_tail = tail;
1389 #ifdef RVM_LOG_TAIL_SHADOW
1390 	RVM_ASSIGN_OFFSET(log_tail_shadow,status->log_tail);
1391 #endif /* RVM_LOG_TAIL_SHADOW */
1392 #ifdef RVM_LOG_TAIL_BUG
1393     protect_page__Fi(ClobberAddress);
1394 #endif /* RVM_LOG_TAIL_BUG */
1395     status->last_write = last_write;
1396     if (RVM_OFFSET_EQL(status->log_head,status->log_tail))
1397         clear_log_status(log);          /* log empty */
1398     else
1399         {                               /* log not empty */
1400         status->log_empty = rvm_false;
1401 
1402         if (status->next_rec_num <= last_rec_num)
1403             status->next_rec_num = last_rec_num+1;
1404         if (status->last_rec_num != last_rec_num)
1405             status->last_rec_num = last_rec_num;
1406         }
1407 
1408 exit:
1409     status->valid = rvm_true;
1410 err_exit:
1411     rvm_utlsw = save_rvm_utlsw;
1412     status->last_trunc = save_last_trunc;
1413     assert(log->trunc_thread == cthread_self());
1414     assert((status->trunc_state & RVM_TRUNC_PHASES) == RVM_TRUNC_FIND_TAIL);
1415     return retval;
1416     }
1417 /* add segment short id to dictionary */
enter_seg_dict(log,seg_code)1418 rvm_return_t enter_seg_dict(log,seg_code)
1419     log_t           *log;
1420     long            seg_code;
1421     {
1422     seg_dict_t      *seg_dict;
1423     long            old_dict_size,new_dict_size;
1424 
1425     /* lengthen seg_dict_vec if necessary */
1426     if (log->seg_dict_len < seg_code)
1427         {
1428         new_dict_size = seg_code*sizeof(seg_dict_t);
1429         old_dict_size = log->seg_dict_len*sizeof(seg_dict_t);
1430         log->seg_dict_vec = (seg_dict_t *)
1431             REALLOC((char *)log->seg_dict_vec,new_dict_size);
1432         if (log->seg_dict_vec == NULL)
1433             return RVM_ENO_MEMORY;
1434         (void)BZERO((char *)((long)log->seg_dict_vec+old_dict_size),
1435                              new_dict_size-old_dict_size);
1436         log->seg_dict_len = seg_code;
1437         }
1438 
1439     /* enter in dictionary if not already defined */
1440     seg_dict = &log->seg_dict_vec[SEG_DICT_INDEX(seg_code)];
1441     if (seg_dict->struct_id != seg_dict_id)
1442         {
1443         seg_dict->struct_id = seg_dict_id;
1444         seg_dict->seg_code = seg_code;
1445         seg_dict->seg = NULL;
1446         init_tree_root(&seg_dict->mod_tree);
1447         (void)dev_init(&seg_dict->dev,NULL);
1448         }
1449     return RVM_SUCCESS;
1450     }
1451 /* complete definition of seg_dict entry */
def_seg_dict(log,rec_hdr)1452 rvm_return_t def_seg_dict(log,rec_hdr)
1453     log_t           *log;               /* log descriptor */
1454     rec_hdr_t       *rec_hdr;           /* log segment definition descriptor
1455                                            (with log record header) */
1456     {
1457     log_seg_t       *log_seg;           /* log segment definition descriptor */
1458     seg_dict_t      *seg_dict;          /* segment dictionary entry */
1459     char            *seg_name;          /* ptr to segment name in seg_dict rec */
1460     device_t        *dev;               /* device descriptor */
1461     rvm_return_t    retval;
1462 
1463     assert(rec_hdr->struct_id == log_seg_id);
1464     log_seg = (log_seg_t *)RVM_ADD_LENGTH_TO_ADDR(rec_hdr,
1465                                                   sizeof(rec_hdr_t));
1466 
1467     /* create dictionary entry if necessary */
1468     if ((retval=enter_seg_dict(log,log_seg->seg_code)) != RVM_SUCCESS)
1469         return retval;
1470     seg_dict = &log->seg_dict_vec[SEG_DICT_INDEX(log_seg->seg_code)];
1471 
1472     /* if segment not defined, set device name (open later) */
1473     seg_name = (char *)((rvm_length_t)rec_hdr+LOG_SPECIAL_SIZE);
1474     seg_dict->seg = seg_lookup(seg_name,&retval);
1475     if (seg_dict->seg == NULL)
1476         {
1477         assert(log->in_recovery || rvm_utlsw);
1478         dev = &seg_dict->dev;
1479         dev->name = malloc(log_seg->name_len+1);
1480         if (dev->name == NULL)
1481             return RVM_ENO_MEMORY;
1482         (void)strcpy(dev->name,seg_name);
1483         dev->num_bytes = log_seg->num_bytes;
1484         }
1485 
1486     return RVM_SUCCESS;
1487     }
1488 /* change tree comparator for tree_insert */
cmp_partial_include(node1,node2)1489 static long cmp_partial_include(node1,node2)
1490     dev_region_t    *node1;
1491     dev_region_t    *node2;
1492     {
1493     return dev_partial_include(&node1->offset,&node1->end_offset,
1494                                &node2->offset,&node2->end_offset);
1495     }
1496 
1497 /* set length of change tree node from offsets */
set_node_length(node)1498 static void set_node_length(node)
1499     dev_region_t    *node;              /* change tree node */
1500     {
1501     rvm_offset_t    offset_temp;        /* offset arithmetic temp */
1502 
1503     offset_temp = RVM_SUB_OFFSETS(node->end_offset,node->offset);
1504     assert(RVM_OFFSET_LEQ(offset_temp,node->end_offset)); /* overflow! */
1505     node->length = RVM_OFFSET_TO_LENGTH(offset_temp);
1506 
1507     }
change_tree_insert(seg_dict,node)1508 static rvm_return_t change_tree_insert(seg_dict,node)
1509     seg_dict_t      *seg_dict;          /* seg_dict for this nv */
1510     dev_region_t    *node;              /* change tree node for this nv */
1511     {
1512     dev_region_t    *x_node;            /* existing node if conflict */
1513     dev_region_t    *split_node;        /* ptr to created node, when used */
1514     rvm_length_t    log_diff;           /* adjustment to log/nv_buf offset */
1515     long            cmpval;             /* comparison return value */
1516     char            *shadow_vmaddr;     /* vmaddr of shadowed data */
1517     rvm_length_t    shadow_length = 0;  /* length of shadowed data */
1518     rvm_length_t    shadow_skew = 0;    /* byte skew of shadowed data */
1519     char            *shadow_ptr = NULL; /* ptr to shadowed data in vm */
1520     rvm_offset_t    shadow_offset;      /* offset of shadowed data in log */
1521     rvm_return_t    retval;
1522 
1523     /* try to insert node & see if values already there */
1524     if (node->length == 0) goto free_node; /* eliminate zero-length nodes */
1525 
1526     if (num_nodes-- == 0)
1527         {
1528         num_nodes = NODES_PER_YIELD;
1529         if (!(default_log->in_recovery || rvm_utlsw))
1530             {
1531             if (!rvm_no_yield) cthread_yield(); /* allow reschedule */
1532             }
1533         }
1534     assert(default_log->trunc_thread == cthread_self());
1535     assert((default_log->status.trunc_state & RVM_TRUNC_PHASES)
1536            == RVM_TRUNC_BUILD_TREE);
1537 
1538     if (tree_insert(&seg_dict->mod_tree,node,cmp_partial_include))
1539         {
1540         if (rvm_chk_len != 0)           /* do monitoring */
1541             monitor_vmaddr(node->vmaddr,node->length,node->nv_ptr,
1542                            &node->log_offset,NULL,
1543                            "change_tree_insert: inserting entire range");
1544         return RVM_SUCCESS;             /* no shadowed values */
1545         }
1546     x_node = (dev_region_t *)           /* get existing node */
1547         (seg_dict->mod_tree.traverse[seg_dict->mod_tree.level].ptr);
1548 
1549     /* some values already there: test existing node spans new */
1550     if (dev_total_include(&node->offset,&node->end_offset,
1551                           &x_node->offset,&x_node->end_offset) == 0)
1552         {
1553         if (rvm_chk_len != 0)           /* do monitoring */
1554             monitor_vmaddr(node->vmaddr,node->length,NULL,NULL,NULL,
1555                            "change_tree_insert: all values shadowed");
1556         goto free_node;                 /* yes, all values shadowed */
1557         }
1558     /* some shadowed, test if new values span existing node */
1559     if ((cmpval=dev_total_include(&x_node->offset,&x_node->end_offset,
1560                           &node->offset,&node->end_offset)) == 0)
1561         if (RVM_OFFSET_LSS(node->offset,x_node->offset))
1562             {                           /* make node for preceeding values */
1563             if ((split_node=make_dev_region()) == NULL)
1564                 return RVM_ENO_MEMORY;
1565             if (node->nv_buf != NULL)
1566                 {
1567                 assert(RVM_OFFSET_EQL_ZERO(node->log_offset));
1568                 assert(node->nv_buf->struct_id == nv_buf_id);
1569                 split_node->nv_buf = node->nv_buf;
1570                 node->nv_buf->ref_cnt++;
1571                 split_node->nv_ptr = node->nv_ptr;
1572                 }
1573             else
1574                 assert(node->nv_ptr == NULL);
1575 
1576             /* complete the new node */
1577             split_node->offset = node->offset;
1578             split_node->end_offset = x_node->offset;
1579             split_node->log_offset = node->log_offset;
1580             split_node->vmaddr = node->vmaddr;
1581             set_node_length(split_node);
1582             node->vmaddr += split_node->length;
1583             node->offset = RVM_ADD_LENGTH_TO_OFFSET(node->offset,
1584                                                   split_node->length);
1585             log_diff = split_node->length +
1586                 BYTE_SKEW(RVM_OFFSET_TO_LENGTH(split_node->offset));
1587 
1588             if (node->nv_ptr != NULL)
1589                 node->nv_ptr = (char *)CHOP_TO_LENGTH(
1590                         RVM_ADD_LENGTH_TO_ADDR(node->nv_ptr,log_diff));
1591             else
1592                 node->log_offset = CHOP_OFFSET_TO_LENGTH_SIZE(
1593                     RVM_ADD_LENGTH_TO_OFFSET(split_node->log_offset,
1594                                              log_diff));
1595 
1596             /* insert split node in tree */
1597             if (rvm_chk_len != 0)       /* do monitoring */
1598                 monitor_vmaddr(split_node->vmaddr,split_node->length,
1599                                NULL,NULL,NULL,
1600                                "change_tree_insert: inserting split node");
1601             if ((retval=change_tree_insert(seg_dict,split_node))
1602                 != RVM_SUCCESS) return retval;
1603             }
1604     /* test if new values follow existing node */
1605     shadow_skew = BYTE_SKEW(RVM_OFFSET_TO_LENGTH(node->offset));
1606     if (cmpval <= 0)
1607         {
1608         /* yes, reset starting offset */
1609         shadow_vmaddr = node->vmaddr;
1610         shadow_length = RVM_OFFSET_TO_LENGTH(
1611                    RVM_SUB_OFFSETS(x_node->end_offset,node->offset));
1612         shadow_ptr = node->nv_ptr;
1613         shadow_offset = node->log_offset;
1614         node->offset = x_node->end_offset;
1615         set_node_length(node);
1616         if (node->nv_ptr != NULL)       /* adjust buffer pointer */
1617             node->nv_ptr = (char *)CHOP_TO_LENGTH(
1618                                 RVM_ADD_LENGTH_TO_ADDR(node->nv_ptr,
1619                                            shadow_length+shadow_skew));
1620         else                            /* adjust log offset */
1621             node->log_offset = CHOP_OFFSET_TO_LENGTH_SIZE(
1622                 RVM_ADD_LENGTH_TO_OFFSET(node->log_offset,
1623                                          shadow_length+shadow_skew));
1624         node->vmaddr = RVM_ADD_LENGTH_TO_ADDR(node->vmaddr,
1625                                               shadow_length);
1626         }
1627     else
1628         /* new values preceed existing node, but don't span it */
1629         {                               /* reset end offset */
1630         node->end_offset = x_node->offset;
1631         shadow_length = node->length;   /* save old length */
1632         set_node_length(node);
1633         shadow_length -= node->length;  /* correct for new length */
1634         shadow_vmaddr = RVM_ADD_LENGTH_TO_ADDR(node->vmaddr,
1635                                                node->length);
1636         if (node->nv_ptr != NULL)
1637             shadow_ptr = (char *)CHOP_TO_LENGTH(
1638                             RVM_ADD_LENGTH_TO_ADDR(node->nv_ptr,
1639                                          shadow_length+shadow_skew));
1640         shadow_offset = CHOP_OFFSET_TO_LENGTH_SIZE(
1641                             RVM_ADD_LENGTH_TO_OFFSET(node->log_offset,
1642                                           shadow_length+shadow_skew));
1643         }
1644     /* insert modified node */
1645     if (rvm_chk_len != 0)               /* do monitoring */
1646         {
1647         if (shadow_length != 0)
1648             monitor_vmaddr(shadow_vmaddr,shadow_length,shadow_ptr,
1649                            &shadow_offset,NULL,
1650                            "change_tree_insert: values shadowed");
1651         monitor_vmaddr(node->vmaddr,node->length,NULL,NULL,NULL,
1652                        "change_tree_insert: inserting non-shadowed values");
1653         }
1654     return change_tree_insert(seg_dict,node);
1655 
1656 free_node:
1657     free_dev_region(node);
1658     return RVM_SUCCESS;
1659     }
1660 /* prepare new value record for seg_dict's mod_tree
1661    if new values are <= nv_local_max, they must be in buffer */
do_nv(log,nv)1662 static rvm_return_t do_nv(log,nv)
1663     log_t           *log;
1664     nv_range_t      *nv;
1665     {
1666     log_status_t    *status = &log->status; /* status descriptor */
1667     log_buf_t       *log_buf = &log->log_buf; /* log buffer descriptor */
1668     seg_dict_t      *seg_dict;          /* seg_dict for this nv */
1669     dev_region_t    *node;              /* change tree node for this nv */
1670     rvm_length_t    aligned_len;        /* allocation temp */
1671     rvm_offset_t    offset;             /* monitoring temp */
1672     rvm_bool_t      chk_val;            /* checksum result */
1673     rvm_return_t    retval;             /* return value */
1674 
1675     assert(log->trunc_thread == cthread_self());
1676     assert((status->trunc_state & RVM_TRUNC_PHASES)
1677            == RVM_TRUNC_BUILD_TREE);
1678     assert(nv->rec_hdr.struct_id == nv_range_id);  /* not a nv range header */
1679     assert(TIME_EQL(log_buf->timestamp,nv->rec_hdr.timestamp));
1680 
1681     if (rvm_chk_len != 0)               /* do monitoring */
1682         {
1683         offset = RVM_ADD_LENGTH_TO_OFFSET(log_buf->offset,
1684                                     log_buf->ptr+sizeof(nv_range_t));
1685         monitor_vmaddr(nv->vmaddr, nv->length, NULL, &offset,
1686                        &nv->rec_hdr, "do_nv: data from log");
1687         }
1688 
1689     if (nv->length == 0) return RVM_SUCCESS; /* ignore null changes */
1690 
1691     /* be sure in segment dictionary */
1692     if ((retval=enter_seg_dict(log,nv->seg_code)) != RVM_SUCCESS)
1693         return retval;
1694     seg_dict = &log->seg_dict_vec[SEG_DICT_INDEX(nv->seg_code)];
1695 
1696     /* make a tree node for changes */
1697     if ((node = make_dev_region()) == NULL) return RVM_ENO_MEMORY;
1698     node->offset = nv->offset;
1699     node->end_offset = RVM_ADD_LENGTH_TO_OFFSET(nv->offset,nv->length);
1700     node->length = nv->length;
1701     node->vmaddr = nv->vmaddr;
1702     /* see if mods small enough to keep in vm */
1703     if (nv->length <= NV_LOCAL_MAX)
1704         {                               /* yes, get some space for nv */
1705         aligned_len = ALIGNED_LEN(RVM_OFFSET_TO_LENGTH(nv->offset),
1706                                   nv->length);
1707         if ((node->nv_buf=(nv_buf_t *)malloc(NV_BUF_SIZE(aligned_len)))
1708             == NULL) return RVM_ENO_MEMORY;
1709         node->nv_buf->struct_id = nv_buf_id;
1710         node->nv_buf->alloc_len = NV_BUF_SIZE(aligned_len);
1711         node->nv_buf->ref_cnt = 1;
1712         node->nv_buf->chk_sum = nv->chk_sum;
1713         node->nv_buf->data_len = nv->length;
1714         node->nv_ptr = (char *)&node->nv_buf->buf;
1715         assert(((rvm_length_t)nv+sizeof(nv_range_t))
1716                >= (rvm_length_t)default_log->log_buf.buf);
1717         assert(((rvm_length_t)nv+sizeof(nv_range_t))
1718                < ((rvm_length_t)default_log->log_buf.buf
1719                   +default_log->log_buf.r_length));
1720 
1721         /* basic BCOPY will not change alignment since buffer padded */
1722         (void)BCOPY(RVM_ADD_LENGTH_TO_ADDR(nv,sizeof(nv_range_t)),
1723                     node->nv_ptr,aligned_len);
1724         }
1725     else
1726         /* no, set offset in log for nv's */
1727         node->log_offset = RVM_ADD_LENGTH_TO_OFFSET(log_buf->offset,
1728                  (rvm_length_t)nv-(rvm_length_t)log_buf->buf
1729                                  +sizeof(nv_range_t));
1730 
1731     /* put in change tree */
1732     if ((retval=change_tree_insert(seg_dict,node)) != RVM_SUCCESS)
1733         return retval;
1734 
1735     /* see if complete check sum test wanted */
1736     if (rvm_chk_sum)
1737         {
1738         if ((retval=range_chk_sum(log,nv,&chk_val,SYNCH))
1739             != RVM_SUCCESS) return retval;
1740         assert(chk_val == rvm_true);        /* check sum failure */
1741         if ((retval=scan_nv_reverse(log,SYNCH)) != RVM_SUCCESS)
1742             return retval;
1743         assert(log->trunc_thread == cthread_self());
1744         assert((status->trunc_state & RVM_TRUNC_PHASES)
1745            == RVM_TRUNC_BUILD_TREE);
1746         }
1747 
1748     return RVM_SUCCESS;
1749     }
1750 /* scan modifications of transaction in reverse order & build tree */
do_trans(log,skip_trans)1751 static rvm_return_t do_trans(log,skip_trans)
1752     log_t           *log;               /* log descriptor */
1753     rvm_bool_t      skip_trans;         /* scan, but ignore if true */
1754     {
1755     log_status_t    *status = &log->status; /* status descriptor */
1756     log_buf_t       *log_buf = &log->log_buf; /* log buffer descriptor */
1757 
1758     rec_hdr_t       *rec_hdr;           /* last record header scanned */
1759     rec_end_t       *rec_end;           /* end marker for transaction */
1760     trans_hdr_t     *trans_hdr;         /* transaction header ptr */
1761     long            num_ranges = 0;     /* ranges processed */
1762     long            prev_range = 0;     /* previous range number */
1763     rvm_return_t    retval;             /* return value */
1764 
1765     assert(log->trunc_thread == cthread_self());
1766     assert((status->trunc_state & RVM_TRUNC_PHASES)
1767            == RVM_TRUNC_BUILD_TREE);
1768 
1769     /* remember the transaction's timestamp and scan ranges */
1770     rec_end = (rec_end_t *)&log_buf->buf[log_buf->ptr];
1771     assert(rec_end->rec_hdr.struct_id == rec_end_id);
1772     log_buf->timestamp = rec_end->rec_hdr.timestamp;
1773     DO_FOREVER
1774         {
1775         if ((retval=scan_nv_reverse(log,SYNCH)) != RVM_SUCCESS)
1776             return retval;
1777         assert(log->trunc_thread == cthread_self());
1778         assert((status->trunc_state & RVM_TRUNC_PHASES)
1779            == RVM_TRUNC_BUILD_TREE);
1780         rec_hdr = (rec_hdr_t *)&log_buf->buf[log_buf->ptr];
1781 
1782         /* test for end */
1783         if (rec_hdr->struct_id == trans_hdr_id)
1784             break;                      /* done */
1785 
1786         /* check order and process the range */
1787         assert(rec_hdr->struct_id == nv_range_id);
1788         if (prev_range != 0)
1789             assert(((nv_range_t *)rec_hdr)->range_num == (prev_range-1));
1790         if (!skip_trans)
1791             if ((retval=do_nv(log,(nv_range_t *)rec_hdr))
1792                 != RVM_SUCCESS) return retval;
1793 
1794         /* tally ranges processed */
1795         num_ranges++;
1796         prev_range = ((nv_range_t *)rec_hdr)->range_num;
1797         }
1798 
1799     /* sanity checks at the end... */
1800     trans_hdr = (trans_hdr_t *)rec_hdr;
1801     assert(trans_hdr->rec_hdr.struct_id == trans_hdr_id);
1802     assert(TIME_EQL(trans_hdr->rec_hdr.timestamp,log_buf->timestamp));
1803     assert(trans_hdr->num_ranges == num_ranges);
1804     if (num_ranges != 0) assert(prev_range == 1);
1805 
1806     return RVM_SUCCESS;
1807     }
1808 /* log wrap-around validation */
chk_wrap(log,force_wrap_chk,skip_trans)1809 static rvm_return_t chk_wrap(log,force_wrap_chk,skip_trans)
1810     log_t           *log;               /* log descriptor */
1811     rvm_bool_t      force_wrap_chk;     /* wrap check required if true */
1812     rvm_bool_t      *skip_trans;        /* set true if bad split */
1813     {
1814     log_status_t    *status = &log->status; /* status descriptor */
1815     log_buf_t       *log_buf = &log->log_buf; /* log buffer descriptor */
1816     rvm_offset_t    offset;             /* offset temp */
1817     rvm_offset_t    end_offset;         /* offset of last trans end marker */
1818     rec_end_t       *rec_end;           /* last record scanned in buffer */
1819     trans_hdr_t     last_trans_hdr;     /* last transaction record header */
1820     trans_hdr_t     *trans_hdr;         /* header temporary */
1821     log_wrap_t      *log_wrap;          /* wrap-around marker */
1822     long            tmp_ptr;            /* buffer index temp */
1823     long            data_len;           /* length temporary */
1824     rvm_return_t    retval;             /* return value */
1825 
1826     *skip_trans = rvm_false;
1827     rec_end = (rec_end_t *)&log_buf->buf[log_buf->ptr];
1828     offset = RVM_ADD_LENGTH_TO_OFFSET(log_buf->offset,log_buf->ptr);
1829     offset = RVM_SUB_LENGTH_FROM_OFFSET(offset,rec_end->rec_hdr.rec_length);
1830 
1831     /* check if transaction header is at start of log data area */
1832     if (!RVM_OFFSET_EQL(offset,status->log_start) && (!force_wrap_chk))
1833         return RVM_SUCCESS;             /* no, nothing more needed */
1834 
1835     /* get header */
1836     if (force_wrap_chk)
1837         {
1838         /* header can be anywhere */
1839         if (RVM_OFFSET_LSS(offset,log_buf->offset))
1840             {
1841             retval = load_aux_buf(log,&offset,sizeof(trans_hdr_t),
1842                                   &tmp_ptr,&data_len,SYNCH,rvm_false);
1843             if (retval != RVM_SUCCESS) return retval;
1844             assert(log->trunc_thread == cthread_self());
1845             assert((status->trunc_state & RVM_TRUNC_PHASES)
1846                    == RVM_TRUNC_BUILD_TREE);
1847             assert(data_len >= sizeof(trans_hdr_t));
1848             trans_hdr = (trans_hdr_t *)&log_buf->aux_buf[tmp_ptr];
1849             }
1850         else
1851             trans_hdr = (trans_hdr_t *)&log_buf->buf[log_buf->ptr
1852                                                 -rec_end->rec_hdr.rec_length];
1853         }
1854     else
1855         /* header is at start of aux_buf or recovery buffer */
1856         if (RVM_OFFSET_LSS(offset,log_buf->offset))
1857             trans_hdr = (trans_hdr_t *)log_buf->aux_buf;
1858         else
1859             trans_hdr = (trans_hdr_t *)log_buf->buf;
1860 
1861     /* check for split transaction */
1862     assert(trans_hdr->rec_hdr.struct_id == trans_hdr_id);
1863     if (TRANS_HDR(FIRST_ENTRY_FLAG)
1864         && TRANS_HDR(LAST_ENTRY_FLAG))
1865         return RVM_SUCCESS;             /* not split, nothing more needed */
1866 
1867     /* split, see if must check further or skip record */
1868     assert(TRANS_HDR(FIRST_ENTRY_FLAG) || TRANS_HDR(LAST_ENTRY_FLAG));
1869     if (!TRANS_HDR(LAST_ENTRY_FLAG))
1870         {
1871         if (log_buf->split_ok)
1872             {                           /* split previously checked */
1873             log_buf->split_ok = rvm_false;
1874             return RVM_SUCCESS;
1875             }
1876         if (force_wrap_chk)             /* if not last entry, trans not good */
1877             {
1878             *skip_trans = rvm_true;
1879             return RVM_SUCCESS;
1880             }
1881         }
1882 
1883     /* must make local copy and scan for first record of transaction */
1884     end_offset = RVM_ADD_LENGTH_TO_OFFSET(log_buf->offset,
1885                                 (log_buf->ptr+sizeof(rec_end_t)));
1886     (void)BCOPY(trans_hdr,&last_trans_hdr,sizeof(trans_hdr_t));
1887     if ((retval=scan_reverse(log,SYNCH)) != RVM_SUCCESS)
1888         return retval;
1889     assert(log->trunc_thread == cthread_self());
1890     assert((status->trunc_state & RVM_TRUNC_PHASES)
1891            == RVM_TRUNC_BUILD_TREE);
1892 
1893     /* wrap-around had better be next... */
1894     assert((long)log_buf->ptr >= 0);
1895     log_wrap = (log_wrap_t *)&log_buf->buf[log_buf->ptr];
1896     assert(log_wrap->rec_hdr.struct_id == log_wrap_id);
1897     assert(log_wrap->rec_hdr.rec_num == (last_trans_hdr.rec_hdr.rec_num-1));
1898 
1899     /* now scan for first record of transaction */
1900     if ((retval=scan_reverse(log,SYNCH)) != RVM_SUCCESS)
1901         return retval;
1902     assert(log->trunc_thread == cthread_self());
1903     assert((status->trunc_state & RVM_TRUNC_PHASES)
1904            == RVM_TRUNC_BUILD_TREE);
1905     assert((long)log_buf->ptr >= 0);
1906     rec_end = (rec_end_t *)&log_buf->buf[log_buf->ptr];
1907     assert(rec_end->rec_hdr.struct_id == rec_end_id);
1908     /* check if the header is the first record of last transaction */
1909     offset = RVM_ADD_LENGTH_TO_OFFSET(log_buf->offset,log_buf->ptr);
1910     offset = RVM_SUB_LENGTH_FROM_OFFSET(offset,rec_end->rec_hdr.rec_length);
1911     if (RVM_OFFSET_LSS(offset,log_buf->offset))
1912         {
1913         /* header is in aux_buf */
1914         tmp_ptr = OFFSET_TO_SECTOR_INDEX(offset);
1915         trans_hdr = (trans_hdr_t *)&log_buf->aux_buf[tmp_ptr];
1916         }
1917     else
1918         {
1919         /* header is in recovery buffer */
1920         tmp_ptr = RVM_OFFSET_TO_LENGTH(RVM_SUB_OFFSETS(offset,
1921                                            log_buf->offset));
1922         assert(tmp_ptr >= 0);
1923         trans_hdr = (trans_hdr_t *)&log_buf->buf[tmp_ptr];
1924         }
1925 
1926     /* sanity checks... */
1927     assert(trans_hdr->rec_hdr.struct_id == trans_hdr_id);
1928     assert(TRANS_HDR(FIRST_ENTRY_FLAG));
1929     assert(TIME_EQL(trans_hdr->uname,last_trans_hdr.uname));
1930     assert(trans_hdr->rec_hdr.rec_num == (last_trans_hdr.rec_hdr.rec_num-2));
1931 
1932     /* all is well, restore last transaction record */
1933     log_buf->prev_rec_num = 0;
1934     ZERO_TIME(log_buf->prev_timestamp);
1935     if ((retval=init_buffer(log,&end_offset,REVERSE,SYNCH))
1936         != RVM_SUCCESS) return retval;
1937     assert(log->trunc_thread == cthread_self());
1938     assert((status->trunc_state & RVM_TRUNC_PHASES)
1939            == RVM_TRUNC_BUILD_TREE);
1940     log_buf->ptr -= sizeof(rec_end_t);
1941     log_buf->split_ok = rvm_true;
1942 
1943     return RVM_SUCCESS;
1944     }
1945 /* Recovery: phase 2 -- build modification trees, and
1946    construct dictionary of segment short names
1947 */
1948 #define X(a)
build_tree(log)1949 static rvm_return_t build_tree(log)
1950     log_t           *log;               /* log descriptor */
1951     {
1952     log_status_t    *status = &log->status; /* status descriptor */
1953     log_buf_t       *log_buf = &log->log_buf; /* log buffer descriptor */
1954     rvm_return_t    retval;             /* return value */
1955     rvm_offset_t    tail;               /* tail offset temp */
1956     rec_end_t       *rec_end;           /* last record scanned in buffer */
1957     rvm_length_t    trans_cnt = 0;      /* transactions processed */
1958     rvm_bool_t      force_wrap_chk = rvm_false; /* true if suspect bad wrap */
1959     rvm_bool_t      skip_trans;         /* true if bad wrap trans to be skipped */
1960 
1961     assert(log->trunc_thread == cthread_self());
1962     assert(((status->trunc_state & RVM_TRUNC_PHASES) == RVM_TRUNC_FIND_TAIL)
1963             || ((status->trunc_state & RVM_TRUNC_PHASES) == ZERO));
1964     status->trunc_state = (status->trunc_state & (~RVM_TRUNC_FIND_TAIL))
1965                            | RVM_TRUNC_BUILD_TREE;
1966 
1967     /* reset sequence checks and init scan buffers */
1968 X(reset_hdr)
1969     reset_hdr_chks(log);
1970 X(clear_aux)
1971     clear_aux_buf(log);
1972 X(init_buf)
1973     if (RVM_OFFSET_EQL(status->prev_log_tail, status->log_start))
1974         retval = init_buffer(log,&status->log_start, FORWARD,SYNCH);
1975     else
1976         retval = init_buffer(log,&status->prev_log_tail, REVERSE,SYNCH);
1977     assert(retval == RVM_SUCCESS);
1978     assert(log->trunc_thread == cthread_self());
1979 X(done_init_buf)
1980     /* scan in reverse from tail to find records for uncommitted changes */
1981     num_nodes = NODES_PER_YIELD;
1982     log_buf->split_ok = rvm_false;      /* split records not checked yet */
1983     tail = status->prev_log_tail;       /* use previous epoch tail */
1984     while (!RVM_OFFSET_EQL(tail,status->prev_log_head))
1985         {
1986 X(start loop)
1987         if ((retval=scan_reverse(log,SYNCH)) != RVM_SUCCESS)
1988             return retval;
1989 X(done scan_reverse)
1990         assert(log->trunc_thread == cthread_self());
1991         assert((status->trunc_state & RVM_TRUNC_PHASES)
1992                == RVM_TRUNC_BUILD_TREE);
1993         if (rvm_chk_sigint != NULL)     /* test for interrupt */
1994             if ((*rvm_chk_sigint)(NULL)) return RVM_SUCCESS;
1995         assert((long)log_buf->ptr >= 0); /* log damage, invalid record */
1996 
1997         /* check type of end marker, do type-dependent processing */
1998         rec_end = (rec_end_t *)&log_buf->buf[log_buf->ptr];
1999         if (rec_end->rec_hdr.struct_id == log_wrap_id)
2000             {
2001 X(log_wrap)
2002             if (!log_buf->split_ok)
2003                 force_wrap_chk = rvm_true;
2004             }
2005         else
2006             {
2007 X(else)
2008             assert(rec_end->rec_hdr.struct_id == rec_end_id);
2009             switch (rec_end->rec_type)
2010                 {
2011               case trans_hdr_id:        /* process transaction */
2012 X( trans_hdr_id: chk_wrap)
2013                 if ((retval=chk_wrap(log,force_wrap_chk,&skip_trans))
2014                     != RVM_SUCCESS) return retval;
2015                 force_wrap_chk = rvm_false;
2016 X( trans_hdr_id: do_trans)
2017                 if ((retval=do_trans(log,skip_trans)) != RVM_SUCCESS)
2018                     return retval;
2019 X( trans_hdr_id: end)
2020                 trans_cnt++;
2021                 break;
2022               case log_seg_id:          /* enter seg short id in dictionary */
2023 X( log_seg_id: def_seg_dict)
2024                 if ((retval=def_seg_dict(log,(rec_hdr_t *)
2025                     RVM_SUB_LENGTH_FROM_ADDR(rec_end,
2026                                         rec_end->rec_hdr.rec_length)))
2027                     != RVM_SUCCESS) return retval;
2028 X( log_seg_id: done)
2029                 log_buf->ptr -= rec_end->rec_hdr.rec_length;
2030                 break;
2031               default:  assert(rvm_false); /* trouble, log damage? */
2032                 }
2033             }
2034 
2035         /* update local tail ptr */
2036         tail = RVM_ADD_LENGTH_TO_OFFSET(log_buf->offset,log_buf->ptr);
2037         }
2038 
2039     /* leave buffer unprotected for later phases */
2040 /* MACH_RVM_PROTECT
2041  *
2042  * protect(log_buf->buf, log_buf->length, FALSE, VM_PROT_WRITE | VM_PROT_READ);
2043  */
2044 
2045     return RVM_SUCCESS;
2046     }
2047 /* pre-scan change tree to see how much to read to read into buffer */
pre_scan(log,tree)2048 static dev_region_t *pre_scan(log,tree)
2049     log_t           *log;               /* log descriptor */
2050     tree_root_t     *tree;              /* current tree root */
2051     {
2052     log_buf_t       *log_buf = &log->log_buf; /* log buffer descriptor */
2053     dev_region_t    *last_node = NULL;
2054     dev_region_t    *node;              /* current change tree node */
2055     rvm_offset_t    temp;
2056 
2057     /* find node with least offset */
2058     node = (dev_region_t *)tree->root;
2059     /* XXX - Can node ever be NULL?  If so, last_node can be random */
2060     /* I currently believe it must be NON-null */
2061     assert(node != NULL);
2062     while (node != NULL)
2063         {
2064         assert(node->links.node.struct_id == dev_region_id);
2065         last_node = node;
2066         node = (dev_region_t *)node->links.node.lss;
2067         }
2068     log_buf->offset = CHOP_OFFSET_TO_SECTOR_SIZE(last_node->offset);
2069 
2070     /* scan for maximum offset node that will fit in buffer */
2071     node = (dev_region_t *)tree->root;
2072     while (node != NULL)
2073         {
2074         assert(node->links.node.struct_id == dev_region_id);
2075 
2076         /* compute buffer extension for this node */
2077         temp = RVM_SUB_OFFSETS(node->end_offset,log_buf->offset);
2078         temp = ROUND_OFFSET_TO_SECTOR_SIZE(temp);
2079 
2080         /* see if will fit in log buffer */
2081         if (RVM_OFFSET_GTR(temp,log_buf->buf_len))
2082             node = (dev_region_t *)node->links.node.lss; /* try smaller */
2083         else
2084             {
2085             /* see if there's another that will also fit */
2086             last_node = node;
2087             node = (dev_region_t *)node->links.node.gtr;
2088             }
2089         }
2090 
2091     return last_node;
2092     }
2093 /* merge large node disk-resident new values with segment data */
disk_merge(log,node,preload)2094 static rvm_return_t disk_merge(log,node,preload)
2095     log_t           *log;               /* log descriptor */
2096     dev_region_t    *node;              /* node to merge */
2097     rvm_bool_t      preload;            /* end sector preload done if true */
2098     {
2099     log_status_t    *status = &log->status; /* status descriptor */
2100     log_buf_t       *log_buf = &log->log_buf; /* log buffer descriptor */
2101     rvm_length_t    data_len=0;         /* actual nv data length read */
2102     rvm_length_t    buf_ptr;            /* log buffer ptr */
2103     rvm_length_t    aux_ptr;            /* aux buffer ptr
2104                                            (compensates for sector alignment) */
2105     rvm_length_t    tmp_ptr;            /* temporary buffer ptr */
2106     long            rw_length;          /* actual i/o transfer length */
2107     rvm_offset_t    end_offset;         /* end offset temporary */
2108     rvm_return_t    retval;             /* return value */
2109     rvm_bool_t      was_preloaded = preload; /* save preload state */
2110 
2111     assert(log->trunc_thread == cthread_self());
2112     assert((status->trunc_state & RVM_TRUNC_PHASES)
2113            == RVM_TRUNC_APPLY);
2114     assert(node->links.node.struct_id == dev_region_id);
2115 
2116     /* set log buffer pointer and end offset */
2117     end_offset = CHOP_OFFSET_TO_SECTOR_SIZE(node->end_offset);
2118     buf_ptr = RVM_OFFSET_TO_LENGTH(RVM_SUB_OFFSETS(node->offset,
2119                                                    log_buf->offset));
2120     node->log_offset = RVM_ADD_LENGTH_TO_OFFSET(node->log_offset,
2121                                                 BYTE_SKEW(buf_ptr));
2122     DO_FOREVER
2123         {                               /* fill log buffer from aux buf */
2124         while ((buf_ptr < log_buf->length)
2125                && (node->length != 0))
2126             {
2127             /* see how much to get in this pass & load aux_buf */
2128             if ((log_buf->length-buf_ptr) < node->length)
2129                 rw_length = log_buf->length-buf_ptr; /* fill log_buf */
2130             else
2131                 rw_length = node->length; /* get all remaining */
2132             if ((retval=load_aux_buf(log,&node->log_offset,rw_length,
2133                                      &aux_ptr,&data_len,SYNCH,rvm_true))
2134                 != RVM_SUCCESS) return retval;
2135             /* sanity checks and monitoring */
2136             assert((aux_ptr+data_len) <= log_buf->aux_rlength);
2137             assert((buf_ptr+data_len) <= log_buf->length);
2138             assert(BYTE_SKEW(aux_ptr) == BYTE_SKEW(node->vmaddr));
2139             assert((long)(node->length-data_len) >= 0);
2140             if (rvm_chk_len != 0)
2141                 monitor_vmaddr(node->vmaddr,data_len,
2142                                &log_buf->aux_buf[aux_ptr],NULL,NULL,
2143                                "disk_merge: data read from log:");
2144 
2145             /* preload of last modified segment sector */
2146             if (RVM_OFFSET_GTR(RVM_ADD_LENGTH_TO_OFFSET(
2147                                node->offset,data_len),end_offset)
2148                 && (!preload))
2149                 {
2150                 /* must load last sector of mods from segment */
2151                 tmp_ptr = CHOP_TO_SECTOR_SIZE(buf_ptr+data_len);
2152                 if (!(log->in_recovery || rvm_utlsw || rvm_no_yield))
2153                     {
2154                     cthread_yield();    /* allow reschedule */
2155                     assert(log->trunc_thread == cthread_self());
2156                     }
2157                 if ((rw_length=read_dev(log->cur_seg_dev,&end_offset,
2158                              &log_buf->buf[tmp_ptr],SECTOR_SIZE)) < 0)
2159                     return RVM_EIO;
2160                 assert(log->trunc_thread == cthread_self());
2161                 assert((status->trunc_state & RVM_TRUNC_PHASES)
2162                        == RVM_TRUNC_APPLY);
2163                 assert(rw_length == SECTOR_SIZE);
2164                 preload = rvm_true;
2165 
2166                 /* monitor data from last sector */
2167                 if (rvm_chk_len != 0)
2168                     monitor_vmaddr(node->vmaddr,data_len,
2169                                    &log_buf->buf[buf_ptr],NULL,NULL,
2170                                    "disk_merge: data read from segment:");
2171                 }
2172 
2173             /* copy to segment (in log buffer) */
2174             (void)BCOPY(&log_buf->aux_buf[aux_ptr],
2175                         &log_buf->buf[buf_ptr],data_len);
2176 
2177             /* tally bytes merged & do monitoring */
2178             if (rvm_chk_len != 0)
2179                 {
2180                 monitor_vmaddr(node->vmaddr,data_len,
2181                                &log_buf->buf[buf_ptr],NULL,NULL,
2182                                "disk_merge: data merged to segment:");
2183                 }
2184             node->length -= data_len;
2185             node->vmaddr += data_len;
2186             node->log_offset =
2187                 RVM_ADD_LENGTH_TO_OFFSET(node->log_offset,data_len);
2188             node->offset =
2189                 RVM_ADD_LENGTH_TO_OFFSET(node->offset,data_len);
2190             buf_ptr += data_len;
2191             /* if done, set final write length */
2192             if (node->length == 0)
2193                 {
2194                 assert(RVM_OFFSET_EQL(node->offset,
2195                                       node->end_offset));
2196                 end_offset =
2197                     RVM_ADD_LENGTH_TO_OFFSET(log_buf->offset,buf_ptr);
2198                 assert(RVM_OFFSET_EQL(end_offset,node->end_offset));
2199                 if (!was_preloaded)
2200                     log_buf->r_length = ROUND_TO_SECTOR_SIZE(buf_ptr);
2201                 return RVM_SUCCESS;
2202                 }
2203             }
2204 
2205         /* write buffer to segment & monitor */
2206         assert(buf_ptr == log_buf->length);
2207         if ((rw_length=write_dev(log->cur_seg_dev,&log_buf->offset,
2208                                  log_buf->buf,log_buf->length,
2209                                  SYNCH))
2210             < 0) return RVM_EIO;
2211         assert(log->trunc_thread == cthread_self());
2212         assert((status->trunc_state & RVM_TRUNC_PHASES)
2213                == RVM_TRUNC_APPLY);
2214         assert(rw_length == log_buf->length);
2215         if (rvm_chk_len != 0)
2216             monitor_vmaddr(node->vmaddr-data_len,data_len,
2217                            &log_buf->buf[buf_ptr-data_len],NULL,NULL,
2218                            "disk_merge: data written to segment:");
2219         if (!(log->in_recovery || rvm_utlsw || rvm_no_yield))
2220             {
2221             cthread_yield();            /* allow reschedule */
2222             assert(log->trunc_thread == cthread_self());
2223             assert((status->trunc_state & RVM_TRUNC_PHASES)
2224                    == RVM_TRUNC_APPLY);
2225             }
2226         log_buf->offset =
2227             RVM_ADD_LENGTH_TO_OFFSET(log_buf->offset,buf_ptr);
2228         buf_ptr = 0;
2229         assert(OFFSET_TO_SECTOR_INDEX(log_buf->offset) == 0);
2230         }
2231     }
2232 /* merge node's new values with segment data in buffer */
merge_node(log,node,preload)2233 static rvm_return_t merge_node(log,node,preload)
2234     log_t           *log;               /* log descriptor */
2235     dev_region_t    *node;              /* current change tree node */
2236     rvm_bool_t      preload;            /* end sector preload done if true */
2237     {
2238     log_buf_t       *log_buf = &log->log_buf; /* log buffer descriptor */
2239     rvm_length_t    temp;
2240     rvm_return_t    retval;             /* return value */
2241 
2242     /* do monitoring and merge node data into segment */
2243     if (RVM_OFFSET_EQL_ZERO(node->log_offset))
2244         {                               /* data in node */
2245         if (rvm_chk_len != ZERO)
2246             monitor_vmaddr(node->vmaddr,node->length,
2247                            node->nv_ptr,NULL,NULL,
2248                            "merge_node: data copied from node:");
2249         temp = RVM_OFFSET_TO_LENGTH(RVM_SUB_OFFSETS(node->offset,
2250                                                     log_buf->offset));
2251         assert((temp+node->length) <= log_buf->r_length);
2252         dest_aligned_bcopy(node->nv_ptr,&log_buf->buf[temp],
2253                            node->length);
2254         }
2255     else                                /* data on disk -- use aux_buf */
2256         if ((retval=disk_merge(log,node,preload)) != RVM_SUCCESS)
2257             return retval;
2258 
2259     /* free node and check for yield */
2260     (void) free_dev_region(node);
2261     if (num_nodes-- == 0)
2262         {
2263         num_nodes = NODES_PER_YIELD;
2264         if (!(log->in_recovery || rvm_utlsw || rvm_no_yield))
2265             {
2266             cthread_yield();            /* allow reschedule */
2267             assert(log->trunc_thread == cthread_self());
2268             }
2269         }
2270 
2271     return RVM_SUCCESS;
2272     }
2273 
update_seg(log,seg_dict,seg_dev)2274 static rvm_return_t update_seg(log,seg_dict,seg_dev)
2275     log_t           *log;               /* log descriptor */
2276     seg_dict_t      *seg_dict;          /* segment dictionary entry */
2277     device_t        *seg_dev;           /* segment device descriptor */
2278 {
2279     log_status_t    *status = &log->status; /* status descriptor */
2280     log_buf_t       *log_buf = &log->log_buf; /* log buffer descriptor */
2281     long            r_length;           /* length of data transfered */
2282     rvm_bool_t      preload;            /* end sector preload done if true */
2283     char            *addr=NULL;         /* monitoring address */
2284     rvm_offset_t    temp;               /* offset temporary */
2285     dev_region_t    *node;              /* current node */
2286     dev_region_t    *last_node;         /* last node before buffer write */
2287     rvm_return_t    retval = RVM_SUCCESS; /* return value */
2288     long            nodes_done = 0;
2289 
2290     /* sanity checks and initializations */
2291     assert(&log->dev != seg_dev);
2292     assert(log->trunc_thread == cthread_self());
2293     assert((status->trunc_state & RVM_TRUNC_PHASES)
2294            == RVM_TRUNC_APPLY);
2295     rvm_num_nodes = seg_dict->mod_tree.n_nodes;
2296     rvm_max_depth = seg_dict->mod_tree.max_depth;
2297     clear_aux_buf(log);
2298 
2299     /* process the change tree */
2300     if (!(log->in_recovery || rvm_utlsw)) /* begin segment dev_lock crit sec
2301                                              */
2302         {
2303         mutex_lock(&seg_dict->seg->dev_lock);
2304         assert(log->trunc_thread == cthread_self());
2305         assert((status->trunc_state & RVM_TRUNC_PHASES)
2306                == RVM_TRUNC_APPLY);
2307         }
2308     while (seg_dict->mod_tree.root != NULL)
2309         {
2310         /* pre-scan tree to determine how to fill buffer */
2311         last_node = pre_scan(log,&seg_dict->mod_tree);
2312 
2313         /* initialize buffer with segment data */
2314         temp = RVM_SUB_OFFSETS(last_node->end_offset,
2315                                log_buf->offset);
2316         temp = ROUND_OFFSET_TO_SECTOR_SIZE(temp);
2317         if (RVM_OFFSET_LEQ(temp,log_buf->buf_len))
2318             {
2319             /* node(s) fit in log buffer */
2320             log_buf->r_length = RVM_OFFSET_TO_LENGTH(RVM_SUB_OFFSETS(
2321                                     last_node->end_offset,
2322                                     log_buf->offset));
2323             log_buf->r_length =
2324                 ROUND_TO_SECTOR_SIZE(log_buf->r_length);
2325             assert(log_buf->r_length <= log_buf->length);
2326             preload = rvm_true;
2327             }
2328         else
2329             {
2330             log_buf->r_length = SECTOR_SIZE; /* very large node!! */
2331             preload = rvm_false;
2332             }
2333         /* allow reschedule & do the read */
2334         if (!(log->in_recovery || rvm_utlsw || rvm_no_yield))
2335             {
2336             cthread_yield();
2337             assert(log->trunc_thread == cthread_self());
2338             assert((status->trunc_state & RVM_TRUNC_PHASES)
2339                    == RVM_TRUNC_APPLY);
2340             }
2341         if ((r_length=read_dev(seg_dev,&log_buf->offset,
2342                            log_buf->buf,log_buf->r_length)) < 0)
2343             {
2344             retval = RVM_EIO;
2345             goto err_exit;
2346             }
2347         assert(log->trunc_thread == cthread_self());
2348         assert((status->trunc_state & RVM_TRUNC_PHASES)
2349                == RVM_TRUNC_APPLY);
2350         assert(r_length == log_buf->r_length);
2351 
2352         /* merge selected nodes into buffer */
2353         num_nodes = NODES_PER_YIELD;
2354         UNLINK_NODES_OF(seg_dict->mod_tree,dev_region_t,node)
2355             {
2356             assert(node->links.node.struct_id == dev_region_id);
2357             nodes_done++;
2358 
2359             /* do monitoring */
2360             if (rvm_chk_len != 0)
2361                 {
2362                 temp = log_buf->offset;
2363                 addr = (char *)CHOP_TO_SECTOR_SIZE(node->vmaddr);
2364                 monitor_vmaddr(addr,log_buf->r_length,log_buf->buf,
2365                                &log_buf->offset,NULL,
2366                                "update_seg: data read from segment:");
2367                 }
2368 
2369             /* merge data */
2370             if ((retval=merge_node(log,node,preload))
2371                 != RVM_SUCCESS) goto err_exit;
2372             if (rvm_chk_sigint != NULL) /* test for interrupt */
2373                 if ((*rvm_chk_sigint)(NULL)) goto err_exit;
2374             if (node == last_node) break;
2375             }
2376 
2377         /* update the segment on disk */
2378         if ((r_length=write_dev(seg_dev,&log_buf->offset,log_buf->buf,
2379                                 log_buf->r_length,rvm_true)) < 0)
2380             {
2381             retval = RVM_EIO;
2382             goto err_exit;
2383             }
2384         assert(log->trunc_thread == cthread_self());
2385         assert((status->trunc_state & RVM_TRUNC_PHASES)
2386                == RVM_TRUNC_APPLY);
2387         assert(r_length == log_buf->r_length);
2388         /* do monitoring */
2389         if (rvm_chk_len != 0)
2390             {
2391             if (!RVM_OFFSET_EQL(temp,log_buf->offset))
2392                 addr=RVM_ADD_LENGTH_TO_ADDR(addr,RVM_OFFSET_TO_LENGTH(
2393                          RVM_SUB_OFFSETS(log_buf->offset,temp)));
2394             monitor_vmaddr(addr,log_buf->r_length,log_buf->buf,
2395                            &log_buf->offset,NULL,
2396                            "update_seg: data written to segment:");
2397             }
2398         }
2399 
2400     /* tree checks and cleanup after unlinking */
2401     assert(nodes_done == rvm_num_nodes);
2402     assert(seg_dict->mod_tree.n_nodes == 0);
2403 
2404 err_exit:
2405     if (!(log->in_recovery || rvm_utlsw)) /* end segment dev_lock crit sec */
2406         {
2407         mutex_unlock(&seg_dict->seg->dev_lock);
2408         assert(log->trunc_thread == cthread_self());
2409         assert((status->trunc_state & RVM_TRUNC_PHASES)
2410                == RVM_TRUNC_APPLY);
2411         }
2412     return retval;
2413     }
2414 /* Recovery: phase 3 -- apply modifications to segments */
apply_mods(log)2415 static rvm_return_t apply_mods(log)
2416     log_t           *log;               /* log descriptor */
2417     {
2418     log_status_t    *status = &log->status; /* status descriptor */
2419     seg_dict_t      *seg_dict;          /* current segment dictionary entry */
2420     device_t        *seg_dev;           /* segment device descriptor */
2421     rvm_return_t    retval = RVM_SUCCESS; /* return value */
2422     long            i;                  /* loop counter */
2423     rvm_length_t    flags = O_RDWR;
2424 
2425     assert(log->trunc_thread == cthread_self());
2426     assert((status->trunc_state & RVM_TRUNC_PHASES)
2427            == RVM_TRUNC_BUILD_TREE);
2428     status->trunc_state = (status->trunc_state & ~RVM_TRUNC_BUILD_TREE)
2429                            | RVM_TRUNC_APPLY;
2430 
2431     /* iterate through segment dictionary */
2432     for (i=0;i<log->seg_dict_len;i++)
2433         {
2434         seg_dict = &log->seg_dict_vec[i];
2435         assert(seg_dict->struct_id == seg_dict_id);
2436 
2437         if (seg_dict->mod_tree.root == NULL)
2438             continue;                   /* no changes to this seg */
2439 
2440         /* open device and get characteristics if necessary */
2441         if (log->in_recovery)
2442             {
2443             seg_dev = &seg_dict->dev;
2444             if (rvm_no_update) flags = O_RDONLY;
2445             if (open_dev(seg_dev,flags,0) < 0)
2446                 return RVM_EIO;
2447             assert(log->trunc_thread == cthread_self());
2448             if (set_dev_char(seg_dev,&seg_dev->num_bytes) < 0)
2449                 {
2450                 close_dev(seg_dev);
2451                 return RVM_EIO;
2452                 }
2453             assert(log->trunc_thread == cthread_self());
2454             assert((status->trunc_state & RVM_TRUNC_PHASES)
2455                    == RVM_TRUNC_APPLY);
2456             }
2457         else
2458             {
2459             assert(seg_dict->seg->links.struct_id == seg_id);
2460             seg_dev = &(seg_dict->seg->dev); /* already open */
2461             }
2462         log->cur_seg_dev = seg_dev;
2463 
2464         /* read segment data and merge new values */
2465         if ((retval=update_seg(log,seg_dict,seg_dev))
2466             != RVM_SUCCESS) return retval;
2467         assert(log->trunc_thread == cthread_self());
2468         assert((status->trunc_state & RVM_TRUNC_PHASES)
2469                == RVM_TRUNC_APPLY);
2470 
2471         /* close segment device if in recovery */
2472         if (log->in_recovery)
2473             if (close_dev(seg_dev) < 0)
2474                 return RVM_EIO;
2475         }
2476 
2477     /* re-protect buffer */
2478 /* MACH_RVM_PROTECT
2479  *
2480  * protect(log->log_buf.buf, log->log_buf.length, FALSE, VM_PROT_READ);
2481  */
2482 
2483     return retval;
2484     }
2485 /* Recovery: phase 4 -- update head/tail of log */
status_update(log,new_1st_rec_num)2486 static rvm_return_t status_update(log, new_1st_rec_num)
2487     log_t           *log;               /* log descriptor */
2488     rvm_length_t    new_1st_rec_num;
2489     {
2490     log_status_t    *status = &log->status; /* status descriptor */
2491     struct timeval  end_time;           /* end of action time temp */
2492     int             kretval;
2493     rvm_return_t    retval = RVM_SUCCESS; /* return value */
2494 
2495     assert(log->trunc_thread == cthread_self());
2496     assert((status->trunc_state & RVM_TRUNC_PHASES)
2497            == RVM_TRUNC_APPLY);
2498     status->trunc_state = (status->trunc_state & ~RVM_TRUNC_APPLY)
2499                            | RVM_TRUNC_UPDATE;
2500 
2501     /* update the status block on disk */
2502     CRITICAL(log->dev_lock,             /* begin log device lock crit sec */
2503         {
2504         assert(log->trunc_thread == cthread_self());
2505         assert((status->trunc_state & RVM_TRUNC_PHASES)
2506            == RVM_TRUNC_UPDATE);
2507         status->prev_trunc = status->last_trunc;
2508 
2509         if (RVM_OFFSET_EQL(status->log_head,status->log_tail))
2510             clear_log_status(log);      /* log empty */
2511         else
2512             {
2513             RVM_ZERO_OFFSET(status->prev_log_head);
2514             RVM_ZERO_OFFSET(status->prev_log_tail);
2515             status->first_rec_num = new_1st_rec_num;
2516             }
2517 
2518         /* end timings */
2519         kretval= gettimeofday(&end_time,(struct timezone *)NULL);
2520         if (kretval != 0) goto err_exit;
2521         end_time = sub_times(&end_time,&trunc_start_time);
2522         status->tot_truncation_time =
2523             add_times(&status->tot_truncation_time,&end_time);
2524         status->last_truncation_time = round_time(&end_time);
2525         enter_histogram(status->last_truncation_time,
2526                             log->status.tot_truncation_times,
2527                             truncation_times_vec,truncation_times_len);
2528         status->last_tree_build_time = last_tree_build_time;
2529         enter_histogram(last_tree_build_time,
2530                         log->status.tot_tree_build_times,
2531                         truncation_times_vec,truncation_times_len);
2532         status->last_tree_apply_time = last_tree_apply_time;
2533         enter_histogram(last_tree_apply_time,
2534                         log->status.tot_tree_apply_times,
2535                         truncation_times_vec,truncation_times_len);
2536 
2537         retval = write_log_status(log,NULL);
2538 err_exit:;
2539         assert(log->trunc_thread == cthread_self());
2540         assert((status->trunc_state & RVM_TRUNC_PHASES)
2541            == RVM_TRUNC_UPDATE);
2542         });                             /* end log device lock crit sec */
2543     if (kretval != 0) return RVM_EIO;
2544     if (retval != RVM_SUCCESS) return retval;
2545 
2546     if (log->in_recovery && (!rvm_utlsw)) /* do recovery-only processing */
2547         {
2548         /* kill segment dictionary */
2549         free_seg_dict_vec(log);
2550 
2551         log->in_recovery = rvm_false;
2552         }
2553 
2554     return retval;
2555     }
2556 /* switch truncation epochs */
new_epoch(log,count)2557 static rvm_return_t new_epoch(log,count)
2558     log_t           *log;               /* log descriptor */
2559     rvm_length_t    *count;             /* ptr to statistics counter */
2560     {
2561     log_status_t    *status = &log->status; /* log status descriptor */
2562     rvm_return_t    retval = RVM_SUCCESS;
2563 
2564     /* be sure last records in truncation are in log */
2565     assert(log->trunc_thread == cthread_self());
2566     if (sync_dev(&log->dev) < 0)
2567         return RVM_EIO;
2568     assert(log->trunc_thread == cthread_self());
2569 
2570     /* count truncations & accumulate statistics */
2571     (*count)++;
2572     copy_log_stats(log);
2573 
2574     /* set up head/tail pointers for truncation */
2575     status->prev_log_head = status->log_head;
2576     status->log_head = status->log_tail;
2577     status->prev_log_tail = status->log_tail;
2578     status->last_rec_num = status->next_rec_num-1;
2579 
2580     /* set epoch time stamp and write status block */
2581     make_uname(&status->last_trunc);
2582     if ((retval=write_log_status(log,NULL)) != RVM_SUCCESS)
2583         return retval;
2584     assert(log->trunc_thread == cthread_self());
2585 
2586     /* restore log segment definitions */
2587     retval = define_all_segs(log);
2588     assert(log->trunc_thread == cthread_self());
2589     return retval;
2590     }
2591 
2592 /* recover committed state from log */
log_recover(log,count,is_daemon,flag)2593 rvm_return_t log_recover(log,count,is_daemon,flag)
2594     log_t           *log;               /* log descriptor */
2595     rvm_length_t    *count;             /* ptr to statistics counter */
2596     rvm_bool_t      is_daemon;          /* true if called by daemon */
2597     rvm_length_t    flag;               /* truncation type flag */
2598 {
2599     log_status_t    *status = &log->status; /* log status descriptor */
2600     log_daemon_t    *daemon = &log->daemon; /* log daemon descriptor */
2601     struct timeval  end_time;           /* end of action time temp */
2602     struct timeval  tmp_time;           /* local timing temp */
2603     int             kretval;
2604     rvm_bool_t      do_truncation = rvm_false;
2605     rvm_return_t    retval = RVM_SUCCESS;
2606     rvm_length_t    new_1st_rec_num=0;
2607 X(start)
2608     CRITICAL(log->truncation_lock,      /* begin truncation lock crit sec */
2609         {
2610         /* capture truncation thread & flag for checking */
2611         assert(log->trunc_thread == (cthread_t)NULL);
2612         assert(status->trunc_state == ZERO);
2613         log->trunc_thread = cthread_self();
2614         status->trunc_state = flag;
2615 X(dev_lock)
2616         CRITICAL(log->dev_lock,         /* begin dev_lock crit sec */
2617             {
2618             /* process statistics */
2619             assert(log->trunc_thread == cthread_self());
2620             kretval= gettimeofday(&trunc_start_time,
2621                                   (struct timezone *)NULL);
2622             if (kretval != 0)
2623                 {
2624                 retval = RVM_EIO;
2625                 goto err_exit1;
2626                 }
2627             last_tree_build_time = 0;
2628             last_tree_apply_time = 0;
2629 X(in_recovery)
2630             /* phase 1: locate tail & start new epoch */
2631             if (log->in_recovery)
2632                 {
2633                 if ((retval=locate_tail(log)) != RVM_SUCCESS)
2634                     goto err_exit1;
2635                 assert((status->trunc_state & RVM_TRUNC_PHASES)
2636                        == RVM_TRUNC_FIND_TAIL);
2637                 }
2638             assert(log->trunc_thread == cthread_self());
2639             if (rvm_chk_sigint != NULL) /* test for interrupt */
2640                 if ((*rvm_chk_sigint)(NULL)) goto err_exit1;
2641             /* see if truncation actually needed */
2642             if (RVM_OFFSET_EQL(status->log_tail,status->log_head))
2643                 status->log_empty = rvm_true;
2644             else
2645                 {
2646                 status->log_empty = rvm_false;
2647                 do_truncation = rvm_true;
2648                 new_1st_rec_num = status->next_rec_num;
2649 
2650                 /* switch epochs */
2651                 if ((retval=new_epoch(log,count)) != RVM_SUCCESS)
2652                     goto err_exit1;
2653                 assert(log->trunc_thread == cthread_self());
2654                 }
2655 
2656 X(err_exit1)
2657 err_exit1:;
2658 	    /* signal `initiate_truncation' that the first part is done */
2659 	    if (is_daemon)
2660 		{
2661 		mutex_lock(&daemon->lock);
2662 		assert(log->daemon.thread == cthread_self());
2663 		assert(daemon->state == truncating);
2664 		assert((status->trunc_state & RVM_ASYNC_TRUNCATE) != 0);
2665 		condition_signal(&daemon->flush_flag);
2666 		mutex_unlock(&daemon->lock);
2667 		}
2668             });                         /* end dev_lock crit sec */
2669 
2670         if (retval != RVM_SUCCESS) goto err_exit;
2671         if (rvm_chk_sigint != NULL)     /* test for interrupt */
2672             if ((*rvm_chk_sigint)(NULL)) goto err_exit;
2673         /* do log scan if truncation actually needed */
2674         if (do_truncation)
2675             {
2676 X(do_trunc)
2677             /* build tree and time */
2678             kretval= gettimeofday(&tmp_time,(struct timezone *)NULL);
2679             if (kretval != 0) assert(0); /* return RVM_EIO; */
2680 X(build_tree)
2681             if ((retval=build_tree(log)) != RVM_SUCCESS) /* phase 2 */
2682                 assert(0); /* return retval; */
2683 X(build_tree done)
2684             assert(log->trunc_thread == cthread_self());
2685             assert((status->trunc_state & RVM_TRUNC_PHASES)
2686                    == RVM_TRUNC_BUILD_TREE);
2687 
2688             kretval= gettimeofday(&end_time,(struct timezone *)NULL);
2689             if (kretval != 0) assert(0); /* return RVM_EIO; */
2690             end_time = sub_times(&end_time,&tmp_time);
2691             last_tree_build_time = round_time(&end_time);
2692             if (rvm_chk_sigint != NULL) /* test for interrupt */
2693                 if ((*rvm_chk_sigint)(NULL)) goto err_exit;
2694 
2695             /* apply tree and time */
2696             kretval= gettimeofday(&tmp_time,(struct timezone *)NULL);
2697             if (kretval != 0) assert(0); /* return RVM_EIO; */
2698 X(apply_mods)
2699             if ((retval=apply_mods(log)) != RVM_SUCCESS) /* phase 3 */
2700                 goto err_exit;
2701 X(apply_mods end)
2702             assert(log->trunc_thread == cthread_self());
2703             assert((status->trunc_state & RVM_TRUNC_PHASES)
2704                    == RVM_TRUNC_APPLY);
2705             kretval= gettimeofday(&end_time,(struct timezone *)NULL);
2706             if (kretval != 0) assert(0); /* return RVM_EIO; */
2707             end_time = sub_times(&end_time,&tmp_time);
2708             last_tree_apply_time = round_time(&end_time);
2709             if (rvm_chk_sigint != NULL) /* test for interrupt */
2710                 if ((*rvm_chk_sigint)(NULL)) goto err_exit;
2711             }
2712         else
2713             status->trunc_state =
2714                 (status->trunc_state & ~RVM_TRUNC_PHASES)
2715                     | RVM_TRUNC_APPLY;
2716 X(status_upd)
2717         /* always update the status */
2718         retval = status_update(log, new_1st_rec_num);    /* phase 4 */
2719         assert(log->trunc_thread == cthread_self());
2720         assert((status->trunc_state & RVM_TRUNC_PHASES)
2721                == RVM_TRUNC_UPDATE);
2722         /* wake up any threads waiting on a truncation */
2723 err_exit:
2724         assert(log->trunc_thread == cthread_self());
2725         CRITICAL(daemon->lock,          /* begin daemon->lock crit sec */
2726             {
2727             assert(log->trunc_thread == cthread_self());
2728             if (is_daemon)
2729                 {
2730                 assert(log->daemon.thread == cthread_self());
2731                 assert((status->trunc_state & RVM_ASYNC_TRUNCATE) != 0);
2732                 assert(daemon->state == truncating);
2733                 if (retval != RVM_SUCCESS)
2734                     daemon->state = error;
2735                 }
2736             assert(log->trunc_thread == cthread_self());
2737             });                         /* end daemon->lock crit sec */
2738 
2739         log->trunc_thread = (cthread_t)NULL;
2740         status->trunc_state = ZERO;
2741         });                             /* end truncation lock crit sec */
2742 
2743     return retval;
2744 }
2745 #undef X
2746 
2747 
2748 /* rvm_truncate */
rvm_truncate()2749 rvm_return_t rvm_truncate()
2750 {
2751 	rvm_return_t    retval;
2752 
2753 	/* initial checks */
2754 	if (bad_init())
2755 		return RVM_EINIT;
2756 	if (default_log == NULL)
2757 		return RVM_ELOG;
2758 
2759     /* flush any queued records */
2760 	if ((retval=flush_log(default_log,
2761 			      &default_log->status.n_flush))
2762 	    != RVM_SUCCESS) return retval;
2763 
2764 	/* do truncation */
2765 	retval = log_recover(default_log,
2766 			     &default_log->status.tot_rvm_truncate,
2767 			     rvm_false,RVM_TRUNCATE_CALL);
2768 	return retval;
2769 }
2770 
2771 
2772 /* map & flush <--> truncation synchronization functions */
2773 
2774 /* initiate asynchronous truncation */
initiate_truncation(log,threshold)2775 rvm_bool_t initiate_truncation(log,threshold)
2776     log_t           *log;               /* log descriptor */
2777     rvm_length_t    threshold;          /* log % truncation threshold */
2778 {
2779     log_daemon_t    *daemon = &log->daemon; /* daemon control descriptor */
2780     rvm_bool_t      did_init = rvm_false; /* true if initiated truncation */
2781 
2782     /* test threshold for asynch truncation */
2783     if (!daemon->truncate || threshold < daemon->truncate)
2784 	return rvm_false;
2785 
2786     /* trigger a truncation if log at threshold */
2787     CRITICAL(daemon->lock,              /* begin daemon->lock crit sec */
2788         {
2789             /* wake up daemon if idle */
2790             if (daemon->state == rvm_idle)
2791 	    {
2792                 did_init = rvm_true;
2793                 daemon->state = truncating;
2794                 condition_signal(&daemon->code);
2795                 condition_wait(&daemon->flush_flag,&daemon->lock);
2796 	    }
2797         });                             /* end daemon->lock crit sec */
2798 
2799     return did_init;
2800 }
2801 /* wait until truncation has processed all records up to time_stamp */
wait_for_truncation(log,time_stamp)2802 rvm_return_t wait_for_truncation(log,time_stamp)
2803     log_t           *log;               /* log descriptor */
2804     struct timeval  *time_stamp;        /* time threshold */
2805     {
2806     log_daemon_t    *daemon = &log->daemon; /* deamon control descriptor */
2807     log_status_t    *status = &log->status; /* log status descriptor */
2808     rvm_bool_t      force_trunc = rvm_false; /* do syncronous truncation */
2809     rvm_bool_t      exit_sw = rvm_false;
2810     rvm_return_t    retval = RVM_SUCCESS;
2811 
2812     while (!exit_sw)
2813         {
2814         CRITICAL(daemon->lock,          /* begin daemon lock crit sec */
2815             {
2816             /* synchronous truncation if daemon not in use */
2817             if ((daemon->truncate == 0) || (daemon->state == rvm_idle))
2818                 {
2819                 force_trunc = rvm_true;
2820                 goto exit_wait;
2821                 }
2822 
2823             /* wait for concurrent truncation completion */
2824             while (daemon->state == truncating)
2825                 {
2826                 condition_wait(&daemon->wake_up,&daemon->lock);
2827                 }
2828             if (daemon->state == error)
2829                 {
2830                 retval = RVM_EINTERNAL; /* quit if daemon had error */
2831                 goto exit_wait;
2832                 }
2833 
2834             /* see if records up to time threshold have been processed */
2835             if ((time_stamp == NULL) ||
2836                 (TIME_GEQ(status->last_trunc,*time_stamp)))
2837                 goto exit_wait;         /* yes, exit */
2838 
2839             /* no, must trigger another truncation */
2840             daemon->state = truncating;
2841             condition_signal(&daemon->code);
2842             goto exit_crit_sec;
2843 
2844 exit_wait:  exit_sw = rvm_true;
2845 exit_crit_sec:;
2846             });                         /* end daemon lock crit sec */
2847         }
2848 
2849     /* do synchronous truncation */
2850     if (force_trunc)
2851         retval = log_recover(log,&log->status.tot_sync_truncation,
2852                              rvm_false,RVM_SYNC_TRUNCATE);
2853 
2854     return retval;
2855     }
2856 /* truncation daemon */
log_daemon(void * arg)2857 void log_daemon(void *arg)
2858     {
2859     log_t           *log = arg;               /* log descriptor */
2860     log_daemon_t    *daemon = &log->daemon; /* deamon control descriptor */
2861     daemon_state_t  state;              /* daemon state code */
2862     rvm_return_t    retval __attribute__((unused));
2863 
2864 #ifdef RVM_USELWP
2865     PRE_Concurrent(1);
2866 #endif
2867 
2868     DO_FOREVER
2869         {
2870         /* wait to be awakened by request */
2871         CRITICAL(daemon->lock,          /* begin daemon lock crit sec */
2872             {
2873 		daemon->state = rvm_idle;
2874 		condition_broadcast(&daemon->wake_up);
2875 		while (daemon->state == rvm_idle) {
2876 		    condition_wait(&daemon->code, &daemon->lock);
2877 		}
2878 		state = daemon->state;      /* end daemon lock crit sec */
2879             });
2880 
2881         /* process request */
2882         switch (state)
2883             {
2884           case truncating:                /* do a truncation */
2885             retval = log_recover(log,&log->status.tot_async_truncation,
2886                                  rvm_true,RVM_ASYNC_TRUNCATE);
2887 
2888             CRITICAL(daemon->lock, state = daemon->state);
2889             if (state == error)
2890                 cthread_exit(retval);   /* error -- return code */
2891             if (state != terminate) break;
2892 
2893           case terminate:
2894 #ifdef RVM_USELWP
2895 	    daemon->thread = NULL;
2896 #endif
2897             cthread_exit(RVM_SUCCESS);  /* normal exit */
2898 
2899           default:    assert(rvm_false);    /* error */
2900             }
2901         }
2902     }
2903