1 /* -*- mode: C++; c-basic-offset: 4; indent-tabs-mode: nil -*- */
2 // vim: ft=cpp:expandtab:ts=8:sw=4:softtabstop=4:
3 #ident "$Id$"
4 /*======
5 This file is part of PerconaFT.
6 
7 
8 Copyright (c) 2006, 2015, Percona and/or its affiliates. All rights reserved.
9 
10     PerconaFT is free software: you can redistribute it and/or modify
11     it under the terms of the GNU General Public License, version 2,
12     as published by the Free Software Foundation.
13 
14     PerconaFT is distributed in the hope that it will be useful,
15     but WITHOUT ANY WARRANTY; without even the implied warranty of
16     MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
17     GNU General Public License for more details.
18 
19     You should have received a copy of the GNU General Public License
20     along with PerconaFT.  If not, see <http://www.gnu.org/licenses/>.
21 
22 ----------------------------------------
23 
24     PerconaFT is free software: you can redistribute it and/or modify
25     it under the terms of the GNU Affero General Public License, version 3,
26     as published by the Free Software Foundation.
27 
28     PerconaFT is distributed in the hope that it will be useful,
29     but WITHOUT ANY WARRANTY; without even the implied warranty of
30     MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
31     GNU Affero General Public License for more details.
32 
33     You should have received a copy of the GNU Affero General Public License
34     along with PerconaFT.  If not, see <http://www.gnu.org/licenses/>.
35 ======= */
36 
37 #ident "Copyright (c) 2006, 2015, Percona and/or its affiliates. All rights reserved."
38 
39 #include "ft/ft.h"
40 #include "ft/ft-internal.h"
41 #include "ft/msg.h"
42 #include "ft/serialize/block_allocator.h"
43 #include "ft/serialize/block_table.h"
44 #include "ft/serialize/compress.h"
45 #include "ft/serialize/ft-serialize.h"
46 
47 // not version-sensitive because we only serialize a descriptor using the current layout_version
48 uint32_t
toku_serialize_descriptor_size(DESCRIPTOR desc)49 toku_serialize_descriptor_size(DESCRIPTOR desc) {
50     //Checksum NOT included in this.  Checksum only exists in header's version.
51     uint32_t size = 4; // four bytes for size of descriptor
52     size += desc->dbt.size;
53     return size;
54 }
55 
56 static uint32_t
deserialize_descriptor_size(DESCRIPTOR desc,int layout_version)57 deserialize_descriptor_size(DESCRIPTOR desc, int layout_version) {
58     //Checksum NOT included in this.  Checksum only exists in header's version.
59     uint32_t size = 4; // four bytes for size of descriptor
60     if (layout_version == FT_LAYOUT_VERSION_13)
61         size += 4;   // for version 13, include four bytes of "version"
62     size += desc->dbt.size;
63     return size;
64 }
65 
toku_serialize_descriptor_contents_to_wbuf(struct wbuf * wb,DESCRIPTOR desc)66 void toku_serialize_descriptor_contents_to_wbuf(struct wbuf *wb, DESCRIPTOR desc) {
67     wbuf_bytes(wb, desc->dbt.data, desc->dbt.size);
68 }
69 
70 //Descriptor is written to disk during toku_ft_handle_open iff we have a new (or changed)
71 //descriptor.
72 //Descriptors are NOT written during the header checkpoint process.
73 void
toku_serialize_descriptor_contents_to_fd(int fd,DESCRIPTOR desc,DISKOFF offset)74 toku_serialize_descriptor_contents_to_fd(int fd, DESCRIPTOR desc, DISKOFF offset) {
75     // make the checksum
76     int64_t size = toku_serialize_descriptor_size(desc)+4; //4 for checksum
77     int64_t size_aligned = roundup_to_multiple(512, size);
78     struct wbuf w;
79     char *XMALLOC_N_ALIGNED(512, size_aligned, aligned_buf);
80     for (int64_t i=size; i<size_aligned; i++) aligned_buf[i] = 0;
81     wbuf_init(&w, aligned_buf, size);
82     toku_serialize_descriptor_contents_to_wbuf(&w, desc);
83     {
84         //Add checksum
85         uint32_t checksum = toku_x1764_finish(&w.checksum);
86         wbuf_int(&w, checksum);
87     }
88     lazy_assert(w.ndone==w.size);
89     {
90         //Actual Write translation table
91         toku_os_full_pwrite(fd, w.buf, size_aligned, offset);
92     }
93     toku_free(w.buf);
94 }
95 
96 static void
deserialize_descriptor_from_rbuf(struct rbuf * rb,DESCRIPTOR desc,int layout_version)97 deserialize_descriptor_from_rbuf(struct rbuf *rb, DESCRIPTOR desc, int layout_version) {
98     if (layout_version <= FT_LAYOUT_VERSION_13) {
99         // in older versions of tokuft, the descriptor had a 4 byte
100         // version, which we skip over
101         (void) rbuf_int(rb);
102     }
103 
104     uint32_t size;
105     const void *data;
106     rbuf_bytes(rb, &data, &size);
107     toku_memdup_dbt(&desc->dbt, data, size);
108 }
109 
110 static int
deserialize_descriptor_from(int fd,block_table * bt,DESCRIPTOR desc,int layout_version)111 deserialize_descriptor_from(int fd, block_table *bt, DESCRIPTOR desc, int layout_version) {
112     int r = 0;
113     DISKOFF offset;
114     DISKOFF size;
115     unsigned char *dbuf = nullptr;
116     bt->get_descriptor_offset_size(&offset, &size);
117     memset(desc, 0, sizeof(*desc));
118     if (size > 0) {
119         lazy_assert(size>=4); //4 for checksum
120         {
121             ssize_t size_to_malloc = roundup_to_multiple(512, size);
122             XMALLOC_N_ALIGNED(512, size_to_malloc, dbuf);
123             {
124 
125                 ssize_t sz_read = toku_os_pread(fd, dbuf, size_to_malloc, offset);
126                 lazy_assert(sz_read==size_to_malloc);
127             }
128             {
129                 // check the checksum
130                 uint32_t x1764 = toku_x1764_memory(dbuf, size-4);
131                 //printf("%s:%d read from %ld (x1764 offset=%ld) size=%ld\n", __FILE__, __LINE__, block_translation_address_on_disk, offset, block_translation_size_on_disk);
132                 uint32_t stored_x1764 = toku_dtoh32(*(int*)(dbuf + size-4));
133                 if (x1764 != stored_x1764) {
134                     fprintf(stderr, "Descriptor checksum failure: calc=0x%08x read=0x%08x\n", x1764, stored_x1764);
135                     r = TOKUDB_BAD_CHECKSUM;
136                     toku_free(dbuf);
137                     goto exit;
138                 }
139             }
140 
141             struct rbuf rb = { .buf = dbuf, .size = (unsigned int) size, .ndone = 0 };
142             deserialize_descriptor_from_rbuf(&rb, desc, layout_version);
143             lazy_assert(deserialize_descriptor_size(desc, layout_version) + 4 == size);
144             toku_free(dbuf);
145         }
146     }
147 exit:
148     return r;
149 }
150 
deserialize_ft_versioned(int fd,struct rbuf * rb,FT * ftp,uint32_t version)151 int deserialize_ft_versioned(int fd, struct rbuf *rb, FT *ftp, uint32_t version)
152 // Effect: Deserialize the ft header.
153 //   We deserialize ft_header only once and then share everything with all the FTs.
154 {
155     int r;
156     FT ft = NULL;
157     paranoid_invariant(version >= FT_LAYOUT_MIN_SUPPORTED_VERSION);
158     paranoid_invariant(version <= FT_LAYOUT_VERSION);
159     // We already know:
160     //  we have an rbuf representing the header.
161     //  The checksum has been validated
162 
163     //Verification of initial elements.
164     //Check magic number
165     const void *magic;
166     rbuf_literal_bytes(rb, &magic, 8);
167     lazy_assert(memcmp(magic,"tokudata",8)==0);
168 
169     XCALLOC(ft);
170     ft->checkpoint_header = NULL;
171     toku_list_init(&ft->live_ft_handles);
172 
173     //version MUST be in network order on disk regardless of disk order
174     ft->layout_version_read_from_disk = rbuf_network_int(rb);
175     invariant(ft->layout_version_read_from_disk >= FT_LAYOUT_MIN_SUPPORTED_VERSION);
176     invariant(ft->layout_version_read_from_disk <= FT_LAYOUT_VERSION);
177 
178     //build_id MUST be in network order on disk regardless of disk order
179     uint32_t build_id;
180     build_id = rbuf_network_int(rb);
181 
182     //Size MUST be in network order regardless of disk order.
183     uint32_t size;
184     size = rbuf_network_int(rb);
185     lazy_assert(size == rb->size);
186 
187     const void *tmp_byte_order_check;
188     lazy_assert((sizeof tmp_byte_order_check) >= 8);
189     rbuf_literal_bytes(rb, &tmp_byte_order_check, 8); //Must not translate byte order
190     int64_t byte_order_stored;
191     byte_order_stored = *(int64_t*)tmp_byte_order_check;
192     lazy_assert(byte_order_stored == toku_byte_order_host);
193 
194     uint64_t checkpoint_count;
195     checkpoint_count = rbuf_ulonglong(rb);
196     LSN checkpoint_lsn;
197     checkpoint_lsn = rbuf_LSN(rb);
198     unsigned nodesize;
199     nodesize = rbuf_int(rb);
200     DISKOFF translation_address_on_disk;
201     translation_address_on_disk = rbuf_DISKOFF(rb);
202     DISKOFF translation_size_on_disk;
203     translation_size_on_disk = rbuf_DISKOFF(rb);
204     lazy_assert(translation_address_on_disk > 0);
205     lazy_assert(translation_size_on_disk > 0);
206 
207     // initialize the tree lock
208     toku_ft_init_reflock(ft);
209 
210     //Load translation table
211     {
212         size_t size_to_read = roundup_to_multiple(512, translation_size_on_disk);
213         unsigned char *XMALLOC_N_ALIGNED(512, size_to_read, tbuf);
214         {
215             // This cast is messed up in 32-bits if the block translation
216             // table is ever more than 4GB.  But in that case, the
217             // translation table itself won't fit in main memory.
218             ssize_t readsz = toku_os_pread(fd, tbuf, size_to_read,
219                                            translation_address_on_disk);
220             invariant(readsz >= translation_size_on_disk);
221             invariant(readsz <= (ssize_t)size_to_read);
222         }
223         // Create table and read in data.
224         r = ft->blocktable.create_from_buffer(fd,
225                                               translation_address_on_disk,
226                                               translation_size_on_disk,
227                                               tbuf);
228         toku_free(tbuf);
229         if (r != 0) {
230             goto exit;
231         }
232     }
233 
234     BLOCKNUM root_blocknum;
235     root_blocknum = rbuf_blocknum(rb);
236     unsigned flags;
237     flags = rbuf_int(rb);
238     if (ft->layout_version_read_from_disk <= FT_LAYOUT_VERSION_13) {
239         // deprecate 'TOKU_DB_VALCMP_BUILTIN'. just remove the flag
240         flags &= ~TOKU_DB_VALCMP_BUILTIN_13;
241     }
242     int layout_version_original;
243     layout_version_original = rbuf_int(rb);
244     uint32_t build_id_original;
245     build_id_original = rbuf_int(rb);
246     uint64_t time_of_creation;
247     time_of_creation = rbuf_ulonglong(rb);
248     uint64_t time_of_last_modification;
249     time_of_last_modification = rbuf_ulonglong(rb);
250 
251     if (ft->layout_version_read_from_disk <= FT_LAYOUT_VERSION_18) {
252         // 17 was the last version with these fields, we no longer store
253         // them, so read and discard them
254         (void) rbuf_ulonglong(rb);  // num_blocks_to_upgrade_13
255         if (ft->layout_version_read_from_disk >= FT_LAYOUT_VERSION_15) {
256             (void) rbuf_ulonglong(rb);  // num_blocks_to_upgrade_14
257         }
258     }
259 
260     // fake creation during the last checkpoint
261     TXNID root_xid_that_created;
262     root_xid_that_created = checkpoint_lsn.lsn;
263     if (ft->layout_version_read_from_disk >= FT_LAYOUT_VERSION_14) {
264         rbuf_TXNID(rb, &root_xid_that_created);
265     }
266 
267     // TODO(leif): get this to default to what's specified, not the
268     // hard-coded default
269     unsigned basementnodesize;
270     basementnodesize = FT_DEFAULT_BASEMENT_NODE_SIZE;
271     uint64_t time_of_last_verification;
272     time_of_last_verification = 0;
273     if (ft->layout_version_read_from_disk >= FT_LAYOUT_VERSION_15) {
274         basementnodesize = rbuf_int(rb);
275         time_of_last_verification = rbuf_ulonglong(rb);
276     }
277 
278     STAT64INFO_S on_disk_stats;
279     on_disk_stats = ZEROSTATS;
280     uint64_t time_of_last_optimize_begin;
281     time_of_last_optimize_begin = 0;
282     uint64_t time_of_last_optimize_end;
283     time_of_last_optimize_end = 0;
284     uint32_t count_of_optimize_in_progress;
285     count_of_optimize_in_progress = 0;
286     MSN msn_at_start_of_last_completed_optimize;
287     msn_at_start_of_last_completed_optimize = ZERO_MSN;
288     if (ft->layout_version_read_from_disk >= FT_LAYOUT_VERSION_18) {
289         on_disk_stats.numrows = rbuf_ulonglong(rb);
290         on_disk_stats.numbytes = rbuf_ulonglong(rb);
291         ft->in_memory_stats = on_disk_stats;
292         time_of_last_optimize_begin = rbuf_ulonglong(rb);
293         time_of_last_optimize_end = rbuf_ulonglong(rb);
294         count_of_optimize_in_progress = rbuf_int(rb);
295         msn_at_start_of_last_completed_optimize = rbuf_MSN(rb);
296     }
297 
298     enum toku_compression_method compression_method;
299     MSN highest_unused_msn_for_upgrade;
300     highest_unused_msn_for_upgrade.msn = (MIN_MSN.msn - 1);
301     if (ft->layout_version_read_from_disk >= FT_LAYOUT_VERSION_19) {
302         unsigned char method = rbuf_char(rb);
303         compression_method = (enum toku_compression_method) method;
304         highest_unused_msn_for_upgrade = rbuf_MSN(rb);
305     } else {
306         // we hard coded zlib until 5.2, then quicklz in 5.2
307         if (ft->layout_version_read_from_disk < FT_LAYOUT_VERSION_18) {
308             compression_method = TOKU_ZLIB_METHOD;
309         } else {
310             compression_method = TOKU_QUICKLZ_METHOD;
311         }
312     }
313 
314     MSN max_msn_in_ft;
315     max_msn_in_ft = ZERO_MSN;  // We'll upgrade it from the root node later if necessary
316     if (ft->layout_version_read_from_disk >= FT_LAYOUT_VERSION_21) {
317         max_msn_in_ft = rbuf_MSN(rb);
318     }
319 
320     unsigned fanout;
321     fanout = FT_DEFAULT_FANOUT;
322     if (ft->layout_version_read_from_disk >= FT_LAYOUT_VERSION_28) {
323         fanout = rbuf_int(rb);
324     }
325 
326     uint64_t on_disk_logical_rows;
327     on_disk_logical_rows = (uint64_t)-1;
328     if (ft->layout_version_read_from_disk >= FT_LAYOUT_VERSION_29) {
329         on_disk_logical_rows = rbuf_ulonglong(rb);
330     }
331     ft->in_memory_logical_rows = on_disk_logical_rows;
332 
333     (void) rbuf_int(rb); //Read in checksum and ignore (already verified).
334     if (rb->ndone != rb->size) {
335         fprintf(stderr, "Header size did not match contents.\n");
336         r = EINVAL;
337         goto exit;
338     }
339 
340     {
341         struct ft_header h = {
342             .type = FT_CURRENT,
343             .dirty_ = 0,
344             .checkpoint_count = checkpoint_count,
345             .checkpoint_lsn = checkpoint_lsn,
346             .layout_version = FT_LAYOUT_VERSION,
347             .layout_version_original = layout_version_original,
348             .build_id = build_id,
349             .build_id_original = build_id_original,
350             .time_of_creation = time_of_creation,
351             .root_xid_that_created = root_xid_that_created,
352             .time_of_last_modification = time_of_last_modification,
353             .time_of_last_verification = time_of_last_verification,
354             .root_blocknum = root_blocknum,
355             .flags = flags,
356             .nodesize = nodesize,
357             .basementnodesize = basementnodesize,
358             .compression_method = compression_method,
359             .fanout = fanout,
360             .highest_unused_msn_for_upgrade = highest_unused_msn_for_upgrade,
361             .max_msn_in_ft = max_msn_in_ft,
362             .time_of_last_optimize_begin = time_of_last_optimize_begin,
363             .time_of_last_optimize_end = time_of_last_optimize_end,
364             .count_of_optimize_in_progress = count_of_optimize_in_progress,
365             .count_of_optimize_in_progress_read_from_disk = count_of_optimize_in_progress,
366             .msn_at_start_of_last_completed_optimize = msn_at_start_of_last_completed_optimize,
367             .on_disk_stats = on_disk_stats,
368             .on_disk_logical_rows = on_disk_logical_rows
369         };
370         XMEMDUP(ft->h, &h);
371     }
372 
373     if (ft->layout_version_read_from_disk < FT_LAYOUT_VERSION_18) {
374         // This needs ft->h to be non-null, so we have to do it after we
375         // read everything else.
376         r = toku_upgrade_subtree_estimates_to_stat64info(fd, ft);
377         if (r != 0) {
378             goto exit;
379         }
380     }
381     if (ft->layout_version_read_from_disk < FT_LAYOUT_VERSION_21) {
382         r = toku_upgrade_msn_from_root_to_header(fd, ft);
383         if (r != 0) {
384             goto exit;
385         }
386     }
387 
388     invariant((uint32_t) ft->layout_version_read_from_disk == version);
389     r = deserialize_descriptor_from(fd, &ft->blocktable, &ft->descriptor, version);
390     if (r != 0) {
391         goto exit;
392     }
393 
394     // initialize for svn #4541
395     toku_clone_dbt(&ft->cmp_descriptor.dbt, ft->descriptor.dbt);
396 
397     // Version 13 descriptors had an extra 4 bytes that we don't read
398     // anymore.  Since the header is going to think it's the current
399     // version if it gets written out, we need to write the descriptor in
400     // the new format (without those bytes) before that happens.
401     if (version <= FT_LAYOUT_VERSION_13) {
402         toku_ft_update_descriptor_with_fd(ft, &ft->cmp_descriptor, fd);
403     }
404     r = 0;
405 exit:
406     if (r != 0 && ft != NULL) {
407         toku_free(ft);
408         ft = NULL;
409     }
410     *ftp = ft;
411     return r;
412 }
413 
serialize_ft_min_size(uint32_t version)414 static size_t serialize_ft_min_size(uint32_t version) {
415     size_t size = 0;
416 
417     switch (version) {
418         case FT_LAYOUT_VERSION_29:
419             size += sizeof(uint64_t);  // logrows in ft
420             // fallthrough
421         case FT_LAYOUT_VERSION_28:
422             size += sizeof(uint32_t);  // fanout in ft
423             // fallthrough
424         case FT_LAYOUT_VERSION_27:
425         case FT_LAYOUT_VERSION_26:
426         case FT_LAYOUT_VERSION_25:
427         case FT_LAYOUT_VERSION_24:
428         case FT_LAYOUT_VERSION_23:
429         case FT_LAYOUT_VERSION_22:
430         case FT_LAYOUT_VERSION_21:
431             size += sizeof(MSN);  // max_msn_in_ft
432             // fallthrough
433         case FT_LAYOUT_VERSION_20:
434         case FT_LAYOUT_VERSION_19:
435             size += 1;            // compression method
436             size += sizeof(MSN);  // highest_unused_msn_for_upgrade
437             // fallthrough
438         case FT_LAYOUT_VERSION_18:
439             size += sizeof(uint64_t);  // time_of_last_optimize_begin
440             size += sizeof(uint64_t);  // time_of_last_optimize_end
441             size += sizeof(uint32_t);  // count_of_optimize_in_progress
442             size += sizeof(MSN);  // msn_at_start_of_last_completed_optimize
443             size -= 8;            // removed num_blocks_to_upgrade_14
444             size -= 8;            // removed num_blocks_to_upgrade_13
445             // fallthrough
446         case FT_LAYOUT_VERSION_17:
447             size += 16;
448             invariant(sizeof(STAT64INFO_S) == 16);
449             // fallthrough
450         case FT_LAYOUT_VERSION_16:
451         case FT_LAYOUT_VERSION_15:
452             size += 4;  // basement node size
453             size += 8;  // num_blocks_to_upgrade_14 (previously
454                         // num_blocks_to_upgrade, now one int each for upgrade
455                         // from 13, 14
456             size += 8;  // time of last verification
457             // fallthrough
458         case FT_LAYOUT_VERSION_14:
459             size += 8;  // TXNID that created
460             // fallthrough
461         case FT_LAYOUT_VERSION_13:
462             size += (4  // build_id
463                      +
464                      4  // build_id_original
465                      +
466                      8  // time_of_creation
467                      +
468                      8  // time_of_last_modification
469                      );
470             // fallthrough
471         case FT_LAYOUT_VERSION_12:
472             size += (+8  // "tokudata"
473                      +
474                      4  // version
475                      +
476                      4  // original_version
477                      +
478                      4  // size
479                      +
480                      8  // byte order verification
481                      +
482                      8  // checkpoint_count
483                      +
484                      8  // checkpoint_lsn
485                      +
486                      4  // tree's nodesize
487                      +
488                      8  // translation_size_on_disk
489                      +
490                      8  // translation_address_on_disk
491                      +
492                      4  // checksum
493                      +
494                      8  // Number of blocks in old version.
495                      +
496                      8  // diskoff
497                      +
498                      4  // flags
499                      );
500             break;
501         default:
502             abort();
503     }
504 
505     lazy_assert(size <= BlockAllocator::BLOCK_ALLOCATOR_HEADER_RESERVE);
506     return size;
507 }
508 
deserialize_ft_from_fd_into_rbuf(int fd,toku_off_t offset_of_header,struct rbuf * rb,uint64_t * checkpoint_count,LSN * checkpoint_lsn,uint32_t * version_p)509 int deserialize_ft_from_fd_into_rbuf(int fd,
510                                      toku_off_t offset_of_header,
511                                      struct rbuf *rb,
512                                      uint64_t *checkpoint_count,
513                                      LSN *checkpoint_lsn,
514                                      uint32_t *version_p)
515 // Effect: Read and parse the header of a fractalal tree
516 //
517 //  Simply reading the raw bytes of the header into an rbuf is insensitive
518 //  to disk format version.  If that ever changes, then modify this.
519 //
520 //  TOKUDB_DICTIONARY_NO_HEADER means we can overwrite everything in the
521 //  file AND the header is useless
522 {
523     int r = 0;
524     const int64_t prefix_size = 8 +  // magic ("tokudata")
525                                 4 +  // version
526                                 4 +  // build_id
527                                 4;   // size
528     const int64_t read_size = roundup_to_multiple(512, prefix_size);
529     unsigned char *XMALLOC_N_ALIGNED(512, read_size, prefix);
530     rb->buf = NULL;
531     int64_t n = toku_os_pread(fd, prefix, read_size, offset_of_header);
532     if (n != read_size) {
533         if (n == 0) {
534             r = TOKUDB_DICTIONARY_NO_HEADER;
535         } else if (n < 0) {
536             r = get_error_errno();
537         } else {
538             r = EINVAL;
539         }
540         toku_free(prefix);
541         goto exit;
542     }
543 
544     rbuf_init(rb, prefix, prefix_size);
545 
546     // Check magic number
547     const void *magic;
548     rbuf_literal_bytes(rb, &magic, 8);
549     if (memcmp(magic, "tokudata", 8) != 0) {
550         if ((*(uint64_t *)magic) == 0) {
551             r = TOKUDB_DICTIONARY_NO_HEADER;
552         } else {
553             r = EINVAL;  // Not a tokudb file! Do not use.
554         }
555         goto exit;
556     }
557 
558     // Version MUST be in network order regardless of disk order.
559     uint32_t version;
560     version = rbuf_network_int(rb);
561     *version_p = version;
562     if (version < FT_LAYOUT_MIN_SUPPORTED_VERSION) {
563         r = TOKUDB_DICTIONARY_TOO_OLD;  // Cannot use
564         goto exit;
565     } else if (version > FT_LAYOUT_VERSION) {
566         r = TOKUDB_DICTIONARY_TOO_NEW;  // Cannot use
567         goto exit;
568     }
569 
570     // build_id MUST be in network order regardless of disk order.
571     uint32_t build_id __attribute__((__unused__));
572     build_id = rbuf_network_int(rb);
573     int64_t min_header_size;
574     min_header_size = serialize_ft_min_size(version);
575 
576     // Size MUST be in network order regardless of disk order.
577     uint32_t size;
578     size = rbuf_network_int(rb);
579     // If too big, it is corrupt.  We would probably notice during checksum
580     // but may have to do a multi-gigabyte malloc+read to find out.
581     // If its too small reading rbuf would crash, so verify.
582     if (size > BlockAllocator::BLOCK_ALLOCATOR_HEADER_RESERVE ||
583         size < min_header_size) {
584         r = TOKUDB_DICTIONARY_NO_HEADER;
585         goto exit;
586     }
587 
588     lazy_assert(rb->ndone == prefix_size);
589     rb->size = size;
590     {
591         toku_free(rb->buf);
592         uint32_t size_to_read = roundup_to_multiple(512, size);
593         XMALLOC_N_ALIGNED(512, size_to_read, rb->buf);
594 
595         invariant(offset_of_header % 512 == 0);
596         n = toku_os_pread(fd, rb->buf, size_to_read, offset_of_header);
597         if (n != size_to_read) {
598             if (n < 0) {
599                 r = get_error_errno();
600             } else {
601                 r = EINVAL;  // Header might be useless (wrong size) or could be
602                              // a disk read error.
603             }
604             goto exit;
605         }
606     }
607     // It's version 14 or later.  Magic looks OK.
608     // We have an rbuf that represents the header.
609     // Size is within acceptable bounds.
610 
611     // Verify checksum (FT_LAYOUT_VERSION_13 or later, when checksum function
612     // changed)
613     uint32_t calculated_x1764;
614     calculated_x1764 = toku_x1764_memory(rb->buf, rb->size - 4);
615     uint32_t stored_x1764;
616     stored_x1764 = toku_dtoh32(*(int *)(rb->buf + rb->size - 4));
617     if (calculated_x1764 != stored_x1764) {
618         r = TOKUDB_BAD_CHECKSUM;  // Header useless
619         fprintf(stderr,
620                 "Header checksum failure: calc=0x%08x read=0x%08x\n",
621                 calculated_x1764,
622                 stored_x1764);
623         goto exit;
624     }
625 
626     // Verify byte order
627     const void *tmp_byte_order_check;
628     lazy_assert((sizeof toku_byte_order_host) == 8);
629     rbuf_literal_bytes(
630         rb, &tmp_byte_order_check, 8);  // Must not translate byte order
631     int64_t byte_order_stored;
632     byte_order_stored = *(int64_t *)tmp_byte_order_check;
633     if (byte_order_stored != toku_byte_order_host) {
634         r = TOKUDB_DICTIONARY_NO_HEADER;  // Cannot use dictionary
635         goto exit;
636     }
637 
638     // Load checkpoint count
639     *checkpoint_count = rbuf_ulonglong(rb);
640     *checkpoint_lsn = rbuf_LSN(rb);
641     // Restart at beginning during regular deserialization
642     rb->ndone = 0;
643 
644 exit:
645     if (r != 0 && rb->buf != NULL) {
646         toku_free(rb->buf);
647         rb->buf = NULL;
648     }
649     return r;
650 }
651 
652 // Read ft from file into struct.  Read both headers and use one.
653 // We want the latest acceptable header whose checkpoint_lsn is no later
654 // than max_acceptable_lsn.
655 #define dump_state_of_toku_deserialize_ft_from() \
656     fprintf(stderr, \
657             "%s:%d toku_deserialize_ft_from: " \
658             "filename[%s] " \
659             "r[%d] max_acceptable_lsn[%lu]" \
660             "r0[%d] checkpoint_lsn_0[%lu] checkpoint_count_0[%lu] " \
661             "r1[%d] checkpoint_lsn_1[%lu] checkpoint_count_1[%lu]\n", \
662             __FILE__, \
663             __LINE__, \
664             fn, \
665             r, \
666             max_acceptable_lsn.lsn, \
667             r0, \
668             checkpoint_lsn_0.lsn, \
669             checkpoint_count_0, \
670             r1, \
671             checkpoint_lsn_1.lsn, \
672             checkpoint_count_1);
673 
toku_deserialize_ft_from(int fd,const char * fn,LSN max_acceptable_lsn,FT * ft)674 int toku_deserialize_ft_from(int fd,
675                              const char *fn,
676                              LSN max_acceptable_lsn,
677                              FT *ft) {
678     struct rbuf rb_0;
679     struct rbuf rb_1;
680     uint64_t checkpoint_count_0 = 0;
681     uint64_t checkpoint_count_1 = 0;
682     LSN checkpoint_lsn_0;
683     LSN checkpoint_lsn_1;
684     uint32_t version_0 = 0, version_1 = 0, version = 0;
685     bool h0_acceptable = false;
686     bool h1_acceptable = false;
687     struct rbuf *rb = NULL;
688     int r0, r1, r = 0;
689 
690     toku_off_t header_0_off = 0;
691     r0 = deserialize_ft_from_fd_into_rbuf(fd,
692                                           header_0_off,
693                                           &rb_0,
694                                           &checkpoint_count_0,
695                                           &checkpoint_lsn_0,
696                                           &version_0);
697     if (r0 == 0 && checkpoint_lsn_0.lsn <= max_acceptable_lsn.lsn) {
698         h0_acceptable = true;
699     }
700 
701     toku_off_t header_1_off = BlockAllocator::BLOCK_ALLOCATOR_HEADER_RESERVE;
702     r1 = deserialize_ft_from_fd_into_rbuf(fd,
703                                           header_1_off,
704                                           &rb_1,
705                                           &checkpoint_count_1,
706                                           &checkpoint_lsn_1,
707                                           &version_1);
708     if (r1 == 0 && checkpoint_lsn_1.lsn <= max_acceptable_lsn.lsn) {
709         h1_acceptable = true;
710     }
711 
712     // if either header is too new, the dictionary is unreadable
713     if (r0 == TOKUDB_DICTIONARY_TOO_NEW || r1 == TOKUDB_DICTIONARY_TOO_NEW ||
714         !(h0_acceptable || h1_acceptable)) {
715         // We were unable to read either header or at least one is too
716         // new.  Certain errors are higher priority than others. Order of
717         // these if/else if is important.
718         if (r0 == TOKUDB_DICTIONARY_TOO_NEW ||
719             r1 == TOKUDB_DICTIONARY_TOO_NEW) {
720             r = TOKUDB_DICTIONARY_TOO_NEW;
721         } else if (r0 == TOKUDB_DICTIONARY_TOO_OLD ||
722                    r1 == TOKUDB_DICTIONARY_TOO_OLD) {
723             r = TOKUDB_DICTIONARY_TOO_OLD;
724         } else if (r0 == TOKUDB_BAD_CHECKSUM && r1 == TOKUDB_BAD_CHECKSUM) {
725             fprintf(stderr, "Both header checksums failed.\n");
726             r = TOKUDB_BAD_CHECKSUM;
727         } else if (r0 == TOKUDB_DICTIONARY_NO_HEADER ||
728                    r1 == TOKUDB_DICTIONARY_NO_HEADER) {
729             r = TOKUDB_DICTIONARY_NO_HEADER;
730         } else {
731             r = r0 ? r0 : r1;  // Arbitrarily report the error from the
732             // first header, unless it's readable
733         }
734 
735         if (r != TOKUDB_DICTIONARY_NO_HEADER) {
736             dump_state_of_toku_deserialize_ft_from();
737         }
738 
739         // it should not be possible for both headers to be later than the
740         // max_acceptable_lsn
741         invariant(
742             !((r0 == 0 && checkpoint_lsn_0.lsn > max_acceptable_lsn.lsn) &&
743               (r1 == 0 && checkpoint_lsn_1.lsn > max_acceptable_lsn.lsn)));
744         invariant(r != 0);
745         goto exit;
746     }
747 
748     if (h0_acceptable && h1_acceptable) {
749         if (checkpoint_count_0 > checkpoint_count_1) {
750             if (!(checkpoint_count_0 == checkpoint_count_1 + 1) ||
751                 !(version_0 >= version_1)) {
752                 dump_state_of_toku_deserialize_ft_from();
753             }
754             invariant(checkpoint_count_0 == checkpoint_count_1 + 1);
755             invariant(version_0 >= version_1);
756             rb = &rb_0;
757             version = version_0;
758         } else {
759             if (!(checkpoint_count_1 == checkpoint_count_0 + 1) ||
760                 !(version_1 >= version_0)) {
761                 dump_state_of_toku_deserialize_ft_from();
762             }
763             invariant(checkpoint_count_1 == checkpoint_count_0 + 1);
764             invariant(version_1 >= version_0);
765             rb = &rb_1;
766             version = version_1;
767         }
768     } else if (h0_acceptable) {
769         if (r1 == TOKUDB_BAD_CHECKSUM) {
770             // print something reassuring
771             fprintf(
772                 stderr,
773                 "Header 2 checksum failed, but header 1 ok.  Proceeding.\n");
774             dump_state_of_toku_deserialize_ft_from();
775         }
776         rb = &rb_0;
777         version = version_0;
778     } else if (h1_acceptable) {
779         if (r0 == TOKUDB_BAD_CHECKSUM) {
780             // print something reassuring
781             fprintf(
782                 stderr,
783                 "Header 1 checksum failed, but header 2 ok.  Proceeding.\n");
784             dump_state_of_toku_deserialize_ft_from();
785         }
786         rb = &rb_1;
787         version = version_1;
788     }
789 
790     if (!rb) {
791         dump_state_of_toku_deserialize_ft_from();
792     }
793     paranoid_invariant(rb);
794     r = deserialize_ft_versioned(fd, rb, ft, version);
795 
796 exit:
797     if (rb_0.buf) {
798         toku_free(rb_0.buf);
799     }
800     if (rb_1.buf) {
801         toku_free(rb_1.buf);
802     }
803     return r;
804 }
805 
toku_serialize_ft_size(FT_HEADER h)806 size_t toku_serialize_ft_size(FT_HEADER h) {
807     size_t size = serialize_ft_min_size(h->layout_version);
808     // There is no dynamic data.
809     lazy_assert(size <= BlockAllocator::BLOCK_ALLOCATOR_HEADER_RESERVE);
810     return size;
811 }
812 
toku_serialize_ft_to_wbuf(struct wbuf * wbuf,FT_HEADER h,DISKOFF translation_location_on_disk,DISKOFF translation_size_on_disk)813 void toku_serialize_ft_to_wbuf (
814     struct wbuf *wbuf,
815     FT_HEADER h,
816     DISKOFF translation_location_on_disk,
817     DISKOFF translation_size_on_disk
818     )
819 {
820     wbuf_literal_bytes(wbuf, "tokudata", 8);
821     wbuf_network_int  (wbuf, h->layout_version); //MUST be in network order regardless of disk order
822     wbuf_network_int  (wbuf, BUILD_ID); //MUST be in network order regardless of disk order
823     wbuf_network_int  (wbuf, wbuf->size); //MUST be in network order regardless of disk order
824     wbuf_literal_bytes(wbuf, &toku_byte_order_host, 8); //Must not translate byte order
825     wbuf_ulonglong(wbuf, h->checkpoint_count);
826     wbuf_LSN    (wbuf, h->checkpoint_lsn);
827     wbuf_int    (wbuf, h->nodesize);
828 
829     wbuf_DISKOFF(wbuf, translation_location_on_disk);
830     wbuf_DISKOFF(wbuf, translation_size_on_disk);
831     wbuf_BLOCKNUM(wbuf, h->root_blocknum);
832     wbuf_int(wbuf, h->flags);
833     wbuf_int(wbuf, h->layout_version_original);
834     wbuf_int(wbuf, h->build_id_original);
835     wbuf_ulonglong(wbuf, h->time_of_creation);
836     wbuf_ulonglong(wbuf, h->time_of_last_modification);
837     wbuf_TXNID(wbuf, h->root_xid_that_created);
838     wbuf_int(wbuf, h->basementnodesize);
839     wbuf_ulonglong(wbuf, h->time_of_last_verification);
840     wbuf_ulonglong(wbuf, h->on_disk_stats.numrows);
841     wbuf_ulonglong(wbuf, h->on_disk_stats.numbytes);
842     wbuf_ulonglong(wbuf, h->time_of_last_optimize_begin);
843     wbuf_ulonglong(wbuf, h->time_of_last_optimize_end);
844     wbuf_int(wbuf, h->count_of_optimize_in_progress);
845     wbuf_MSN(wbuf, h->msn_at_start_of_last_completed_optimize);
846     wbuf_char(wbuf, (unsigned char) h->compression_method);
847     wbuf_MSN(wbuf, h->highest_unused_msn_for_upgrade);
848     wbuf_MSN(wbuf, h->max_msn_in_ft);
849     wbuf_int(wbuf, h->fanout);
850     wbuf_ulonglong(wbuf, h->on_disk_logical_rows);
851     uint32_t checksum = toku_x1764_finish(&wbuf->checksum);
852     wbuf_int(wbuf, checksum);
853     lazy_assert(wbuf->ndone == wbuf->size);
854 }
855 
toku_serialize_ft_to(int fd,FT_HEADER h,block_table * bt,CACHEFILE cf)856 void toku_serialize_ft_to(int fd, FT_HEADER h, block_table *bt, CACHEFILE cf) {
857     lazy_assert(h->type == FT_CHECKPOINT_INPROGRESS);
858     struct wbuf w_translation;
859     int64_t size_translation;
860     int64_t address_translation;
861 
862     // Must serialize translation first, to get address,size for header.
863     bt->serialize_translation_to_wbuf(
864         fd, &w_translation, &address_translation, &size_translation);
865     invariant(size_translation == w_translation.ndone);
866 
867     // the number of bytes available in the buffer is 0 mod 512, and those last
868     // bytes are all initialized.
869     invariant(w_translation.size % 512 == 0);
870 
871     struct wbuf w_main;
872     size_t size_main = toku_serialize_ft_size(h);
873     size_t size_main_aligned = roundup_to_multiple(512, size_main);
874     invariant(size_main_aligned <
875               BlockAllocator::BLOCK_ALLOCATOR_HEADER_RESERVE);
876     char *XMALLOC_N_ALIGNED(512, size_main_aligned, mainbuf);
877     for (size_t i = size_main; i < size_main_aligned; i++)
878         mainbuf[i] = 0;  // initialize the end of the buffer with zeros
879     wbuf_init(&w_main, mainbuf, size_main);
880     toku_serialize_ft_to_wbuf(
881         &w_main, h, address_translation, size_translation);
882     lazy_assert(w_main.ndone == size_main);
883 
884     // Actually write translation table
885     // This write is guaranteed to read good data at the end of the buffer,
886     // since the
887     // w_translation.buf is padded with zeros to a 512-byte boundary.
888     toku_os_full_pwrite(fd,
889                         w_translation.buf,
890                         roundup_to_multiple(512, size_translation),
891                         address_translation);
892 
893     // Everything but the header MUST be on disk before header starts.
894     // Otherwise we will think the header is good and some blocks might not
895     // yet be on disk.
896     // If the header has a cachefile we need to do cachefile fsync (to
897     // prevent crash if we redirected to dev null)
898     // If there is no cachefile we still need to do an fsync.
899     if (cf) {
900         toku_cachefile_fsync(cf);
901     } else {
902         toku_file_fsync(fd);
903     }
904 
905     // Alternate writing header to two locations:
906     //   Beginning (0) or BLOCK_ALLOCATOR_HEADER_RESERVE
907     toku_off_t main_offset;
908     main_offset = (h->checkpoint_count & 0x1)
909                       ? 0
910                       : BlockAllocator::BLOCK_ALLOCATOR_HEADER_RESERVE;
911     toku_os_full_pwrite(fd, w_main.buf, size_main_aligned, main_offset);
912     toku_free(w_main.buf);
913     toku_free(w_translation.buf);
914 }
915