1 /*****************************************************************************
2
3 Copyright (c) 2005, 2016, Oracle and/or its affiliates. All Rights Reserved.
4 Copyright (c) 2012, Facebook Inc.
5 Copyright (c) 2014, 2021, MariaDB Corporation.
6
7 This program is free software; you can redistribute it and/or modify it under
8 the terms of the GNU General Public License as published by the Free Software
9 Foundation; version 2 of the License.
10
11 This program is distributed in the hope that it will be useful, but WITHOUT
12 ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
13 FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
14
15 You should have received a copy of the GNU General Public License along with
16 this program; if not, write to the Free Software Foundation, Inc.,
17 51 Franklin Street, Fifth Floor, Boston, MA 02110-1335 USA
18
19 *****************************************************************************/
20
21 /**************************************************//**
22 @file page/page0zip.cc
23 Compressed page interface
24
25 Created June 2005 by Marko Makela
26 *******************************************************/
27
28 #include "page0zip.h"
29 #include "fsp0types.h"
30 #include "page0page.h"
31 #include "buf0checksum.h"
32 #include "ut0crc32.h"
33 #include "zlib.h"
34 #include "span.h"
35
36 using st_::span;
37
38 #ifndef UNIV_INNOCHECKSUM
39 #include "mtr0log.h"
40 #include "dict0dict.h"
41 #include "btr0cur.h"
42 #include "log0recv.h"
43 #include "row0row.h"
44 #include "btr0sea.h"
45 #include "dict0boot.h"
46 #include "lock0lock.h"
47 #include "srv0srv.h"
48 #include "buf0lru.h"
49 #include "srv0mon.h"
50
51 #include <map>
52 #include <algorithm>
53
54 /** Statistics on compression, indexed by page_zip_des_t::ssize - 1 */
55 page_zip_stat_t page_zip_stat[PAGE_ZIP_SSIZE_MAX];
56 /** Statistics on compression, indexed by index->id */
57 page_zip_stat_per_index_t page_zip_stat_per_index;
58
59 /** Compression level to be used by zlib. Settable by user. */
60 uint page_zip_level;
61
62 /* Please refer to ../include/page0zip.ic for a description of the
63 compressed page format. */
64
65 /* The infimum and supremum records are omitted from the compressed page.
66 On compress, we compare that the records are there, and on uncompress we
67 restore the records. */
68 /** Extra bytes of an infimum record */
69 static const byte infimum_extra[] = {
70 0x01, /* info_bits=0, n_owned=1 */
71 0x00, 0x02 /* heap_no=0, status=2 */
72 /* ?, ? */ /* next=(first user rec, or supremum) */
73 };
74 /** Data bytes of an infimum record */
75 static const byte infimum_data[] = {
76 0x69, 0x6e, 0x66, 0x69,
77 0x6d, 0x75, 0x6d, 0x00 /* "infimum\0" */
78 };
79 /** Extra bytes and data bytes of a supremum record */
80 static const byte supremum_extra_data alignas(4) [] = {
81 /* 0x0?, */ /* info_bits=0, n_owned=1..8 */
82 0x00, 0x0b, /* heap_no=1, status=3 */
83 0x00, 0x00, /* next=0 */
84 0x73, 0x75, 0x70, 0x72,
85 0x65, 0x6d, 0x75, 0x6d /* "supremum" */
86 };
87
88 /** Assert that a block of memory is filled with zero bytes.
89 @param b in: memory block
90 @param s in: size of the memory block, in bytes */
91 #define ASSERT_ZERO(b, s) ut_ad(!memcmp(b, field_ref_zero, s))
92 /** Assert that a BLOB pointer is filled with zero bytes.
93 @param b in: BLOB pointer */
94 #define ASSERT_ZERO_BLOB(b) ASSERT_ZERO(b, FIELD_REF_SIZE)
95
96 /* Enable some extra debugging output. This code can be enabled
97 independently of any UNIV_ debugging conditions. */
98 #if defined UNIV_DEBUG || defined UNIV_ZIP_DEBUG
99 # include <stdarg.h>
100 MY_ATTRIBUTE((format (printf, 1, 2)))
101 /**********************************************************************//**
102 Report a failure to decompress or compress.
103 @return number of characters printed */
104 static
105 int
page_zip_fail_func(const char * fmt,...)106 page_zip_fail_func(
107 /*===============*/
108 const char* fmt, /*!< in: printf(3) format string */
109 ...) /*!< in: arguments corresponding to fmt */
110 {
111 int res;
112 va_list ap;
113
114 ut_print_timestamp(stderr);
115 fputs(" InnoDB: ", stderr);
116 va_start(ap, fmt);
117 res = vfprintf(stderr, fmt, ap);
118 va_end(ap);
119
120 return(res);
121 }
122 /** Wrapper for page_zip_fail_func()
123 @param fmt_args in: printf(3) format string and arguments */
124 # define page_zip_fail(fmt_args) page_zip_fail_func fmt_args
125 #else /* UNIV_DEBUG || UNIV_ZIP_DEBUG */
126 /** Dummy wrapper for page_zip_fail_func()
127 @param fmt_args ignored: printf(3) format string and arguments */
128 # define page_zip_fail(fmt_args) /* empty */
129 #endif /* UNIV_DEBUG || UNIV_ZIP_DEBUG */
130
131 /**********************************************************************//**
132 Determine the guaranteed free space on an empty page.
133 @return minimum payload size on the page */
134 ulint
page_zip_empty_size(ulint n_fields,ulint zip_size)135 page_zip_empty_size(
136 /*================*/
137 ulint n_fields, /*!< in: number of columns in the index */
138 ulint zip_size) /*!< in: compressed page size in bytes */
139 {
140 ulint size = zip_size
141 /* subtract the page header and the longest
142 uncompressed data needed for one record */
143 - (PAGE_DATA
144 + PAGE_ZIP_CLUST_LEAF_SLOT_SIZE
145 + 1/* encoded heap_no==2 in page_zip_write_rec() */
146 + 1/* end of modification log */
147 - REC_N_NEW_EXTRA_BYTES/* omitted bytes */)
148 /* subtract the space for page_zip_fields_encode() */
149 - compressBound(static_cast<uLong>(2 * (n_fields + 1)));
150 return(lint(size) > 0 ? size : 0);
151 }
152
153 /** Check whether a tuple is too big for compressed table
154 @param[in] index dict index object
155 @param[in] entry entry for the index
156 @return true if it's too big, otherwise false */
157 bool
page_zip_is_too_big(const dict_index_t * index,const dtuple_t * entry)158 page_zip_is_too_big(
159 const dict_index_t* index,
160 const dtuple_t* entry)
161 {
162 const ulint zip_size = index->table->space->zip_size();
163
164 /* Estimate the free space of an empty compressed page.
165 Subtract one byte for the encoded heap_no in the
166 modification log. */
167 ulint free_space_zip = page_zip_empty_size(
168 index->n_fields, zip_size);
169 ulint n_uniq = dict_index_get_n_unique_in_tree(index);
170
171 ut_ad(dict_table_is_comp(index->table));
172 ut_ad(zip_size);
173
174 if (free_space_zip == 0) {
175 return(true);
176 }
177
178 /* Subtract one byte for the encoded heap_no in the
179 modification log. */
180 free_space_zip--;
181
182 /* There should be enough room for two node pointer
183 records on an empty non-leaf page. This prevents
184 infinite page splits. */
185
186 if (entry->n_fields >= n_uniq
187 && (REC_NODE_PTR_SIZE
188 + rec_get_converted_size_comp_prefix(
189 index, entry->fields, n_uniq, NULL)
190 /* On a compressed page, there is
191 a two-byte entry in the dense
192 page directory for every record.
193 But there is no record header. */
194 - (REC_N_NEW_EXTRA_BYTES - 2)
195 > free_space_zip / 2)) {
196 return(true);
197 }
198
199 return(false);
200 }
201
202 /*************************************************************//**
203 Gets the number of elements in the dense page directory,
204 including deleted records (the free list).
205 @return number of elements in the dense page directory */
206 UNIV_INLINE
207 ulint
page_zip_dir_elems(const page_zip_des_t * page_zip)208 page_zip_dir_elems(
209 /*===============*/
210 const page_zip_des_t* page_zip) /*!< in: compressed page */
211 {
212 /* Exclude the page infimum and supremum from the record count. */
213 return ulint(page_dir_get_n_heap(page_zip->data))
214 - PAGE_HEAP_NO_USER_LOW;
215 }
216
217 /*************************************************************//**
218 Gets the size of the compressed page trailer (the dense page directory),
219 including deleted records (the free list).
220 @return length of dense page directory, in bytes */
221 UNIV_INLINE
222 ulint
page_zip_dir_size(const page_zip_des_t * page_zip)223 page_zip_dir_size(
224 /*==============*/
225 const page_zip_des_t* page_zip) /*!< in: compressed page */
226 {
227 return(PAGE_ZIP_DIR_SLOT_SIZE * page_zip_dir_elems(page_zip));
228 }
229
230 /*************************************************************//**
231 Gets an offset to the compressed page trailer (the dense page directory),
232 including deleted records (the free list).
233 @return offset of the dense page directory */
234 UNIV_INLINE
235 ulint
page_zip_dir_start_offs(const page_zip_des_t * page_zip,ulint n_dense)236 page_zip_dir_start_offs(
237 /*====================*/
238 const page_zip_des_t* page_zip, /*!< in: compressed page */
239 ulint n_dense) /*!< in: directory size */
240 {
241 ut_ad(n_dense * PAGE_ZIP_DIR_SLOT_SIZE < page_zip_get_size(page_zip));
242
243 return(page_zip_get_size(page_zip) - n_dense * PAGE_ZIP_DIR_SLOT_SIZE);
244 }
245
246 /*************************************************************//**
247 Gets a pointer to the compressed page trailer (the dense page directory),
248 including deleted records (the free list).
249 @param[in] page_zip compressed page
250 @param[in] n_dense number of entries in the directory
251 @return pointer to the dense page directory */
252 #define page_zip_dir_start_low(page_zip, n_dense) \
253 ((page_zip)->data + page_zip_dir_start_offs(page_zip, n_dense))
254 /*************************************************************//**
255 Gets a pointer to the compressed page trailer (the dense page directory),
256 including deleted records (the free list).
257 @param[in] page_zip compressed page
258 @return pointer to the dense page directory */
259 #define page_zip_dir_start(page_zip) \
260 page_zip_dir_start_low(page_zip, page_zip_dir_elems(page_zip))
261
262 /*************************************************************//**
263 Gets the size of the compressed page trailer (the dense page directory),
264 only including user records (excluding the free list).
265 @return length of dense page directory comprising existing records, in bytes */
266 UNIV_INLINE
267 ulint
page_zip_dir_user_size(const page_zip_des_t * page_zip)268 page_zip_dir_user_size(
269 /*===================*/
270 const page_zip_des_t* page_zip) /*!< in: compressed page */
271 {
272 ulint size = PAGE_ZIP_DIR_SLOT_SIZE
273 * ulint(page_get_n_recs(page_zip->data));
274 ut_ad(size <= page_zip_dir_size(page_zip));
275 return(size);
276 }
277
278 /*************************************************************//**
279 Find the slot of the given record in the dense page directory.
280 @return dense directory slot, or NULL if record not found */
281 UNIV_INLINE
282 byte*
page_zip_dir_find_low(byte * slot,byte * end,ulint offset)283 page_zip_dir_find_low(
284 /*==================*/
285 byte* slot, /*!< in: start of records */
286 byte* end, /*!< in: end of records */
287 ulint offset) /*!< in: offset of user record */
288 {
289 ut_ad(slot <= end);
290
291 for (; slot < end; slot += PAGE_ZIP_DIR_SLOT_SIZE) {
292 if ((mach_read_from_2(slot) & PAGE_ZIP_DIR_SLOT_MASK)
293 == offset) {
294 return(slot);
295 }
296 }
297
298 return(NULL);
299 }
300
301 /*************************************************************//**
302 Find the slot of the given non-free record in the dense page directory.
303 @return dense directory slot, or NULL if record not found */
304 UNIV_INLINE
305 byte*
page_zip_dir_find(page_zip_des_t * page_zip,ulint offset)306 page_zip_dir_find(
307 /*==============*/
308 page_zip_des_t* page_zip, /*!< in: compressed page */
309 ulint offset) /*!< in: offset of user record */
310 {
311 byte* end = page_zip->data + page_zip_get_size(page_zip);
312
313 ut_ad(page_zip_simple_validate(page_zip));
314
315 return(page_zip_dir_find_low(end - page_zip_dir_user_size(page_zip),
316 end,
317 offset));
318 }
319
320 /*************************************************************//**
321 Find the slot of the given free record in the dense page directory.
322 @return dense directory slot, or NULL if record not found */
323 UNIV_INLINE
324 byte*
page_zip_dir_find_free(page_zip_des_t * page_zip,ulint offset)325 page_zip_dir_find_free(
326 /*===================*/
327 page_zip_des_t* page_zip, /*!< in: compressed page */
328 ulint offset) /*!< in: offset of user record */
329 {
330 byte* end = page_zip->data + page_zip_get_size(page_zip);
331
332 ut_ad(page_zip_simple_validate(page_zip));
333
334 return(page_zip_dir_find_low(end - page_zip_dir_size(page_zip),
335 end - page_zip_dir_user_size(page_zip),
336 offset));
337 }
338
339 /*************************************************************//**
340 Read a given slot in the dense page directory.
341 @return record offset on the uncompressed page, possibly ORed with
342 PAGE_ZIP_DIR_SLOT_DEL or PAGE_ZIP_DIR_SLOT_OWNED */
343 UNIV_INLINE
344 ulint
page_zip_dir_get(const page_zip_des_t * page_zip,ulint slot)345 page_zip_dir_get(
346 /*=============*/
347 const page_zip_des_t* page_zip, /*!< in: compressed page */
348 ulint slot) /*!< in: slot
349 (0=first user record) */
350 {
351 ut_ad(page_zip_simple_validate(page_zip));
352 ut_ad(slot < page_zip_dir_size(page_zip) / PAGE_ZIP_DIR_SLOT_SIZE);
353 return(mach_read_from_2(page_zip->data + page_zip_get_size(page_zip)
354 - PAGE_ZIP_DIR_SLOT_SIZE * (slot + 1)));
355 }
356
357 /** Write a byte string to a ROW_FORMAT=COMPRESSED page.
358 @param[in] b ROW_FORMAT=COMPRESSED index page
359 @param[in] offset byte offset from b.zip.data
360 @param[in] len length of the data to write */
zmemcpy(const buf_block_t & b,ulint offset,ulint len)361 inline void mtr_t::zmemcpy(const buf_block_t &b, ulint offset, ulint len)
362 {
363 ut_ad(fil_page_get_type(b.page.zip.data) == FIL_PAGE_INDEX ||
364 fil_page_get_type(b.page.zip.data) == FIL_PAGE_RTREE);
365 ut_ad(page_zip_simple_validate(&b.page.zip));
366 ut_ad(offset + len <= page_zip_get_size(&b.page.zip));
367
368 memcpy_low(b, static_cast<uint16_t>(offset), &b.page.zip.data[offset], len);
369 m_last_offset= static_cast<uint16_t>(offset + len);
370 }
371
372 /** Write a byte string to a ROW_FORMAT=COMPRESSED page.
373 @param[in] b ROW_FORMAT=COMPRESSED index page
374 @param[in] dest destination within b.zip.data
375 @param[in] str the data to write
376 @param[in] len length of the data to write
377 @tparam w write request type */
378 template<mtr_t::write_type w>
zmemcpy(const buf_block_t & b,void * dest,const void * str,ulint len)379 inline void mtr_t::zmemcpy(const buf_block_t &b, void *dest, const void *str,
380 ulint len)
381 {
382 byte *d= static_cast<byte*>(dest);
383 const byte *s= static_cast<const byte*>(str);
384 ut_ad(d >= b.page.zip.data + FIL_PAGE_OFFSET);
385 if (w != FORCED)
386 {
387 ut_ad(len);
388 const byte *const end= d + len;
389 while (*d++ == *s++)
390 {
391 if (d == end)
392 {
393 ut_ad(w == MAYBE_NOP);
394 return;
395 }
396 }
397 s--;
398 d--;
399 len= static_cast<ulint>(end - d);
400 }
401 ::memcpy(d, s, len);
402 zmemcpy(b, d - b.page.zip.data, len);
403 }
404
405 /** Write redo log for compressing a ROW_FORMAT=COMPRESSED index page.
406 @param[in,out] block ROW_FORMAT=COMPRESSED index page
407 @param[in] index the index that the block belongs to
408 @param[in,out] mtr mini-transaction */
page_zip_compress_write_log(buf_block_t * block,dict_index_t * index,mtr_t * mtr)409 static void page_zip_compress_write_log(buf_block_t *block,
410 dict_index_t *index, mtr_t *mtr)
411 {
412 ut_ad(!index->is_ibuf());
413
414 if (mtr->get_log_mode() != MTR_LOG_ALL)
415 {
416 ut_ad(mtr->get_log_mode() == MTR_LOG_NONE ||
417 mtr->get_log_mode() == MTR_LOG_NO_REDO);
418 return;
419 }
420
421 const page_t *page= block->frame;
422 const page_zip_des_t *page_zip= &block->page.zip;
423 /* Read the number of user records. */
424 ulint trailer_size= ulint(page_dir_get_n_heap(page_zip->data)) -
425 PAGE_HEAP_NO_USER_LOW;
426 /* Multiply by uncompressed of size stored per record */
427 if (!page_is_leaf(page))
428 trailer_size*= PAGE_ZIP_DIR_SLOT_SIZE + REC_NODE_PTR_SIZE;
429 else if (index->is_clust())
430 trailer_size*= PAGE_ZIP_DIR_SLOT_SIZE + DATA_TRX_ID_LEN +
431 DATA_ROLL_PTR_LEN;
432 else
433 trailer_size*= PAGE_ZIP_DIR_SLOT_SIZE;
434 /* Add the space occupied by BLOB pointers. */
435 trailer_size+= page_zip->n_blobs * BTR_EXTERN_FIELD_REF_SIZE;
436 ut_a(page_zip->m_end > PAGE_DATA);
437 compile_time_assert(FIL_PAGE_DATA <= PAGE_DATA);
438 ut_a(page_zip->m_end + trailer_size <= page_zip_get_size(page_zip));
439
440 mtr->init(block);
441 mtr->zmemcpy(*block, FIL_PAGE_PREV, page_zip->m_end - FIL_PAGE_PREV);
442
443 if (trailer_size)
444 mtr->zmemcpy(*block, page_zip_get_size(page_zip) - trailer_size,
445 trailer_size);
446 block->page.status = buf_page_t::INIT_ON_FLUSH; /* because of mtr_t::init() */
447 }
448
449 /******************************************************//**
450 Determine how many externally stored columns are contained
451 in existing records with smaller heap_no than rec. */
452 static
453 ulint
page_zip_get_n_prev_extern(const page_zip_des_t * page_zip,const rec_t * rec,const dict_index_t * index)454 page_zip_get_n_prev_extern(
455 /*=======================*/
456 const page_zip_des_t* page_zip,/*!< in: dense page directory on
457 compressed page */
458 const rec_t* rec, /*!< in: compact physical record
459 on a B-tree leaf page */
460 const dict_index_t* index) /*!< in: record descriptor */
461 {
462 const page_t* page = page_align(rec);
463 ulint n_ext = 0;
464 ulint i;
465 ulint left;
466 ulint heap_no;
467 ulint n_recs = page_get_n_recs(page_zip->data);
468
469 ut_ad(page_is_leaf(page));
470 ut_ad(page_is_comp(page));
471 ut_ad(dict_table_is_comp(index->table));
472 ut_ad(dict_index_is_clust(index));
473 ut_ad(!dict_index_is_ibuf(index));
474
475 heap_no = rec_get_heap_no_new(rec);
476 ut_ad(heap_no >= PAGE_HEAP_NO_USER_LOW);
477 left = heap_no - PAGE_HEAP_NO_USER_LOW;
478 if (UNIV_UNLIKELY(!left)) {
479 return(0);
480 }
481
482 for (i = 0; i < n_recs; i++) {
483 const rec_t* r = page + (page_zip_dir_get(page_zip, i)
484 & PAGE_ZIP_DIR_SLOT_MASK);
485
486 if (rec_get_heap_no_new(r) < heap_no) {
487 n_ext += rec_get_n_extern_new(r, index,
488 ULINT_UNDEFINED);
489 if (!--left) {
490 break;
491 }
492 }
493 }
494
495 return(n_ext);
496 }
497
498 /**********************************************************************//**
499 Encode the length of a fixed-length column.
500 @return buf + length of encoded val */
501 static
502 byte*
page_zip_fixed_field_encode(byte * buf,ulint val)503 page_zip_fixed_field_encode(
504 /*========================*/
505 byte* buf, /*!< in: pointer to buffer where to write */
506 ulint val) /*!< in: value to write */
507 {
508 ut_ad(val >= 2);
509
510 if (UNIV_LIKELY(val < 126)) {
511 /*
512 0 = nullable variable field of at most 255 bytes length;
513 1 = not null variable field of at most 255 bytes length;
514 126 = nullable variable field with maximum length >255;
515 127 = not null variable field with maximum length >255
516 */
517 *buf++ = (byte) val;
518 } else {
519 *buf++ = (byte) (0x80 | val >> 8);
520 *buf++ = (byte) val;
521 }
522
523 return(buf);
524 }
525
526 /**********************************************************************//**
527 Write the index information for the compressed page.
528 @return used size of buf */
529 ulint
page_zip_fields_encode(ulint n,const dict_index_t * index,ulint trx_id_pos,byte * buf)530 page_zip_fields_encode(
531 /*===================*/
532 ulint n, /*!< in: number of fields
533 to compress */
534 const dict_index_t* index, /*!< in: index comprising
535 at least n fields */
536 ulint trx_id_pos,
537 /*!< in: position of the trx_id column
538 in the index, or ULINT_UNDEFINED if
539 this is a non-leaf page */
540 byte* buf) /*!< out: buffer of (n + 1) * 2 bytes */
541 {
542 const byte* buf_start = buf;
543 ulint i;
544 ulint col;
545 ulint trx_id_col = 0;
546 /* sum of lengths of preceding non-nullable fixed fields, or 0 */
547 ulint fixed_sum = 0;
548
549 ut_ad(trx_id_pos == ULINT_UNDEFINED || trx_id_pos < n);
550
551 for (i = col = 0; i < n; i++) {
552 dict_field_t* field = dict_index_get_nth_field(index, i);
553 ulint val;
554
555 if (dict_field_get_col(field)->prtype & DATA_NOT_NULL) {
556 val = 1; /* set the "not nullable" flag */
557 } else {
558 val = 0; /* nullable field */
559 }
560
561 if (!field->fixed_len) {
562 /* variable-length field */
563 const dict_col_t* column
564 = dict_field_get_col(field);
565
566 if (DATA_BIG_COL(column)) {
567 val |= 0x7e; /* max > 255 bytes */
568 }
569
570 if (fixed_sum) {
571 /* write out the length of any
572 preceding non-nullable fields */
573 buf = page_zip_fixed_field_encode(
574 buf, fixed_sum << 1 | 1);
575 fixed_sum = 0;
576 col++;
577 }
578
579 *buf++ = (byte) val;
580 col++;
581 } else if (val) {
582 /* fixed-length non-nullable field */
583
584 if (fixed_sum && UNIV_UNLIKELY
585 (fixed_sum + field->fixed_len
586 > DICT_MAX_FIXED_COL_LEN)) {
587 /* Write out the length of the
588 preceding non-nullable fields,
589 to avoid exceeding the maximum
590 length of a fixed-length column. */
591 buf = page_zip_fixed_field_encode(
592 buf, fixed_sum << 1 | 1);
593 fixed_sum = 0;
594 col++;
595 }
596
597 if (i && UNIV_UNLIKELY(i == trx_id_pos)) {
598 if (fixed_sum) {
599 /* Write out the length of any
600 preceding non-nullable fields,
601 and start a new trx_id column. */
602 buf = page_zip_fixed_field_encode(
603 buf, fixed_sum << 1 | 1);
604 col++;
605 }
606
607 trx_id_col = col;
608 fixed_sum = field->fixed_len;
609 } else {
610 /* add to the sum */
611 fixed_sum += field->fixed_len;
612 }
613 } else {
614 /* fixed-length nullable field */
615
616 if (fixed_sum) {
617 /* write out the length of any
618 preceding non-nullable fields */
619 buf = page_zip_fixed_field_encode(
620 buf, fixed_sum << 1 | 1);
621 fixed_sum = 0;
622 col++;
623 }
624
625 buf = page_zip_fixed_field_encode(
626 buf, ulint(field->fixed_len) << 1);
627 col++;
628 }
629 }
630
631 if (fixed_sum) {
632 /* Write out the lengths of last fixed-length columns. */
633 buf = page_zip_fixed_field_encode(buf, fixed_sum << 1 | 1);
634 }
635
636 if (trx_id_pos != ULINT_UNDEFINED) {
637 /* Write out the position of the trx_id column */
638 i = trx_id_col;
639 } else {
640 /* Write out the number of nullable fields */
641 i = index->n_nullable;
642 }
643
644 if (i < 128) {
645 *buf++ = (byte) i;
646 } else {
647 *buf++ = (byte) (0x80 | i >> 8);
648 *buf++ = (byte) i;
649 }
650
651 ut_ad((ulint) (buf - buf_start) <= (n + 2) * 2);
652 return((ulint) (buf - buf_start));
653 }
654
655 /**********************************************************************//**
656 Populate the dense page directory from the sparse directory. */
657 static
658 void
page_zip_dir_encode(const page_t * page,byte * buf,const rec_t ** recs)659 page_zip_dir_encode(
660 /*================*/
661 const page_t* page, /*!< in: compact page */
662 byte* buf, /*!< in: pointer to dense page directory[-1];
663 out: dense directory on compressed page */
664 const rec_t** recs) /*!< in: pointer to an array of 0, or NULL;
665 out: dense page directory sorted by ascending
666 address (and heap_no) */
667 {
668 const byte* rec;
669 ulint status;
670 ulint min_mark;
671 ulint heap_no;
672 ulint i;
673 ulint n_heap;
674 ulint offs;
675
676 min_mark = 0;
677
678 if (page_is_leaf(page)) {
679 status = REC_STATUS_ORDINARY;
680 } else {
681 status = REC_STATUS_NODE_PTR;
682 if (UNIV_UNLIKELY(!page_has_prev(page))) {
683 min_mark = REC_INFO_MIN_REC_FLAG;
684 }
685 }
686
687 n_heap = page_dir_get_n_heap(page);
688
689 /* Traverse the list of stored records in the collation order,
690 starting from the first user record. */
691
692 rec = page + PAGE_NEW_INFIMUM;
693
694 i = 0;
695
696 for (;;) {
697 ulint info_bits;
698 offs = rec_get_next_offs(rec, TRUE);
699 if (UNIV_UNLIKELY(offs == PAGE_NEW_SUPREMUM)) {
700 break;
701 }
702 rec = page + offs;
703 heap_no = rec_get_heap_no_new(rec);
704 ut_a(heap_no >= PAGE_HEAP_NO_USER_LOW);
705 ut_a(heap_no < n_heap);
706 ut_a(offs < srv_page_size - PAGE_DIR);
707 ut_a(offs >= PAGE_ZIP_START);
708 compile_time_assert(!(PAGE_ZIP_DIR_SLOT_MASK
709 & (PAGE_ZIP_DIR_SLOT_MASK + 1)));
710 compile_time_assert(PAGE_ZIP_DIR_SLOT_MASK
711 >= UNIV_ZIP_SIZE_MAX - 1);
712
713 if (UNIV_UNLIKELY(rec_get_n_owned_new(rec) != 0)) {
714 offs |= PAGE_ZIP_DIR_SLOT_OWNED;
715 }
716
717 info_bits = rec_get_info_bits(rec, TRUE);
718 if (info_bits & REC_INFO_DELETED_FLAG) {
719 info_bits &= ~REC_INFO_DELETED_FLAG;
720 offs |= PAGE_ZIP_DIR_SLOT_DEL;
721 }
722 ut_a(info_bits == min_mark);
723 /* Only the smallest user record can have
724 REC_INFO_MIN_REC_FLAG set. */
725 min_mark = 0;
726
727 mach_write_to_2(buf - PAGE_ZIP_DIR_SLOT_SIZE * ++i, offs);
728
729 if (UNIV_LIKELY_NULL(recs)) {
730 /* Ensure that each heap_no occurs at most once. */
731 ut_a(!recs[heap_no - PAGE_HEAP_NO_USER_LOW]);
732 /* exclude infimum and supremum */
733 recs[heap_no - PAGE_HEAP_NO_USER_LOW] = rec;
734 }
735
736 ut_a(ulint(rec_get_status(rec)) == status);
737 }
738
739 offs = page_header_get_field(page, PAGE_FREE);
740
741 /* Traverse the free list (of deleted records). */
742 while (offs) {
743 ut_ad(!(offs & ~PAGE_ZIP_DIR_SLOT_MASK));
744 rec = page + offs;
745
746 heap_no = rec_get_heap_no_new(rec);
747 ut_a(heap_no >= PAGE_HEAP_NO_USER_LOW);
748 ut_a(heap_no < n_heap);
749
750 ut_a(!rec[-REC_N_NEW_EXTRA_BYTES]); /* info_bits and n_owned */
751 ut_a(ulint(rec_get_status(rec)) == status);
752
753 mach_write_to_2(buf - PAGE_ZIP_DIR_SLOT_SIZE * ++i, offs);
754
755 if (UNIV_LIKELY_NULL(recs)) {
756 /* Ensure that each heap_no occurs at most once. */
757 ut_a(!recs[heap_no - PAGE_HEAP_NO_USER_LOW]);
758 /* exclude infimum and supremum */
759 recs[heap_no - PAGE_HEAP_NO_USER_LOW] = rec;
760 }
761
762 offs = rec_get_next_offs(rec, TRUE);
763 }
764
765 /* Ensure that each heap no occurs at least once. */
766 ut_a(i + PAGE_HEAP_NO_USER_LOW == n_heap);
767 }
768
769 extern "C" {
770
771 /**********************************************************************//**
772 Allocate memory for zlib. */
773 static
774 void*
page_zip_zalloc(void * opaque,uInt items,uInt size)775 page_zip_zalloc(
776 /*============*/
777 void* opaque, /*!< in/out: memory heap */
778 uInt items, /*!< in: number of items to allocate */
779 uInt size) /*!< in: size of an item in bytes */
780 {
781 return(mem_heap_zalloc(static_cast<mem_heap_t*>(opaque), items * size));
782 }
783
784 /**********************************************************************//**
785 Deallocate memory for zlib. */
786 static
787 void
page_zip_free(void * opaque MY_ATTRIBUTE ((unused)),void * address MY_ATTRIBUTE ((unused)))788 page_zip_free(
789 /*==========*/
790 void* opaque MY_ATTRIBUTE((unused)), /*!< in: memory heap */
791 void* address MY_ATTRIBUTE((unused)))/*!< in: object to free */
792 {
793 }
794
795 } /* extern "C" */
796
797 /**********************************************************************//**
798 Configure the zlib allocator to use the given memory heap. */
799 void
page_zip_set_alloc(void * stream,mem_heap_t * heap)800 page_zip_set_alloc(
801 /*===============*/
802 void* stream, /*!< in/out: zlib stream */
803 mem_heap_t* heap) /*!< in: memory heap to use */
804 {
805 z_stream* strm = static_cast<z_stream*>(stream);
806
807 strm->zalloc = page_zip_zalloc;
808 strm->zfree = page_zip_free;
809 strm->opaque = heap;
810 }
811
812 #if 0 || defined UNIV_DEBUG || defined UNIV_ZIP_DEBUG
813 /** Symbol for enabling compression and decompression diagnostics */
814 # define PAGE_ZIP_COMPRESS_DBG
815 #endif
816
817 #ifdef PAGE_ZIP_COMPRESS_DBG
818 /** Set this variable in a debugger to enable
819 excessive logging in page_zip_compress(). */
820 static bool page_zip_compress_dbg;
821 /** Set this variable in a debugger to enable
822 binary logging of the data passed to deflate().
823 When this variable is nonzero, it will act
824 as a log file name generator. */
825 static unsigned page_zip_compress_log;
826
827 /**********************************************************************//**
828 Wrapper for deflate(). Log the operation if page_zip_compress_dbg is set.
829 @return deflate() status: Z_OK, Z_BUF_ERROR, ... */
830 static
831 int
page_zip_compress_deflate(FILE * logfile,z_streamp strm,int flush)832 page_zip_compress_deflate(
833 /*======================*/
834 FILE* logfile,/*!< in: log file, or NULL */
835 z_streamp strm, /*!< in/out: compressed stream for deflate() */
836 int flush) /*!< in: deflate() flushing method */
837 {
838 int status;
839 if (UNIV_UNLIKELY(page_zip_compress_dbg)) {
840 ut_print_buf(stderr, strm->next_in, strm->avail_in);
841 }
842 if (UNIV_LIKELY_NULL(logfile)) {
843 if (fwrite(strm->next_in, 1, strm->avail_in, logfile)
844 != strm->avail_in) {
845 perror("fwrite");
846 }
847 }
848 status = deflate(strm, flush);
849 if (UNIV_UNLIKELY(page_zip_compress_dbg)) {
850 fprintf(stderr, " -> %d\n", status);
851 }
852 return(status);
853 }
854
855 /* Redefine deflate(). */
856 # undef deflate
857 /** Debug wrapper for the zlib compression routine deflate().
858 Log the operation if page_zip_compress_dbg is set.
859 @param strm in/out: compressed stream
860 @param flush in: flushing method
861 @return deflate() status: Z_OK, Z_BUF_ERROR, ... */
862 # define deflate(strm, flush) page_zip_compress_deflate(logfile, strm, flush)
863 /** Declaration of the logfile parameter */
864 # define FILE_LOGFILE FILE* logfile,
865 /** The logfile parameter */
866 # define LOGFILE logfile,
867 #else /* PAGE_ZIP_COMPRESS_DBG */
868 /** Empty declaration of the logfile parameter */
869 # define FILE_LOGFILE
870 /** Missing logfile parameter */
871 # define LOGFILE
872 #endif /* PAGE_ZIP_COMPRESS_DBG */
873
874 /**********************************************************************//**
875 Compress the records of a node pointer page.
876 @return Z_OK, or a zlib error code */
877 static
878 int
page_zip_compress_node_ptrs(FILE_LOGFILE z_stream * c_stream,const rec_t ** recs,ulint n_dense,dict_index_t * index,byte * storage,mem_heap_t * heap)879 page_zip_compress_node_ptrs(
880 /*========================*/
881 FILE_LOGFILE
882 z_stream* c_stream, /*!< in/out: compressed page stream */
883 const rec_t** recs, /*!< in: dense page directory
884 sorted by address */
885 ulint n_dense, /*!< in: size of recs[] */
886 dict_index_t* index, /*!< in: the index of the page */
887 byte* storage, /*!< in: end of dense page directory */
888 mem_heap_t* heap) /*!< in: temporary memory heap */
889 {
890 int err = Z_OK;
891 rec_offs* offsets = NULL;
892
893 do {
894 const rec_t* rec = *recs++;
895
896 offsets = rec_get_offsets(rec, index, offsets, 0,
897 ULINT_UNDEFINED, &heap);
898 /* Only leaf nodes may contain externally stored columns. */
899 ut_ad(!rec_offs_any_extern(offsets));
900
901 MEM_CHECK_DEFINED(rec, rec_offs_data_size(offsets));
902 MEM_CHECK_DEFINED(rec - rec_offs_extra_size(offsets),
903 rec_offs_extra_size(offsets));
904
905 /* Compress the extra bytes. */
906 c_stream->avail_in = static_cast<uInt>(
907 rec - REC_N_NEW_EXTRA_BYTES - c_stream->next_in);
908
909 if (c_stream->avail_in) {
910 err = deflate(c_stream, Z_NO_FLUSH);
911 if (UNIV_UNLIKELY(err != Z_OK)) {
912 break;
913 }
914 }
915 ut_ad(!c_stream->avail_in);
916
917 /* Compress the data bytes, except node_ptr. */
918 c_stream->next_in = (byte*) rec;
919 c_stream->avail_in = static_cast<uInt>(
920 rec_offs_data_size(offsets) - REC_NODE_PTR_SIZE);
921
922 if (c_stream->avail_in) {
923 err = deflate(c_stream, Z_NO_FLUSH);
924 if (UNIV_UNLIKELY(err != Z_OK)) {
925 break;
926 }
927 }
928
929 ut_ad(!c_stream->avail_in);
930
931 memcpy(storage - REC_NODE_PTR_SIZE
932 * (rec_get_heap_no_new(rec) - 1),
933 c_stream->next_in, REC_NODE_PTR_SIZE);
934 c_stream->next_in += REC_NODE_PTR_SIZE;
935 } while (--n_dense);
936
937 return(err);
938 }
939
940 /**********************************************************************//**
941 Compress the records of a leaf node of a secondary index.
942 @return Z_OK, or a zlib error code */
943 static
944 int
page_zip_compress_sec(FILE_LOGFILE z_stream * c_stream,const rec_t ** recs,ulint n_dense)945 page_zip_compress_sec(
946 /*==================*/
947 FILE_LOGFILE
948 z_stream* c_stream, /*!< in/out: compressed page stream */
949 const rec_t** recs, /*!< in: dense page directory
950 sorted by address */
951 ulint n_dense) /*!< in: size of recs[] */
952 {
953 int err = Z_OK;
954
955 ut_ad(n_dense > 0);
956
957 do {
958 const rec_t* rec = *recs++;
959
960 /* Compress everything up to this record. */
961 c_stream->avail_in = static_cast<uInt>(
962 rec - REC_N_NEW_EXTRA_BYTES
963 - c_stream->next_in);
964
965 if (UNIV_LIKELY(c_stream->avail_in != 0)) {
966 MEM_CHECK_DEFINED(c_stream->next_in,
967 c_stream->avail_in);
968 err = deflate(c_stream, Z_NO_FLUSH);
969 if (UNIV_UNLIKELY(err != Z_OK)) {
970 break;
971 }
972 }
973
974 ut_ad(!c_stream->avail_in);
975 ut_ad(c_stream->next_in == rec - REC_N_NEW_EXTRA_BYTES);
976
977 /* Skip the REC_N_NEW_EXTRA_BYTES. */
978
979 c_stream->next_in = (byte*) rec;
980 } while (--n_dense);
981
982 return(err);
983 }
984
985 /**********************************************************************//**
986 Compress a record of a leaf node of a clustered index that contains
987 externally stored columns.
988 @return Z_OK, or a zlib error code */
989 static
990 int
page_zip_compress_clust_ext(FILE_LOGFILE z_stream * c_stream,const rec_t * rec,const rec_offs * offsets,ulint trx_id_col,byte * deleted,byte * storage,byte ** externs,ulint * n_blobs)991 page_zip_compress_clust_ext(
992 /*========================*/
993 FILE_LOGFILE
994 z_stream* c_stream, /*!< in/out: compressed page stream */
995 const rec_t* rec, /*!< in: record */
996 const rec_offs* offsets, /*!< in: rec_get_offsets(rec) */
997 ulint trx_id_col, /*!< in: position of of DB_TRX_ID */
998 byte* deleted, /*!< in: dense directory entry pointing
999 to the head of the free list */
1000 byte* storage, /*!< in: end of dense page directory */
1001 byte** externs, /*!< in/out: pointer to the next
1002 available BLOB pointer */
1003 ulint* n_blobs) /*!< in/out: number of
1004 externally stored columns */
1005 {
1006 int err;
1007 ulint i;
1008
1009 MEM_CHECK_DEFINED(rec, rec_offs_data_size(offsets));
1010 MEM_CHECK_DEFINED(rec - rec_offs_extra_size(offsets),
1011 rec_offs_extra_size(offsets));
1012
1013 for (i = 0; i < rec_offs_n_fields(offsets); i++) {
1014 ulint len;
1015 const byte* src;
1016
1017 if (UNIV_UNLIKELY(i == trx_id_col)) {
1018 ut_ad(!rec_offs_nth_extern(offsets, i));
1019 /* Store trx_id and roll_ptr
1020 in uncompressed form. */
1021 src = rec_get_nth_field(rec, offsets, i, &len);
1022 ut_ad(src + DATA_TRX_ID_LEN
1023 == rec_get_nth_field(rec, offsets,
1024 i + 1, &len));
1025 ut_ad(len == DATA_ROLL_PTR_LEN);
1026
1027 /* Compress any preceding bytes. */
1028 c_stream->avail_in = static_cast<uInt>(
1029 src - c_stream->next_in);
1030
1031 if (c_stream->avail_in) {
1032 err = deflate(c_stream, Z_NO_FLUSH);
1033 if (UNIV_UNLIKELY(err != Z_OK)) {
1034
1035 return(err);
1036 }
1037 }
1038
1039 ut_ad(!c_stream->avail_in);
1040 ut_ad(c_stream->next_in == src);
1041
1042 memcpy(storage
1043 - (DATA_TRX_ID_LEN + DATA_ROLL_PTR_LEN)
1044 * (rec_get_heap_no_new(rec) - 1),
1045 c_stream->next_in,
1046 DATA_TRX_ID_LEN + DATA_ROLL_PTR_LEN);
1047
1048 c_stream->next_in
1049 += DATA_TRX_ID_LEN + DATA_ROLL_PTR_LEN;
1050
1051 /* Skip also roll_ptr */
1052 i++;
1053 } else if (rec_offs_nth_extern(offsets, i)) {
1054 src = rec_get_nth_field(rec, offsets, i, &len);
1055 ut_ad(len >= BTR_EXTERN_FIELD_REF_SIZE);
1056 src += len - BTR_EXTERN_FIELD_REF_SIZE;
1057
1058 c_stream->avail_in = static_cast<uInt>(
1059 src - c_stream->next_in);
1060 if (UNIV_LIKELY(c_stream->avail_in != 0)) {
1061 err = deflate(c_stream, Z_NO_FLUSH);
1062 if (UNIV_UNLIKELY(err != Z_OK)) {
1063
1064 return(err);
1065 }
1066 }
1067
1068 ut_ad(!c_stream->avail_in);
1069 ut_ad(c_stream->next_in == src);
1070
1071 /* Reserve space for the data at
1072 the end of the space reserved for
1073 the compressed data and the page
1074 modification log. */
1075
1076 if (UNIV_UNLIKELY
1077 (c_stream->avail_out
1078 <= BTR_EXTERN_FIELD_REF_SIZE)) {
1079 /* out of space */
1080 return(Z_BUF_ERROR);
1081 }
1082
1083 ut_ad(*externs == c_stream->next_out
1084 + c_stream->avail_out
1085 + 1/* end of modif. log */);
1086
1087 c_stream->next_in
1088 += BTR_EXTERN_FIELD_REF_SIZE;
1089
1090 /* Skip deleted records. */
1091 if (UNIV_LIKELY_NULL
1092 (page_zip_dir_find_low(
1093 storage, deleted,
1094 page_offset(rec)))) {
1095 continue;
1096 }
1097
1098 (*n_blobs)++;
1099 c_stream->avail_out
1100 -= BTR_EXTERN_FIELD_REF_SIZE;
1101 *externs -= BTR_EXTERN_FIELD_REF_SIZE;
1102
1103 /* Copy the BLOB pointer */
1104 memcpy(*externs, c_stream->next_in
1105 - BTR_EXTERN_FIELD_REF_SIZE,
1106 BTR_EXTERN_FIELD_REF_SIZE);
1107 }
1108 }
1109
1110 return(Z_OK);
1111 }
1112
1113 /**********************************************************************//**
1114 Compress the records of a leaf node of a clustered index.
1115 @return Z_OK, or a zlib error code */
1116 static
1117 int
page_zip_compress_clust(FILE_LOGFILE z_stream * c_stream,const rec_t ** recs,ulint n_dense,dict_index_t * index,ulint * n_blobs,ulint trx_id_col,byte * deleted,byte * storage,mem_heap_t * heap)1118 page_zip_compress_clust(
1119 /*====================*/
1120 FILE_LOGFILE
1121 z_stream* c_stream, /*!< in/out: compressed page stream */
1122 const rec_t** recs, /*!< in: dense page directory
1123 sorted by address */
1124 ulint n_dense, /*!< in: size of recs[] */
1125 dict_index_t* index, /*!< in: the index of the page */
1126 ulint* n_blobs, /*!< in: 0; out: number of
1127 externally stored columns */
1128 ulint trx_id_col, /*!< index of the trx_id column */
1129 byte* deleted, /*!< in: dense directory entry pointing
1130 to the head of the free list */
1131 byte* storage, /*!< in: end of dense page directory */
1132 mem_heap_t* heap) /*!< in: temporary memory heap */
1133 {
1134 int err = Z_OK;
1135 rec_offs* offsets = NULL;
1136 /* BTR_EXTERN_FIELD_REF storage */
1137 byte* externs = storage - n_dense
1138 * (DATA_TRX_ID_LEN + DATA_ROLL_PTR_LEN);
1139
1140 ut_ad(*n_blobs == 0);
1141
1142 do {
1143 const rec_t* rec = *recs++;
1144
1145 offsets = rec_get_offsets(rec, index, offsets, index->n_fields,
1146 ULINT_UNDEFINED, &heap);
1147 ut_ad(rec_offs_n_fields(offsets)
1148 == dict_index_get_n_fields(index));
1149 MEM_CHECK_DEFINED(rec, rec_offs_data_size(offsets));
1150 MEM_CHECK_DEFINED(rec - rec_offs_extra_size(offsets),
1151 rec_offs_extra_size(offsets));
1152
1153 /* Compress the extra bytes. */
1154 c_stream->avail_in = static_cast<uInt>(
1155 rec - REC_N_NEW_EXTRA_BYTES
1156 - c_stream->next_in);
1157
1158 if (c_stream->avail_in) {
1159 err = deflate(c_stream, Z_NO_FLUSH);
1160 if (UNIV_UNLIKELY(err != Z_OK)) {
1161
1162 goto func_exit;
1163 }
1164 }
1165 ut_ad(!c_stream->avail_in);
1166 ut_ad(c_stream->next_in == rec - REC_N_NEW_EXTRA_BYTES);
1167
1168 /* Compress the data bytes. */
1169
1170 c_stream->next_in = (byte*) rec;
1171
1172 /* Check if there are any externally stored columns.
1173 For each externally stored column, store the
1174 BTR_EXTERN_FIELD_REF separately. */
1175 if (rec_offs_any_extern(offsets)) {
1176 ut_ad(dict_index_is_clust(index));
1177
1178 err = page_zip_compress_clust_ext(
1179 LOGFILE
1180 c_stream, rec, offsets, trx_id_col,
1181 deleted, storage, &externs, n_blobs);
1182
1183 if (UNIV_UNLIKELY(err != Z_OK)) {
1184
1185 goto func_exit;
1186 }
1187 } else {
1188 ulint len;
1189 const byte* src;
1190
1191 /* Store trx_id and roll_ptr in uncompressed form. */
1192 src = rec_get_nth_field(rec, offsets,
1193 trx_id_col, &len);
1194 ut_ad(src + DATA_TRX_ID_LEN
1195 == rec_get_nth_field(rec, offsets,
1196 trx_id_col + 1, &len));
1197 ut_ad(len == DATA_ROLL_PTR_LEN);
1198 MEM_CHECK_DEFINED(rec, rec_offs_data_size(offsets));
1199 MEM_CHECK_DEFINED(rec - rec_offs_extra_size(offsets),
1200 rec_offs_extra_size(offsets));
1201
1202 /* Compress any preceding bytes. */
1203 c_stream->avail_in = static_cast<uInt>(
1204 src - c_stream->next_in);
1205
1206 if (c_stream->avail_in) {
1207 err = deflate(c_stream, Z_NO_FLUSH);
1208 if (UNIV_UNLIKELY(err != Z_OK)) {
1209
1210 return(err);
1211 }
1212 }
1213
1214 ut_ad(!c_stream->avail_in);
1215 ut_ad(c_stream->next_in == src);
1216
1217 memcpy(storage
1218 - (DATA_TRX_ID_LEN + DATA_ROLL_PTR_LEN)
1219 * (rec_get_heap_no_new(rec) - 1),
1220 c_stream->next_in,
1221 DATA_TRX_ID_LEN + DATA_ROLL_PTR_LEN);
1222
1223 c_stream->next_in
1224 += DATA_TRX_ID_LEN + DATA_ROLL_PTR_LEN;
1225
1226 /* Skip also roll_ptr */
1227 ut_ad(trx_id_col + 1 < rec_offs_n_fields(offsets));
1228 }
1229
1230 /* Compress the last bytes of the record. */
1231 c_stream->avail_in = static_cast<uInt>(
1232 rec + rec_offs_data_size(offsets) - c_stream->next_in);
1233
1234 if (c_stream->avail_in) {
1235 err = deflate(c_stream, Z_NO_FLUSH);
1236 if (UNIV_UNLIKELY(err != Z_OK)) {
1237
1238 goto func_exit;
1239 }
1240 }
1241 ut_ad(!c_stream->avail_in);
1242 } while (--n_dense);
1243
1244 func_exit:
1245 return(err);}
1246
1247 /** Attempt to compress a ROW_FORMAT=COMPRESSED page.
1248 @retval true on success
1249 @retval false on failure; block->page.zip will be left intact. */
1250 bool
page_zip_compress(buf_block_t * block,dict_index_t * index,ulint level,mtr_t * mtr)1251 page_zip_compress(
1252 buf_block_t* block, /*!< in/out: buffer block */
1253 dict_index_t* index, /*!< in: index of the B-tree node */
1254 ulint level, /*!< in: commpression level */
1255 mtr_t* mtr) /*!< in/out: mini-transaction */
1256 {
1257 z_stream c_stream;
1258 int err;
1259 byte* fields; /*!< index field information */
1260 byte* buf; /*!< compressed payload of the
1261 page */
1262 byte* buf_end; /* end of buf */
1263 ulint n_dense;
1264 ulint slot_size; /* amount of uncompressed bytes
1265 per record */
1266 const rec_t** recs; /*!< dense page directory,
1267 sorted by address */
1268 mem_heap_t* heap;
1269 ulint trx_id_col = ULINT_UNDEFINED;
1270 ulint n_blobs = 0;
1271 byte* storage; /* storage of uncompressed
1272 columns */
1273 const ulonglong ns = my_interval_timer();
1274 #ifdef PAGE_ZIP_COMPRESS_DBG
1275 FILE* logfile = NULL;
1276 #endif
1277 /* A local copy of srv_cmp_per_index_enabled to avoid reading that
1278 variable multiple times in this function since it can be changed at
1279 anytime. */
1280 my_bool cmp_per_index_enabled;
1281 cmp_per_index_enabled = srv_cmp_per_index_enabled;
1282
1283 page_t* page = block->frame;
1284 page_zip_des_t* page_zip = &block->page.zip;
1285
1286 ut_a(page_is_comp(page));
1287 ut_a(fil_page_index_page_check(page));
1288 ut_ad(page_simple_validate_new((page_t*) page));
1289 ut_ad(page_zip_simple_validate(page_zip));
1290 ut_ad(dict_table_is_comp(index->table));
1291 ut_ad(!dict_index_is_ibuf(index));
1292
1293 MEM_CHECK_DEFINED(page, srv_page_size);
1294
1295 /* Check the data that will be omitted. */
1296 ut_a(!memcmp(page + (PAGE_NEW_INFIMUM - REC_N_NEW_EXTRA_BYTES),
1297 infimum_extra, sizeof infimum_extra));
1298 ut_a(!memcmp(page + PAGE_NEW_INFIMUM,
1299 infimum_data, sizeof infimum_data));
1300 ut_a(page[PAGE_NEW_SUPREMUM - REC_N_NEW_EXTRA_BYTES]
1301 /* info_bits == 0, n_owned <= max */
1302 <= PAGE_DIR_SLOT_MAX_N_OWNED);
1303 ut_a(!memcmp(page + (PAGE_NEW_SUPREMUM - REC_N_NEW_EXTRA_BYTES + 1),
1304 supremum_extra_data, sizeof supremum_extra_data));
1305
1306 if (page_is_empty(page)) {
1307 ut_a(rec_get_next_offs(page + PAGE_NEW_INFIMUM, TRUE)
1308 == PAGE_NEW_SUPREMUM);
1309 }
1310
1311 const ulint n_fields = page_is_leaf(page)
1312 ? dict_index_get_n_fields(index)
1313 : dict_index_get_n_unique_in_tree_nonleaf(index);
1314 index_id_t ind_id = index->id;
1315
1316 /* The dense directory excludes the infimum and supremum records. */
1317 n_dense = ulint(page_dir_get_n_heap(page)) - PAGE_HEAP_NO_USER_LOW;
1318 #ifdef PAGE_ZIP_COMPRESS_DBG
1319 if (UNIV_UNLIKELY(page_zip_compress_dbg)) {
1320 ib::info() << "compress "
1321 << static_cast<void*>(page_zip) << " "
1322 << static_cast<const void*>(page) << " "
1323 << page_is_leaf(page) << " "
1324 << n_fields << " " << n_dense;
1325 }
1326
1327 if (UNIV_UNLIKELY(page_zip_compress_log)) {
1328 /* Create a log file for every compression attempt. */
1329 char logfilename[9];
1330 snprintf(logfilename, sizeof logfilename,
1331 "%08x", page_zip_compress_log++);
1332 logfile = fopen(logfilename, "wb");
1333
1334 if (logfile) {
1335 /* Write the uncompressed page to the log. */
1336 if (fwrite(page, 1, srv_page_size, logfile)
1337 != srv_page_size) {
1338 perror("fwrite");
1339 }
1340 /* Record the compressed size as zero.
1341 This will be overwritten at successful exit. */
1342 putc(0, logfile);
1343 putc(0, logfile);
1344 putc(0, logfile);
1345 putc(0, logfile);
1346 }
1347 }
1348 #endif /* PAGE_ZIP_COMPRESS_DBG */
1349 page_zip_stat[page_zip->ssize - 1].compressed++;
1350 if (cmp_per_index_enabled) {
1351 mutex_enter(&page_zip_stat_per_index_mutex);
1352 page_zip_stat_per_index[ind_id].compressed++;
1353 mutex_exit(&page_zip_stat_per_index_mutex);
1354 }
1355
1356 if (UNIV_UNLIKELY(n_dense * PAGE_ZIP_DIR_SLOT_SIZE
1357 >= page_zip_get_size(page_zip))) {
1358
1359 goto err_exit;
1360 }
1361
1362 MONITOR_INC(MONITOR_PAGE_COMPRESS);
1363
1364 heap = mem_heap_create(page_zip_get_size(page_zip)
1365 + n_fields * (2 + sizeof(ulint))
1366 + REC_OFFS_HEADER_SIZE
1367 + n_dense * ((sizeof *recs)
1368 - PAGE_ZIP_DIR_SLOT_SIZE)
1369 + srv_page_size * 4
1370 + (512 << MAX_MEM_LEVEL));
1371
1372 recs = static_cast<const rec_t**>(
1373 mem_heap_zalloc(heap, n_dense * sizeof *recs));
1374
1375 fields = static_cast<byte*>(mem_heap_alloc(heap, (n_fields + 1) * 2));
1376
1377 buf = static_cast<byte*>(
1378 mem_heap_alloc(heap, page_zip_get_size(page_zip) - PAGE_DATA));
1379
1380 buf_end = buf + page_zip_get_size(page_zip) - PAGE_DATA;
1381
1382 /* Compress the data payload. */
1383 page_zip_set_alloc(&c_stream, heap);
1384
1385 err = deflateInit2(&c_stream, static_cast<int>(level),
1386 Z_DEFLATED, static_cast<int>(srv_page_size_shift),
1387 MAX_MEM_LEVEL, Z_DEFAULT_STRATEGY);
1388 ut_a(err == Z_OK);
1389
1390 c_stream.next_out = buf;
1391
1392 /* Subtract the space reserved for uncompressed data. */
1393 /* Page header and the end marker of the modification log */
1394 c_stream.avail_out = static_cast<uInt>(buf_end - buf - 1);
1395
1396 /* Dense page directory and uncompressed columns, if any */
1397 if (page_is_leaf(page)) {
1398 if (dict_index_is_clust(index)) {
1399 trx_id_col = index->db_trx_id();
1400
1401 slot_size = PAGE_ZIP_DIR_SLOT_SIZE
1402 + DATA_TRX_ID_LEN + DATA_ROLL_PTR_LEN;
1403
1404 } else {
1405 /* Signal the absence of trx_id
1406 in page_zip_fields_encode() */
1407 trx_id_col = 0;
1408 slot_size = PAGE_ZIP_DIR_SLOT_SIZE;
1409 }
1410 } else {
1411 slot_size = PAGE_ZIP_DIR_SLOT_SIZE + REC_NODE_PTR_SIZE;
1412 trx_id_col = ULINT_UNDEFINED;
1413 }
1414
1415 if (UNIV_UNLIKELY(c_stream.avail_out <= n_dense * slot_size
1416 + 6/* sizeof(zlib header and footer) */)) {
1417 goto zlib_error;
1418 }
1419
1420 c_stream.avail_out -= uInt(n_dense * slot_size);
1421 c_stream.avail_in = uInt(page_zip_fields_encode(n_fields, index,
1422 trx_id_col, fields));
1423 c_stream.next_in = fields;
1424
1425 if (UNIV_LIKELY(!trx_id_col)) {
1426 trx_id_col = ULINT_UNDEFINED;
1427 }
1428
1429 MEM_CHECK_DEFINED(c_stream.next_in, c_stream.avail_in);
1430 err = deflate(&c_stream, Z_FULL_FLUSH);
1431 if (err != Z_OK) {
1432 goto zlib_error;
1433 }
1434
1435 ut_ad(!c_stream.avail_in);
1436
1437 page_zip_dir_encode(page, buf_end, recs);
1438
1439 c_stream.next_in = (byte*) page + PAGE_ZIP_START;
1440
1441 storage = buf_end - n_dense * PAGE_ZIP_DIR_SLOT_SIZE;
1442
1443 /* Compress the records in heap_no order. */
1444 if (UNIV_UNLIKELY(!n_dense)) {
1445 } else if (!page_is_leaf(page)) {
1446 /* This is a node pointer page. */
1447 err = page_zip_compress_node_ptrs(LOGFILE
1448 &c_stream, recs, n_dense,
1449 index, storage, heap);
1450 if (UNIV_UNLIKELY(err != Z_OK)) {
1451 goto zlib_error;
1452 }
1453 } else if (UNIV_LIKELY(trx_id_col == ULINT_UNDEFINED)) {
1454 /* This is a leaf page in a secondary index. */
1455 err = page_zip_compress_sec(LOGFILE
1456 &c_stream, recs, n_dense);
1457 if (UNIV_UNLIKELY(err != Z_OK)) {
1458 goto zlib_error;
1459 }
1460 } else {
1461 /* This is a leaf page in a clustered index. */
1462 err = page_zip_compress_clust(LOGFILE
1463 &c_stream, recs, n_dense,
1464 index, &n_blobs, trx_id_col,
1465 buf_end - PAGE_ZIP_DIR_SLOT_SIZE
1466 * page_get_n_recs(page),
1467 storage, heap);
1468 if (UNIV_UNLIKELY(err != Z_OK)) {
1469 goto zlib_error;
1470 }
1471 }
1472
1473 /* Finish the compression. */
1474 ut_ad(!c_stream.avail_in);
1475 /* Compress any trailing garbage, in case the last record was
1476 allocated from an originally longer space on the free list,
1477 or the data of the last record from page_zip_compress_sec(). */
1478 c_stream.avail_in = static_cast<uInt>(
1479 page_header_get_field(page, PAGE_HEAP_TOP)
1480 - (c_stream.next_in - page));
1481 ut_a(c_stream.avail_in <= srv_page_size - PAGE_ZIP_START - PAGE_DIR);
1482
1483 MEM_CHECK_DEFINED(c_stream.next_in, c_stream.avail_in);
1484 err = deflate(&c_stream, Z_FINISH);
1485
1486 if (UNIV_UNLIKELY(err != Z_STREAM_END)) {
1487 zlib_error:
1488 deflateEnd(&c_stream);
1489 mem_heap_free(heap);
1490 err_exit:
1491 #ifdef PAGE_ZIP_COMPRESS_DBG
1492 if (logfile) {
1493 fclose(logfile);
1494 }
1495 #endif /* PAGE_ZIP_COMPRESS_DBG */
1496 if (page_is_leaf(page)) {
1497 dict_index_zip_failure(index);
1498 }
1499
1500 const uint64_t time_diff = (my_interval_timer() - ns) / 1000;
1501 page_zip_stat[page_zip->ssize - 1].compressed_usec
1502 += time_diff;
1503 if (cmp_per_index_enabled) {
1504 mutex_enter(&page_zip_stat_per_index_mutex);
1505 page_zip_stat_per_index[ind_id].compressed_usec
1506 += time_diff;
1507 mutex_exit(&page_zip_stat_per_index_mutex);
1508 }
1509 return false;
1510 }
1511
1512 err = deflateEnd(&c_stream);
1513 ut_a(err == Z_OK);
1514
1515 ut_ad(buf + c_stream.total_out == c_stream.next_out);
1516 ut_ad((ulint) (storage - c_stream.next_out) >= c_stream.avail_out);
1517
1518 #if defined HAVE_valgrind && !__has_feature(memory_sanitizer)
1519 /* Valgrind believes that zlib does not initialize some bits
1520 in the last 7 or 8 bytes of the stream. Make Valgrind happy. */
1521 MEM_MAKE_DEFINED(buf, c_stream.total_out);
1522 #endif /* HAVE_valgrind && !memory_sanitizer */
1523
1524 /* Zero out the area reserved for the modification log.
1525 Space for the end marker of the modification log is not
1526 included in avail_out. */
1527 memset(c_stream.next_out, 0, c_stream.avail_out + 1/* end marker */);
1528
1529 #ifdef UNIV_DEBUG
1530 page_zip->m_start =
1531 #endif /* UNIV_DEBUG */
1532 page_zip->m_end = uint16_t(PAGE_DATA + c_stream.total_out);
1533 page_zip->m_nonempty = FALSE;
1534 page_zip->n_blobs = unsigned(n_blobs) & ((1U << 12) - 1);
1535 /* Copy those header fields that will not be written
1536 in buf_flush_init_for_writing() */
1537 memcpy_aligned<8>(page_zip->data + FIL_PAGE_PREV, page + FIL_PAGE_PREV,
1538 FIL_PAGE_LSN - FIL_PAGE_PREV);
1539 memcpy_aligned<2>(page_zip->data + FIL_PAGE_TYPE, page + FIL_PAGE_TYPE,
1540 2);
1541 memcpy_aligned<2>(page_zip->data + FIL_PAGE_DATA, page + FIL_PAGE_DATA,
1542 PAGE_DATA - FIL_PAGE_DATA);
1543 /* Copy the rest of the compressed page */
1544 memcpy_aligned<2>(page_zip->data + PAGE_DATA, buf,
1545 page_zip_get_size(page_zip) - PAGE_DATA);
1546 mem_heap_free(heap);
1547 #ifdef UNIV_ZIP_DEBUG
1548 ut_a(page_zip_validate(page_zip, page, index));
1549 #endif /* UNIV_ZIP_DEBUG */
1550
1551 page_zip_compress_write_log(block, index, mtr);
1552
1553 MEM_CHECK_DEFINED(page_zip->data, page_zip_get_size(page_zip));
1554
1555 #ifdef PAGE_ZIP_COMPRESS_DBG
1556 if (logfile) {
1557 /* Record the compressed size of the block. */
1558 byte sz[4];
1559 mach_write_to_4(sz, c_stream.total_out);
1560 fseek(logfile, srv_page_size, SEEK_SET);
1561 if (fwrite(sz, 1, sizeof sz, logfile) != sizeof sz) {
1562 perror("fwrite");
1563 }
1564 fclose(logfile);
1565 }
1566 #endif /* PAGE_ZIP_COMPRESS_DBG */
1567 const uint64_t time_diff = (my_interval_timer() - ns) / 1000;
1568 page_zip_stat[page_zip->ssize - 1].compressed_ok++;
1569 page_zip_stat[page_zip->ssize - 1].compressed_usec += time_diff;
1570 if (cmp_per_index_enabled) {
1571 mutex_enter(&page_zip_stat_per_index_mutex);
1572 page_zip_stat_per_index[ind_id].compressed_ok++;
1573 page_zip_stat_per_index[ind_id].compressed_usec += time_diff;
1574 mutex_exit(&page_zip_stat_per_index_mutex);
1575 }
1576
1577 if (page_is_leaf(page)) {
1578 dict_index_zip_success(index);
1579 }
1580
1581 return true;
1582 }
1583
1584 /**********************************************************************//**
1585 Deallocate the index information initialized by page_zip_fields_decode(). */
1586 static
1587 void
page_zip_fields_free(dict_index_t * index)1588 page_zip_fields_free(
1589 /*=================*/
1590 dict_index_t* index) /*!< in: dummy index to be freed */
1591 {
1592 if (index) {
1593 dict_table_t* table = index->table;
1594 index->zip_pad.mutex.~mutex();
1595 mem_heap_free(index->heap);
1596
1597 dict_mem_table_free(table);
1598 }
1599 }
1600
1601 /**********************************************************************//**
1602 Read the index information for the compressed page.
1603 @return own: dummy index describing the page, or NULL on error */
1604 static
1605 dict_index_t*
page_zip_fields_decode(const byte * buf,const byte * end,ulint * trx_id_col,bool is_spatial)1606 page_zip_fields_decode(
1607 /*===================*/
1608 const byte* buf, /*!< in: index information */
1609 const byte* end, /*!< in: end of buf */
1610 ulint* trx_id_col,/*!< in: NULL for non-leaf pages;
1611 for leaf pages, pointer to where to store
1612 the position of the trx_id column */
1613 bool is_spatial)/*< in: is spatial index or not */
1614 {
1615 const byte* b;
1616 ulint n;
1617 ulint i;
1618 ulint val;
1619 dict_table_t* table;
1620 dict_index_t* index;
1621
1622 /* Determine the number of fields. */
1623 for (b = buf, n = 0; b < end; n++) {
1624 if (*b++ & 0x80) {
1625 b++; /* skip the second byte */
1626 }
1627 }
1628
1629 n--; /* n_nullable or trx_id */
1630
1631 if (UNIV_UNLIKELY(n > REC_MAX_N_FIELDS)) {
1632
1633 page_zip_fail(("page_zip_fields_decode: n = %lu\n",
1634 (ulong) n));
1635 return(NULL);
1636 }
1637
1638 if (UNIV_UNLIKELY(b > end)) {
1639
1640 page_zip_fail(("page_zip_fields_decode: %p > %p\n",
1641 (const void*) b, (const void*) end));
1642 return(NULL);
1643 }
1644
1645 table = dict_mem_table_create("ZIP_DUMMY", NULL, n, 0,
1646 DICT_TF_COMPACT, 0);
1647 index = dict_mem_index_create(table, "ZIP_DUMMY", 0, n);
1648 index->n_uniq = static_cast<unsigned>(n) & dict_index_t::MAX_N_FIELDS;
1649 /* avoid ut_ad(index->cached) in dict_index_get_n_unique_in_tree */
1650 index->cached = TRUE;
1651
1652 /* Initialize the fields. */
1653 for (b = buf, i = 0; i < n; i++) {
1654 ulint mtype;
1655 ulint len;
1656
1657 val = *b++;
1658
1659 if (UNIV_UNLIKELY(val & 0x80)) {
1660 /* fixed length > 62 bytes */
1661 val = (val & 0x7f) << 8 | *b++;
1662 len = val >> 1;
1663 mtype = DATA_FIXBINARY;
1664 } else if (UNIV_UNLIKELY(val >= 126)) {
1665 /* variable length with max > 255 bytes */
1666 len = 0x7fff;
1667 mtype = DATA_BINARY;
1668 } else if (val <= 1) {
1669 /* variable length with max <= 255 bytes */
1670 len = 0;
1671 mtype = DATA_BINARY;
1672 } else {
1673 /* fixed length < 62 bytes */
1674 len = val >> 1;
1675 mtype = DATA_FIXBINARY;
1676 }
1677
1678 dict_mem_table_add_col(table, NULL, NULL, mtype,
1679 val & 1 ? DATA_NOT_NULL : 0, len);
1680 dict_index_add_col(index, table,
1681 dict_table_get_nth_col(table, i), 0);
1682 }
1683
1684 val = *b++;
1685 if (UNIV_UNLIKELY(val & 0x80)) {
1686 val = (val & 0x7f) << 8 | *b++;
1687 }
1688
1689 /* Decode the position of the trx_id column. */
1690 if (trx_id_col) {
1691 if (!val) {
1692 val = ULINT_UNDEFINED;
1693 } else if (UNIV_UNLIKELY(val >= n)) {
1694 fail:
1695 page_zip_fields_free(index);
1696 return NULL;
1697 } else {
1698 index->type = DICT_CLUSTERED;
1699 }
1700
1701 *trx_id_col = val;
1702 } else {
1703 /* Decode the number of nullable fields. */
1704 if (UNIV_UNLIKELY(index->n_nullable > val)) {
1705 goto fail;
1706 } else {
1707 index->n_nullable = static_cast<unsigned>(val)
1708 & dict_index_t::MAX_N_FIELDS;
1709 }
1710 }
1711
1712 /* ROW_FORMAT=COMPRESSED does not support instant ADD COLUMN */
1713 index->n_core_fields = index->n_fields;
1714 index->n_core_null_bytes = static_cast<uint8_t>(
1715 UT_BITS_IN_BYTES(unsigned(index->n_nullable)));
1716
1717 ut_ad(b == end);
1718
1719 if (is_spatial) {
1720 index->type |= DICT_SPATIAL;
1721 }
1722
1723 return(index);
1724 }
1725
1726 /**********************************************************************//**
1727 Populate the sparse page directory from the dense directory.
1728 @return TRUE on success, FALSE on failure */
1729 static MY_ATTRIBUTE((nonnull, warn_unused_result))
1730 ibool
page_zip_dir_decode(const page_zip_des_t * page_zip,page_t * page,rec_t ** recs,ulint n_dense)1731 page_zip_dir_decode(
1732 /*================*/
1733 const page_zip_des_t* page_zip,/*!< in: dense page directory on
1734 compressed page */
1735 page_t* page, /*!< in: compact page with valid header;
1736 out: trailer and sparse page directory
1737 filled in */
1738 rec_t** recs, /*!< out: dense page directory sorted by
1739 ascending address (and heap_no) */
1740 ulint n_dense)/*!< in: number of user records, and
1741 size of recs[] */
1742 {
1743 ulint i;
1744 ulint n_recs;
1745 byte* slot;
1746
1747 n_recs = page_get_n_recs(page);
1748
1749 if (UNIV_UNLIKELY(n_recs > n_dense)) {
1750 page_zip_fail(("page_zip_dir_decode 1: %lu > %lu\n",
1751 (ulong) n_recs, (ulong) n_dense));
1752 return(FALSE);
1753 }
1754
1755 /* Traverse the list of stored records in the sorting order,
1756 starting from the first user record. */
1757
1758 slot = page + (srv_page_size - PAGE_DIR - PAGE_DIR_SLOT_SIZE);
1759 UNIV_PREFETCH_RW(slot);
1760
1761 /* Zero out the page trailer. */
1762 memset(slot + PAGE_DIR_SLOT_SIZE, 0, PAGE_DIR);
1763
1764 mach_write_to_2(slot, PAGE_NEW_INFIMUM);
1765 slot -= PAGE_DIR_SLOT_SIZE;
1766 UNIV_PREFETCH_RW(slot);
1767
1768 /* Initialize the sparse directory and copy the dense directory. */
1769 for (i = 0; i < n_recs; i++) {
1770 ulint offs = page_zip_dir_get(page_zip, i);
1771
1772 if (offs & PAGE_ZIP_DIR_SLOT_OWNED) {
1773 mach_write_to_2(slot, offs & PAGE_ZIP_DIR_SLOT_MASK);
1774 slot -= PAGE_DIR_SLOT_SIZE;
1775 UNIV_PREFETCH_RW(slot);
1776 }
1777
1778 if (UNIV_UNLIKELY((offs & PAGE_ZIP_DIR_SLOT_MASK)
1779 < PAGE_ZIP_START + REC_N_NEW_EXTRA_BYTES)) {
1780 page_zip_fail(("page_zip_dir_decode 2: %u %u %lx\n",
1781 (unsigned) i, (unsigned) n_recs,
1782 (ulong) offs));
1783 return(FALSE);
1784 }
1785
1786 recs[i] = page + (offs & PAGE_ZIP_DIR_SLOT_MASK);
1787 }
1788
1789 mach_write_to_2(slot, PAGE_NEW_SUPREMUM);
1790 {
1791 const page_dir_slot_t* last_slot = page_dir_get_nth_slot(
1792 page, page_dir_get_n_slots(page) - 1U);
1793
1794 if (UNIV_UNLIKELY(slot != last_slot)) {
1795 page_zip_fail(("page_zip_dir_decode 3: %p != %p\n",
1796 (const void*) slot,
1797 (const void*) last_slot));
1798 return(FALSE);
1799 }
1800 }
1801
1802 /* Copy the rest of the dense directory. */
1803 for (; i < n_dense; i++) {
1804 ulint offs = page_zip_dir_get(page_zip, i);
1805
1806 if (UNIV_UNLIKELY(offs & ~PAGE_ZIP_DIR_SLOT_MASK)) {
1807 page_zip_fail(("page_zip_dir_decode 4: %u %u %lx\n",
1808 (unsigned) i, (unsigned) n_dense,
1809 (ulong) offs));
1810 return(FALSE);
1811 }
1812
1813 recs[i] = page + offs;
1814 }
1815
1816 std::sort(recs, recs + n_dense);
1817 return(TRUE);
1818 }
1819
1820 /**********************************************************************//**
1821 Initialize the REC_N_NEW_EXTRA_BYTES of each record.
1822 @return TRUE on success, FALSE on failure */
1823 static
1824 ibool
page_zip_set_extra_bytes(const page_zip_des_t * page_zip,page_t * page,ulint info_bits)1825 page_zip_set_extra_bytes(
1826 /*=====================*/
1827 const page_zip_des_t* page_zip,/*!< in: compressed page */
1828 page_t* page, /*!< in/out: uncompressed page */
1829 ulint info_bits)/*!< in: REC_INFO_MIN_REC_FLAG or 0 */
1830 {
1831 ulint n;
1832 ulint i;
1833 ulint n_owned = 1;
1834 ulint offs;
1835 rec_t* rec;
1836
1837 n = page_get_n_recs(page);
1838 rec = page + PAGE_NEW_INFIMUM;
1839
1840 for (i = 0; i < n; i++) {
1841 offs = page_zip_dir_get(page_zip, i);
1842
1843 if (offs & PAGE_ZIP_DIR_SLOT_DEL) {
1844 info_bits |= REC_INFO_DELETED_FLAG;
1845 }
1846 if (UNIV_UNLIKELY(offs & PAGE_ZIP_DIR_SLOT_OWNED)) {
1847 info_bits |= n_owned;
1848 n_owned = 1;
1849 } else {
1850 n_owned++;
1851 }
1852 offs &= PAGE_ZIP_DIR_SLOT_MASK;
1853 if (UNIV_UNLIKELY(offs < PAGE_ZIP_START
1854 + REC_N_NEW_EXTRA_BYTES)) {
1855 page_zip_fail(("page_zip_set_extra_bytes 1:"
1856 " %u %u %lx\n",
1857 (unsigned) i, (unsigned) n,
1858 (ulong) offs));
1859 return(FALSE);
1860 }
1861
1862 rec_set_next_offs_new(rec, offs);
1863 rec = page + offs;
1864 rec[-REC_N_NEW_EXTRA_BYTES] = (byte) info_bits;
1865 info_bits = 0;
1866 }
1867
1868 /* Set the next pointer of the last user record. */
1869 rec_set_next_offs_new(rec, PAGE_NEW_SUPREMUM);
1870
1871 /* Set n_owned of the supremum record. */
1872 page[PAGE_NEW_SUPREMUM - REC_N_NEW_EXTRA_BYTES] = (byte) n_owned;
1873
1874 /* The dense directory excludes the infimum and supremum records. */
1875 n = ulint(page_dir_get_n_heap(page)) - PAGE_HEAP_NO_USER_LOW;
1876
1877 if (i >= n) {
1878 if (UNIV_LIKELY(i == n)) {
1879 return(TRUE);
1880 }
1881
1882 page_zip_fail(("page_zip_set_extra_bytes 2: %u != %u\n",
1883 (unsigned) i, (unsigned) n));
1884 return(FALSE);
1885 }
1886
1887 offs = page_zip_dir_get(page_zip, i);
1888
1889 /* Set the extra bytes of deleted records on the free list. */
1890 for (;;) {
1891 if (UNIV_UNLIKELY(!offs)
1892 || UNIV_UNLIKELY(offs & ~PAGE_ZIP_DIR_SLOT_MASK)) {
1893
1894 page_zip_fail(("page_zip_set_extra_bytes 3: %lx\n",
1895 (ulong) offs));
1896 return(FALSE);
1897 }
1898
1899 rec = page + offs;
1900 rec[-REC_N_NEW_EXTRA_BYTES] = 0; /* info_bits and n_owned */
1901
1902 if (++i == n) {
1903 break;
1904 }
1905
1906 offs = page_zip_dir_get(page_zip, i);
1907 rec_set_next_offs_new(rec, offs);
1908 }
1909
1910 /* Terminate the free list. */
1911 rec[-REC_N_NEW_EXTRA_BYTES] = 0; /* info_bits and n_owned */
1912 rec_set_next_offs_new(rec, 0);
1913
1914 return(TRUE);
1915 }
1916
1917 /**********************************************************************//**
1918 Apply the modification log to a record containing externally stored
1919 columns. Do not copy the fields that are stored separately.
1920 @return pointer to modification log, or NULL on failure */
1921 static
1922 const byte*
page_zip_apply_log_ext(rec_t * rec,const rec_offs * offsets,ulint trx_id_col,const byte * data,const byte * end)1923 page_zip_apply_log_ext(
1924 /*===================*/
1925 rec_t* rec, /*!< in/out: record */
1926 const rec_offs* offsets, /*!< in: rec_get_offsets(rec) */
1927 ulint trx_id_col, /*!< in: position of of DB_TRX_ID */
1928 const byte* data, /*!< in: modification log */
1929 const byte* end) /*!< in: end of modification log */
1930 {
1931 ulint i;
1932 ulint len;
1933 byte* next_out = rec;
1934
1935 /* Check if there are any externally stored columns.
1936 For each externally stored column, skip the
1937 BTR_EXTERN_FIELD_REF. */
1938
1939 for (i = 0; i < rec_offs_n_fields(offsets); i++) {
1940 byte* dst;
1941
1942 if (UNIV_UNLIKELY(i == trx_id_col)) {
1943 /* Skip trx_id and roll_ptr */
1944 dst = rec_get_nth_field(rec, offsets,
1945 i, &len);
1946 if (UNIV_UNLIKELY(dst - next_out >= end - data)
1947 || UNIV_UNLIKELY
1948 (len < (DATA_TRX_ID_LEN + DATA_ROLL_PTR_LEN))
1949 || rec_offs_nth_extern(offsets, i)) {
1950 page_zip_fail(("page_zip_apply_log_ext:"
1951 " trx_id len %lu,"
1952 " %p - %p >= %p - %p\n",
1953 (ulong) len,
1954 (const void*) dst,
1955 (const void*) next_out,
1956 (const void*) end,
1957 (const void*) data));
1958 return(NULL);
1959 }
1960
1961 memcpy(next_out, data, ulint(dst - next_out));
1962 data += ulint(dst - next_out);
1963 next_out = dst + (DATA_TRX_ID_LEN
1964 + DATA_ROLL_PTR_LEN);
1965 } else if (rec_offs_nth_extern(offsets, i)) {
1966 dst = rec_get_nth_field(rec, offsets,
1967 i, &len);
1968 ut_ad(len
1969 >= BTR_EXTERN_FIELD_REF_SIZE);
1970
1971 len += ulint(dst - next_out)
1972 - BTR_EXTERN_FIELD_REF_SIZE;
1973
1974 if (UNIV_UNLIKELY(data + len >= end)) {
1975 page_zip_fail(("page_zip_apply_log_ext:"
1976 " ext %p+%lu >= %p\n",
1977 (const void*) data,
1978 (ulong) len,
1979 (const void*) end));
1980 return(NULL);
1981 }
1982
1983 memcpy(next_out, data, len);
1984 data += len;
1985 next_out += len
1986 + BTR_EXTERN_FIELD_REF_SIZE;
1987 }
1988 }
1989
1990 /* Copy the last bytes of the record. */
1991 len = ulint(rec_get_end(rec, offsets) - next_out);
1992 if (UNIV_UNLIKELY(data + len >= end)) {
1993 page_zip_fail(("page_zip_apply_log_ext:"
1994 " last %p+%lu >= %p\n",
1995 (const void*) data,
1996 (ulong) len,
1997 (const void*) end));
1998 return(NULL);
1999 }
2000 memcpy(next_out, data, len);
2001 data += len;
2002
2003 return(data);
2004 }
2005
2006 /**********************************************************************//**
2007 Apply the modification log to an uncompressed page.
2008 Do not copy the fields that are stored separately.
2009 @return pointer to end of modification log, or NULL on failure */
2010 static
2011 const byte*
page_zip_apply_log(const byte * data,ulint size,rec_t ** recs,ulint n_dense,ulint n_core,ulint trx_id_col,ulint heap_status,dict_index_t * index,rec_offs * offsets)2012 page_zip_apply_log(
2013 /*===============*/
2014 const byte* data, /*!< in: modification log */
2015 ulint size, /*!< in: maximum length of the log, in bytes */
2016 rec_t** recs, /*!< in: dense page directory,
2017 sorted by address (indexed by
2018 heap_no - PAGE_HEAP_NO_USER_LOW) */
2019 ulint n_dense,/*!< in: size of recs[] */
2020 ulint n_core, /*!< in: index->n_fields, or 0 for non-leaf */
2021 ulint trx_id_col,/*!< in: column number of trx_id in the index,
2022 or ULINT_UNDEFINED if none */
2023 ulint heap_status,
2024 /*!< in: heap_no and status bits for
2025 the next record to uncompress */
2026 dict_index_t* index, /*!< in: index of the page */
2027 rec_offs* offsets)/*!< in/out: work area for
2028 rec_get_offsets_reverse() */
2029 {
2030 const byte* const end = data + size;
2031
2032 for (;;) {
2033 ulint val;
2034 rec_t* rec;
2035 ulint len;
2036 ulint hs;
2037
2038 val = *data++;
2039 if (UNIV_UNLIKELY(!val)) {
2040 return(data - 1);
2041 }
2042 if (val & 0x80) {
2043 val = (val & 0x7f) << 8 | *data++;
2044 if (UNIV_UNLIKELY(!val)) {
2045 page_zip_fail(("page_zip_apply_log:"
2046 " invalid val %x%x\n",
2047 data[-2], data[-1]));
2048 return(NULL);
2049 }
2050 }
2051 if (UNIV_UNLIKELY(data >= end)) {
2052 page_zip_fail(("page_zip_apply_log: %p >= %p\n",
2053 (const void*) data,
2054 (const void*) end));
2055 return(NULL);
2056 }
2057 if (UNIV_UNLIKELY((val >> 1) > n_dense)) {
2058 page_zip_fail(("page_zip_apply_log: %lu>>1 > %lu\n",
2059 (ulong) val, (ulong) n_dense));
2060 return(NULL);
2061 }
2062
2063 /* Determine the heap number and status bits of the record. */
2064 rec = recs[(val >> 1) - 1];
2065
2066 hs = ((val >> 1) + 1) << REC_HEAP_NO_SHIFT;
2067 hs |= heap_status & ((1 << REC_HEAP_NO_SHIFT) - 1);
2068
2069 /* This may either be an old record that is being
2070 overwritten (updated in place, or allocated from
2071 the free list), or a new record, with the next
2072 available_heap_no. */
2073 if (UNIV_UNLIKELY(hs > heap_status)) {
2074 page_zip_fail(("page_zip_apply_log: %lu > %lu\n",
2075 (ulong) hs, (ulong) heap_status));
2076 return(NULL);
2077 } else if (hs == heap_status) {
2078 /* A new record was allocated from the heap. */
2079 if (UNIV_UNLIKELY(val & 1)) {
2080 /* Only existing records may be cleared. */
2081 page_zip_fail(("page_zip_apply_log:"
2082 " attempting to create"
2083 " deleted rec %lu\n",
2084 (ulong) hs));
2085 return(NULL);
2086 }
2087 heap_status += 1 << REC_HEAP_NO_SHIFT;
2088 }
2089
2090 mach_write_to_2(rec - REC_NEW_HEAP_NO, hs);
2091
2092 if (val & 1) {
2093 /* Clear the data bytes of the record. */
2094 mem_heap_t* heap = NULL;
2095 rec_offs* offs;
2096 offs = rec_get_offsets(rec, index, offsets, n_core,
2097 ULINT_UNDEFINED, &heap);
2098 memset(rec, 0, rec_offs_data_size(offs));
2099
2100 if (UNIV_LIKELY_NULL(heap)) {
2101 mem_heap_free(heap);
2102 }
2103 continue;
2104 }
2105
2106 compile_time_assert(REC_STATUS_NODE_PTR == TRUE);
2107 rec_get_offsets_reverse(data, index,
2108 hs & REC_STATUS_NODE_PTR,
2109 offsets);
2110 /* Silence a debug assertion in rec_offs_make_valid().
2111 This will be overwritten in page_zip_set_extra_bytes(),
2112 called by page_zip_decompress_low(). */
2113 ut_d(rec[-REC_NEW_INFO_BITS] = 0);
2114 rec_offs_make_valid(rec, index, n_core != 0, offsets);
2115
2116 /* Copy the extra bytes (backwards). */
2117 {
2118 byte* start = rec_get_start(rec, offsets);
2119 byte* b = rec - REC_N_NEW_EXTRA_BYTES;
2120 while (b != start) {
2121 *--b = *data++;
2122 }
2123 }
2124
2125 /* Copy the data bytes. */
2126 if (UNIV_UNLIKELY(rec_offs_any_extern(offsets))) {
2127 /* Non-leaf nodes should not contain any
2128 externally stored columns. */
2129 if (UNIV_UNLIKELY(hs & REC_STATUS_NODE_PTR)) {
2130 page_zip_fail(("page_zip_apply_log:"
2131 " %lu&REC_STATUS_NODE_PTR\n",
2132 (ulong) hs));
2133 return(NULL);
2134 }
2135
2136 data = page_zip_apply_log_ext(
2137 rec, offsets, trx_id_col, data, end);
2138
2139 if (UNIV_UNLIKELY(!data)) {
2140 return(NULL);
2141 }
2142 } else if (UNIV_UNLIKELY(hs & REC_STATUS_NODE_PTR)) {
2143 len = rec_offs_data_size(offsets)
2144 - REC_NODE_PTR_SIZE;
2145 /* Copy the data bytes, except node_ptr. */
2146 if (UNIV_UNLIKELY(data + len >= end)) {
2147 page_zip_fail(("page_zip_apply_log:"
2148 " node_ptr %p+%lu >= %p\n",
2149 (const void*) data,
2150 (ulong) len,
2151 (const void*) end));
2152 return(NULL);
2153 }
2154 memcpy(rec, data, len);
2155 data += len;
2156 } else if (UNIV_LIKELY(trx_id_col == ULINT_UNDEFINED)) {
2157 len = rec_offs_data_size(offsets);
2158
2159 /* Copy all data bytes of
2160 a record in a secondary index. */
2161 if (UNIV_UNLIKELY(data + len >= end)) {
2162 page_zip_fail(("page_zip_apply_log:"
2163 " sec %p+%lu >= %p\n",
2164 (const void*) data,
2165 (ulong) len,
2166 (const void*) end));
2167 return(NULL);
2168 }
2169
2170 memcpy(rec, data, len);
2171 data += len;
2172 } else {
2173 /* Skip DB_TRX_ID and DB_ROLL_PTR. */
2174 ulint l = rec_get_nth_field_offs(offsets,
2175 trx_id_col, &len);
2176 byte* b;
2177
2178 if (UNIV_UNLIKELY(data + l >= end)
2179 || UNIV_UNLIKELY(len < (DATA_TRX_ID_LEN
2180 + DATA_ROLL_PTR_LEN))) {
2181 page_zip_fail(("page_zip_apply_log:"
2182 " trx_id %p+%lu >= %p\n",
2183 (const void*) data,
2184 (ulong) l,
2185 (const void*) end));
2186 return(NULL);
2187 }
2188
2189 /* Copy any preceding data bytes. */
2190 memcpy(rec, data, l);
2191 data += l;
2192
2193 /* Copy any bytes following DB_TRX_ID, DB_ROLL_PTR. */
2194 b = rec + l + (DATA_TRX_ID_LEN + DATA_ROLL_PTR_LEN);
2195 len = ulint(rec_get_end(rec, offsets) - b);
2196 if (UNIV_UNLIKELY(data + len >= end)) {
2197 page_zip_fail(("page_zip_apply_log:"
2198 " clust %p+%lu >= %p\n",
2199 (const void*) data,
2200 (ulong) len,
2201 (const void*) end));
2202 return(NULL);
2203 }
2204 memcpy(b, data, len);
2205 data += len;
2206 }
2207 }
2208 }
2209
2210 /**********************************************************************//**
2211 Set the heap_no in a record, and skip the fixed-size record header
2212 that is not included in the d_stream.
2213 @return TRUE on success, FALSE if d_stream does not end at rec */
2214 static
2215 ibool
page_zip_decompress_heap_no(z_stream * d_stream,rec_t * rec,ulint & heap_status)2216 page_zip_decompress_heap_no(
2217 /*========================*/
2218 z_stream* d_stream, /*!< in/out: compressed page stream */
2219 rec_t* rec, /*!< in/out: record */
2220 ulint& heap_status) /*!< in/out: heap_no and status bits */
2221 {
2222 if (d_stream->next_out != rec - REC_N_NEW_EXTRA_BYTES) {
2223 /* n_dense has grown since the page was last compressed. */
2224 return(FALSE);
2225 }
2226
2227 /* Skip the REC_N_NEW_EXTRA_BYTES. */
2228 d_stream->next_out = rec;
2229
2230 /* Set heap_no and the status bits. */
2231 mach_write_to_2(rec - REC_NEW_HEAP_NO, heap_status);
2232 heap_status += 1 << REC_HEAP_NO_SHIFT;
2233 return(TRUE);
2234 }
2235
2236 /**********************************************************************//**
2237 Decompress the records of a node pointer page.
2238 @return TRUE on success, FALSE on failure */
2239 static
2240 ibool
page_zip_decompress_node_ptrs(page_zip_des_t * page_zip,z_stream * d_stream,rec_t ** recs,ulint n_dense,dict_index_t * index,rec_offs * offsets,mem_heap_t * heap)2241 page_zip_decompress_node_ptrs(
2242 /*==========================*/
2243 page_zip_des_t* page_zip, /*!< in/out: compressed page */
2244 z_stream* d_stream, /*!< in/out: compressed page stream */
2245 rec_t** recs, /*!< in: dense page directory
2246 sorted by address */
2247 ulint n_dense, /*!< in: size of recs[] */
2248 dict_index_t* index, /*!< in: the index of the page */
2249 rec_offs* offsets, /*!< in/out: temporary offsets */
2250 mem_heap_t* heap) /*!< in: temporary memory heap */
2251 {
2252 ulint heap_status = REC_STATUS_NODE_PTR
2253 | PAGE_HEAP_NO_USER_LOW << REC_HEAP_NO_SHIFT;
2254 ulint slot;
2255 const byte* storage;
2256
2257 /* Subtract the space reserved for uncompressed data. */
2258 d_stream->avail_in -= static_cast<uInt>(
2259 n_dense * (PAGE_ZIP_DIR_SLOT_SIZE + REC_NODE_PTR_SIZE));
2260
2261 /* Decompress the records in heap_no order. */
2262 for (slot = 0; slot < n_dense; slot++) {
2263 rec_t* rec = recs[slot];
2264
2265 d_stream->avail_out = static_cast<uInt>(
2266 rec - REC_N_NEW_EXTRA_BYTES - d_stream->next_out);
2267
2268 ut_ad(d_stream->avail_out < srv_page_size
2269 - PAGE_ZIP_START - PAGE_DIR);
2270 switch (inflate(d_stream, Z_SYNC_FLUSH)) {
2271 case Z_STREAM_END:
2272 page_zip_decompress_heap_no(
2273 d_stream, rec, heap_status);
2274 goto zlib_done;
2275 case Z_OK:
2276 case Z_BUF_ERROR:
2277 if (!d_stream->avail_out) {
2278 break;
2279 }
2280 /* fall through */
2281 default:
2282 page_zip_fail(("page_zip_decompress_node_ptrs:"
2283 " 1 inflate(Z_SYNC_FLUSH)=%s\n",
2284 d_stream->msg));
2285 goto zlib_error;
2286 }
2287
2288 if (!page_zip_decompress_heap_no(
2289 d_stream, rec, heap_status)) {
2290 ut_ad(0);
2291 }
2292
2293 /* Read the offsets. The status bits are needed here. */
2294 offsets = rec_get_offsets(rec, index, offsets, 0,
2295 ULINT_UNDEFINED, &heap);
2296
2297 /* Non-leaf nodes should not have any externally
2298 stored columns. */
2299 ut_ad(!rec_offs_any_extern(offsets));
2300
2301 /* Decompress the data bytes, except node_ptr. */
2302 d_stream->avail_out =static_cast<uInt>(
2303 rec_offs_data_size(offsets) - REC_NODE_PTR_SIZE);
2304
2305 switch (inflate(d_stream, Z_SYNC_FLUSH)) {
2306 case Z_STREAM_END:
2307 goto zlib_done;
2308 case Z_OK:
2309 case Z_BUF_ERROR:
2310 if (!d_stream->avail_out) {
2311 break;
2312 }
2313 /* fall through */
2314 default:
2315 page_zip_fail(("page_zip_decompress_node_ptrs:"
2316 " 2 inflate(Z_SYNC_FLUSH)=%s\n",
2317 d_stream->msg));
2318 goto zlib_error;
2319 }
2320
2321 /* Clear the node pointer in case the record
2322 will be deleted and the space will be reallocated
2323 to a smaller record. */
2324 memset(d_stream->next_out, 0, REC_NODE_PTR_SIZE);
2325 d_stream->next_out += REC_NODE_PTR_SIZE;
2326
2327 ut_ad(d_stream->next_out == rec_get_end(rec, offsets));
2328 }
2329
2330 /* Decompress any trailing garbage, in case the last record was
2331 allocated from an originally longer space on the free list. */
2332 d_stream->avail_out = static_cast<uInt>(
2333 page_header_get_field(page_zip->data, PAGE_HEAP_TOP)
2334 - page_offset(d_stream->next_out));
2335 if (UNIV_UNLIKELY(d_stream->avail_out > srv_page_size
2336 - PAGE_ZIP_START - PAGE_DIR)) {
2337
2338 page_zip_fail(("page_zip_decompress_node_ptrs:"
2339 " avail_out = %u\n",
2340 d_stream->avail_out));
2341 goto zlib_error;
2342 }
2343
2344 if (UNIV_UNLIKELY(inflate(d_stream, Z_FINISH) != Z_STREAM_END)) {
2345 page_zip_fail(("page_zip_decompress_node_ptrs:"
2346 " inflate(Z_FINISH)=%s\n",
2347 d_stream->msg));
2348 zlib_error:
2349 inflateEnd(d_stream);
2350 return(FALSE);
2351 }
2352
2353 /* Note that d_stream->avail_out > 0 may hold here
2354 if the modification log is nonempty. */
2355
2356 zlib_done:
2357 if (UNIV_UNLIKELY(inflateEnd(d_stream) != Z_OK)) {
2358 ut_error;
2359 }
2360
2361 {
2362 page_t* page = page_align(d_stream->next_out);
2363
2364 /* Clear the unused heap space on the uncompressed page. */
2365 memset(d_stream->next_out, 0,
2366 ulint(page_dir_get_nth_slot(page,
2367 page_dir_get_n_slots(page)
2368 - 1U)
2369 - d_stream->next_out));
2370 }
2371
2372 #ifdef UNIV_DEBUG
2373 page_zip->m_start = uint16_t(PAGE_DATA + d_stream->total_in);
2374 #endif /* UNIV_DEBUG */
2375
2376 /* Apply the modification log. */
2377 {
2378 const byte* mod_log_ptr;
2379 mod_log_ptr = page_zip_apply_log(d_stream->next_in,
2380 d_stream->avail_in + 1,
2381 recs, n_dense, 0,
2382 ULINT_UNDEFINED, heap_status,
2383 index, offsets);
2384
2385 if (UNIV_UNLIKELY(!mod_log_ptr)) {
2386 return(FALSE);
2387 }
2388 page_zip->m_end = uint16_t(mod_log_ptr - page_zip->data);
2389 page_zip->m_nonempty = mod_log_ptr != d_stream->next_in;
2390 }
2391
2392 if (UNIV_UNLIKELY
2393 (page_zip_get_trailer_len(page_zip,
2394 dict_index_is_clust(index))
2395 + page_zip->m_end >= page_zip_get_size(page_zip))) {
2396 page_zip_fail(("page_zip_decompress_node_ptrs:"
2397 " %lu + %lu >= %lu, %lu\n",
2398 (ulong) page_zip_get_trailer_len(
2399 page_zip, dict_index_is_clust(index)),
2400 (ulong) page_zip->m_end,
2401 (ulong) page_zip_get_size(page_zip),
2402 (ulong) dict_index_is_clust(index)));
2403 return(FALSE);
2404 }
2405
2406 /* Restore the uncompressed columns in heap_no order. */
2407 storage = page_zip_dir_start_low(page_zip, n_dense);
2408
2409 for (slot = 0; slot < n_dense; slot++) {
2410 rec_t* rec = recs[slot];
2411
2412 offsets = rec_get_offsets(rec, index, offsets, 0,
2413 ULINT_UNDEFINED, &heap);
2414 /* Non-leaf nodes should not have any externally
2415 stored columns. */
2416 ut_ad(!rec_offs_any_extern(offsets));
2417 storage -= REC_NODE_PTR_SIZE;
2418
2419 memcpy(rec_get_end(rec, offsets) - REC_NODE_PTR_SIZE,
2420 storage, REC_NODE_PTR_SIZE);
2421 }
2422
2423 return(TRUE);
2424 }
2425
2426 /**********************************************************************//**
2427 Decompress the records of a leaf node of a secondary index.
2428 @return TRUE on success, FALSE on failure */
2429 static
2430 ibool
page_zip_decompress_sec(page_zip_des_t * page_zip,z_stream * d_stream,rec_t ** recs,ulint n_dense,dict_index_t * index,rec_offs * offsets)2431 page_zip_decompress_sec(
2432 /*====================*/
2433 page_zip_des_t* page_zip, /*!< in/out: compressed page */
2434 z_stream* d_stream, /*!< in/out: compressed page stream */
2435 rec_t** recs, /*!< in: dense page directory
2436 sorted by address */
2437 ulint n_dense, /*!< in: size of recs[] */
2438 dict_index_t* index, /*!< in: the index of the page */
2439 rec_offs* offsets) /*!< in/out: temporary offsets */
2440 {
2441 ulint heap_status = REC_STATUS_ORDINARY
2442 | PAGE_HEAP_NO_USER_LOW << REC_HEAP_NO_SHIFT;
2443 ulint slot;
2444
2445 ut_a(!dict_index_is_clust(index));
2446
2447 /* Subtract the space reserved for uncompressed data. */
2448 d_stream->avail_in -= static_cast<uint>(
2449 n_dense * PAGE_ZIP_DIR_SLOT_SIZE);
2450
2451 for (slot = 0; slot < n_dense; slot++) {
2452 rec_t* rec = recs[slot];
2453
2454 /* Decompress everything up to this record. */
2455 d_stream->avail_out = static_cast<uint>(
2456 rec - REC_N_NEW_EXTRA_BYTES - d_stream->next_out);
2457
2458 if (UNIV_LIKELY(d_stream->avail_out)) {
2459 switch (inflate(d_stream, Z_SYNC_FLUSH)) {
2460 case Z_STREAM_END:
2461 page_zip_decompress_heap_no(
2462 d_stream, rec, heap_status);
2463 goto zlib_done;
2464 case Z_OK:
2465 case Z_BUF_ERROR:
2466 if (!d_stream->avail_out) {
2467 break;
2468 }
2469 /* fall through */
2470 default:
2471 page_zip_fail(("page_zip_decompress_sec:"
2472 " inflate(Z_SYNC_FLUSH)=%s\n",
2473 d_stream->msg));
2474 goto zlib_error;
2475 }
2476 }
2477
2478 if (!page_zip_decompress_heap_no(
2479 d_stream, rec, heap_status)) {
2480 ut_ad(0);
2481 }
2482 }
2483
2484 /* Decompress the data of the last record and any trailing garbage,
2485 in case the last record was allocated from an originally longer space
2486 on the free list. */
2487 d_stream->avail_out = static_cast<uInt>(
2488 page_header_get_field(page_zip->data, PAGE_HEAP_TOP)
2489 - page_offset(d_stream->next_out));
2490 if (UNIV_UNLIKELY(d_stream->avail_out > srv_page_size
2491 - PAGE_ZIP_START - PAGE_DIR)) {
2492
2493 page_zip_fail(("page_zip_decompress_sec:"
2494 " avail_out = %u\n",
2495 d_stream->avail_out));
2496 goto zlib_error;
2497 }
2498
2499 if (UNIV_UNLIKELY(inflate(d_stream, Z_FINISH) != Z_STREAM_END)) {
2500 page_zip_fail(("page_zip_decompress_sec:"
2501 " inflate(Z_FINISH)=%s\n",
2502 d_stream->msg));
2503 zlib_error:
2504 inflateEnd(d_stream);
2505 return(FALSE);
2506 }
2507
2508 /* Note that d_stream->avail_out > 0 may hold here
2509 if the modification log is nonempty. */
2510
2511 zlib_done:
2512 if (UNIV_UNLIKELY(inflateEnd(d_stream) != Z_OK)) {
2513 ut_error;
2514 }
2515
2516 {
2517 page_t* page = page_align(d_stream->next_out);
2518
2519 /* Clear the unused heap space on the uncompressed page. */
2520 memset(d_stream->next_out, 0,
2521 ulint(page_dir_get_nth_slot(page,
2522 page_dir_get_n_slots(page)
2523 - 1U)
2524 - d_stream->next_out));
2525 }
2526
2527 ut_d(page_zip->m_start = uint16_t(PAGE_DATA + d_stream->total_in));
2528
2529 /* Apply the modification log. */
2530 {
2531 const byte* mod_log_ptr;
2532 mod_log_ptr = page_zip_apply_log(d_stream->next_in,
2533 d_stream->avail_in + 1,
2534 recs, n_dense,
2535 index->n_fields,
2536 ULINT_UNDEFINED, heap_status,
2537 index, offsets);
2538
2539 if (UNIV_UNLIKELY(!mod_log_ptr)) {
2540 return(FALSE);
2541 }
2542 page_zip->m_end = uint16_t(mod_log_ptr - page_zip->data);
2543 page_zip->m_nonempty = mod_log_ptr != d_stream->next_in;
2544 }
2545
2546 if (UNIV_UNLIKELY(page_zip_get_trailer_len(page_zip, FALSE)
2547 + page_zip->m_end >= page_zip_get_size(page_zip))) {
2548
2549 page_zip_fail(("page_zip_decompress_sec: %lu + %lu >= %lu\n",
2550 (ulong) page_zip_get_trailer_len(
2551 page_zip, FALSE),
2552 (ulong) page_zip->m_end,
2553 (ulong) page_zip_get_size(page_zip)));
2554 return(FALSE);
2555 }
2556
2557 /* There are no uncompressed columns on leaf pages of
2558 secondary indexes. */
2559
2560 return(TRUE);
2561 }
2562
2563 /**********************************************************************//**
2564 Decompress a record of a leaf node of a clustered index that contains
2565 externally stored columns.
2566 @return TRUE on success */
2567 static
2568 ibool
page_zip_decompress_clust_ext(z_stream * d_stream,rec_t * rec,const rec_offs * offsets,ulint trx_id_col)2569 page_zip_decompress_clust_ext(
2570 /*==========================*/
2571 z_stream* d_stream, /*!< in/out: compressed page stream */
2572 rec_t* rec, /*!< in/out: record */
2573 const rec_offs* offsets, /*!< in: rec_get_offsets(rec) */
2574 ulint trx_id_col) /*!< in: position of of DB_TRX_ID */
2575 {
2576 ulint i;
2577
2578 for (i = 0; i < rec_offs_n_fields(offsets); i++) {
2579 ulint len;
2580 byte* dst;
2581
2582 if (UNIV_UNLIKELY(i == trx_id_col)) {
2583 /* Skip trx_id and roll_ptr */
2584 dst = rec_get_nth_field(rec, offsets, i, &len);
2585 if (UNIV_UNLIKELY(len < DATA_TRX_ID_LEN
2586 + DATA_ROLL_PTR_LEN)) {
2587
2588 page_zip_fail(("page_zip_decompress_clust_ext:"
2589 " len[%lu] = %lu\n",
2590 (ulong) i, (ulong) len));
2591 return(FALSE);
2592 }
2593
2594 if (rec_offs_nth_extern(offsets, i)) {
2595
2596 page_zip_fail(("page_zip_decompress_clust_ext:"
2597 " DB_TRX_ID at %lu is ext\n",
2598 (ulong) i));
2599 return(FALSE);
2600 }
2601
2602 d_stream->avail_out = static_cast<uInt>(
2603 dst - d_stream->next_out);
2604
2605 switch (inflate(d_stream, Z_SYNC_FLUSH)) {
2606 case Z_STREAM_END:
2607 case Z_OK:
2608 case Z_BUF_ERROR:
2609 if (!d_stream->avail_out) {
2610 break;
2611 }
2612 /* fall through */
2613 default:
2614 page_zip_fail(("page_zip_decompress_clust_ext:"
2615 " 1 inflate(Z_SYNC_FLUSH)=%s\n",
2616 d_stream->msg));
2617 return(FALSE);
2618 }
2619
2620 ut_ad(d_stream->next_out == dst);
2621
2622 /* Clear DB_TRX_ID and DB_ROLL_PTR in order to
2623 avoid uninitialized bytes in case the record
2624 is affected by page_zip_apply_log(). */
2625 memset(dst, 0, DATA_TRX_ID_LEN + DATA_ROLL_PTR_LEN);
2626
2627 d_stream->next_out += DATA_TRX_ID_LEN
2628 + DATA_ROLL_PTR_LEN;
2629 } else if (rec_offs_nth_extern(offsets, i)) {
2630 dst = rec_get_nth_field(rec, offsets, i, &len);
2631 ut_ad(len >= BTR_EXTERN_FIELD_REF_SIZE);
2632 dst += len - BTR_EXTERN_FIELD_REF_SIZE;
2633
2634 d_stream->avail_out = static_cast<uInt>(
2635 dst - d_stream->next_out);
2636 switch (inflate(d_stream, Z_SYNC_FLUSH)) {
2637 case Z_STREAM_END:
2638 case Z_OK:
2639 case Z_BUF_ERROR:
2640 if (!d_stream->avail_out) {
2641 break;
2642 }
2643 /* fall through */
2644 default:
2645 page_zip_fail(("page_zip_decompress_clust_ext:"
2646 " 2 inflate(Z_SYNC_FLUSH)=%s\n",
2647 d_stream->msg));
2648 return(FALSE);
2649 }
2650
2651 ut_ad(d_stream->next_out == dst);
2652
2653 /* Clear the BLOB pointer in case
2654 the record will be deleted and the
2655 space will not be reused. Note that
2656 the final initialization of the BLOB
2657 pointers (copying from "externs"
2658 or clearing) will have to take place
2659 only after the page modification log
2660 has been applied. Otherwise, we
2661 could end up with an uninitialized
2662 BLOB pointer when a record is deleted,
2663 reallocated and deleted. */
2664 memset(d_stream->next_out, 0,
2665 BTR_EXTERN_FIELD_REF_SIZE);
2666 d_stream->next_out
2667 += BTR_EXTERN_FIELD_REF_SIZE;
2668 }
2669 }
2670
2671 return(TRUE);
2672 }
2673
2674 /**********************************************************************//**
2675 Compress the records of a leaf node of a clustered index.
2676 @return TRUE on success, FALSE on failure */
2677 static
2678 ibool
page_zip_decompress_clust(page_zip_des_t * page_zip,z_stream * d_stream,rec_t ** recs,ulint n_dense,dict_index_t * index,ulint trx_id_col,rec_offs * offsets,mem_heap_t * heap)2679 page_zip_decompress_clust(
2680 /*======================*/
2681 page_zip_des_t* page_zip, /*!< in/out: compressed page */
2682 z_stream* d_stream, /*!< in/out: compressed page stream */
2683 rec_t** recs, /*!< in: dense page directory
2684 sorted by address */
2685 ulint n_dense, /*!< in: size of recs[] */
2686 dict_index_t* index, /*!< in: the index of the page */
2687 ulint trx_id_col, /*!< index of the trx_id column */
2688 rec_offs* offsets, /*!< in/out: temporary offsets */
2689 mem_heap_t* heap) /*!< in: temporary memory heap */
2690 {
2691 int err;
2692 ulint slot;
2693 ulint heap_status = REC_STATUS_ORDINARY
2694 | PAGE_HEAP_NO_USER_LOW << REC_HEAP_NO_SHIFT;
2695 const byte* storage;
2696 const byte* externs;
2697
2698 ut_a(dict_index_is_clust(index));
2699
2700 /* Subtract the space reserved for uncompressed data. */
2701 d_stream->avail_in -= static_cast<uInt>(n_dense)
2702 * (PAGE_ZIP_CLUST_LEAF_SLOT_SIZE);
2703
2704 /* Decompress the records in heap_no order. */
2705 for (slot = 0; slot < n_dense; slot++) {
2706 rec_t* rec = recs[slot];
2707
2708 d_stream->avail_out =static_cast<uInt>(
2709 rec - REC_N_NEW_EXTRA_BYTES - d_stream->next_out);
2710
2711 ut_ad(d_stream->avail_out < srv_page_size
2712 - PAGE_ZIP_START - PAGE_DIR);
2713 err = inflate(d_stream, Z_SYNC_FLUSH);
2714 switch (err) {
2715 case Z_STREAM_END:
2716 page_zip_decompress_heap_no(
2717 d_stream, rec, heap_status);
2718 goto zlib_done;
2719 case Z_OK:
2720 case Z_BUF_ERROR:
2721 if (UNIV_LIKELY(!d_stream->avail_out)) {
2722 break;
2723 }
2724 /* fall through */
2725 default:
2726 page_zip_fail(("page_zip_decompress_clust:"
2727 " 1 inflate(Z_SYNC_FLUSH)=%s\n",
2728 d_stream->msg));
2729 goto zlib_error;
2730 }
2731
2732 if (!page_zip_decompress_heap_no(
2733 d_stream, rec, heap_status)) {
2734 ut_ad(0);
2735 }
2736
2737 /* Read the offsets. The status bits are needed here. */
2738 offsets = rec_get_offsets(rec, index, offsets, index->n_fields,
2739 ULINT_UNDEFINED, &heap);
2740
2741 /* This is a leaf page in a clustered index. */
2742
2743 /* Check if there are any externally stored columns.
2744 For each externally stored column, restore the
2745 BTR_EXTERN_FIELD_REF separately. */
2746
2747 if (rec_offs_any_extern(offsets)) {
2748 if (UNIV_UNLIKELY
2749 (!page_zip_decompress_clust_ext(
2750 d_stream, rec, offsets, trx_id_col))) {
2751
2752 goto zlib_error;
2753 }
2754 } else {
2755 /* Skip trx_id and roll_ptr */
2756 ulint len;
2757 byte* dst = rec_get_nth_field(rec, offsets,
2758 trx_id_col, &len);
2759 if (UNIV_UNLIKELY(len < DATA_TRX_ID_LEN
2760 + DATA_ROLL_PTR_LEN)) {
2761
2762 page_zip_fail(("page_zip_decompress_clust:"
2763 " len = %lu\n", (ulong) len));
2764 goto zlib_error;
2765 }
2766
2767 d_stream->avail_out = static_cast<uInt>(
2768 dst - d_stream->next_out);
2769
2770 switch (inflate(d_stream, Z_SYNC_FLUSH)) {
2771 case Z_STREAM_END:
2772 case Z_OK:
2773 case Z_BUF_ERROR:
2774 if (!d_stream->avail_out) {
2775 break;
2776 }
2777 /* fall through */
2778 default:
2779 page_zip_fail(("page_zip_decompress_clust:"
2780 " 2 inflate(Z_SYNC_FLUSH)=%s\n",
2781 d_stream->msg));
2782 goto zlib_error;
2783 }
2784
2785 ut_ad(d_stream->next_out == dst);
2786
2787 /* Clear DB_TRX_ID and DB_ROLL_PTR in order to
2788 avoid uninitialized bytes in case the record
2789 is affected by page_zip_apply_log(). */
2790 memset(dst, 0, DATA_TRX_ID_LEN + DATA_ROLL_PTR_LEN);
2791
2792 d_stream->next_out += DATA_TRX_ID_LEN
2793 + DATA_ROLL_PTR_LEN;
2794 }
2795
2796 /* Decompress the last bytes of the record. */
2797 d_stream->avail_out = static_cast<uInt>(
2798 rec_get_end(rec, offsets) - d_stream->next_out);
2799
2800 switch (inflate(d_stream, Z_SYNC_FLUSH)) {
2801 case Z_STREAM_END:
2802 case Z_OK:
2803 case Z_BUF_ERROR:
2804 if (!d_stream->avail_out) {
2805 break;
2806 }
2807 /* fall through */
2808 default:
2809 page_zip_fail(("page_zip_decompress_clust:"
2810 " 3 inflate(Z_SYNC_FLUSH)=%s\n",
2811 d_stream->msg));
2812 goto zlib_error;
2813 }
2814 }
2815
2816 /* Decompress any trailing garbage, in case the last record was
2817 allocated from an originally longer space on the free list. */
2818 d_stream->avail_out = static_cast<uInt>(
2819 page_header_get_field(page_zip->data, PAGE_HEAP_TOP)
2820 - page_offset(d_stream->next_out));
2821 if (UNIV_UNLIKELY(d_stream->avail_out > srv_page_size
2822 - PAGE_ZIP_START - PAGE_DIR)) {
2823
2824 page_zip_fail(("page_zip_decompress_clust:"
2825 " avail_out = %u\n",
2826 d_stream->avail_out));
2827 goto zlib_error;
2828 }
2829
2830 if (UNIV_UNLIKELY(inflate(d_stream, Z_FINISH) != Z_STREAM_END)) {
2831 page_zip_fail(("page_zip_decompress_clust:"
2832 " inflate(Z_FINISH)=%s\n",
2833 d_stream->msg));
2834 zlib_error:
2835 inflateEnd(d_stream);
2836 return(FALSE);
2837 }
2838
2839 /* Note that d_stream->avail_out > 0 may hold here
2840 if the modification log is nonempty. */
2841
2842 zlib_done:
2843 if (UNIV_UNLIKELY(inflateEnd(d_stream) != Z_OK)) {
2844 ut_error;
2845 }
2846
2847 {
2848 page_t* page = page_align(d_stream->next_out);
2849
2850 /* Clear the unused heap space on the uncompressed page. */
2851 memset(d_stream->next_out, 0,
2852 ulint(page_dir_get_nth_slot(page,
2853 page_dir_get_n_slots(page)
2854 - 1U)
2855 - d_stream->next_out));
2856 }
2857
2858 ut_d(page_zip->m_start = uint16_t(PAGE_DATA + d_stream->total_in));
2859
2860 /* Apply the modification log. */
2861 {
2862 const byte* mod_log_ptr;
2863 mod_log_ptr = page_zip_apply_log(d_stream->next_in,
2864 d_stream->avail_in + 1,
2865 recs, n_dense,
2866 index->n_fields,
2867 trx_id_col, heap_status,
2868 index, offsets);
2869
2870 if (UNIV_UNLIKELY(!mod_log_ptr)) {
2871 return(FALSE);
2872 }
2873 page_zip->m_end = uint16_t(mod_log_ptr - page_zip->data);
2874 page_zip->m_nonempty = mod_log_ptr != d_stream->next_in;
2875 }
2876
2877 if (UNIV_UNLIKELY(page_zip_get_trailer_len(page_zip, TRUE)
2878 + page_zip->m_end >= page_zip_get_size(page_zip))) {
2879
2880 page_zip_fail(("page_zip_decompress_clust: %lu + %lu >= %lu\n",
2881 (ulong) page_zip_get_trailer_len(
2882 page_zip, TRUE),
2883 (ulong) page_zip->m_end,
2884 (ulong) page_zip_get_size(page_zip)));
2885 return(FALSE);
2886 }
2887
2888 storage = page_zip_dir_start_low(page_zip, n_dense);
2889
2890 externs = storage - n_dense
2891 * (DATA_TRX_ID_LEN + DATA_ROLL_PTR_LEN);
2892
2893 /* Restore the uncompressed columns in heap_no order. */
2894
2895 for (slot = 0; slot < n_dense; slot++) {
2896 ulint i;
2897 ulint len;
2898 byte* dst;
2899 rec_t* rec = recs[slot];
2900 bool exists = !page_zip_dir_find_free(
2901 page_zip, page_offset(rec));
2902 offsets = rec_get_offsets(rec, index, offsets, index->n_fields,
2903 ULINT_UNDEFINED, &heap);
2904
2905 dst = rec_get_nth_field(rec, offsets,
2906 trx_id_col, &len);
2907 ut_ad(len >= DATA_TRX_ID_LEN + DATA_ROLL_PTR_LEN);
2908 storage -= DATA_TRX_ID_LEN + DATA_ROLL_PTR_LEN;
2909 memcpy(dst, storage,
2910 DATA_TRX_ID_LEN + DATA_ROLL_PTR_LEN);
2911
2912 /* Check if there are any externally stored
2913 columns in this record. For each externally
2914 stored column, restore or clear the
2915 BTR_EXTERN_FIELD_REF. */
2916 if (!rec_offs_any_extern(offsets)) {
2917 continue;
2918 }
2919
2920 for (i = 0; i < rec_offs_n_fields(offsets); i++) {
2921 if (!rec_offs_nth_extern(offsets, i)) {
2922 continue;
2923 }
2924 dst = rec_get_nth_field(rec, offsets, i, &len);
2925
2926 if (UNIV_UNLIKELY(len < BTR_EXTERN_FIELD_REF_SIZE)) {
2927 page_zip_fail(("page_zip_decompress_clust:"
2928 " %lu < 20\n",
2929 (ulong) len));
2930 return(FALSE);
2931 }
2932
2933 dst += len - BTR_EXTERN_FIELD_REF_SIZE;
2934
2935 if (UNIV_LIKELY(exists)) {
2936 /* Existing record:
2937 restore the BLOB pointer */
2938 externs -= BTR_EXTERN_FIELD_REF_SIZE;
2939
2940 if (UNIV_UNLIKELY
2941 (externs < page_zip->data
2942 + page_zip->m_end)) {
2943 page_zip_fail(("page_zip_"
2944 "decompress_clust:"
2945 " %p < %p + %lu\n",
2946 (const void*) externs,
2947 (const void*)
2948 page_zip->data,
2949 (ulong)
2950 page_zip->m_end));
2951 return(FALSE);
2952 }
2953
2954 memcpy(dst, externs,
2955 BTR_EXTERN_FIELD_REF_SIZE);
2956
2957 page_zip->n_blobs++;
2958 } else {
2959 /* Deleted record:
2960 clear the BLOB pointer */
2961 memset(dst, 0,
2962 BTR_EXTERN_FIELD_REF_SIZE);
2963 }
2964 }
2965 }
2966
2967 return(TRUE);
2968 }
2969
2970 /**********************************************************************//**
2971 Decompress a page. This function should tolerate errors on the compressed
2972 page. Instead of letting assertions fail, it will return FALSE if an
2973 inconsistency is detected.
2974 @return TRUE on success, FALSE on failure */
2975 static
2976 ibool
page_zip_decompress_low(page_zip_des_t * page_zip,page_t * page,ibool all)2977 page_zip_decompress_low(
2978 /*====================*/
2979 page_zip_des_t* page_zip,/*!< in: data, ssize;
2980 out: m_start, m_end, m_nonempty, n_blobs */
2981 page_t* page, /*!< out: uncompressed page, may be trashed */
2982 ibool all) /*!< in: TRUE=decompress the whole page;
2983 FALSE=verify but do not copy some
2984 page header fields that should not change
2985 after page creation */
2986 {
2987 z_stream d_stream;
2988 dict_index_t* index = NULL;
2989 rec_t** recs; /*!< dense page directory, sorted by address */
2990 ulint n_dense;/* number of user records on the page */
2991 ulint trx_id_col = ULINT_UNDEFINED;
2992 mem_heap_t* heap;
2993 rec_offs* offsets;
2994
2995 ut_ad(page_zip_simple_validate(page_zip));
2996 MEM_CHECK_ADDRESSABLE(page, srv_page_size);
2997 MEM_CHECK_DEFINED(page_zip->data, page_zip_get_size(page_zip));
2998
2999 /* The dense directory excludes the infimum and supremum records. */
3000 n_dense = page_dir_get_n_heap(page_zip->data) - PAGE_HEAP_NO_USER_LOW;
3001 if (UNIV_UNLIKELY(n_dense * PAGE_ZIP_DIR_SLOT_SIZE
3002 >= page_zip_get_size(page_zip))) {
3003 page_zip_fail(("page_zip_decompress 1: %lu %lu\n",
3004 (ulong) n_dense,
3005 (ulong) page_zip_get_size(page_zip)));
3006 return(FALSE);
3007 }
3008
3009 heap = mem_heap_create(n_dense * (3 * sizeof *recs) + srv_page_size);
3010
3011 recs = static_cast<rec_t**>(
3012 mem_heap_alloc(heap, n_dense * sizeof *recs));
3013
3014 if (all) {
3015 /* Copy the page header. */
3016 memcpy_aligned<2>(page, page_zip->data, PAGE_DATA);
3017 } else {
3018 /* Check that the bytes that we skip are identical. */
3019 #if defined UNIV_DEBUG || defined UNIV_ZIP_DEBUG
3020 ut_a(!memcmp(FIL_PAGE_TYPE + page,
3021 FIL_PAGE_TYPE + page_zip->data,
3022 PAGE_HEADER - FIL_PAGE_TYPE));
3023 ut_a(!memcmp(PAGE_HEADER + PAGE_LEVEL + page,
3024 PAGE_HEADER + PAGE_LEVEL + page_zip->data,
3025 PAGE_DATA - (PAGE_HEADER + PAGE_LEVEL)));
3026 #endif /* UNIV_DEBUG || UNIV_ZIP_DEBUG */
3027
3028 /* Copy the mutable parts of the page header. */
3029 memcpy_aligned<8>(page, page_zip->data, FIL_PAGE_TYPE);
3030 memcpy_aligned<2>(PAGE_HEADER + page,
3031 PAGE_HEADER + page_zip->data,
3032 PAGE_LEVEL - PAGE_N_DIR_SLOTS);
3033
3034 #if defined UNIV_DEBUG || defined UNIV_ZIP_DEBUG
3035 /* Check that the page headers match after copying. */
3036 ut_a(!memcmp(page, page_zip->data, PAGE_DATA));
3037 #endif /* UNIV_DEBUG || UNIV_ZIP_DEBUG */
3038 }
3039
3040 #ifdef UNIV_ZIP_DEBUG
3041 /* Clear the uncompressed page, except the header. */
3042 memset(PAGE_DATA + page, 0x55, srv_page_size - PAGE_DATA);
3043 #endif /* UNIV_ZIP_DEBUG */
3044 MEM_UNDEFINED(PAGE_DATA + page, srv_page_size - PAGE_DATA);
3045
3046 /* Copy the page directory. */
3047 if (UNIV_UNLIKELY(!page_zip_dir_decode(page_zip, page, recs,
3048 n_dense))) {
3049 zlib_error:
3050 mem_heap_free(heap);
3051 return(FALSE);
3052 }
3053
3054 /* Copy the infimum and supremum records. */
3055 memcpy(page + (PAGE_NEW_INFIMUM - REC_N_NEW_EXTRA_BYTES),
3056 infimum_extra, sizeof infimum_extra);
3057 if (page_is_empty(page)) {
3058 rec_set_next_offs_new(page + PAGE_NEW_INFIMUM,
3059 PAGE_NEW_SUPREMUM);
3060 } else {
3061 rec_set_next_offs_new(page + PAGE_NEW_INFIMUM,
3062 page_zip_dir_get(page_zip, 0)
3063 & PAGE_ZIP_DIR_SLOT_MASK);
3064 }
3065 memcpy(page + PAGE_NEW_INFIMUM, infimum_data, sizeof infimum_data);
3066 memcpy_aligned<4>(PAGE_NEW_SUPREMUM - REC_N_NEW_EXTRA_BYTES + 1
3067 + page, supremum_extra_data,
3068 sizeof supremum_extra_data);
3069
3070 page_zip_set_alloc(&d_stream, heap);
3071
3072 d_stream.next_in = page_zip->data + PAGE_DATA;
3073 /* Subtract the space reserved for
3074 the page header and the end marker of the modification log. */
3075 d_stream.avail_in = static_cast<uInt>(
3076 page_zip_get_size(page_zip) - (PAGE_DATA + 1));
3077 d_stream.next_out = page + PAGE_ZIP_START;
3078 d_stream.avail_out = uInt(srv_page_size - PAGE_ZIP_START);
3079
3080 if (UNIV_UNLIKELY(inflateInit2(&d_stream, int(srv_page_size_shift))
3081 != Z_OK)) {
3082 ut_error;
3083 }
3084
3085 /* Decode the zlib header and the index information. */
3086 if (UNIV_UNLIKELY(inflate(&d_stream, Z_BLOCK) != Z_OK)) {
3087
3088 page_zip_fail(("page_zip_decompress:"
3089 " 1 inflate(Z_BLOCK)=%s\n", d_stream.msg));
3090 goto zlib_error;
3091 }
3092
3093 if (UNIV_UNLIKELY(inflate(&d_stream, Z_BLOCK) != Z_OK)) {
3094
3095 page_zip_fail(("page_zip_decompress:"
3096 " 2 inflate(Z_BLOCK)=%s\n", d_stream.msg));
3097 goto zlib_error;
3098 }
3099
3100 index = page_zip_fields_decode(
3101 page + PAGE_ZIP_START, d_stream.next_out,
3102 page_is_leaf(page) ? &trx_id_col : NULL,
3103 fil_page_get_type(page) == FIL_PAGE_RTREE);
3104
3105 if (UNIV_UNLIKELY(!index)) {
3106
3107 goto zlib_error;
3108 }
3109
3110 /* Decompress the user records. */
3111 page_zip->n_blobs = 0;
3112 d_stream.next_out = page + PAGE_ZIP_START;
3113
3114 {
3115 /* Pre-allocate the offsets for rec_get_offsets_reverse(). */
3116 ulint n = 1 + 1/* node ptr */ + REC_OFFS_HEADER_SIZE
3117 + dict_index_get_n_fields(index);
3118
3119 offsets = static_cast<rec_offs*>(
3120 mem_heap_alloc(heap, n * sizeof(ulint)));
3121
3122 rec_offs_set_n_alloc(offsets, n);
3123 }
3124
3125 /* Decompress the records in heap_no order. */
3126 if (!page_is_leaf(page)) {
3127 /* This is a node pointer page. */
3128 ulint info_bits;
3129
3130 if (UNIV_UNLIKELY
3131 (!page_zip_decompress_node_ptrs(page_zip, &d_stream,
3132 recs, n_dense, index,
3133 offsets, heap))) {
3134 goto err_exit;
3135 }
3136
3137 info_bits = page_has_prev(page) ? 0 : REC_INFO_MIN_REC_FLAG;
3138
3139 if (UNIV_UNLIKELY(!page_zip_set_extra_bytes(page_zip, page,
3140 info_bits))) {
3141 goto err_exit;
3142 }
3143 } else if (UNIV_LIKELY(trx_id_col == ULINT_UNDEFINED)) {
3144 /* This is a leaf page in a secondary index. */
3145 if (UNIV_UNLIKELY(!page_zip_decompress_sec(page_zip, &d_stream,
3146 recs, n_dense,
3147 index, offsets))) {
3148 goto err_exit;
3149 }
3150
3151 if (UNIV_UNLIKELY(!page_zip_set_extra_bytes(page_zip,
3152 page, 0))) {
3153 err_exit:
3154 page_zip_fields_free(index);
3155 mem_heap_free(heap);
3156 return(FALSE);
3157 }
3158 } else {
3159 /* This is a leaf page in a clustered index. */
3160 if (UNIV_UNLIKELY(!page_zip_decompress_clust(page_zip,
3161 &d_stream, recs,
3162 n_dense, index,
3163 trx_id_col,
3164 offsets, heap))) {
3165 goto err_exit;
3166 }
3167
3168 if (UNIV_UNLIKELY(!page_zip_set_extra_bytes(page_zip,
3169 page, 0))) {
3170 goto err_exit;
3171 }
3172 }
3173
3174 ut_a(page_is_comp(page));
3175 MEM_CHECK_DEFINED(page, srv_page_size);
3176
3177 page_zip_fields_free(index);
3178 mem_heap_free(heap);
3179
3180 return(TRUE);
3181 }
3182
3183 /**********************************************************************//**
3184 Decompress a page. This function should tolerate errors on the compressed
3185 page. Instead of letting assertions fail, it will return FALSE if an
3186 inconsistency is detected.
3187 @return TRUE on success, FALSE on failure */
3188 ibool
page_zip_decompress(page_zip_des_t * page_zip,page_t * page,ibool all)3189 page_zip_decompress(
3190 /*================*/
3191 page_zip_des_t* page_zip,/*!< in: data, ssize;
3192 out: m_start, m_end, m_nonempty, n_blobs */
3193 page_t* page, /*!< out: uncompressed page, may be trashed */
3194 ibool all) /*!< in: TRUE=decompress the whole page;
3195 FALSE=verify but do not copy some
3196 page header fields that should not change
3197 after page creation */
3198 {
3199 const ulonglong ns = my_interval_timer();
3200
3201 if (!page_zip_decompress_low(page_zip, page, all)) {
3202 return(FALSE);
3203 }
3204
3205 const uint64_t time_diff = (my_interval_timer() - ns) / 1000;
3206 page_zip_stat[page_zip->ssize - 1].decompressed++;
3207 page_zip_stat[page_zip->ssize - 1].decompressed_usec += time_diff;
3208
3209 index_id_t index_id = btr_page_get_index_id(page);
3210
3211 if (srv_cmp_per_index_enabled) {
3212 mutex_enter(&page_zip_stat_per_index_mutex);
3213 page_zip_stat_per_index[index_id].decompressed++;
3214 page_zip_stat_per_index[index_id].decompressed_usec += time_diff;
3215 mutex_exit(&page_zip_stat_per_index_mutex);
3216 }
3217
3218 /* Update the stat counter for LRU policy. */
3219 buf_LRU_stat_inc_unzip();
3220
3221 MONITOR_INC(MONITOR_PAGE_DECOMPRESS);
3222
3223 return(TRUE);
3224 }
3225
3226 #ifdef UNIV_ZIP_DEBUG
3227 /**********************************************************************//**
3228 Dump a block of memory on the standard error stream. */
3229 static
3230 void
page_zip_hexdump_func(const char * name,const void * buf,ulint size)3231 page_zip_hexdump_func(
3232 /*==================*/
3233 const char* name, /*!< in: name of the data structure */
3234 const void* buf, /*!< in: data */
3235 ulint size) /*!< in: length of the data, in bytes */
3236 {
3237 const byte* s = static_cast<const byte*>(buf);
3238 ulint addr;
3239 const ulint width = 32; /* bytes per line */
3240
3241 fprintf(stderr, "%s:\n", name);
3242
3243 for (addr = 0; addr < size; addr += width) {
3244 ulint i;
3245
3246 fprintf(stderr, "%04lx ", (ulong) addr);
3247
3248 i = ut_min(width, size - addr);
3249
3250 while (i--) {
3251 fprintf(stderr, "%02x", *s++);
3252 }
3253
3254 putc('\n', stderr);
3255 }
3256 }
3257
3258 /** Dump a block of memory on the standard error stream.
3259 @param buf in: data
3260 @param size in: length of the data, in bytes */
3261 #define page_zip_hexdump(buf, size) page_zip_hexdump_func(#buf, buf, size)
3262
3263 /** Flag: make page_zip_validate() compare page headers only */
3264 bool page_zip_validate_header_only;
3265
3266 /**********************************************************************//**
3267 Check that the compressed and decompressed pages match.
3268 @return TRUE if valid, FALSE if not */
3269 ibool
page_zip_validate_low(const page_zip_des_t * page_zip,const page_t * page,const dict_index_t * index,ibool sloppy)3270 page_zip_validate_low(
3271 /*==================*/
3272 const page_zip_des_t* page_zip,/*!< in: compressed page */
3273 const page_t* page, /*!< in: uncompressed page */
3274 const dict_index_t* index, /*!< in: index of the page, if known */
3275 ibool sloppy) /*!< in: FALSE=strict,
3276 TRUE=ignore the MIN_REC_FLAG */
3277 {
3278 page_zip_des_t temp_page_zip;
3279 ibool valid;
3280
3281 if (memcmp(page_zip->data + FIL_PAGE_PREV, page + FIL_PAGE_PREV,
3282 FIL_PAGE_LSN - FIL_PAGE_PREV)
3283 || memcmp(page_zip->data + FIL_PAGE_TYPE, page + FIL_PAGE_TYPE, 2)
3284 || memcmp(page_zip->data + FIL_PAGE_DATA, page + FIL_PAGE_DATA,
3285 PAGE_ROOT_AUTO_INC)
3286 /* The PAGE_ROOT_AUTO_INC can be updated while holding an SX-latch
3287 on the clustered index root page (page number 3 in .ibd files).
3288 That allows concurrent readers (holding buf_block_t::lock S-latch).
3289 Because we do not know what type of a latch our caller is holding,
3290 we will ignore the field on clustered index root pages in order
3291 to avoid false positives. */
3292 || (page_get_page_no(page) != 3/* clustered index root page */
3293 && memcmp(&page_zip->data[FIL_PAGE_DATA + PAGE_ROOT_AUTO_INC],
3294 &page[FIL_PAGE_DATA + PAGE_ROOT_AUTO_INC], 8))
3295 || memcmp(&page_zip->data[FIL_PAGE_DATA + PAGE_HEADER_PRIV_END],
3296 &page[FIL_PAGE_DATA + PAGE_HEADER_PRIV_END],
3297 PAGE_DATA - FIL_PAGE_DATA - PAGE_HEADER_PRIV_END)) {
3298 page_zip_fail(("page_zip_validate: page header\n"));
3299 page_zip_hexdump(page_zip, sizeof *page_zip);
3300 page_zip_hexdump(page_zip->data, page_zip_get_size(page_zip));
3301 page_zip_hexdump(page, srv_page_size);
3302 return(FALSE);
3303 }
3304
3305 ut_a(page_is_comp(page));
3306
3307 if (page_zip_validate_header_only) {
3308 return(TRUE);
3309 }
3310
3311 /* page_zip_decompress() expects the uncompressed page to be
3312 srv_page_size aligned. */
3313 page_t* temp_page = static_cast<byte*>(aligned_malloc(srv_page_size,
3314 srv_page_size));
3315
3316 MEM_CHECK_DEFINED(page, srv_page_size);
3317 MEM_CHECK_DEFINED(page_zip->data, page_zip_get_size(page_zip));
3318
3319 temp_page_zip = *page_zip;
3320 valid = page_zip_decompress_low(&temp_page_zip, temp_page, TRUE);
3321 if (!valid) {
3322 fputs("page_zip_validate(): failed to decompress\n", stderr);
3323 goto func_exit;
3324 }
3325 if (page_zip->n_blobs != temp_page_zip.n_blobs) {
3326 page_zip_fail(("page_zip_validate: n_blobs: %u!=%u\n",
3327 page_zip->n_blobs, temp_page_zip.n_blobs));
3328 valid = FALSE;
3329 }
3330 #ifdef UNIV_DEBUG
3331 if (page_zip->m_start != temp_page_zip.m_start) {
3332 page_zip_fail(("page_zip_validate: m_start: %u!=%u\n",
3333 page_zip->m_start, temp_page_zip.m_start));
3334 valid = FALSE;
3335 }
3336 #endif /* UNIV_DEBUG */
3337 if (page_zip->m_end != temp_page_zip.m_end) {
3338 page_zip_fail(("page_zip_validate: m_end: %u!=%u\n",
3339 page_zip->m_end, temp_page_zip.m_end));
3340 valid = FALSE;
3341 }
3342 if (page_zip->m_nonempty != temp_page_zip.m_nonempty) {
3343 page_zip_fail(("page_zip_validate(): m_nonempty: %u!=%u\n",
3344 page_zip->m_nonempty,
3345 temp_page_zip.m_nonempty));
3346 valid = FALSE;
3347 }
3348 if (memcmp(page + PAGE_HEADER, temp_page + PAGE_HEADER,
3349 srv_page_size - PAGE_HEADER - FIL_PAGE_DATA_END)) {
3350
3351 /* In crash recovery, the "minimum record" flag may be
3352 set incorrectly until the mini-transaction is
3353 committed. Let us tolerate that difference when we
3354 are performing a sloppy validation. */
3355
3356 rec_offs* offsets;
3357 mem_heap_t* heap;
3358 const rec_t* rec;
3359 const rec_t* trec;
3360 byte info_bits_diff;
3361 ulint offset
3362 = rec_get_next_offs(page + PAGE_NEW_INFIMUM, TRUE);
3363 ut_a(offset >= PAGE_NEW_SUPREMUM);
3364 offset -= 5/*REC_NEW_INFO_BITS*/;
3365
3366 info_bits_diff = page[offset] ^ temp_page[offset];
3367
3368 if (info_bits_diff == REC_INFO_MIN_REC_FLAG) {
3369 temp_page[offset] = page[offset];
3370
3371 if (!memcmp(page + PAGE_HEADER,
3372 temp_page + PAGE_HEADER,
3373 srv_page_size - PAGE_HEADER
3374 - FIL_PAGE_DATA_END)) {
3375
3376 /* Only the minimum record flag
3377 differed. Let us ignore it. */
3378 page_zip_fail(("page_zip_validate:"
3379 " min_rec_flag"
3380 " (%s" ULINTPF "," ULINTPF
3381 ",0x%02x)\n",
3382 sloppy ? "ignored, " : "",
3383 page_get_space_id(page),
3384 page_get_page_no(page),
3385 page[offset]));
3386 /* We don't check for spatial index, since
3387 the "minimum record" could be deleted when
3388 doing rtr_update_mbr_field.
3389 GIS_FIXME: need to validate why
3390 rtr_update_mbr_field.() could affect this */
3391 if (index && dict_index_is_spatial(index)) {
3392 valid = true;
3393 } else {
3394 valid = sloppy;
3395 }
3396 goto func_exit;
3397 }
3398 }
3399
3400 /* Compare the pointers in the PAGE_FREE list. */
3401 rec = page_header_get_ptr(page, PAGE_FREE);
3402 trec = page_header_get_ptr(temp_page, PAGE_FREE);
3403
3404 while (rec || trec) {
3405 if (page_offset(rec) != page_offset(trec)) {
3406 page_zip_fail(("page_zip_validate:"
3407 " PAGE_FREE list: %u!=%u\n",
3408 (unsigned) page_offset(rec),
3409 (unsigned) page_offset(trec)));
3410 valid = FALSE;
3411 goto func_exit;
3412 }
3413
3414 rec = page_rec_get_next_low(rec, TRUE);
3415 trec = page_rec_get_next_low(trec, TRUE);
3416 }
3417
3418 /* Compare the records. */
3419 heap = NULL;
3420 offsets = NULL;
3421 rec = page_rec_get_next_low(
3422 page + PAGE_NEW_INFIMUM, TRUE);
3423 trec = page_rec_get_next_low(
3424 temp_page + PAGE_NEW_INFIMUM, TRUE);
3425 const ulint n_core = page_is_leaf(page) ? index->n_fields : 0;
3426
3427 do {
3428 if (page_offset(rec) != page_offset(trec)) {
3429 page_zip_fail(("page_zip_validate:"
3430 " record list: 0x%02x!=0x%02x\n",
3431 (unsigned) page_offset(rec),
3432 (unsigned) page_offset(trec)));
3433 valid = FALSE;
3434 break;
3435 }
3436
3437 if (index) {
3438 /* Compare the data. */
3439 offsets = rec_get_offsets(
3440 rec, index, offsets, n_core,
3441 ULINT_UNDEFINED, &heap);
3442
3443 if (memcmp(rec - rec_offs_extra_size(offsets),
3444 trec - rec_offs_extra_size(offsets),
3445 rec_offs_size(offsets))) {
3446 page_zip_fail(
3447 ("page_zip_validate:"
3448 " record content: 0x%02x",
3449 (unsigned) page_offset(rec)));
3450 valid = FALSE;
3451 break;
3452 }
3453 }
3454
3455 rec = page_rec_get_next_low(rec, TRUE);
3456 trec = page_rec_get_next_low(trec, TRUE);
3457 } while (rec || trec);
3458
3459 if (heap) {
3460 mem_heap_free(heap);
3461 }
3462 }
3463
3464 func_exit:
3465 if (!valid) {
3466 page_zip_hexdump(page_zip, sizeof *page_zip);
3467 page_zip_hexdump(page_zip->data, page_zip_get_size(page_zip));
3468 page_zip_hexdump(page, srv_page_size);
3469 page_zip_hexdump(temp_page, srv_page_size);
3470 }
3471 aligned_free(temp_page);
3472 return(valid);
3473 }
3474
3475 /**********************************************************************//**
3476 Check that the compressed and decompressed pages match.
3477 @return TRUE if valid, FALSE if not */
3478 ibool
page_zip_validate(const page_zip_des_t * page_zip,const page_t * page,const dict_index_t * index)3479 page_zip_validate(
3480 /*==============*/
3481 const page_zip_des_t* page_zip,/*!< in: compressed page */
3482 const page_t* page, /*!< in: uncompressed page */
3483 const dict_index_t* index) /*!< in: index of the page, if known */
3484 {
3485 return(page_zip_validate_low(page_zip, page, index,
3486 recv_recovery_is_on()));
3487 }
3488 #endif /* UNIV_ZIP_DEBUG */
3489
3490 #ifdef UNIV_DEBUG
3491 /**********************************************************************//**
3492 Assert that the compressed and decompressed page headers match.
3493 @return TRUE */
3494 static
3495 ibool
page_zip_header_cmp(const page_zip_des_t * page_zip,const byte * page)3496 page_zip_header_cmp(
3497 /*================*/
3498 const page_zip_des_t* page_zip,/*!< in: compressed page */
3499 const byte* page) /*!< in: uncompressed page */
3500 {
3501 ut_ad(!memcmp(page_zip->data + FIL_PAGE_PREV, page + FIL_PAGE_PREV,
3502 FIL_PAGE_LSN - FIL_PAGE_PREV));
3503 ut_ad(!memcmp(page_zip->data + FIL_PAGE_TYPE, page + FIL_PAGE_TYPE,
3504 2));
3505 ut_ad(!memcmp(page_zip->data + FIL_PAGE_DATA, page + FIL_PAGE_DATA,
3506 PAGE_DATA - FIL_PAGE_DATA));
3507
3508 return(TRUE);
3509 }
3510 #endif /* UNIV_DEBUG */
3511
3512 /**********************************************************************//**
3513 Write a record on the compressed page that contains externally stored
3514 columns. The data must already have been written to the uncompressed page.
3515 @return end of modification log */
3516 static
3517 byte*
page_zip_write_rec_ext(buf_block_t * block,const byte * rec,const dict_index_t * index,const rec_offs * offsets,ulint create,ulint trx_id_col,ulint heap_no,byte * storage,byte * data,mtr_t * mtr)3518 page_zip_write_rec_ext(
3519 /*===================*/
3520 buf_block_t* block, /*!< in/out: compressed page */
3521 const byte* rec, /*!< in: record being written */
3522 const dict_index_t*index, /*!< in: record descriptor */
3523 const rec_offs* offsets, /*!< in: rec_get_offsets(rec, index) */
3524 ulint create, /*!< in: nonzero=insert, zero=update */
3525 ulint trx_id_col, /*!< in: position of DB_TRX_ID */
3526 ulint heap_no, /*!< in: heap number of rec */
3527 byte* storage, /*!< in: end of dense page directory */
3528 byte* data, /*!< in: end of modification log */
3529 mtr_t* mtr) /*!< in/out: mini-transaction */
3530 {
3531 const byte* start = rec;
3532 ulint i;
3533 ulint len;
3534 byte* externs = storage;
3535 ulint n_ext = rec_offs_n_extern(offsets);
3536 const page_t* const page = block->frame;
3537 page_zip_des_t* const page_zip = &block->page.zip;
3538
3539 ut_ad(rec_offs_validate(rec, index, offsets));
3540 MEM_CHECK_DEFINED(rec, rec_offs_data_size(offsets));
3541 MEM_CHECK_DEFINED(rec - rec_offs_extra_size(offsets),
3542 rec_offs_extra_size(offsets));
3543
3544 externs -= (DATA_TRX_ID_LEN + DATA_ROLL_PTR_LEN)
3545 * (page_dir_get_n_heap(page) - PAGE_HEAP_NO_USER_LOW);
3546
3547 /* Note that this will not take into account
3548 the BLOB columns of rec if create==TRUE. */
3549 ut_ad(data + rec_offs_data_size(offsets)
3550 - (DATA_TRX_ID_LEN + DATA_ROLL_PTR_LEN)
3551 - n_ext * FIELD_REF_SIZE
3552 < externs - FIELD_REF_SIZE * page_zip->n_blobs);
3553
3554 if (n_ext) {
3555 ulint blob_no = page_zip_get_n_prev_extern(
3556 page_zip, rec, index);
3557 byte* ext_end = externs - page_zip->n_blobs * FIELD_REF_SIZE;
3558 ut_ad(blob_no <= page_zip->n_blobs);
3559 externs -= blob_no * FIELD_REF_SIZE;
3560
3561 if (create) {
3562 page_zip->n_blobs = (page_zip->n_blobs + n_ext)
3563 & ((1U << 12) - 1);
3564 ASSERT_ZERO_BLOB(ext_end - n_ext * FIELD_REF_SIZE);
3565 if (ulint len = ulint(externs - ext_end)) {
3566 byte* ext_start = ext_end
3567 - n_ext * FIELD_REF_SIZE;
3568 memmove(ext_start, ext_end, len);
3569 mtr->memmove(*block,
3570 ext_start - page_zip->data,
3571 ext_end - page_zip->data, len);
3572 }
3573 }
3574
3575 ut_a(blob_no + n_ext <= page_zip->n_blobs);
3576 }
3577
3578 for (i = 0; i < rec_offs_n_fields(offsets); i++) {
3579 const byte* src;
3580
3581 if (UNIV_UNLIKELY(i == trx_id_col)) {
3582 ut_ad(!rec_offs_nth_extern(offsets,
3583 i));
3584 ut_ad(!rec_offs_nth_extern(offsets,
3585 i + 1));
3586 /* Locate trx_id and roll_ptr. */
3587 src = rec_get_nth_field(rec, offsets,
3588 i, &len);
3589 ut_ad(len == DATA_TRX_ID_LEN);
3590 ut_ad(src + DATA_TRX_ID_LEN
3591 == rec_get_nth_field(
3592 rec, offsets,
3593 i + 1, &len));
3594 ut_ad(len == DATA_ROLL_PTR_LEN);
3595
3596 /* Log the preceding fields. */
3597 ASSERT_ZERO(data, src - start);
3598 memcpy(data, start, ulint(src - start));
3599 data += src - start;
3600 start = src + (DATA_TRX_ID_LEN
3601 + DATA_ROLL_PTR_LEN);
3602
3603 /* Store trx_id and roll_ptr. */
3604 constexpr ulint sys_len = DATA_TRX_ID_LEN
3605 + DATA_ROLL_PTR_LEN;
3606 byte* sys = storage - sys_len * (heap_no - 1);
3607 memcpy(sys, src, sys_len);
3608 i++; /* skip also roll_ptr */
3609 mtr->zmemcpy(*block, sys - page_zip->data, sys_len);
3610 } else if (rec_offs_nth_extern(offsets, i)) {
3611 src = rec_get_nth_field(rec, offsets,
3612 i, &len);
3613
3614 ut_ad(dict_index_is_clust(index));
3615 ut_ad(len >= FIELD_REF_SIZE);
3616 src += len - FIELD_REF_SIZE;
3617
3618 ASSERT_ZERO(data, src - start);
3619 memcpy(data, start, ulint(src - start));
3620 data += src - start;
3621 start = src + FIELD_REF_SIZE;
3622
3623 /* Store the BLOB pointer. */
3624 externs -= FIELD_REF_SIZE;
3625 ut_ad(data < externs);
3626 memcpy(externs, src, FIELD_REF_SIZE);
3627 mtr->zmemcpy(*block, externs - page_zip->data,
3628 FIELD_REF_SIZE);
3629 }
3630 }
3631
3632 /* Log the last bytes of the record. */
3633 len = rec_offs_data_size(offsets) - ulint(start - rec);
3634
3635 ASSERT_ZERO(data, len);
3636 memcpy(data, start, len);
3637 data += len;
3638
3639 return(data);
3640 }
3641
3642 /** Write an entire record to the ROW_FORMAT=COMPRESSED page.
3643 The data must already have been written to the uncompressed page.
3644 @param[in,out] block ROW_FORMAT=COMPRESSED page
3645 @param[in] rec record in the uncompressed page
3646 @param[in] index the index that the page belongs to
3647 @param[in] offsets rec_get_offsets(rec, index)
3648 @param[in] create nonzero=insert, zero=update
3649 @param[in,out] mtr mini-transaction */
page_zip_write_rec(buf_block_t * block,const byte * rec,const dict_index_t * index,const rec_offs * offsets,ulint create,mtr_t * mtr)3650 void page_zip_write_rec(buf_block_t *block, const byte *rec,
3651 const dict_index_t *index, const rec_offs *offsets,
3652 ulint create, mtr_t *mtr)
3653 {
3654 const page_t* const page = block->frame;
3655 page_zip_des_t* const page_zip = &block->page.zip;
3656 byte* data;
3657 byte* storage;
3658 ulint heap_no;
3659 byte* slot;
3660
3661 ut_ad(page_zip_simple_validate(page_zip));
3662 ut_ad(page_zip_get_size(page_zip)
3663 > PAGE_DATA + page_zip_dir_size(page_zip));
3664 ut_ad(rec_offs_comp(offsets));
3665 ut_ad(rec_offs_validate(rec, index, offsets));
3666
3667 ut_ad(page_zip->m_start >= PAGE_DATA);
3668
3669 ut_ad(page_zip_header_cmp(page_zip, page));
3670 ut_ad(page_simple_validate_new((page_t*) page));
3671
3672 MEM_CHECK_DEFINED(page_zip->data, page_zip_get_size(page_zip));
3673 MEM_CHECK_DEFINED(rec, rec_offs_data_size(offsets));
3674 MEM_CHECK_DEFINED(rec - rec_offs_extra_size(offsets),
3675 rec_offs_extra_size(offsets));
3676
3677 slot = page_zip_dir_find(page_zip, page_offset(rec));
3678 ut_a(slot);
3679 byte s = *slot;
3680 /* Copy the delete mark. */
3681 if (rec_get_deleted_flag(rec, TRUE)) {
3682 /* In delete-marked records, DB_TRX_ID must
3683 always refer to an existing undo log record.
3684 On non-leaf pages, the delete-mark flag is garbage. */
3685 ut_ad(!index->is_primary() || !page_is_leaf(page)
3686 || row_get_rec_trx_id(rec, index, offsets));
3687 s |= PAGE_ZIP_DIR_SLOT_DEL >> 8;
3688 } else {
3689 s &= byte(~(PAGE_ZIP_DIR_SLOT_DEL >> 8));
3690 }
3691
3692 if (s != *slot) {
3693 *slot = s;
3694 mtr->zmemcpy(*block, slot - page_zip->data, 1);
3695 }
3696
3697 ut_ad(rec_get_start((rec_t*) rec, offsets) >= page + PAGE_ZIP_START);
3698 ut_ad(rec_get_end((rec_t*) rec, offsets) <= page + srv_page_size
3699 - PAGE_DIR - PAGE_DIR_SLOT_SIZE
3700 * page_dir_get_n_slots(page));
3701
3702 heap_no = rec_get_heap_no_new(rec);
3703 ut_ad(heap_no >= PAGE_HEAP_NO_USER_LOW); /* not infimum or supremum */
3704 ut_ad(heap_no < page_dir_get_n_heap(page));
3705
3706 /* Append to the modification log. */
3707 data = page_zip->data + page_zip->m_end;
3708 ut_ad(!*data);
3709
3710 /* Identify the record by writing its heap number - 1.
3711 0 is reserved to indicate the end of the modification log. */
3712
3713 if (UNIV_UNLIKELY(heap_no - 1 >= 64)) {
3714 *data++ = (byte) (0x80 | (heap_no - 1) >> 7);
3715 ut_ad(!*data);
3716 }
3717 *data++ = (byte) ((heap_no - 1) << 1);
3718 ut_ad(!*data);
3719
3720 {
3721 const byte* start = rec - rec_offs_extra_size(offsets);
3722 const byte* b = rec - REC_N_NEW_EXTRA_BYTES;
3723
3724 /* Write the extra bytes backwards, so that
3725 rec_offs_extra_size() can be easily computed in
3726 page_zip_apply_log() by invoking
3727 rec_get_offsets_reverse(). */
3728
3729 while (b != start) {
3730 *data++ = *--b;
3731 ut_ad(!*data);
3732 }
3733 }
3734
3735 /* Write the data bytes. Store the uncompressed bytes separately. */
3736 storage = page_zip_dir_start(page_zip);
3737
3738 if (page_is_leaf(page)) {
3739 if (dict_index_is_clust(index)) {
3740 /* Store separately trx_id, roll_ptr and
3741 the BTR_EXTERN_FIELD_REF of each BLOB column. */
3742 if (rec_offs_any_extern(offsets)) {
3743 data = page_zip_write_rec_ext(
3744 block,
3745 rec, index, offsets, create,
3746 index->db_trx_id(), heap_no,
3747 storage, data, mtr);
3748 } else {
3749 /* Locate trx_id and roll_ptr. */
3750 ulint len;
3751 const byte* src
3752 = rec_get_nth_field(rec, offsets,
3753 index->db_trx_id(),
3754 &len);
3755 ut_ad(len == DATA_TRX_ID_LEN);
3756 ut_ad(src + DATA_TRX_ID_LEN
3757 == rec_get_nth_field(
3758 rec, offsets,
3759 index->db_roll_ptr(), &len));
3760 ut_ad(len == DATA_ROLL_PTR_LEN);
3761
3762 /* Log the preceding fields. */
3763 ASSERT_ZERO(data, src - rec);
3764 memcpy(data, rec, ulint(src - rec));
3765 data += src - rec;
3766
3767 /* Store trx_id and roll_ptr. */
3768 constexpr ulint sys_len
3769 = DATA_TRX_ID_LEN + DATA_ROLL_PTR_LEN;
3770 byte* sys = storage - sys_len * (heap_no - 1);
3771 memcpy(sys, src, sys_len);
3772
3773 src += sys_len;
3774 mtr->zmemcpy(*block, sys - page_zip->data,
3775 sys_len);
3776 /* Log the last bytes of the record. */
3777 len = rec_offs_data_size(offsets)
3778 - ulint(src - rec);
3779
3780 ASSERT_ZERO(data, len);
3781 memcpy(data, src, len);
3782 data += len;
3783 }
3784 } else {
3785 /* Leaf page of a secondary index:
3786 no externally stored columns */
3787 ut_ad(!rec_offs_any_extern(offsets));
3788
3789 /* Log the entire record. */
3790 ulint len = rec_offs_data_size(offsets);
3791
3792 ASSERT_ZERO(data, len);
3793 memcpy(data, rec, len);
3794 data += len;
3795 }
3796 } else {
3797 /* This is a node pointer page. */
3798 /* Non-leaf nodes should not have any externally
3799 stored columns. */
3800 ut_ad(!rec_offs_any_extern(offsets));
3801
3802 /* Copy the data bytes, except node_ptr. */
3803 ulint len = rec_offs_data_size(offsets) - REC_NODE_PTR_SIZE;
3804 ut_ad(data + len < storage - REC_NODE_PTR_SIZE
3805 * (page_dir_get_n_heap(page) - PAGE_HEAP_NO_USER_LOW));
3806 ASSERT_ZERO(data, len);
3807 memcpy(data, rec, len);
3808 data += len;
3809
3810 /* Copy the node pointer to the uncompressed area. */
3811 byte* node_ptr = storage - REC_NODE_PTR_SIZE * (heap_no - 1);
3812 mtr->zmemcpy<mtr_t::MAYBE_NOP>(*block, node_ptr,
3813 rec + len, REC_NODE_PTR_SIZE);
3814 }
3815
3816 ut_a(!*data);
3817 ut_ad((ulint) (data - page_zip->data) < page_zip_get_size(page_zip));
3818 mtr->zmemcpy(*block, page_zip->m_end,
3819 data - page_zip->data - page_zip->m_end);
3820 page_zip->m_end = uint16_t(data - page_zip->data);
3821 page_zip->m_nonempty = TRUE;
3822
3823 #ifdef UNIV_ZIP_DEBUG
3824 ut_a(page_zip_validate(page_zip, page_align(rec), index));
3825 #endif /* UNIV_ZIP_DEBUG */
3826 }
3827
3828 /**********************************************************************//**
3829 Write a BLOB pointer of a record on the leaf page of a clustered index.
3830 The information must already have been updated on the uncompressed page. */
3831 void
page_zip_write_blob_ptr(buf_block_t * block,const byte * rec,dict_index_t * index,const rec_offs * offsets,ulint n,mtr_t * mtr)3832 page_zip_write_blob_ptr(
3833 /*====================*/
3834 buf_block_t* block, /*!< in/out: ROW_FORMAT=COMPRESSED page */
3835 const byte* rec, /*!< in/out: record whose data is being
3836 written */
3837 dict_index_t* index, /*!< in: index of the page */
3838 const rec_offs* offsets,/*!< in: rec_get_offsets(rec, index) */
3839 ulint n, /*!< in: column index */
3840 mtr_t* mtr) /*!< in/out: mini-transaction */
3841 {
3842 const byte* field;
3843 byte* externs;
3844 const page_t* const page = block->frame;
3845 page_zip_des_t* const page_zip = &block->page.zip;
3846 ulint blob_no;
3847 ulint len;
3848
3849 ut_ad(page_align(rec) == page);
3850 ut_ad(index != NULL);
3851 ut_ad(offsets != NULL);
3852 ut_ad(page_simple_validate_new((page_t*) page));
3853 ut_ad(page_zip_simple_validate(page_zip));
3854 ut_ad(page_zip_get_size(page_zip)
3855 > PAGE_DATA + page_zip_dir_size(page_zip));
3856 ut_ad(rec_offs_comp(offsets));
3857 ut_ad(rec_offs_validate(rec, NULL, offsets));
3858 ut_ad(rec_offs_any_extern(offsets));
3859 ut_ad(rec_offs_nth_extern(offsets, n));
3860
3861 ut_ad(page_zip->m_start >= PAGE_DATA);
3862 ut_ad(page_zip_header_cmp(page_zip, page));
3863
3864 ut_ad(page_is_leaf(page));
3865 ut_ad(dict_index_is_clust(index));
3866
3867 MEM_CHECK_DEFINED(page_zip->data, page_zip_get_size(page_zip));
3868 MEM_CHECK_DEFINED(rec, rec_offs_data_size(offsets));
3869 MEM_CHECK_DEFINED(rec - rec_offs_extra_size(offsets),
3870 rec_offs_extra_size(offsets));
3871
3872 blob_no = page_zip_get_n_prev_extern(page_zip, rec, index)
3873 + rec_get_n_extern_new(rec, index, n);
3874 ut_a(blob_no < page_zip->n_blobs);
3875
3876 externs = page_zip->data + page_zip_get_size(page_zip)
3877 - (page_dir_get_n_heap(page) - PAGE_HEAP_NO_USER_LOW)
3878 * PAGE_ZIP_CLUST_LEAF_SLOT_SIZE;
3879
3880 field = rec_get_nth_field(rec, offsets, n, &len);
3881
3882 externs -= (blob_no + 1) * BTR_EXTERN_FIELD_REF_SIZE;
3883 field += len - BTR_EXTERN_FIELD_REF_SIZE;
3884
3885 mtr->zmemcpy<mtr_t::MAYBE_NOP>(*block, externs, field,
3886 BTR_EXTERN_FIELD_REF_SIZE);
3887
3888 #ifdef UNIV_ZIP_DEBUG
3889 ut_a(page_zip_validate(page_zip, page, index));
3890 #endif /* UNIV_ZIP_DEBUG */
3891 }
3892
3893 /**********************************************************************//**
3894 Write the node pointer of a record on a non-leaf compressed page. */
3895 void
page_zip_write_node_ptr(buf_block_t * block,byte * rec,ulint size,ulint ptr,mtr_t * mtr)3896 page_zip_write_node_ptr(
3897 /*====================*/
3898 buf_block_t* block, /*!< in/out: compressed page */
3899 byte* rec, /*!< in/out: record */
3900 ulint size, /*!< in: data size of rec */
3901 ulint ptr, /*!< in: node pointer */
3902 mtr_t* mtr) /*!< in/out: mini-transaction */
3903 {
3904 byte* field;
3905 byte* storage;
3906 page_zip_des_t* const page_zip = &block->page.zip;
3907
3908 ut_d(const page_t* const page = block->frame);
3909 ut_ad(page_simple_validate_new(page));
3910 ut_ad(page_zip_simple_validate(page_zip));
3911 ut_ad(page_zip_get_size(page_zip)
3912 > PAGE_DATA + page_zip_dir_size(page_zip));
3913 ut_ad(page_rec_is_comp(rec));
3914
3915 ut_ad(page_zip->m_start >= PAGE_DATA);
3916 ut_ad(page_zip_header_cmp(page_zip, page));
3917
3918 ut_ad(!page_is_leaf(page));
3919
3920 MEM_CHECK_DEFINED(page_zip->data, page_zip_get_size(page_zip));
3921 MEM_CHECK_DEFINED(rec, size);
3922
3923 storage = page_zip_dir_start(page_zip)
3924 - (rec_get_heap_no_new(rec) - 1) * REC_NODE_PTR_SIZE;
3925 field = rec + size - REC_NODE_PTR_SIZE;
3926
3927 #if defined UNIV_DEBUG || defined UNIV_ZIP_DEBUG
3928 ut_a(!memcmp(storage, field, REC_NODE_PTR_SIZE));
3929 #endif /* UNIV_DEBUG || UNIV_ZIP_DEBUG */
3930 compile_time_assert(REC_NODE_PTR_SIZE == 4);
3931 mach_write_to_4(field, ptr);
3932 mtr->zmemcpy(*block, storage, field, REC_NODE_PTR_SIZE);
3933 }
3934
3935 /** Write the DB_TRX_ID,DB_ROLL_PTR into a clustered index leaf page record.
3936 @param[in,out] block ROW_FORMAT=COMPRESSED page
3937 @param[in,out] rec record
3938 @param[in] offsets rec_get_offsets(rec, index)
3939 @param[in] trx_id_field field number of DB_TRX_ID (number of PK fields)
3940 @param[in] trx_id DB_TRX_ID value (transaction identifier)
3941 @param[in] roll_ptr DB_ROLL_PTR value (undo log pointer)
3942 @param[in,out] mtr mini-transaction */
3943 void
page_zip_write_trx_id_and_roll_ptr(buf_block_t * block,byte * rec,const rec_offs * offsets,ulint trx_id_col,trx_id_t trx_id,roll_ptr_t roll_ptr,mtr_t * mtr)3944 page_zip_write_trx_id_and_roll_ptr(
3945 buf_block_t* block,
3946 byte* rec,
3947 const rec_offs* offsets,
3948 ulint trx_id_col,
3949 trx_id_t trx_id,
3950 roll_ptr_t roll_ptr,
3951 mtr_t* mtr)
3952 {
3953 page_zip_des_t* const page_zip = &block->page.zip;
3954
3955 ut_d(const page_t* const page = block->frame);
3956 ut_ad(page_align(rec) == page);
3957 ut_ad(page_simple_validate_new(page));
3958 ut_ad(page_zip_simple_validate(page_zip));
3959 ut_ad(page_zip_get_size(page_zip)
3960 > PAGE_DATA + page_zip_dir_size(page_zip));
3961 ut_ad(rec_offs_validate(rec, NULL, offsets));
3962 ut_ad(rec_offs_comp(offsets));
3963
3964 ut_ad(page_zip->m_start >= PAGE_DATA);
3965 ut_ad(page_zip_header_cmp(page_zip, page));
3966
3967 ut_ad(page_is_leaf(page));
3968
3969 MEM_CHECK_DEFINED(page_zip->data, page_zip_get_size(page_zip));
3970
3971 constexpr ulint sys_len = DATA_TRX_ID_LEN + DATA_ROLL_PTR_LEN;
3972 const ulint heap_no = rec_get_heap_no_new(rec);
3973 ut_ad(heap_no >= PAGE_HEAP_NO_USER_LOW);
3974 byte* storage = page_zip_dir_start(page_zip) - (heap_no - 1) * sys_len;
3975
3976 compile_time_assert(DATA_TRX_ID + 1 == DATA_ROLL_PTR);
3977 ulint len;
3978 byte* field = rec_get_nth_field(rec, offsets, trx_id_col, &len);
3979 ut_ad(len == DATA_TRX_ID_LEN);
3980 ut_ad(field + DATA_TRX_ID_LEN
3981 == rec_get_nth_field(rec, offsets, trx_id_col + 1, &len));
3982 ut_ad(len == DATA_ROLL_PTR_LEN);
3983 #if defined UNIV_DEBUG || defined UNIV_ZIP_DEBUG
3984 ut_a(!memcmp(storage, field, sys_len));
3985 #endif /* UNIV_DEBUG || UNIV_ZIP_DEBUG */
3986 compile_time_assert(DATA_TRX_ID_LEN == 6);
3987 mach_write_to_6(field, trx_id);
3988 compile_time_assert(DATA_ROLL_PTR_LEN == 7);
3989 mach_write_to_7(field + DATA_TRX_ID_LEN, roll_ptr);
3990 len = 0;
3991 if (heap_no > PAGE_HEAP_NO_USER_LOW) {
3992 byte* prev = storage + sys_len;
3993 for (; len < sys_len && prev[len] == field[len]; len++);
3994 if (len > 4) {
3995 /* We save space by replacing a single record
3996
3997 WRITE,offset(storage),byte[13]
3998
3999 with up to two records:
4000
4001 MEMMOVE,offset(storage),len(1 byte),+13(1 byte),
4002 WRITE|0x80,0,byte[13-len]
4003
4004 The single WRITE record would be x+13 bytes long (x>2).
4005 The MEMMOVE record would be x+1+1 = x+2 bytes, and
4006 the second WRITE would be 1+1+13-len = 15-len bytes.
4007
4008 The total size is: x+13 versus x+2+15-len = x+17-len.
4009 To save space, we must have len>4. */
4010 memcpy(storage, prev, len);
4011 mtr->memmove(*block, ulint(storage - page_zip->data),
4012 ulint(storage - page_zip->data) + sys_len,
4013 len);
4014 storage += len;
4015 field += len;
4016 if (UNIV_LIKELY(len < sys_len)) {
4017 goto write;
4018 }
4019 } else {
4020 len = 0;
4021 goto write;
4022 }
4023 } else {
4024 write:
4025 mtr->zmemcpy<mtr_t::MAYBE_NOP>(*block, storage, field,
4026 sys_len - len);
4027 }
4028 #if defined UNIV_DEBUG || defined UNIV_ZIP_DEBUG
4029 ut_a(!memcmp(storage - len, field - len, sys_len));
4030 #endif /* UNIV_DEBUG || UNIV_ZIP_DEBUG */
4031
4032 MEM_CHECK_DEFINED(rec, rec_offs_data_size(offsets));
4033 MEM_CHECK_DEFINED(rec - rec_offs_extra_size(offsets),
4034 rec_offs_extra_size(offsets));
4035 MEM_CHECK_DEFINED(page_zip->data, page_zip_get_size(page_zip));
4036 }
4037
4038 /**********************************************************************//**
4039 Clear an area on the uncompressed and compressed page.
4040 Do not clear the data payload, as that would grow the modification log. */
4041 static
4042 void
page_zip_clear_rec(buf_block_t * block,byte * rec,const dict_index_t * index,const rec_offs * offsets,mtr_t * mtr)4043 page_zip_clear_rec(
4044 /*===============*/
4045 buf_block_t* block, /*!< in/out: compressed page */
4046 byte* rec, /*!< in: record to clear */
4047 const dict_index_t* index, /*!< in: index of rec */
4048 const rec_offs* offsets, /*!< in: rec_get_offsets(rec, index) */
4049 mtr_t* mtr) /*!< in/out: mini-transaction */
4050 {
4051 ulint heap_no;
4052 byte* storage;
4053 byte* field;
4054 ulint len;
4055
4056 ut_ad(page_align(rec) == block->frame);
4057 page_zip_des_t* const page_zip = &block->page.zip;
4058
4059 /* page_zip_validate() would fail here if a record
4060 containing externally stored columns is being deleted. */
4061 ut_ad(rec_offs_validate(rec, index, offsets));
4062 ut_ad(!page_zip_dir_find(page_zip, page_offset(rec)));
4063 ut_ad(page_zip_dir_find_free(page_zip, page_offset(rec)));
4064 ut_ad(page_zip_header_cmp(page_zip, block->frame));
4065
4066 heap_no = rec_get_heap_no_new(rec);
4067 ut_ad(heap_no >= PAGE_HEAP_NO_USER_LOW);
4068
4069 MEM_CHECK_DEFINED(page_zip->data, page_zip_get_size(page_zip));
4070 MEM_CHECK_DEFINED(rec, rec_offs_data_size(offsets));
4071 MEM_CHECK_DEFINED(rec - rec_offs_extra_size(offsets),
4072 rec_offs_extra_size(offsets));
4073
4074 if (!page_is_leaf(block->frame)) {
4075 /* Clear node_ptr. On the compressed page,
4076 there is an array of node_ptr immediately before the
4077 dense page directory, at the very end of the page. */
4078 storage = page_zip_dir_start(page_zip);
4079 ut_ad(dict_index_get_n_unique_in_tree_nonleaf(index) ==
4080 rec_offs_n_fields(offsets) - 1);
4081 field = rec_get_nth_field(rec, offsets,
4082 rec_offs_n_fields(offsets) - 1,
4083 &len);
4084 ut_ad(len == REC_NODE_PTR_SIZE);
4085 ut_ad(!rec_offs_any_extern(offsets));
4086 memset(field, 0, REC_NODE_PTR_SIZE);
4087 storage -= (heap_no - 1) * REC_NODE_PTR_SIZE;
4088 len = REC_NODE_PTR_SIZE;
4089 clear_page_zip:
4090 memset(storage, 0, len);
4091 mtr->memset(*block, storage - page_zip->data, len, 0);
4092 } else if (index->is_clust()) {
4093 /* Clear trx_id and roll_ptr. On the compressed page,
4094 there is an array of these fields immediately before the
4095 dense page directory, at the very end of the page. */
4096 const ulint trx_id_pos
4097 = dict_col_get_clust_pos(
4098 dict_table_get_sys_col(
4099 index->table, DATA_TRX_ID), index);
4100 field = rec_get_nth_field(rec, offsets, trx_id_pos, &len);
4101 ut_ad(len == DATA_TRX_ID_LEN);
4102 memset(field, 0, DATA_TRX_ID_LEN + DATA_ROLL_PTR_LEN);
4103
4104 if (rec_offs_any_extern(offsets)) {
4105 ulint i;
4106
4107 for (i = rec_offs_n_fields(offsets); i--; ) {
4108 /* Clear all BLOB pointers in order to make
4109 page_zip_validate() pass. */
4110 if (rec_offs_nth_extern(offsets, i)) {
4111 field = rec_get_nth_field(
4112 rec, offsets, i, &len);
4113 ut_ad(len
4114 == BTR_EXTERN_FIELD_REF_SIZE);
4115 memset(field + len
4116 - BTR_EXTERN_FIELD_REF_SIZE,
4117 0, BTR_EXTERN_FIELD_REF_SIZE);
4118 }
4119 }
4120 }
4121
4122 len = DATA_TRX_ID_LEN + DATA_ROLL_PTR_LEN;
4123 storage = page_zip_dir_start(page_zip)
4124 - (heap_no - 1)
4125 * (DATA_TRX_ID_LEN + DATA_ROLL_PTR_LEN);
4126 goto clear_page_zip;
4127 } else {
4128 ut_ad(!rec_offs_any_extern(offsets));
4129 }
4130 }
4131
4132 /** Modify the delete-mark flag of a ROW_FORMAT=COMPRESSED record.
4133 @param[in,out] block buffer block
4134 @param[in,out] rec record on a physical index page
4135 @param[in] flag the value of the delete-mark flag
4136 @param[in,out] mtr mini-transaction */
page_zip_rec_set_deleted(buf_block_t * block,rec_t * rec,bool flag,mtr_t * mtr)4137 void page_zip_rec_set_deleted(buf_block_t *block, rec_t *rec, bool flag,
4138 mtr_t *mtr)
4139 {
4140 ut_ad(page_align(rec) == block->frame);
4141 byte *slot= page_zip_dir_find(&block->page.zip, page_offset(rec));
4142 byte b= *slot;
4143 if (flag)
4144 b|= (PAGE_ZIP_DIR_SLOT_DEL >> 8);
4145 else
4146 b&= byte(~(PAGE_ZIP_DIR_SLOT_DEL >> 8));
4147 mtr->zmemcpy<mtr_t::MAYBE_NOP>(*block, slot, &b, 1);
4148 #ifdef UNIV_ZIP_DEBUG
4149 ut_a(page_zip_validate(&block->page.zip, block->frame, nullptr));
4150 #endif /* UNIV_ZIP_DEBUG */
4151 }
4152
4153 /**********************************************************************//**
4154 Write the "owned" flag of a record on a compressed page. The n_owned field
4155 must already have been written on the uncompressed page. */
4156 void
page_zip_rec_set_owned(buf_block_t * block,const byte * rec,ulint flag,mtr_t * mtr)4157 page_zip_rec_set_owned(
4158 /*===================*/
4159 buf_block_t* block, /*!< in/out: ROW_FORMAT=COMPRESSED page */
4160 const byte* rec, /*!< in: record on the uncompressed page */
4161 ulint flag, /*!< in: the owned flag (nonzero=TRUE) */
4162 mtr_t* mtr) /*!< in/out: mini-transaction */
4163 {
4164 ut_ad(page_align(rec) == block->frame);
4165 page_zip_des_t *const page_zip= &block->page.zip;
4166 byte *slot= page_zip_dir_find(page_zip, page_offset(rec));
4167 MEM_CHECK_DEFINED(page_zip->data, page_zip_get_size(page_zip));
4168 byte b= *slot;
4169 if (flag)
4170 b|= (PAGE_ZIP_DIR_SLOT_OWNED >> 8);
4171 else
4172 b&= byte(~(PAGE_ZIP_DIR_SLOT_OWNED >> 8));
4173 mtr->zmemcpy<mtr_t::MAYBE_NOP>(*block, slot, &b, 1);
4174 }
4175
4176 /**********************************************************************//**
4177 Insert a record to the dense page directory. */
4178 void
page_zip_dir_insert(page_cur_t * cursor,uint16_t free_rec,byte * rec,mtr_t * mtr)4179 page_zip_dir_insert(
4180 /*================*/
4181 page_cur_t* cursor, /*!< in/out: page cursor */
4182 uint16_t free_rec,/*!< in: record from which rec was
4183 allocated, or 0 */
4184 byte* rec, /*!< in: record to insert */
4185 mtr_t* mtr) /*!< in/out: mini-transaction */
4186 {
4187 ut_ad(page_align(cursor->rec) == cursor->block->frame);
4188 ut_ad(page_align(rec) == cursor->block->frame);
4189 page_zip_des_t *const page_zip= &cursor->block->page.zip;
4190
4191 ulint n_dense;
4192 byte* slot_rec;
4193 byte* slot_free;
4194
4195 ut_ad(cursor->rec != rec);
4196 ut_ad(page_rec_get_next_const(cursor->rec) == rec);
4197 ut_ad(page_zip_simple_validate(page_zip));
4198
4199 MEM_CHECK_DEFINED(page_zip->data, page_zip_get_size(page_zip));
4200
4201 if (page_rec_is_infimum(cursor->rec)) {
4202 /* Use the first slot. */
4203 slot_rec = page_zip->data + page_zip_get_size(page_zip);
4204 } else {
4205 byte* end = page_zip->data + page_zip_get_size(page_zip);
4206 byte* start = end - page_zip_dir_user_size(page_zip);
4207
4208 if (UNIV_LIKELY(!free_rec)) {
4209 /* PAGE_N_RECS was already incremented
4210 in page_cur_insert_rec_zip(), but the
4211 dense directory slot at that position
4212 contains garbage. Skip it. */
4213 start += PAGE_ZIP_DIR_SLOT_SIZE;
4214 }
4215
4216 slot_rec = page_zip_dir_find_low(start, end,
4217 page_offset(cursor->rec));
4218 ut_a(slot_rec);
4219 }
4220
4221 /* Read the old n_dense (n_heap may have been incremented). */
4222 n_dense = page_dir_get_n_heap(page_zip->data)
4223 - (PAGE_HEAP_NO_USER_LOW + 1U);
4224
4225 if (UNIV_UNLIKELY(free_rec)) {
4226 /* The record was allocated from the free list.
4227 Shift the dense directory only up to that slot.
4228 Note that in this case, n_dense is actually
4229 off by one, because page_cur_insert_rec_zip()
4230 did not increment n_heap. */
4231 ut_ad(rec_get_heap_no_new(rec) < n_dense + 1
4232 + PAGE_HEAP_NO_USER_LOW);
4233 ut_ad(page_offset(rec) >= free_rec);
4234 slot_free = page_zip_dir_find(page_zip, free_rec);
4235 ut_ad(slot_free);
4236 slot_free += PAGE_ZIP_DIR_SLOT_SIZE;
4237 } else {
4238 /* The record was allocated from the heap.
4239 Shift the entire dense directory. */
4240 ut_ad(rec_get_heap_no_new(rec) == n_dense
4241 + PAGE_HEAP_NO_USER_LOW);
4242
4243 /* Shift to the end of the dense page directory. */
4244 slot_free = page_zip->data + page_zip_get_size(page_zip)
4245 - PAGE_ZIP_DIR_SLOT_SIZE * n_dense;
4246 }
4247
4248 if (const ulint slot_len = ulint(slot_rec - slot_free)) {
4249 /* Shift the dense directory to allocate place for rec. */
4250 memmove_aligned<2>(slot_free - PAGE_ZIP_DIR_SLOT_SIZE,
4251 slot_free, slot_len);
4252 mtr->memmove(*cursor->block, (slot_free - page_zip->data)
4253 - PAGE_ZIP_DIR_SLOT_SIZE,
4254 slot_free - page_zip->data, slot_len);
4255 }
4256
4257 /* Write the entry for the inserted record.
4258 The "owned" flag must be zero. */
4259 uint16_t offs = page_offset(rec);
4260 if (rec_get_deleted_flag(rec, true)) {
4261 offs |= PAGE_ZIP_DIR_SLOT_DEL;
4262 }
4263
4264 mach_write_to_2(slot_rec - PAGE_ZIP_DIR_SLOT_SIZE, offs);
4265 mtr->zmemcpy(*cursor->block, slot_rec - page_zip->data
4266 - PAGE_ZIP_DIR_SLOT_SIZE, PAGE_ZIP_DIR_SLOT_SIZE);
4267 }
4268
4269 /** Shift the dense page directory and the array of BLOB pointers
4270 when a record is deleted.
4271 @param[in,out] block index page
4272 @param[in,out] rec record being deleted
4273 @param[in] index the index that the page belongs to
4274 @param[in] offsets rec_get_offsets(rec, index)
4275 @param[in] free previous start of the free list
4276 @param[in,out] mtr mini-transaction */
page_zip_dir_delete(buf_block_t * block,byte * rec,const dict_index_t * index,const rec_offs * offsets,const byte * free,mtr_t * mtr)4277 void page_zip_dir_delete(buf_block_t *block, byte *rec,
4278 const dict_index_t *index, const rec_offs *offsets,
4279 const byte *free, mtr_t *mtr)
4280 {
4281 ut_ad(page_align(rec) == block->frame);
4282 page_zip_des_t *const page_zip= &block->page.zip;
4283
4284 ut_ad(rec_offs_validate(rec, index, offsets));
4285 ut_ad(rec_offs_comp(offsets));
4286
4287 MEM_CHECK_DEFINED(page_zip->data, page_zip_get_size(page_zip));
4288 MEM_CHECK_DEFINED(rec, rec_offs_data_size(offsets));
4289 MEM_CHECK_DEFINED(rec - rec_offs_extra_size(offsets),
4290 rec_offs_extra_size(offsets));
4291
4292 mach_write_to_2(rec - REC_NEXT,
4293 free ? static_cast<uint16_t>(free - rec) : 0);
4294 byte *page_free= my_assume_aligned<2>(PAGE_FREE + PAGE_HEADER +
4295 block->frame);
4296 mtr->write<2>(*block, page_free, page_offset(rec));
4297 byte *garbage= my_assume_aligned<2>(PAGE_GARBAGE + PAGE_HEADER +
4298 block->frame);
4299 mtr->write<2>(*block, garbage, rec_offs_size(offsets) +
4300 mach_read_from_2(garbage));
4301 compile_time_assert(PAGE_GARBAGE == PAGE_FREE + 2);
4302 memcpy_aligned<4>(PAGE_FREE + PAGE_HEADER + page_zip->data, page_free, 4);
4303 byte *slot_rec= page_zip_dir_find(page_zip, page_offset(rec));
4304 ut_a(slot_rec);
4305 uint16_t n_recs= page_get_n_recs(block->frame);
4306 ut_ad(n_recs);
4307 ut_ad(n_recs > 1 || page_get_page_no(block->frame) == index->page);
4308 /* This could not be done before page_zip_dir_find(). */
4309 byte *page_n_recs= my_assume_aligned<2>(PAGE_N_RECS + PAGE_HEADER +
4310 block->frame);
4311 mtr->write<2>(*block, page_n_recs, n_recs - 1U);
4312 memcpy_aligned<2>(PAGE_N_RECS + PAGE_HEADER + page_zip->data, page_n_recs,
4313 2);
4314
4315 byte *slot_free;
4316
4317 if (UNIV_UNLIKELY(!free))
4318 /* Make the last slot the start of the free list. */
4319 slot_free= page_zip->data + page_zip_get_size(page_zip) -
4320 PAGE_ZIP_DIR_SLOT_SIZE * (page_dir_get_n_heap(page_zip->data) -
4321 PAGE_HEAP_NO_USER_LOW);
4322 else
4323 {
4324 slot_free= page_zip_dir_find_free(page_zip, page_offset(free));
4325 ut_a(slot_free < slot_rec);
4326 /* Grow the free list by one slot by moving the start. */
4327 slot_free+= PAGE_ZIP_DIR_SLOT_SIZE;
4328 }
4329
4330 const ulint slot_len= slot_rec > slot_free ? ulint(slot_rec - slot_free) : 0;
4331 if (slot_len)
4332 {
4333 memmove_aligned<2>(slot_free + PAGE_ZIP_DIR_SLOT_SIZE, slot_free,
4334 slot_len);
4335 mtr->memmove(*block, (slot_free - page_zip->data) + PAGE_ZIP_DIR_SLOT_SIZE,
4336 slot_free - page_zip->data, slot_len);
4337 }
4338
4339 /* Write the entry for the deleted record.
4340 The "owned" and "deleted" flags will be cleared. */
4341 mach_write_to_2(slot_free, page_offset(rec));
4342 mtr->zmemcpy(*block, slot_free - page_zip->data, 2);
4343
4344 if (const ulint n_ext= rec_offs_n_extern(offsets))
4345 {
4346 ut_ad(index->is_primary());
4347 ut_ad(page_is_leaf(block->frame));
4348
4349 /* Shift and zero fill the array of BLOB pointers. */
4350 ulint blob_no = page_zip_get_n_prev_extern(page_zip, rec, index);
4351 ut_a(blob_no + n_ext <= page_zip->n_blobs);
4352
4353 byte *externs= page_zip->data + page_zip_get_size(page_zip) -
4354 (page_dir_get_n_heap(block->frame) - PAGE_HEAP_NO_USER_LOW) *
4355 PAGE_ZIP_CLUST_LEAF_SLOT_SIZE;
4356 byte *ext_end= externs - page_zip->n_blobs * FIELD_REF_SIZE;
4357
4358 /* Shift and zero fill the array. */
4359 if (const ulint ext_len= ulint(page_zip->n_blobs - n_ext - blob_no) *
4360 BTR_EXTERN_FIELD_REF_SIZE)
4361 {
4362 memmove(ext_end + n_ext * FIELD_REF_SIZE, ext_end, ext_len);
4363 mtr->memmove(*block, (ext_end - page_zip->data) + n_ext * FIELD_REF_SIZE,
4364 ext_end - page_zip->data, ext_len);
4365 }
4366 memset(ext_end, 0, n_ext * FIELD_REF_SIZE);
4367 mtr->memset(*block, ext_end - page_zip->data, n_ext * FIELD_REF_SIZE, 0);
4368 page_zip->n_blobs = (page_zip->n_blobs - n_ext) & ((1U << 12) - 1);
4369 }
4370
4371 /* The compression algorithm expects info_bits and n_owned
4372 to be 0 for deleted records. */
4373 rec[-REC_N_NEW_EXTRA_BYTES]= 0; /* info_bits and n_owned */
4374
4375 page_zip_clear_rec(block, rec, index, offsets, mtr);
4376 }
4377
4378 /**********************************************************************//**
4379 Reorganize and compress a page. This is a low-level operation for
4380 compressed pages, to be used when page_zip_compress() fails.
4381 On success, redo log will be written.
4382 The function btr_page_reorganize() should be preferred whenever possible.
4383 IMPORTANT: if page_zip_reorganize() is invoked on a leaf page of a
4384 non-clustered index, the caller must update the insert buffer free
4385 bits in the same mini-transaction in such a way that the modification
4386 will be redo-logged.
4387 @retval true on success
4388 @retval false on failure; the block will be left intact */
4389 bool
page_zip_reorganize(buf_block_t * block,dict_index_t * index,ulint z_level,mtr_t * mtr,bool restore)4390 page_zip_reorganize(
4391 buf_block_t* block, /*!< in/out: page with compressed page;
4392 on the compressed page, in: size;
4393 out: data, n_blobs,
4394 m_start, m_end, m_nonempty */
4395 dict_index_t* index, /*!< in: index of the B-tree node */
4396 ulint z_level,/*!< in: compression level */
4397 mtr_t* mtr, /*!< in: mini-transaction */
4398 bool restore)/*!< whether to restore on failure */
4399 {
4400 page_t* page = buf_block_get_frame(block);
4401 buf_block_t* temp_block;
4402 page_t* temp_page;
4403
4404 ut_ad(mtr->memo_contains_flagged(block, MTR_MEMO_PAGE_X_FIX));
4405 ut_ad(block->page.zip.data);
4406 ut_ad(page_is_comp(page));
4407 ut_ad(!dict_index_is_ibuf(index));
4408 ut_ad(!index->table->is_temporary());
4409 /* Note that page_zip_validate(page_zip, page, index) may fail here. */
4410 MEM_CHECK_DEFINED(page, srv_page_size);
4411 MEM_CHECK_DEFINED(buf_block_get_page_zip(block)->data,
4412 page_zip_get_size(buf_block_get_page_zip(block)));
4413
4414 /* Disable logging */
4415 mtr_log_t log_mode = mtr_set_log_mode(mtr, MTR_LOG_NONE);
4416
4417 temp_block = buf_block_alloc();
4418 btr_search_drop_page_hash_index(block);
4419 temp_page = temp_block->frame;
4420
4421 /* Copy the old page to temporary space */
4422 memcpy_aligned<UNIV_PAGE_SIZE_MIN>(temp_block->frame, block->frame,
4423 srv_page_size);
4424
4425 /* Recreate the page: note that global data on page (possible
4426 segment headers, next page-field, etc.) is preserved intact */
4427
4428 page_create(block, mtr, true);
4429 if (index->is_spatial()) {
4430 mach_write_to_2(FIL_PAGE_TYPE + page, FIL_PAGE_RTREE);
4431 memcpy_aligned<2>(block->page.zip.data + FIL_PAGE_TYPE,
4432 page + FIL_PAGE_TYPE, 2);
4433 memset(FIL_RTREE_SPLIT_SEQ_NUM + page, 0, 8);
4434 memset(FIL_RTREE_SPLIT_SEQ_NUM + block->page.zip.data, 0, 8);
4435 }
4436
4437 /* Copy the records from the temporary space to the recreated page;
4438 do not copy the lock bits yet */
4439
4440 page_copy_rec_list_end_no_locks(block, temp_block,
4441 page_get_infimum_rec(temp_page),
4442 index, mtr);
4443
4444 /* Copy the PAGE_MAX_TRX_ID or PAGE_ROOT_AUTO_INC. */
4445 memcpy_aligned<8>(page + (PAGE_HEADER + PAGE_MAX_TRX_ID),
4446 temp_page + (PAGE_HEADER + PAGE_MAX_TRX_ID), 8);
4447 /* PAGE_MAX_TRX_ID must be set on secondary index leaf pages. */
4448 ut_ad(dict_index_is_clust(index) || !page_is_leaf(temp_page)
4449 || page_get_max_trx_id(page) != 0);
4450 /* PAGE_MAX_TRX_ID must be zero on non-leaf pages other than
4451 clustered index root pages. */
4452 ut_ad(page_get_max_trx_id(page) == 0
4453 || (dict_index_is_clust(index)
4454 ? !page_has_siblings(temp_page)
4455 : page_is_leaf(temp_page)));
4456
4457 /* Restore logging. */
4458 mtr_set_log_mode(mtr, log_mode);
4459
4460 if (!page_zip_compress(block, index, z_level, mtr)) {
4461 if (restore) {
4462 /* Restore the old page and exit. */
4463 #if defined UNIV_DEBUG || defined UNIV_ZIP_DEBUG
4464 /* Check that the bytes that we skip are identical. */
4465 ut_a(!memcmp(page, temp_page, PAGE_HEADER));
4466 ut_a(!memcmp(PAGE_HEADER + PAGE_N_RECS + page,
4467 PAGE_HEADER + PAGE_N_RECS + temp_page,
4468 PAGE_DATA - (PAGE_HEADER + PAGE_N_RECS)));
4469 ut_a(!memcmp(srv_page_size - FIL_PAGE_DATA_END + page,
4470 srv_page_size - FIL_PAGE_DATA_END
4471 + temp_page,
4472 FIL_PAGE_DATA_END));
4473 #endif /* UNIV_DEBUG || UNIV_ZIP_DEBUG */
4474
4475 memcpy(PAGE_HEADER + page, PAGE_HEADER + temp_page,
4476 PAGE_N_RECS - PAGE_N_DIR_SLOTS);
4477 memcpy(PAGE_DATA + page, PAGE_DATA + temp_page,
4478 srv_page_size - PAGE_DATA - FIL_PAGE_DATA_END);
4479
4480 #if defined UNIV_DEBUG || defined UNIV_ZIP_DEBUG
4481 ut_a(!memcmp(page, temp_page, srv_page_size));
4482 #endif /* UNIV_DEBUG || UNIV_ZIP_DEBUG */
4483 }
4484
4485 buf_block_free(temp_block);
4486 return false;
4487 }
4488
4489 lock_move_reorganize_page(block, temp_block);
4490
4491 buf_block_free(temp_block);
4492 return true;
4493 }
4494
4495 /**********************************************************************//**
4496 Copy the records of a page byte for byte. Do not copy the page header
4497 or trailer, except those B-tree header fields that are directly
4498 related to the storage of records. Also copy PAGE_MAX_TRX_ID.
4499 NOTE: The caller must update the lock table and the adaptive hash index. */
4500 void
page_zip_copy_recs(buf_block_t * block,const page_zip_des_t * src_zip,const page_t * src,dict_index_t * index,mtr_t * mtr)4501 page_zip_copy_recs(
4502 buf_block_t* block, /*!< in/out: buffer block */
4503 const page_zip_des_t* src_zip, /*!< in: compressed page */
4504 const page_t* src, /*!< in: page */
4505 dict_index_t* index, /*!< in: index of the B-tree */
4506 mtr_t* mtr) /*!< in: mini-transaction */
4507 {
4508 page_t* page = block->frame;
4509 page_zip_des_t* page_zip = &block->page.zip;
4510
4511 ut_ad(mtr->memo_contains_flagged(block, MTR_MEMO_PAGE_X_FIX));
4512 ut_ad(mtr->memo_contains_page_flagged(src, MTR_MEMO_PAGE_X_FIX));
4513 ut_ad(!dict_index_is_ibuf(index));
4514 ut_ad(!index->table->is_temporary());
4515 #ifdef UNIV_ZIP_DEBUG
4516 /* The B-tree operations that call this function may set
4517 FIL_PAGE_PREV or PAGE_LEVEL, causing a temporary min_rec_flag
4518 mismatch. A strict page_zip_validate() will be executed later
4519 during the B-tree operations. */
4520 ut_a(page_zip_validate_low(src_zip, src, index, TRUE));
4521 #endif /* UNIV_ZIP_DEBUG */
4522 ut_a(page_zip_get_size(page_zip) == page_zip_get_size(src_zip));
4523 if (UNIV_UNLIKELY(src_zip->n_blobs)) {
4524 ut_a(page_is_leaf(src));
4525 ut_a(dict_index_is_clust(index));
4526 }
4527
4528 MEM_CHECK_ADDRESSABLE(page, srv_page_size);
4529 MEM_CHECK_ADDRESSABLE(page_zip->data, page_zip_get_size(page_zip));
4530 MEM_CHECK_DEFINED(src, srv_page_size);
4531 MEM_CHECK_DEFINED(src_zip->data, page_zip_get_size(page_zip));
4532
4533 /* Copy those B-tree page header fields that are related to
4534 the records stored in the page. Also copy the field
4535 PAGE_MAX_TRX_ID. Skip the rest of the page header and
4536 trailer. On the compressed page, there is no trailer. */
4537 compile_time_assert(PAGE_MAX_TRX_ID + 8 == PAGE_HEADER_PRIV_END);
4538 memcpy_aligned<2>(PAGE_HEADER + page, PAGE_HEADER + src,
4539 PAGE_HEADER_PRIV_END);
4540 memcpy_aligned<2>(PAGE_DATA + page, PAGE_DATA + src,
4541 srv_page_size - (PAGE_DATA + FIL_PAGE_DATA_END));
4542 memcpy_aligned<2>(PAGE_HEADER + page_zip->data,
4543 PAGE_HEADER + src_zip->data,
4544 PAGE_HEADER_PRIV_END);
4545 memcpy_aligned<2>(PAGE_DATA + page_zip->data,
4546 PAGE_DATA + src_zip->data,
4547 page_zip_get_size(page_zip) - PAGE_DATA);
4548
4549 if (dict_index_is_clust(index)) {
4550 /* Reset the PAGE_ROOT_AUTO_INC field when copying
4551 from a root page. */
4552 memset_aligned<8>(PAGE_HEADER + PAGE_ROOT_AUTO_INC
4553 + page, 0, 8);
4554 memset_aligned<8>(PAGE_HEADER + PAGE_ROOT_AUTO_INC
4555 + page_zip->data, 0, 8);
4556 } else {
4557 /* The PAGE_MAX_TRX_ID must be nonzero on leaf pages
4558 of secondary indexes, and 0 on others. */
4559 ut_ad(!page_is_leaf(src) == !page_get_max_trx_id(src));
4560 }
4561
4562 /* Copy all fields of src_zip to page_zip, except the pointer
4563 to the compressed data page. */
4564 {
4565 page_zip_t* data = page_zip->data;
4566 memcpy(page_zip, src_zip, sizeof *page_zip);
4567 page_zip->data = data;
4568 }
4569 ut_ad(page_zip_get_trailer_len(page_zip, dict_index_is_clust(index))
4570 + page_zip->m_end < page_zip_get_size(page_zip));
4571
4572 if (!page_is_leaf(src)
4573 && UNIV_UNLIKELY(!page_has_prev(src))
4574 && UNIV_LIKELY(page_has_prev(page))) {
4575 /* Clear the REC_INFO_MIN_REC_FLAG of the first user record. */
4576 ulint offs = rec_get_next_offs(page + PAGE_NEW_INFIMUM,
4577 TRUE);
4578 if (UNIV_LIKELY(offs != PAGE_NEW_SUPREMUM)) {
4579 rec_t* rec = page + offs;
4580 ut_a(rec[-REC_N_NEW_EXTRA_BYTES]
4581 & REC_INFO_MIN_REC_FLAG);
4582 rec[-REC_N_NEW_EXTRA_BYTES]
4583 &= byte(~REC_INFO_MIN_REC_FLAG);
4584 }
4585 }
4586
4587 #ifdef UNIV_ZIP_DEBUG
4588 ut_a(page_zip_validate(page_zip, page, index));
4589 #endif /* UNIV_ZIP_DEBUG */
4590 page_zip_compress_write_log(block, index, mtr);
4591 }
4592 #endif /* !UNIV_INNOCHECKSUM */
4593
4594 /** Calculate the compressed page checksum.
4595 @param[in] data compressed page
4596 @param[in] size size of compressed page
4597 @param[in] algo algorithm to use
4598 @return page checksum */
4599 uint32_t
page_zip_calc_checksum(const void * data,ulint size,srv_checksum_algorithm_t algo)4600 page_zip_calc_checksum(
4601 const void* data,
4602 ulint size,
4603 srv_checksum_algorithm_t algo)
4604 {
4605 uLong adler;
4606 const Bytef* s = static_cast<const byte*>(data);
4607
4608 /* Exclude FIL_PAGE_SPACE_OR_CHKSUM, FIL_PAGE_LSN,
4609 and FIL_PAGE_FILE_FLUSH_LSN from the checksum. */
4610
4611 switch (algo) {
4612 case SRV_CHECKSUM_ALGORITHM_FULL_CRC32:
4613 case SRV_CHECKSUM_ALGORITHM_STRICT_FULL_CRC32:
4614 case SRV_CHECKSUM_ALGORITHM_CRC32:
4615 case SRV_CHECKSUM_ALGORITHM_STRICT_CRC32:
4616 ut_ad(size > FIL_PAGE_ARCH_LOG_NO_OR_SPACE_ID);
4617 return ut_crc32(s + FIL_PAGE_OFFSET,
4618 FIL_PAGE_LSN - FIL_PAGE_OFFSET)
4619 ^ ut_crc32(s + FIL_PAGE_TYPE, 2)
4620 ^ ut_crc32(s + FIL_PAGE_ARCH_LOG_NO_OR_SPACE_ID,
4621 size - FIL_PAGE_ARCH_LOG_NO_OR_SPACE_ID);
4622 case SRV_CHECKSUM_ALGORITHM_INNODB:
4623 case SRV_CHECKSUM_ALGORITHM_STRICT_INNODB:
4624 ut_ad(size > FIL_PAGE_ARCH_LOG_NO_OR_SPACE_ID);
4625
4626 adler = adler32(0L, s + FIL_PAGE_OFFSET,
4627 FIL_PAGE_LSN - FIL_PAGE_OFFSET);
4628 adler = adler32(adler, s + FIL_PAGE_TYPE, 2);
4629 adler = adler32(
4630 adler, s + FIL_PAGE_ARCH_LOG_NO_OR_SPACE_ID,
4631 static_cast<uInt>(size)
4632 - FIL_PAGE_ARCH_LOG_NO_OR_SPACE_ID);
4633
4634 return(uint32_t(adler));
4635 case SRV_CHECKSUM_ALGORITHM_NONE:
4636 case SRV_CHECKSUM_ALGORITHM_STRICT_NONE:
4637 return(BUF_NO_CHECKSUM_MAGIC);
4638 /* no default so the compiler will emit a warning if new enum
4639 is added and not handled here */
4640 }
4641
4642 ut_error;
4643 return(0);
4644 }
4645
4646 /** Validate the checksum on a ROW_FORMAT=COMPRESSED page.
4647 @param data ROW_FORMAT=COMPRESSED page
4648 @param size size of the page, in bytes
4649 @return whether the stored checksum matches innodb_checksum_algorithm */
page_zip_verify_checksum(const byte * data,size_t size)4650 bool page_zip_verify_checksum(const byte *data, size_t size)
4651 {
4652 const srv_checksum_algorithm_t curr_algo =
4653 static_cast<srv_checksum_algorithm_t>(srv_checksum_algorithm);
4654
4655 if (curr_algo == SRV_CHECKSUM_ALGORITHM_NONE) {
4656 return true;
4657 }
4658
4659 if (buf_is_zeroes(span<const byte>(data, size))) {
4660 return true;
4661 }
4662
4663 const uint32_t stored = mach_read_from_4(
4664 data + FIL_PAGE_SPACE_OR_CHKSUM);
4665
4666 uint32_t calc = page_zip_calc_checksum(data, size, curr_algo);
4667
4668 #ifdef UNIV_INNOCHECKSUM
4669 if (log_file) {
4670 fprintf(log_file, "page::" UINT32PF ";"
4671 " %s checksum: calculated = " UINT32PF ";"
4672 " recorded = " UINT32PF "\n", cur_page_num,
4673 buf_checksum_algorithm_name(
4674 static_cast<srv_checksum_algorithm_t>(
4675 srv_checksum_algorithm)),
4676 calc, stored);
4677 }
4678
4679 if (!strict_verify) {
4680 const uint32_t crc32 = page_zip_calc_checksum(
4681 data, size, SRV_CHECKSUM_ALGORITHM_CRC32);
4682
4683 if (log_file) {
4684 fprintf(log_file, "page::" UINT32PF ": crc32 checksum:"
4685 " calculated = " UINT32PF "; recorded = " UINT32PF "\n",
4686 cur_page_num, crc32, stored);
4687 fprintf(log_file, "page::" UINT32PF ": none checksum:"
4688 " calculated = %lu; recorded = " UINT32PF "\n",
4689 cur_page_num, BUF_NO_CHECKSUM_MAGIC, stored);
4690 }
4691 }
4692 #endif /* UNIV_INNOCHECKSUM */
4693
4694 if (stored == calc) {
4695 return(TRUE);
4696 }
4697
4698 switch (curr_algo) {
4699 case SRV_CHECKSUM_ALGORITHM_STRICT_FULL_CRC32:
4700 case SRV_CHECKSUM_ALGORITHM_STRICT_CRC32:
4701 case SRV_CHECKSUM_ALGORITHM_STRICT_INNODB:
4702 case SRV_CHECKSUM_ALGORITHM_STRICT_NONE:
4703 return FALSE;
4704 case SRV_CHECKSUM_ALGORITHM_FULL_CRC32:
4705 case SRV_CHECKSUM_ALGORITHM_CRC32:
4706 if (stored == BUF_NO_CHECKSUM_MAGIC) {
4707 return(TRUE);
4708 }
4709
4710 return stored == page_zip_calc_checksum(
4711 data, size, SRV_CHECKSUM_ALGORITHM_INNODB);
4712 case SRV_CHECKSUM_ALGORITHM_INNODB:
4713 if (stored == BUF_NO_CHECKSUM_MAGIC) {
4714 return TRUE;
4715 }
4716
4717 return stored == page_zip_calc_checksum(
4718 data, size, SRV_CHECKSUM_ALGORITHM_CRC32);
4719 case SRV_CHECKSUM_ALGORITHM_NONE:
4720 return TRUE;
4721 }
4722
4723 return FALSE;
4724 }
4725