1 /*****************************************************************************
2 
3 Copyright (c) 2005, 2016, Oracle and/or its affiliates. All Rights Reserved.
4 Copyright (c) 2012, Facebook Inc.
5 Copyright (c) 2014, 2021, MariaDB Corporation.
6 
7 This program is free software; you can redistribute it and/or modify it under
8 the terms of the GNU General Public License as published by the Free Software
9 Foundation; version 2 of the License.
10 
11 This program is distributed in the hope that it will be useful, but WITHOUT
12 ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
13 FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
14 
15 You should have received a copy of the GNU General Public License along with
16 this program; if not, write to the Free Software Foundation, Inc.,
17 51 Franklin Street, Fifth Floor, Boston, MA 02110-1335 USA
18 
19 *****************************************************************************/
20 
21 /**************************************************//**
22 @file page/page0zip.cc
23 Compressed page interface
24 
25 Created June 2005 by Marko Makela
26 *******************************************************/
27 
28 #include "page0zip.h"
29 #include "fsp0types.h"
30 #include "page0page.h"
31 #include "buf0checksum.h"
32 #include "ut0crc32.h"
33 #include "zlib.h"
34 #include "span.h"
35 
36 using st_::span;
37 
38 #ifndef UNIV_INNOCHECKSUM
39 #include "mtr0log.h"
40 #include "dict0dict.h"
41 #include "btr0cur.h"
42 #include "log0recv.h"
43 #include "row0row.h"
44 #include "btr0sea.h"
45 #include "dict0boot.h"
46 #include "lock0lock.h"
47 #include "srv0srv.h"
48 #include "buf0lru.h"
49 #include "srv0mon.h"
50 
51 #include <map>
52 #include <algorithm>
53 
54 /** Statistics on compression, indexed by page_zip_des_t::ssize - 1 */
55 page_zip_stat_t		page_zip_stat[PAGE_ZIP_SSIZE_MAX];
56 /** Statistics on compression, indexed by index->id */
57 page_zip_stat_per_index_t	page_zip_stat_per_index;
58 
59 /** Compression level to be used by zlib. Settable by user. */
60 uint	page_zip_level;
61 
62 /* Please refer to ../include/page0zip.ic for a description of the
63 compressed page format. */
64 
65 /* The infimum and supremum records are omitted from the compressed page.
66 On compress, we compare that the records are there, and on uncompress we
67 restore the records. */
68 /** Extra bytes of an infimum record */
69 static const byte infimum_extra[] = {
70 	0x01,			/* info_bits=0, n_owned=1 */
71 	0x00, 0x02		/* heap_no=0, status=2 */
72 	/* ?, ?	*/		/* next=(first user rec, or supremum) */
73 };
74 /** Data bytes of an infimum record */
75 static const byte infimum_data[] = {
76 	0x69, 0x6e, 0x66, 0x69,
77 	0x6d, 0x75, 0x6d, 0x00	/* "infimum\0" */
78 };
79 /** Extra bytes and data bytes of a supremum record */
80 static const byte supremum_extra_data alignas(4) [] = {
81 	/* 0x0?, */		/* info_bits=0, n_owned=1..8 */
82 	0x00, 0x0b,		/* heap_no=1, status=3 */
83 	0x00, 0x00,		/* next=0 */
84 	0x73, 0x75, 0x70, 0x72,
85 	0x65, 0x6d, 0x75, 0x6d	/* "supremum" */
86 };
87 
88 /** Assert that a block of memory is filled with zero bytes.
89 @param b in: memory block
90 @param s in: size of the memory block, in bytes */
91 #define ASSERT_ZERO(b, s) ut_ad(!memcmp(b, field_ref_zero, s))
92 /** Assert that a BLOB pointer is filled with zero bytes.
93 @param b in: BLOB pointer */
94 #define ASSERT_ZERO_BLOB(b) ASSERT_ZERO(b, FIELD_REF_SIZE)
95 
96 /* Enable some extra debugging output.  This code can be enabled
97 independently of any UNIV_ debugging conditions. */
98 #if defined UNIV_DEBUG || defined UNIV_ZIP_DEBUG
99 # include <stdarg.h>
100 MY_ATTRIBUTE((format (printf, 1, 2)))
101 /**********************************************************************//**
102 Report a failure to decompress or compress.
103 @return number of characters printed */
104 static
105 int
page_zip_fail_func(const char * fmt,...)106 page_zip_fail_func(
107 /*===============*/
108 	const char*	fmt,	/*!< in: printf(3) format string */
109 	...)			/*!< in: arguments corresponding to fmt */
110 {
111 	int	res;
112 	va_list	ap;
113 
114 	ut_print_timestamp(stderr);
115 	fputs("  InnoDB: ", stderr);
116 	va_start(ap, fmt);
117 	res = vfprintf(stderr, fmt, ap);
118 	va_end(ap);
119 
120 	return(res);
121 }
122 /** Wrapper for page_zip_fail_func()
123 @param fmt_args in: printf(3) format string and arguments */
124 # define page_zip_fail(fmt_args) page_zip_fail_func fmt_args
125 #else /* UNIV_DEBUG || UNIV_ZIP_DEBUG */
126 /** Dummy wrapper for page_zip_fail_func()
127 @param fmt_args ignored: printf(3) format string and arguments */
128 # define page_zip_fail(fmt_args) /* empty */
129 #endif /* UNIV_DEBUG || UNIV_ZIP_DEBUG */
130 
131 /**********************************************************************//**
132 Determine the guaranteed free space on an empty page.
133 @return minimum payload size on the page */
134 ulint
page_zip_empty_size(ulint n_fields,ulint zip_size)135 page_zip_empty_size(
136 /*================*/
137 	ulint	n_fields,	/*!< in: number of columns in the index */
138 	ulint	zip_size)	/*!< in: compressed page size in bytes */
139 {
140 	ulint	size = zip_size
141 		/* subtract the page header and the longest
142 		uncompressed data needed for one record */
143 		- (PAGE_DATA
144 		   + PAGE_ZIP_CLUST_LEAF_SLOT_SIZE
145 		   + 1/* encoded heap_no==2 in page_zip_write_rec() */
146 		   + 1/* end of modification log */
147 		   - REC_N_NEW_EXTRA_BYTES/* omitted bytes */)
148 		/* subtract the space for page_zip_fields_encode() */
149 		- compressBound(static_cast<uLong>(2 * (n_fields + 1)));
150 	return(lint(size) > 0 ? size : 0);
151 }
152 
153 /** Check whether a tuple is too big for compressed table
154 @param[in]	index	dict index object
155 @param[in]	entry	entry for the index
156 @return	true if it's too big, otherwise false */
157 bool
page_zip_is_too_big(const dict_index_t * index,const dtuple_t * entry)158 page_zip_is_too_big(
159 	const dict_index_t*	index,
160 	const dtuple_t*		entry)
161 {
162 	const ulint zip_size = index->table->space->zip_size();
163 
164 	/* Estimate the free space of an empty compressed page.
165 	Subtract one byte for the encoded heap_no in the
166 	modification log. */
167 	ulint	free_space_zip = page_zip_empty_size(
168 		index->n_fields, zip_size);
169 	ulint	n_uniq = dict_index_get_n_unique_in_tree(index);
170 
171 	ut_ad(dict_table_is_comp(index->table));
172 	ut_ad(zip_size);
173 
174 	if (free_space_zip == 0) {
175 		return(true);
176 	}
177 
178 	/* Subtract one byte for the encoded heap_no in the
179 	modification log. */
180 	free_space_zip--;
181 
182 	/* There should be enough room for two node pointer
183 	records on an empty non-leaf page.  This prevents
184 	infinite page splits. */
185 
186 	if (entry->n_fields >= n_uniq
187 	    && (REC_NODE_PTR_SIZE
188 		+ rec_get_converted_size_comp_prefix(
189 			index, entry->fields, n_uniq, NULL)
190 		/* On a compressed page, there is
191 		a two-byte entry in the dense
192 		page directory for every record.
193 		But there is no record header. */
194 		- (REC_N_NEW_EXTRA_BYTES - 2)
195 		> free_space_zip / 2)) {
196 		return(true);
197 	}
198 
199 	return(false);
200 }
201 
202 /*************************************************************//**
203 Gets the number of elements in the dense page directory,
204 including deleted records (the free list).
205 @return number of elements in the dense page directory */
206 UNIV_INLINE
207 ulint
page_zip_dir_elems(const page_zip_des_t * page_zip)208 page_zip_dir_elems(
209 /*===============*/
210 	const page_zip_des_t*	page_zip)	/*!< in: compressed page */
211 {
212 	/* Exclude the page infimum and supremum from the record count. */
213 	return ulint(page_dir_get_n_heap(page_zip->data))
214 		- PAGE_HEAP_NO_USER_LOW;
215 }
216 
217 /*************************************************************//**
218 Gets the size of the compressed page trailer (the dense page directory),
219 including deleted records (the free list).
220 @return length of dense page directory, in bytes */
221 UNIV_INLINE
222 ulint
page_zip_dir_size(const page_zip_des_t * page_zip)223 page_zip_dir_size(
224 /*==============*/
225 	const page_zip_des_t*	page_zip)	/*!< in: compressed page */
226 {
227 	return(PAGE_ZIP_DIR_SLOT_SIZE * page_zip_dir_elems(page_zip));
228 }
229 
230 /*************************************************************//**
231 Gets an offset to the compressed page trailer (the dense page directory),
232 including deleted records (the free list).
233 @return offset of the dense page directory */
234 UNIV_INLINE
235 ulint
page_zip_dir_start_offs(const page_zip_des_t * page_zip,ulint n_dense)236 page_zip_dir_start_offs(
237 /*====================*/
238 	const page_zip_des_t*	page_zip,	/*!< in: compressed page */
239 	ulint			n_dense)	/*!< in: directory size */
240 {
241 	ut_ad(n_dense * PAGE_ZIP_DIR_SLOT_SIZE < page_zip_get_size(page_zip));
242 
243 	return(page_zip_get_size(page_zip) - n_dense * PAGE_ZIP_DIR_SLOT_SIZE);
244 }
245 
246 /*************************************************************//**
247 Gets a pointer to the compressed page trailer (the dense page directory),
248 including deleted records (the free list).
249 @param[in] page_zip compressed page
250 @param[in] n_dense number of entries in the directory
251 @return pointer to the dense page directory */
252 #define page_zip_dir_start_low(page_zip, n_dense)			\
253 	((page_zip)->data + page_zip_dir_start_offs(page_zip, n_dense))
254 /*************************************************************//**
255 Gets a pointer to the compressed page trailer (the dense page directory),
256 including deleted records (the free list).
257 @param[in] page_zip compressed page
258 @return pointer to the dense page directory */
259 #define page_zip_dir_start(page_zip)					\
260 	page_zip_dir_start_low(page_zip, page_zip_dir_elems(page_zip))
261 
262 /*************************************************************//**
263 Gets the size of the compressed page trailer (the dense page directory),
264 only including user records (excluding the free list).
265 @return length of dense page directory comprising existing records, in bytes */
266 UNIV_INLINE
267 ulint
page_zip_dir_user_size(const page_zip_des_t * page_zip)268 page_zip_dir_user_size(
269 /*===================*/
270 	const page_zip_des_t*	page_zip)	/*!< in: compressed page */
271 {
272 	ulint	size = PAGE_ZIP_DIR_SLOT_SIZE
273 		* ulint(page_get_n_recs(page_zip->data));
274 	ut_ad(size <= page_zip_dir_size(page_zip));
275 	return(size);
276 }
277 
278 /*************************************************************//**
279 Find the slot of the given record in the dense page directory.
280 @return dense directory slot, or NULL if record not found */
281 UNIV_INLINE
282 byte*
page_zip_dir_find_low(byte * slot,byte * end,ulint offset)283 page_zip_dir_find_low(
284 /*==================*/
285 	byte*	slot,			/*!< in: start of records */
286 	byte*	end,			/*!< in: end of records */
287 	ulint	offset)			/*!< in: offset of user record */
288 {
289 	ut_ad(slot <= end);
290 
291 	for (; slot < end; slot += PAGE_ZIP_DIR_SLOT_SIZE) {
292 		if ((mach_read_from_2(slot) & PAGE_ZIP_DIR_SLOT_MASK)
293 		    == offset) {
294 			return(slot);
295 		}
296 	}
297 
298 	return(NULL);
299 }
300 
301 /*************************************************************//**
302 Find the slot of the given non-free record in the dense page directory.
303 @return dense directory slot, or NULL if record not found */
304 UNIV_INLINE
305 byte*
page_zip_dir_find(page_zip_des_t * page_zip,ulint offset)306 page_zip_dir_find(
307 /*==============*/
308 	page_zip_des_t*	page_zip,		/*!< in: compressed page */
309 	ulint		offset)			/*!< in: offset of user record */
310 {
311 	byte*	end	= page_zip->data + page_zip_get_size(page_zip);
312 
313 	ut_ad(page_zip_simple_validate(page_zip));
314 
315 	return(page_zip_dir_find_low(end - page_zip_dir_user_size(page_zip),
316 				     end,
317 				     offset));
318 }
319 
320 /*************************************************************//**
321 Find the slot of the given free record in the dense page directory.
322 @return dense directory slot, or NULL if record not found */
323 UNIV_INLINE
324 byte*
page_zip_dir_find_free(page_zip_des_t * page_zip,ulint offset)325 page_zip_dir_find_free(
326 /*===================*/
327 	page_zip_des_t*	page_zip,		/*!< in: compressed page */
328 	ulint		offset)			/*!< in: offset of user record */
329 {
330 	byte*	end	= page_zip->data + page_zip_get_size(page_zip);
331 
332 	ut_ad(page_zip_simple_validate(page_zip));
333 
334 	return(page_zip_dir_find_low(end - page_zip_dir_size(page_zip),
335 				     end - page_zip_dir_user_size(page_zip),
336 				     offset));
337 }
338 
339 /*************************************************************//**
340 Read a given slot in the dense page directory.
341 @return record offset on the uncompressed page, possibly ORed with
342 PAGE_ZIP_DIR_SLOT_DEL or PAGE_ZIP_DIR_SLOT_OWNED */
343 UNIV_INLINE
344 ulint
page_zip_dir_get(const page_zip_des_t * page_zip,ulint slot)345 page_zip_dir_get(
346 /*=============*/
347 	const page_zip_des_t*	page_zip,	/*!< in: compressed page */
348 	ulint			slot)		/*!< in: slot
349 						(0=first user record) */
350 {
351 	ut_ad(page_zip_simple_validate(page_zip));
352 	ut_ad(slot < page_zip_dir_size(page_zip) / PAGE_ZIP_DIR_SLOT_SIZE);
353 	return(mach_read_from_2(page_zip->data + page_zip_get_size(page_zip)
354 				- PAGE_ZIP_DIR_SLOT_SIZE * (slot + 1)));
355 }
356 
357 /** Write a byte string to a ROW_FORMAT=COMPRESSED page.
358 @param[in]      b       ROW_FORMAT=COMPRESSED index page
359 @param[in]      offset  byte offset from b.zip.data
360 @param[in]      len     length of the data to write */
zmemcpy(const buf_block_t & b,ulint offset,ulint len)361 inline void mtr_t::zmemcpy(const buf_block_t &b, ulint offset, ulint len)
362 {
363   ut_ad(fil_page_get_type(b.page.zip.data) == FIL_PAGE_INDEX ||
364         fil_page_get_type(b.page.zip.data) == FIL_PAGE_RTREE);
365   ut_ad(page_zip_simple_validate(&b.page.zip));
366   ut_ad(offset + len <= page_zip_get_size(&b.page.zip));
367 
368   memcpy_low(b, static_cast<uint16_t>(offset), &b.page.zip.data[offset], len);
369   m_last_offset= static_cast<uint16_t>(offset + len);
370 }
371 
372 /** Write a byte string to a ROW_FORMAT=COMPRESSED page.
373 @param[in]      b       ROW_FORMAT=COMPRESSED index page
374 @param[in]      dest    destination within b.zip.data
375 @param[in]      str     the data to write
376 @param[in]      len     length of the data to write
377 @tparam w       write request type */
378 template<mtr_t::write_type w>
zmemcpy(const buf_block_t & b,void * dest,const void * str,ulint len)379 inline void mtr_t::zmemcpy(const buf_block_t &b, void *dest, const void *str,
380                            ulint len)
381 {
382   byte *d= static_cast<byte*>(dest);
383   const byte *s= static_cast<const byte*>(str);
384   ut_ad(d >= b.page.zip.data + FIL_PAGE_OFFSET);
385   if (w != FORCED)
386   {
387     ut_ad(len);
388     const byte *const end= d + len;
389     while (*d++ == *s++)
390     {
391       if (d == end)
392       {
393         ut_ad(w == MAYBE_NOP);
394         return;
395       }
396     }
397     s--;
398     d--;
399     len= static_cast<ulint>(end - d);
400   }
401   ::memcpy(d, s, len);
402   zmemcpy(b, d - b.page.zip.data, len);
403 }
404 
405 /** Write redo log for compressing a ROW_FORMAT=COMPRESSED index page.
406 @param[in,out]	block	ROW_FORMAT=COMPRESSED index page
407 @param[in]	index	the index that the block belongs to
408 @param[in,out]	mtr	mini-transaction */
page_zip_compress_write_log(buf_block_t * block,dict_index_t * index,mtr_t * mtr)409 static void page_zip_compress_write_log(buf_block_t *block,
410                                         dict_index_t *index, mtr_t *mtr)
411 {
412   ut_ad(!index->is_ibuf());
413 
414   if (mtr->get_log_mode() != MTR_LOG_ALL)
415   {
416     ut_ad(mtr->get_log_mode() == MTR_LOG_NONE ||
417           mtr->get_log_mode() == MTR_LOG_NO_REDO);
418     return;
419   }
420 
421   const page_t *page= block->frame;
422   const page_zip_des_t *page_zip= &block->page.zip;
423   /* Read the number of user records. */
424   ulint trailer_size= ulint(page_dir_get_n_heap(page_zip->data)) -
425     PAGE_HEAP_NO_USER_LOW;
426   /* Multiply by uncompressed of size stored per record */
427   if (!page_is_leaf(page))
428     trailer_size*= PAGE_ZIP_DIR_SLOT_SIZE + REC_NODE_PTR_SIZE;
429   else if (index->is_clust())
430     trailer_size*= PAGE_ZIP_DIR_SLOT_SIZE + DATA_TRX_ID_LEN +
431       DATA_ROLL_PTR_LEN;
432   else
433     trailer_size*= PAGE_ZIP_DIR_SLOT_SIZE;
434   /* Add the space occupied by BLOB pointers. */
435   trailer_size+= page_zip->n_blobs * BTR_EXTERN_FIELD_REF_SIZE;
436   ut_a(page_zip->m_end > PAGE_DATA);
437   compile_time_assert(FIL_PAGE_DATA <= PAGE_DATA);
438   ut_a(page_zip->m_end + trailer_size <= page_zip_get_size(page_zip));
439 
440   mtr->init(block);
441   mtr->zmemcpy(*block, FIL_PAGE_PREV, page_zip->m_end - FIL_PAGE_PREV);
442 
443   if (trailer_size)
444     mtr->zmemcpy(*block, page_zip_get_size(page_zip) - trailer_size,
445                  trailer_size);
446   block->page.status = buf_page_t::INIT_ON_FLUSH; /* because of mtr_t::init() */
447 }
448 
449 /******************************************************//**
450 Determine how many externally stored columns are contained
451 in existing records with smaller heap_no than rec. */
452 static
453 ulint
page_zip_get_n_prev_extern(const page_zip_des_t * page_zip,const rec_t * rec,const dict_index_t * index)454 page_zip_get_n_prev_extern(
455 /*=======================*/
456 	const page_zip_des_t*	page_zip,/*!< in: dense page directory on
457 					compressed page */
458 	const rec_t*		rec,	/*!< in: compact physical record
459 					on a B-tree leaf page */
460 	const dict_index_t*	index)	/*!< in: record descriptor */
461 {
462 	const page_t*	page	= page_align(rec);
463 	ulint		n_ext	= 0;
464 	ulint		i;
465 	ulint		left;
466 	ulint		heap_no;
467 	ulint		n_recs	= page_get_n_recs(page_zip->data);
468 
469 	ut_ad(page_is_leaf(page));
470 	ut_ad(page_is_comp(page));
471 	ut_ad(dict_table_is_comp(index->table));
472 	ut_ad(dict_index_is_clust(index));
473 	ut_ad(!dict_index_is_ibuf(index));
474 
475 	heap_no = rec_get_heap_no_new(rec);
476 	ut_ad(heap_no >= PAGE_HEAP_NO_USER_LOW);
477 	left = heap_no - PAGE_HEAP_NO_USER_LOW;
478 	if (UNIV_UNLIKELY(!left)) {
479 		return(0);
480 	}
481 
482 	for (i = 0; i < n_recs; i++) {
483 		const rec_t*	r	= page + (page_zip_dir_get(page_zip, i)
484 						  & PAGE_ZIP_DIR_SLOT_MASK);
485 
486 		if (rec_get_heap_no_new(r) < heap_no) {
487 			n_ext += rec_get_n_extern_new(r, index,
488 						      ULINT_UNDEFINED);
489 			if (!--left) {
490 				break;
491 			}
492 		}
493 	}
494 
495 	return(n_ext);
496 }
497 
498 /**********************************************************************//**
499 Encode the length of a fixed-length column.
500 @return buf + length of encoded val */
501 static
502 byte*
page_zip_fixed_field_encode(byte * buf,ulint val)503 page_zip_fixed_field_encode(
504 /*========================*/
505 	byte*	buf,	/*!< in: pointer to buffer where to write */
506 	ulint	val)	/*!< in: value to write */
507 {
508 	ut_ad(val >= 2);
509 
510 	if (UNIV_LIKELY(val < 126)) {
511 		/*
512 		0 = nullable variable field of at most 255 bytes length;
513 		1 = not null variable field of at most 255 bytes length;
514 		126 = nullable variable field with maximum length >255;
515 		127 = not null variable field with maximum length >255
516 		*/
517 		*buf++ = (byte) val;
518 	} else {
519 		*buf++ = (byte) (0x80 | val >> 8);
520 		*buf++ = (byte) val;
521 	}
522 
523 	return(buf);
524 }
525 
526 /**********************************************************************//**
527 Write the index information for the compressed page.
528 @return used size of buf */
529 ulint
page_zip_fields_encode(ulint n,const dict_index_t * index,ulint trx_id_pos,byte * buf)530 page_zip_fields_encode(
531 /*===================*/
532 	ulint			n,	/*!< in: number of fields
533 					to compress */
534 	const dict_index_t*	index,	/*!< in: index comprising
535 					at least n fields */
536 	ulint			trx_id_pos,
537 					/*!< in: position of the trx_id column
538 					in the index, or ULINT_UNDEFINED if
539 					this is a non-leaf page */
540 	byte*			buf)	/*!< out: buffer of (n + 1) * 2 bytes */
541 {
542 	const byte*	buf_start	= buf;
543 	ulint		i;
544 	ulint		col;
545 	ulint		trx_id_col	= 0;
546 	/* sum of lengths of preceding non-nullable fixed fields, or 0 */
547 	ulint		fixed_sum	= 0;
548 
549 	ut_ad(trx_id_pos == ULINT_UNDEFINED || trx_id_pos < n);
550 
551 	for (i = col = 0; i < n; i++) {
552 		dict_field_t*	field = dict_index_get_nth_field(index, i);
553 		ulint		val;
554 
555 		if (dict_field_get_col(field)->prtype & DATA_NOT_NULL) {
556 			val = 1; /* set the "not nullable" flag */
557 		} else {
558 			val = 0; /* nullable field */
559 		}
560 
561 		if (!field->fixed_len) {
562 			/* variable-length field */
563 			const dict_col_t*	column
564 				= dict_field_get_col(field);
565 
566 			if (DATA_BIG_COL(column)) {
567 				val |= 0x7e; /* max > 255 bytes */
568 			}
569 
570 			if (fixed_sum) {
571 				/* write out the length of any
572 				preceding non-nullable fields */
573 				buf = page_zip_fixed_field_encode(
574 					buf, fixed_sum << 1 | 1);
575 				fixed_sum = 0;
576 				col++;
577 			}
578 
579 			*buf++ = (byte) val;
580 			col++;
581 		} else if (val) {
582 			/* fixed-length non-nullable field */
583 
584 			if (fixed_sum && UNIV_UNLIKELY
585 			    (fixed_sum + field->fixed_len
586 			     > DICT_MAX_FIXED_COL_LEN)) {
587 				/* Write out the length of the
588 				preceding non-nullable fields,
589 				to avoid exceeding the maximum
590 				length of a fixed-length column. */
591 				buf = page_zip_fixed_field_encode(
592 					buf, fixed_sum << 1 | 1);
593 				fixed_sum = 0;
594 				col++;
595 			}
596 
597 			if (i && UNIV_UNLIKELY(i == trx_id_pos)) {
598 				if (fixed_sum) {
599 					/* Write out the length of any
600 					preceding non-nullable fields,
601 					and start a new trx_id column. */
602 					buf = page_zip_fixed_field_encode(
603 						buf, fixed_sum << 1 | 1);
604 					col++;
605 				}
606 
607 				trx_id_col = col;
608 				fixed_sum = field->fixed_len;
609 			} else {
610 				/* add to the sum */
611 				fixed_sum += field->fixed_len;
612 			}
613 		} else {
614 			/* fixed-length nullable field */
615 
616 			if (fixed_sum) {
617 				/* write out the length of any
618 				preceding non-nullable fields */
619 				buf = page_zip_fixed_field_encode(
620 					buf, fixed_sum << 1 | 1);
621 				fixed_sum = 0;
622 				col++;
623 			}
624 
625 			buf = page_zip_fixed_field_encode(
626 				buf, ulint(field->fixed_len) << 1);
627 			col++;
628 		}
629 	}
630 
631 	if (fixed_sum) {
632 		/* Write out the lengths of last fixed-length columns. */
633 		buf = page_zip_fixed_field_encode(buf, fixed_sum << 1 | 1);
634 	}
635 
636 	if (trx_id_pos != ULINT_UNDEFINED) {
637 		/* Write out the position of the trx_id column */
638 		i = trx_id_col;
639 	} else {
640 		/* Write out the number of nullable fields */
641 		i = index->n_nullable;
642 	}
643 
644 	if (i < 128) {
645 		*buf++ = (byte) i;
646 	} else {
647 		*buf++ = (byte) (0x80 | i >> 8);
648 		*buf++ = (byte) i;
649 	}
650 
651 	ut_ad((ulint) (buf - buf_start) <= (n + 2) * 2);
652 	return((ulint) (buf - buf_start));
653 }
654 
655 /**********************************************************************//**
656 Populate the dense page directory from the sparse directory. */
657 static
658 void
page_zip_dir_encode(const page_t * page,byte * buf,const rec_t ** recs)659 page_zip_dir_encode(
660 /*================*/
661 	const page_t*	page,	/*!< in: compact page */
662 	byte*		buf,	/*!< in: pointer to dense page directory[-1];
663 				out: dense directory on compressed page */
664 	const rec_t**	recs)	/*!< in: pointer to an array of 0, or NULL;
665 				out: dense page directory sorted by ascending
666 				address (and heap_no) */
667 {
668 	const byte*	rec;
669 	ulint		status;
670 	ulint		min_mark;
671 	ulint		heap_no;
672 	ulint		i;
673 	ulint		n_heap;
674 	ulint		offs;
675 
676 	min_mark = 0;
677 
678 	if (page_is_leaf(page)) {
679 		status = REC_STATUS_ORDINARY;
680 	} else {
681 		status = REC_STATUS_NODE_PTR;
682 		if (UNIV_UNLIKELY(!page_has_prev(page))) {
683 			min_mark = REC_INFO_MIN_REC_FLAG;
684 		}
685 	}
686 
687 	n_heap = page_dir_get_n_heap(page);
688 
689 	/* Traverse the list of stored records in the collation order,
690 	starting from the first user record. */
691 
692 	rec = page + PAGE_NEW_INFIMUM;
693 
694 	i = 0;
695 
696 	for (;;) {
697 		ulint	info_bits;
698 		offs = rec_get_next_offs(rec, TRUE);
699 		if (UNIV_UNLIKELY(offs == PAGE_NEW_SUPREMUM)) {
700 			break;
701 		}
702 		rec = page + offs;
703 		heap_no = rec_get_heap_no_new(rec);
704 		ut_a(heap_no >= PAGE_HEAP_NO_USER_LOW);
705 		ut_a(heap_no < n_heap);
706 		ut_a(offs < srv_page_size - PAGE_DIR);
707 		ut_a(offs >= PAGE_ZIP_START);
708 		compile_time_assert(!(PAGE_ZIP_DIR_SLOT_MASK
709 				      & (PAGE_ZIP_DIR_SLOT_MASK + 1)));
710 		compile_time_assert(PAGE_ZIP_DIR_SLOT_MASK
711 				    >= UNIV_ZIP_SIZE_MAX - 1);
712 
713 		if (UNIV_UNLIKELY(rec_get_n_owned_new(rec) != 0)) {
714 			offs |= PAGE_ZIP_DIR_SLOT_OWNED;
715 		}
716 
717 		info_bits = rec_get_info_bits(rec, TRUE);
718 		if (info_bits & REC_INFO_DELETED_FLAG) {
719 			info_bits &= ~REC_INFO_DELETED_FLAG;
720 			offs |= PAGE_ZIP_DIR_SLOT_DEL;
721 		}
722 		ut_a(info_bits == min_mark);
723 		/* Only the smallest user record can have
724 		REC_INFO_MIN_REC_FLAG set. */
725 		min_mark = 0;
726 
727 		mach_write_to_2(buf - PAGE_ZIP_DIR_SLOT_SIZE * ++i, offs);
728 
729 		if (UNIV_LIKELY_NULL(recs)) {
730 			/* Ensure that each heap_no occurs at most once. */
731 			ut_a(!recs[heap_no - PAGE_HEAP_NO_USER_LOW]);
732 			/* exclude infimum and supremum */
733 			recs[heap_no - PAGE_HEAP_NO_USER_LOW] = rec;
734 		}
735 
736 		ut_a(ulint(rec_get_status(rec)) == status);
737 	}
738 
739 	offs = page_header_get_field(page, PAGE_FREE);
740 
741 	/* Traverse the free list (of deleted records). */
742 	while (offs) {
743 		ut_ad(!(offs & ~PAGE_ZIP_DIR_SLOT_MASK));
744 		rec = page + offs;
745 
746 		heap_no = rec_get_heap_no_new(rec);
747 		ut_a(heap_no >= PAGE_HEAP_NO_USER_LOW);
748 		ut_a(heap_no < n_heap);
749 
750 		ut_a(!rec[-REC_N_NEW_EXTRA_BYTES]); /* info_bits and n_owned */
751 		ut_a(ulint(rec_get_status(rec)) == status);
752 
753 		mach_write_to_2(buf - PAGE_ZIP_DIR_SLOT_SIZE * ++i, offs);
754 
755 		if (UNIV_LIKELY_NULL(recs)) {
756 			/* Ensure that each heap_no occurs at most once. */
757 			ut_a(!recs[heap_no - PAGE_HEAP_NO_USER_LOW]);
758 			/* exclude infimum and supremum */
759 			recs[heap_no - PAGE_HEAP_NO_USER_LOW] = rec;
760 		}
761 
762 		offs = rec_get_next_offs(rec, TRUE);
763 	}
764 
765 	/* Ensure that each heap no occurs at least once. */
766 	ut_a(i + PAGE_HEAP_NO_USER_LOW == n_heap);
767 }
768 
769 extern "C" {
770 
771 /**********************************************************************//**
772 Allocate memory for zlib. */
773 static
774 void*
page_zip_zalloc(void * opaque,uInt items,uInt size)775 page_zip_zalloc(
776 /*============*/
777 	void*	opaque,	/*!< in/out: memory heap */
778 	uInt	items,	/*!< in: number of items to allocate */
779 	uInt	size)	/*!< in: size of an item in bytes */
780 {
781 	return(mem_heap_zalloc(static_cast<mem_heap_t*>(opaque), items * size));
782 }
783 
784 /**********************************************************************//**
785 Deallocate memory for zlib. */
786 static
787 void
page_zip_free(void * opaque MY_ATTRIBUTE ((unused)),void * address MY_ATTRIBUTE ((unused)))788 page_zip_free(
789 /*==========*/
790 	void*	opaque MY_ATTRIBUTE((unused)),	/*!< in: memory heap */
791 	void*	address MY_ATTRIBUTE((unused)))/*!< in: object to free */
792 {
793 }
794 
795 } /* extern "C" */
796 
797 /**********************************************************************//**
798 Configure the zlib allocator to use the given memory heap. */
799 void
page_zip_set_alloc(void * stream,mem_heap_t * heap)800 page_zip_set_alloc(
801 /*===============*/
802 	void*		stream,		/*!< in/out: zlib stream */
803 	mem_heap_t*	heap)		/*!< in: memory heap to use */
804 {
805 	z_stream*	strm = static_cast<z_stream*>(stream);
806 
807 	strm->zalloc = page_zip_zalloc;
808 	strm->zfree = page_zip_free;
809 	strm->opaque = heap;
810 }
811 
812 #if 0 || defined UNIV_DEBUG || defined UNIV_ZIP_DEBUG
813 /** Symbol for enabling compression and decompression diagnostics */
814 # define PAGE_ZIP_COMPRESS_DBG
815 #endif
816 
817 #ifdef PAGE_ZIP_COMPRESS_DBG
818 /** Set this variable in a debugger to enable
819 excessive logging in page_zip_compress(). */
820 static bool	page_zip_compress_dbg;
821 /** Set this variable in a debugger to enable
822 binary logging of the data passed to deflate().
823 When this variable is nonzero, it will act
824 as a log file name generator. */
825 static unsigned	page_zip_compress_log;
826 
827 /**********************************************************************//**
828 Wrapper for deflate().  Log the operation if page_zip_compress_dbg is set.
829 @return deflate() status: Z_OK, Z_BUF_ERROR, ... */
830 static
831 int
page_zip_compress_deflate(FILE * logfile,z_streamp strm,int flush)832 page_zip_compress_deflate(
833 /*======================*/
834 	FILE*		logfile,/*!< in: log file, or NULL */
835 	z_streamp	strm,	/*!< in/out: compressed stream for deflate() */
836 	int		flush)	/*!< in: deflate() flushing method */
837 {
838 	int	status;
839 	if (UNIV_UNLIKELY(page_zip_compress_dbg)) {
840 		ut_print_buf(stderr, strm->next_in, strm->avail_in);
841 	}
842 	if (UNIV_LIKELY_NULL(logfile)) {
843 		if (fwrite(strm->next_in, 1, strm->avail_in, logfile)
844 		    != strm->avail_in) {
845 			perror("fwrite");
846 		}
847 	}
848 	status = deflate(strm, flush);
849 	if (UNIV_UNLIKELY(page_zip_compress_dbg)) {
850 		fprintf(stderr, " -> %d\n", status);
851 	}
852 	return(status);
853 }
854 
855 /* Redefine deflate(). */
856 # undef deflate
857 /** Debug wrapper for the zlib compression routine deflate().
858 Log the operation if page_zip_compress_dbg is set.
859 @param strm in/out: compressed stream
860 @param flush in: flushing method
861 @return deflate() status: Z_OK, Z_BUF_ERROR, ... */
862 # define deflate(strm, flush) page_zip_compress_deflate(logfile, strm, flush)
863 /** Declaration of the logfile parameter */
864 # define FILE_LOGFILE FILE* logfile,
865 /** The logfile parameter */
866 # define LOGFILE logfile,
867 #else /* PAGE_ZIP_COMPRESS_DBG */
868 /** Empty declaration of the logfile parameter */
869 # define FILE_LOGFILE
870 /** Missing logfile parameter */
871 # define LOGFILE
872 #endif /* PAGE_ZIP_COMPRESS_DBG */
873 
874 /**********************************************************************//**
875 Compress the records of a node pointer page.
876 @return Z_OK, or a zlib error code */
877 static
878 int
page_zip_compress_node_ptrs(FILE_LOGFILE z_stream * c_stream,const rec_t ** recs,ulint n_dense,dict_index_t * index,byte * storage,mem_heap_t * heap)879 page_zip_compress_node_ptrs(
880 /*========================*/
881 	FILE_LOGFILE
882 	z_stream*	c_stream,	/*!< in/out: compressed page stream */
883 	const rec_t**	recs,		/*!< in: dense page directory
884 					sorted by address */
885 	ulint		n_dense,	/*!< in: size of recs[] */
886 	dict_index_t*	index,		/*!< in: the index of the page */
887 	byte*		storage,	/*!< in: end of dense page directory */
888 	mem_heap_t*	heap)		/*!< in: temporary memory heap */
889 {
890 	int	err	= Z_OK;
891 	rec_offs* offsets = NULL;
892 
893 	do {
894 		const rec_t*	rec = *recs++;
895 
896 		offsets = rec_get_offsets(rec, index, offsets, 0,
897 					  ULINT_UNDEFINED, &heap);
898 		/* Only leaf nodes may contain externally stored columns. */
899 		ut_ad(!rec_offs_any_extern(offsets));
900 
901 		MEM_CHECK_DEFINED(rec, rec_offs_data_size(offsets));
902 		MEM_CHECK_DEFINED(rec - rec_offs_extra_size(offsets),
903 				  rec_offs_extra_size(offsets));
904 
905 		/* Compress the extra bytes. */
906 		c_stream->avail_in = static_cast<uInt>(
907 			rec - REC_N_NEW_EXTRA_BYTES - c_stream->next_in);
908 
909 		if (c_stream->avail_in) {
910 			err = deflate(c_stream, Z_NO_FLUSH);
911 			if (UNIV_UNLIKELY(err != Z_OK)) {
912 				break;
913 			}
914 		}
915 		ut_ad(!c_stream->avail_in);
916 
917 		/* Compress the data bytes, except node_ptr. */
918 		c_stream->next_in = (byte*) rec;
919 		c_stream->avail_in = static_cast<uInt>(
920 			rec_offs_data_size(offsets) - REC_NODE_PTR_SIZE);
921 
922 		if (c_stream->avail_in) {
923 			err = deflate(c_stream, Z_NO_FLUSH);
924 			if (UNIV_UNLIKELY(err != Z_OK)) {
925 				break;
926 			}
927 		}
928 
929 		ut_ad(!c_stream->avail_in);
930 
931 		memcpy(storage - REC_NODE_PTR_SIZE
932 		       * (rec_get_heap_no_new(rec) - 1),
933 		       c_stream->next_in, REC_NODE_PTR_SIZE);
934 		c_stream->next_in += REC_NODE_PTR_SIZE;
935 	} while (--n_dense);
936 
937 	return(err);
938 }
939 
940 /**********************************************************************//**
941 Compress the records of a leaf node of a secondary index.
942 @return Z_OK, or a zlib error code */
943 static
944 int
page_zip_compress_sec(FILE_LOGFILE z_stream * c_stream,const rec_t ** recs,ulint n_dense)945 page_zip_compress_sec(
946 /*==================*/
947 	FILE_LOGFILE
948 	z_stream*	c_stream,	/*!< in/out: compressed page stream */
949 	const rec_t**	recs,		/*!< in: dense page directory
950 					sorted by address */
951 	ulint		n_dense)	/*!< in: size of recs[] */
952 {
953 	int		err	= Z_OK;
954 
955 	ut_ad(n_dense > 0);
956 
957 	do {
958 		const rec_t*	rec = *recs++;
959 
960 		/* Compress everything up to this record. */
961 		c_stream->avail_in = static_cast<uInt>(
962 			rec - REC_N_NEW_EXTRA_BYTES
963 			- c_stream->next_in);
964 
965 		if (UNIV_LIKELY(c_stream->avail_in != 0)) {
966 			MEM_CHECK_DEFINED(c_stream->next_in,
967 					  c_stream->avail_in);
968 			err = deflate(c_stream, Z_NO_FLUSH);
969 			if (UNIV_UNLIKELY(err != Z_OK)) {
970 				break;
971 			}
972 		}
973 
974 		ut_ad(!c_stream->avail_in);
975 		ut_ad(c_stream->next_in == rec - REC_N_NEW_EXTRA_BYTES);
976 
977 		/* Skip the REC_N_NEW_EXTRA_BYTES. */
978 
979 		c_stream->next_in = (byte*) rec;
980 	} while (--n_dense);
981 
982 	return(err);
983 }
984 
985 /**********************************************************************//**
986 Compress a record of a leaf node of a clustered index that contains
987 externally stored columns.
988 @return Z_OK, or a zlib error code */
989 static
990 int
page_zip_compress_clust_ext(FILE_LOGFILE z_stream * c_stream,const rec_t * rec,const rec_offs * offsets,ulint trx_id_col,byte * deleted,byte * storage,byte ** externs,ulint * n_blobs)991 page_zip_compress_clust_ext(
992 /*========================*/
993 	FILE_LOGFILE
994 	z_stream*	c_stream,	/*!< in/out: compressed page stream */
995 	const rec_t*	rec,		/*!< in: record */
996 	const rec_offs*	offsets,	/*!< in: rec_get_offsets(rec) */
997 	ulint		trx_id_col,	/*!< in: position of of DB_TRX_ID */
998 	byte*		deleted,	/*!< in: dense directory entry pointing
999 					to the head of the free list */
1000 	byte*		storage,	/*!< in: end of dense page directory */
1001 	byte**		externs,	/*!< in/out: pointer to the next
1002 					available BLOB pointer */
1003 	ulint*		n_blobs)	/*!< in/out: number of
1004 					externally stored columns */
1005 {
1006 	int	err;
1007 	ulint	i;
1008 
1009 	MEM_CHECK_DEFINED(rec, rec_offs_data_size(offsets));
1010 	MEM_CHECK_DEFINED(rec - rec_offs_extra_size(offsets),
1011 			  rec_offs_extra_size(offsets));
1012 
1013 	for (i = 0; i < rec_offs_n_fields(offsets); i++) {
1014 		ulint		len;
1015 		const byte*	src;
1016 
1017 		if (UNIV_UNLIKELY(i == trx_id_col)) {
1018 			ut_ad(!rec_offs_nth_extern(offsets, i));
1019 			/* Store trx_id and roll_ptr
1020 			in uncompressed form. */
1021 			src = rec_get_nth_field(rec, offsets, i, &len);
1022 			ut_ad(src + DATA_TRX_ID_LEN
1023 			      == rec_get_nth_field(rec, offsets,
1024 						   i + 1, &len));
1025 			ut_ad(len == DATA_ROLL_PTR_LEN);
1026 
1027 			/* Compress any preceding bytes. */
1028 			c_stream->avail_in = static_cast<uInt>(
1029 				src - c_stream->next_in);
1030 
1031 			if (c_stream->avail_in) {
1032 				err = deflate(c_stream, Z_NO_FLUSH);
1033 				if (UNIV_UNLIKELY(err != Z_OK)) {
1034 
1035 					return(err);
1036 				}
1037 			}
1038 
1039 			ut_ad(!c_stream->avail_in);
1040 			ut_ad(c_stream->next_in == src);
1041 
1042 			memcpy(storage
1043 			       - (DATA_TRX_ID_LEN + DATA_ROLL_PTR_LEN)
1044 			       * (rec_get_heap_no_new(rec) - 1),
1045 			       c_stream->next_in,
1046 			       DATA_TRX_ID_LEN + DATA_ROLL_PTR_LEN);
1047 
1048 			c_stream->next_in
1049 				+= DATA_TRX_ID_LEN + DATA_ROLL_PTR_LEN;
1050 
1051 			/* Skip also roll_ptr */
1052 			i++;
1053 		} else if (rec_offs_nth_extern(offsets, i)) {
1054 			src = rec_get_nth_field(rec, offsets, i, &len);
1055 			ut_ad(len >= BTR_EXTERN_FIELD_REF_SIZE);
1056 			src += len - BTR_EXTERN_FIELD_REF_SIZE;
1057 
1058 			c_stream->avail_in = static_cast<uInt>(
1059 				src - c_stream->next_in);
1060 			if (UNIV_LIKELY(c_stream->avail_in != 0)) {
1061 				err = deflate(c_stream, Z_NO_FLUSH);
1062 				if (UNIV_UNLIKELY(err != Z_OK)) {
1063 
1064 					return(err);
1065 				}
1066 			}
1067 
1068 			ut_ad(!c_stream->avail_in);
1069 			ut_ad(c_stream->next_in == src);
1070 
1071 			/* Reserve space for the data at
1072 			the end of the space reserved for
1073 			the compressed data and the page
1074 			modification log. */
1075 
1076 			if (UNIV_UNLIKELY
1077 			    (c_stream->avail_out
1078 			     <= BTR_EXTERN_FIELD_REF_SIZE)) {
1079 				/* out of space */
1080 				return(Z_BUF_ERROR);
1081 			}
1082 
1083 			ut_ad(*externs == c_stream->next_out
1084 			      + c_stream->avail_out
1085 			      + 1/* end of modif. log */);
1086 
1087 			c_stream->next_in
1088 				+= BTR_EXTERN_FIELD_REF_SIZE;
1089 
1090 			/* Skip deleted records. */
1091 			if (UNIV_LIKELY_NULL
1092 			    (page_zip_dir_find_low(
1093 				    storage, deleted,
1094 				    page_offset(rec)))) {
1095 				continue;
1096 			}
1097 
1098 			(*n_blobs)++;
1099 			c_stream->avail_out
1100 				-= BTR_EXTERN_FIELD_REF_SIZE;
1101 			*externs -= BTR_EXTERN_FIELD_REF_SIZE;
1102 
1103 			/* Copy the BLOB pointer */
1104 			memcpy(*externs, c_stream->next_in
1105 			       - BTR_EXTERN_FIELD_REF_SIZE,
1106 			       BTR_EXTERN_FIELD_REF_SIZE);
1107 		}
1108 	}
1109 
1110 	return(Z_OK);
1111 }
1112 
1113 /**********************************************************************//**
1114 Compress the records of a leaf node of a clustered index.
1115 @return Z_OK, or a zlib error code */
1116 static
1117 int
page_zip_compress_clust(FILE_LOGFILE z_stream * c_stream,const rec_t ** recs,ulint n_dense,dict_index_t * index,ulint * n_blobs,ulint trx_id_col,byte * deleted,byte * storage,mem_heap_t * heap)1118 page_zip_compress_clust(
1119 /*====================*/
1120 	FILE_LOGFILE
1121 	z_stream*	c_stream,	/*!< in/out: compressed page stream */
1122 	const rec_t**	recs,		/*!< in: dense page directory
1123 					sorted by address */
1124 	ulint		n_dense,	/*!< in: size of recs[] */
1125 	dict_index_t*	index,		/*!< in: the index of the page */
1126 	ulint*		n_blobs,	/*!< in: 0; out: number of
1127 					externally stored columns */
1128 	ulint		trx_id_col,	/*!< index of the trx_id column */
1129 	byte*		deleted,	/*!< in: dense directory entry pointing
1130 					to the head of the free list */
1131 	byte*		storage,	/*!< in: end of dense page directory */
1132 	mem_heap_t*	heap)		/*!< in: temporary memory heap */
1133 {
1134 	int	err		= Z_OK;
1135 	rec_offs* offsets		= NULL;
1136 	/* BTR_EXTERN_FIELD_REF storage */
1137 	byte*	externs		= storage - n_dense
1138 		* (DATA_TRX_ID_LEN + DATA_ROLL_PTR_LEN);
1139 
1140 	ut_ad(*n_blobs == 0);
1141 
1142 	do {
1143 		const rec_t*	rec = *recs++;
1144 
1145 		offsets = rec_get_offsets(rec, index, offsets, index->n_fields,
1146 					  ULINT_UNDEFINED, &heap);
1147 		ut_ad(rec_offs_n_fields(offsets)
1148 		      == dict_index_get_n_fields(index));
1149 		MEM_CHECK_DEFINED(rec, rec_offs_data_size(offsets));
1150 		MEM_CHECK_DEFINED(rec - rec_offs_extra_size(offsets),
1151 				  rec_offs_extra_size(offsets));
1152 
1153 		/* Compress the extra bytes. */
1154 		c_stream->avail_in = static_cast<uInt>(
1155 			rec - REC_N_NEW_EXTRA_BYTES
1156 			- c_stream->next_in);
1157 
1158 		if (c_stream->avail_in) {
1159 			err = deflate(c_stream, Z_NO_FLUSH);
1160 			if (UNIV_UNLIKELY(err != Z_OK)) {
1161 
1162 				goto func_exit;
1163 			}
1164 		}
1165 		ut_ad(!c_stream->avail_in);
1166 		ut_ad(c_stream->next_in == rec - REC_N_NEW_EXTRA_BYTES);
1167 
1168 		/* Compress the data bytes. */
1169 
1170 		c_stream->next_in = (byte*) rec;
1171 
1172 		/* Check if there are any externally stored columns.
1173 		For each externally stored column, store the
1174 		BTR_EXTERN_FIELD_REF separately. */
1175 		if (rec_offs_any_extern(offsets)) {
1176 			ut_ad(dict_index_is_clust(index));
1177 
1178 			err = page_zip_compress_clust_ext(
1179 				LOGFILE
1180 				c_stream, rec, offsets, trx_id_col,
1181 				deleted, storage, &externs, n_blobs);
1182 
1183 			if (UNIV_UNLIKELY(err != Z_OK)) {
1184 
1185 				goto func_exit;
1186 			}
1187 		} else {
1188 			ulint		len;
1189 			const byte*	src;
1190 
1191 			/* Store trx_id and roll_ptr in uncompressed form. */
1192 			src = rec_get_nth_field(rec, offsets,
1193 						trx_id_col, &len);
1194 			ut_ad(src + DATA_TRX_ID_LEN
1195 			      == rec_get_nth_field(rec, offsets,
1196 						   trx_id_col + 1, &len));
1197 			ut_ad(len == DATA_ROLL_PTR_LEN);
1198 			MEM_CHECK_DEFINED(rec, rec_offs_data_size(offsets));
1199 			MEM_CHECK_DEFINED(rec - rec_offs_extra_size(offsets),
1200 					  rec_offs_extra_size(offsets));
1201 
1202 			/* Compress any preceding bytes. */
1203 			c_stream->avail_in = static_cast<uInt>(
1204 				src - c_stream->next_in);
1205 
1206 			if (c_stream->avail_in) {
1207 				err = deflate(c_stream, Z_NO_FLUSH);
1208 				if (UNIV_UNLIKELY(err != Z_OK)) {
1209 
1210 					return(err);
1211 				}
1212 			}
1213 
1214 			ut_ad(!c_stream->avail_in);
1215 			ut_ad(c_stream->next_in == src);
1216 
1217 			memcpy(storage
1218 			       - (DATA_TRX_ID_LEN + DATA_ROLL_PTR_LEN)
1219 			       * (rec_get_heap_no_new(rec) - 1),
1220 			       c_stream->next_in,
1221 			       DATA_TRX_ID_LEN + DATA_ROLL_PTR_LEN);
1222 
1223 			c_stream->next_in
1224 				+= DATA_TRX_ID_LEN + DATA_ROLL_PTR_LEN;
1225 
1226 			/* Skip also roll_ptr */
1227 			ut_ad(trx_id_col + 1 < rec_offs_n_fields(offsets));
1228 		}
1229 
1230 		/* Compress the last bytes of the record. */
1231 		c_stream->avail_in = static_cast<uInt>(
1232 			rec + rec_offs_data_size(offsets) - c_stream->next_in);
1233 
1234 		if (c_stream->avail_in) {
1235 			err = deflate(c_stream, Z_NO_FLUSH);
1236 			if (UNIV_UNLIKELY(err != Z_OK)) {
1237 
1238 				goto func_exit;
1239 			}
1240 		}
1241 		ut_ad(!c_stream->avail_in);
1242 	} while (--n_dense);
1243 
1244 func_exit:
1245 	return(err);}
1246 
1247 /** Attempt to compress a ROW_FORMAT=COMPRESSED page.
1248 @retval true on success
1249 @retval false on failure; block->page.zip will be left intact. */
1250 bool
page_zip_compress(buf_block_t * block,dict_index_t * index,ulint level,mtr_t * mtr)1251 page_zip_compress(
1252 	buf_block_t*		block,	/*!< in/out: buffer block */
1253 	dict_index_t*		index,	/*!< in: index of the B-tree node */
1254 	ulint			level,	/*!< in: commpression level */
1255 	mtr_t*			mtr)	/*!< in/out: mini-transaction */
1256 {
1257 	z_stream		c_stream;
1258 	int			err;
1259 	byte*			fields;		/*!< index field information */
1260 	byte*			buf;		/*!< compressed payload of the
1261 						page */
1262 	byte*			buf_end;	/* end of buf */
1263 	ulint			n_dense;
1264 	ulint			slot_size;	/* amount of uncompressed bytes
1265 						per record */
1266 	const rec_t**		recs;		/*!< dense page directory,
1267 						sorted by address */
1268 	mem_heap_t*		heap;
1269 	ulint			trx_id_col = ULINT_UNDEFINED;
1270 	ulint			n_blobs	= 0;
1271 	byte*			storage;	/* storage of uncompressed
1272 						columns */
1273 	const ulonglong		ns = my_interval_timer();
1274 #ifdef PAGE_ZIP_COMPRESS_DBG
1275 	FILE*			logfile = NULL;
1276 #endif
1277 	/* A local copy of srv_cmp_per_index_enabled to avoid reading that
1278 	variable multiple times in this function since it can be changed at
1279 	anytime. */
1280 	my_bool			cmp_per_index_enabled;
1281 	cmp_per_index_enabled	= srv_cmp_per_index_enabled;
1282 
1283 	page_t* page = block->frame;
1284 	page_zip_des_t* page_zip = &block->page.zip;
1285 
1286 	ut_a(page_is_comp(page));
1287 	ut_a(fil_page_index_page_check(page));
1288 	ut_ad(page_simple_validate_new((page_t*) page));
1289 	ut_ad(page_zip_simple_validate(page_zip));
1290 	ut_ad(dict_table_is_comp(index->table));
1291 	ut_ad(!dict_index_is_ibuf(index));
1292 
1293 	MEM_CHECK_DEFINED(page, srv_page_size);
1294 
1295 	/* Check the data that will be omitted. */
1296 	ut_a(!memcmp(page + (PAGE_NEW_INFIMUM - REC_N_NEW_EXTRA_BYTES),
1297 		     infimum_extra, sizeof infimum_extra));
1298 	ut_a(!memcmp(page + PAGE_NEW_INFIMUM,
1299 		     infimum_data, sizeof infimum_data));
1300 	ut_a(page[PAGE_NEW_SUPREMUM - REC_N_NEW_EXTRA_BYTES]
1301 	     /* info_bits == 0, n_owned <= max */
1302 	     <= PAGE_DIR_SLOT_MAX_N_OWNED);
1303 	ut_a(!memcmp(page + (PAGE_NEW_SUPREMUM - REC_N_NEW_EXTRA_BYTES + 1),
1304 		     supremum_extra_data, sizeof supremum_extra_data));
1305 
1306 	if (page_is_empty(page)) {
1307 		ut_a(rec_get_next_offs(page + PAGE_NEW_INFIMUM, TRUE)
1308 		     == PAGE_NEW_SUPREMUM);
1309 	}
1310 
1311 	const ulint n_fields = page_is_leaf(page)
1312 		? dict_index_get_n_fields(index)
1313 		: dict_index_get_n_unique_in_tree_nonleaf(index);
1314 	index_id_t ind_id = index->id;
1315 
1316 	/* The dense directory excludes the infimum and supremum records. */
1317 	n_dense = ulint(page_dir_get_n_heap(page)) - PAGE_HEAP_NO_USER_LOW;
1318 #ifdef PAGE_ZIP_COMPRESS_DBG
1319 	if (UNIV_UNLIKELY(page_zip_compress_dbg)) {
1320 		ib::info() << "compress "
1321 			<< static_cast<void*>(page_zip) << " "
1322 			<< static_cast<const void*>(page) << " "
1323 			<< page_is_leaf(page) << " "
1324 			<< n_fields << " " << n_dense;
1325 	}
1326 
1327 	if (UNIV_UNLIKELY(page_zip_compress_log)) {
1328 		/* Create a log file for every compression attempt. */
1329 		char	logfilename[9];
1330 		snprintf(logfilename, sizeof logfilename,
1331 			 "%08x", page_zip_compress_log++);
1332 		logfile = fopen(logfilename, "wb");
1333 
1334 		if (logfile) {
1335 			/* Write the uncompressed page to the log. */
1336 			if (fwrite(page, 1, srv_page_size, logfile)
1337 			    != srv_page_size) {
1338 				perror("fwrite");
1339 			}
1340 			/* Record the compressed size as zero.
1341 			This will be overwritten at successful exit. */
1342 			putc(0, logfile);
1343 			putc(0, logfile);
1344 			putc(0, logfile);
1345 			putc(0, logfile);
1346 		}
1347 	}
1348 #endif /* PAGE_ZIP_COMPRESS_DBG */
1349 	page_zip_stat[page_zip->ssize - 1].compressed++;
1350 	if (cmp_per_index_enabled) {
1351 		mutex_enter(&page_zip_stat_per_index_mutex);
1352 		page_zip_stat_per_index[ind_id].compressed++;
1353 		mutex_exit(&page_zip_stat_per_index_mutex);
1354 	}
1355 
1356 	if (UNIV_UNLIKELY(n_dense * PAGE_ZIP_DIR_SLOT_SIZE
1357 			  >= page_zip_get_size(page_zip))) {
1358 
1359 		goto err_exit;
1360 	}
1361 
1362 	MONITOR_INC(MONITOR_PAGE_COMPRESS);
1363 
1364 	heap = mem_heap_create(page_zip_get_size(page_zip)
1365 			       + n_fields * (2 + sizeof(ulint))
1366 			       + REC_OFFS_HEADER_SIZE
1367 			       + n_dense * ((sizeof *recs)
1368 					    - PAGE_ZIP_DIR_SLOT_SIZE)
1369 			       + srv_page_size * 4
1370 			       + (512 << MAX_MEM_LEVEL));
1371 
1372 	recs = static_cast<const rec_t**>(
1373 		mem_heap_zalloc(heap, n_dense * sizeof *recs));
1374 
1375 	fields = static_cast<byte*>(mem_heap_alloc(heap, (n_fields + 1) * 2));
1376 
1377 	buf = static_cast<byte*>(
1378 		mem_heap_alloc(heap, page_zip_get_size(page_zip) - PAGE_DATA));
1379 
1380 	buf_end = buf + page_zip_get_size(page_zip) - PAGE_DATA;
1381 
1382 	/* Compress the data payload. */
1383 	page_zip_set_alloc(&c_stream, heap);
1384 
1385 	err = deflateInit2(&c_stream, static_cast<int>(level),
1386 			   Z_DEFLATED, static_cast<int>(srv_page_size_shift),
1387 			   MAX_MEM_LEVEL, Z_DEFAULT_STRATEGY);
1388 	ut_a(err == Z_OK);
1389 
1390 	c_stream.next_out = buf;
1391 
1392 	/* Subtract the space reserved for uncompressed data. */
1393 	/* Page header and the end marker of the modification log */
1394 	c_stream.avail_out = static_cast<uInt>(buf_end - buf - 1);
1395 
1396 	/* Dense page directory and uncompressed columns, if any */
1397 	if (page_is_leaf(page)) {
1398 		if (dict_index_is_clust(index)) {
1399 			trx_id_col = index->db_trx_id();
1400 
1401 			slot_size = PAGE_ZIP_DIR_SLOT_SIZE
1402 				+ DATA_TRX_ID_LEN + DATA_ROLL_PTR_LEN;
1403 
1404 		} else {
1405 			/* Signal the absence of trx_id
1406 			in page_zip_fields_encode() */
1407 			trx_id_col = 0;
1408 			slot_size = PAGE_ZIP_DIR_SLOT_SIZE;
1409 		}
1410 	} else {
1411 		slot_size = PAGE_ZIP_DIR_SLOT_SIZE + REC_NODE_PTR_SIZE;
1412 		trx_id_col = ULINT_UNDEFINED;
1413 	}
1414 
1415 	if (UNIV_UNLIKELY(c_stream.avail_out <= n_dense * slot_size
1416 			  + 6/* sizeof(zlib header and footer) */)) {
1417 		goto zlib_error;
1418 	}
1419 
1420 	c_stream.avail_out -= uInt(n_dense * slot_size);
1421 	c_stream.avail_in = uInt(page_zip_fields_encode(n_fields, index,
1422 							trx_id_col, fields));
1423 	c_stream.next_in = fields;
1424 
1425 	if (UNIV_LIKELY(!trx_id_col)) {
1426 		trx_id_col = ULINT_UNDEFINED;
1427 	}
1428 
1429 	MEM_CHECK_DEFINED(c_stream.next_in, c_stream.avail_in);
1430 	err = deflate(&c_stream, Z_FULL_FLUSH);
1431 	if (err != Z_OK) {
1432 		goto zlib_error;
1433 	}
1434 
1435 	ut_ad(!c_stream.avail_in);
1436 
1437 	page_zip_dir_encode(page, buf_end, recs);
1438 
1439 	c_stream.next_in = (byte*) page + PAGE_ZIP_START;
1440 
1441 	storage = buf_end - n_dense * PAGE_ZIP_DIR_SLOT_SIZE;
1442 
1443 	/* Compress the records in heap_no order. */
1444 	if (UNIV_UNLIKELY(!n_dense)) {
1445 	} else if (!page_is_leaf(page)) {
1446 		/* This is a node pointer page. */
1447 		err = page_zip_compress_node_ptrs(LOGFILE
1448 						  &c_stream, recs, n_dense,
1449 						  index, storage, heap);
1450 		if (UNIV_UNLIKELY(err != Z_OK)) {
1451 			goto zlib_error;
1452 		}
1453 	} else if (UNIV_LIKELY(trx_id_col == ULINT_UNDEFINED)) {
1454 		/* This is a leaf page in a secondary index. */
1455 		err = page_zip_compress_sec(LOGFILE
1456 					    &c_stream, recs, n_dense);
1457 		if (UNIV_UNLIKELY(err != Z_OK)) {
1458 			goto zlib_error;
1459 		}
1460 	} else {
1461 		/* This is a leaf page in a clustered index. */
1462 		err = page_zip_compress_clust(LOGFILE
1463 					      &c_stream, recs, n_dense,
1464 					      index, &n_blobs, trx_id_col,
1465 					      buf_end - PAGE_ZIP_DIR_SLOT_SIZE
1466 					      * page_get_n_recs(page),
1467 					      storage, heap);
1468 		if (UNIV_UNLIKELY(err != Z_OK)) {
1469 			goto zlib_error;
1470 		}
1471 	}
1472 
1473 	/* Finish the compression. */
1474 	ut_ad(!c_stream.avail_in);
1475 	/* Compress any trailing garbage, in case the last record was
1476 	allocated from an originally longer space on the free list,
1477 	or the data of the last record from page_zip_compress_sec(). */
1478 	c_stream.avail_in = static_cast<uInt>(
1479 		page_header_get_field(page, PAGE_HEAP_TOP)
1480 		- (c_stream.next_in - page));
1481 	ut_a(c_stream.avail_in <= srv_page_size - PAGE_ZIP_START - PAGE_DIR);
1482 
1483 	MEM_CHECK_DEFINED(c_stream.next_in, c_stream.avail_in);
1484 	err = deflate(&c_stream, Z_FINISH);
1485 
1486 	if (UNIV_UNLIKELY(err != Z_STREAM_END)) {
1487 zlib_error:
1488 		deflateEnd(&c_stream);
1489 		mem_heap_free(heap);
1490 err_exit:
1491 #ifdef PAGE_ZIP_COMPRESS_DBG
1492 		if (logfile) {
1493 			fclose(logfile);
1494 		}
1495 #endif /* PAGE_ZIP_COMPRESS_DBG */
1496 		if (page_is_leaf(page)) {
1497 			dict_index_zip_failure(index);
1498 		}
1499 
1500 		const uint64_t time_diff = (my_interval_timer() - ns) / 1000;
1501 		page_zip_stat[page_zip->ssize - 1].compressed_usec
1502 			+= time_diff;
1503 		if (cmp_per_index_enabled) {
1504 			mutex_enter(&page_zip_stat_per_index_mutex);
1505 			page_zip_stat_per_index[ind_id].compressed_usec
1506 				+= time_diff;
1507 			mutex_exit(&page_zip_stat_per_index_mutex);
1508 		}
1509 		return false;
1510 	}
1511 
1512 	err = deflateEnd(&c_stream);
1513 	ut_a(err == Z_OK);
1514 
1515 	ut_ad(buf + c_stream.total_out == c_stream.next_out);
1516 	ut_ad((ulint) (storage - c_stream.next_out) >= c_stream.avail_out);
1517 
1518 #if defined HAVE_valgrind && !__has_feature(memory_sanitizer)
1519 	/* Valgrind believes that zlib does not initialize some bits
1520 	in the last 7 or 8 bytes of the stream.  Make Valgrind happy. */
1521 	MEM_MAKE_DEFINED(buf, c_stream.total_out);
1522 #endif /* HAVE_valgrind && !memory_sanitizer */
1523 
1524 	/* Zero out the area reserved for the modification log.
1525 	Space for the end marker of the modification log is not
1526 	included in avail_out. */
1527 	memset(c_stream.next_out, 0, c_stream.avail_out + 1/* end marker */);
1528 
1529 #ifdef UNIV_DEBUG
1530 	page_zip->m_start =
1531 #endif /* UNIV_DEBUG */
1532 		page_zip->m_end = uint16_t(PAGE_DATA + c_stream.total_out);
1533 	page_zip->m_nonempty = FALSE;
1534 	page_zip->n_blobs = unsigned(n_blobs) & ((1U << 12) - 1);
1535 	/* Copy those header fields that will not be written
1536 	in buf_flush_init_for_writing() */
1537 	memcpy_aligned<8>(page_zip->data + FIL_PAGE_PREV, page + FIL_PAGE_PREV,
1538 			  FIL_PAGE_LSN - FIL_PAGE_PREV);
1539 	memcpy_aligned<2>(page_zip->data + FIL_PAGE_TYPE, page + FIL_PAGE_TYPE,
1540 			  2);
1541 	memcpy_aligned<2>(page_zip->data + FIL_PAGE_DATA, page + FIL_PAGE_DATA,
1542 			  PAGE_DATA - FIL_PAGE_DATA);
1543 	/* Copy the rest of the compressed page */
1544 	memcpy_aligned<2>(page_zip->data + PAGE_DATA, buf,
1545 			  page_zip_get_size(page_zip) - PAGE_DATA);
1546 	mem_heap_free(heap);
1547 #ifdef UNIV_ZIP_DEBUG
1548 	ut_a(page_zip_validate(page_zip, page, index));
1549 #endif /* UNIV_ZIP_DEBUG */
1550 
1551 	page_zip_compress_write_log(block, index, mtr);
1552 
1553 	MEM_CHECK_DEFINED(page_zip->data, page_zip_get_size(page_zip));
1554 
1555 #ifdef PAGE_ZIP_COMPRESS_DBG
1556 	if (logfile) {
1557 		/* Record the compressed size of the block. */
1558 		byte sz[4];
1559 		mach_write_to_4(sz, c_stream.total_out);
1560 		fseek(logfile, srv_page_size, SEEK_SET);
1561 		if (fwrite(sz, 1, sizeof sz, logfile) != sizeof sz) {
1562 			perror("fwrite");
1563 		}
1564 		fclose(logfile);
1565 	}
1566 #endif /* PAGE_ZIP_COMPRESS_DBG */
1567 	const uint64_t time_diff = (my_interval_timer() - ns) / 1000;
1568 	page_zip_stat[page_zip->ssize - 1].compressed_ok++;
1569 	page_zip_stat[page_zip->ssize - 1].compressed_usec += time_diff;
1570 	if (cmp_per_index_enabled) {
1571 		mutex_enter(&page_zip_stat_per_index_mutex);
1572 		page_zip_stat_per_index[ind_id].compressed_ok++;
1573 		page_zip_stat_per_index[ind_id].compressed_usec += time_diff;
1574 		mutex_exit(&page_zip_stat_per_index_mutex);
1575 	}
1576 
1577 	if (page_is_leaf(page)) {
1578 		dict_index_zip_success(index);
1579 	}
1580 
1581 	return true;
1582 }
1583 
1584 /**********************************************************************//**
1585 Deallocate the index information initialized by page_zip_fields_decode(). */
1586 static
1587 void
page_zip_fields_free(dict_index_t * index)1588 page_zip_fields_free(
1589 /*=================*/
1590 	dict_index_t*	index)	/*!< in: dummy index to be freed */
1591 {
1592 	if (index) {
1593 		dict_table_t*	table = index->table;
1594 		index->zip_pad.mutex.~mutex();
1595 		mem_heap_free(index->heap);
1596 
1597 		dict_mem_table_free(table);
1598 	}
1599 }
1600 
1601 /**********************************************************************//**
1602 Read the index information for the compressed page.
1603 @return own: dummy index describing the page, or NULL on error */
1604 static
1605 dict_index_t*
page_zip_fields_decode(const byte * buf,const byte * end,ulint * trx_id_col,bool is_spatial)1606 page_zip_fields_decode(
1607 /*===================*/
1608 	const byte*	buf,	/*!< in: index information */
1609 	const byte*	end,	/*!< in: end of buf */
1610 	ulint*		trx_id_col,/*!< in: NULL for non-leaf pages;
1611 				for leaf pages, pointer to where to store
1612 				the position of the trx_id column */
1613 	bool		is_spatial)/*< in: is spatial index or not */
1614 {
1615 	const byte*	b;
1616 	ulint		n;
1617 	ulint		i;
1618 	ulint		val;
1619 	dict_table_t*	table;
1620 	dict_index_t*	index;
1621 
1622 	/* Determine the number of fields. */
1623 	for (b = buf, n = 0; b < end; n++) {
1624 		if (*b++ & 0x80) {
1625 			b++; /* skip the second byte */
1626 		}
1627 	}
1628 
1629 	n--; /* n_nullable or trx_id */
1630 
1631 	if (UNIV_UNLIKELY(n > REC_MAX_N_FIELDS)) {
1632 
1633 		page_zip_fail(("page_zip_fields_decode: n = %lu\n",
1634 			       (ulong) n));
1635 		return(NULL);
1636 	}
1637 
1638 	if (UNIV_UNLIKELY(b > end)) {
1639 
1640 		page_zip_fail(("page_zip_fields_decode: %p > %p\n",
1641 			       (const void*) b, (const void*) end));
1642 		return(NULL);
1643 	}
1644 
1645 	table = dict_mem_table_create("ZIP_DUMMY", NULL, n, 0,
1646 				      DICT_TF_COMPACT, 0);
1647 	index = dict_mem_index_create(table, "ZIP_DUMMY", 0, n);
1648 	index->n_uniq = static_cast<unsigned>(n) & dict_index_t::MAX_N_FIELDS;
1649 	/* avoid ut_ad(index->cached) in dict_index_get_n_unique_in_tree */
1650 	index->cached = TRUE;
1651 
1652 	/* Initialize the fields. */
1653 	for (b = buf, i = 0; i < n; i++) {
1654 		ulint	mtype;
1655 		ulint	len;
1656 
1657 		val = *b++;
1658 
1659 		if (UNIV_UNLIKELY(val & 0x80)) {
1660 			/* fixed length > 62 bytes */
1661 			val = (val & 0x7f) << 8 | *b++;
1662 			len = val >> 1;
1663 			mtype = DATA_FIXBINARY;
1664 		} else if (UNIV_UNLIKELY(val >= 126)) {
1665 			/* variable length with max > 255 bytes */
1666 			len = 0x7fff;
1667 			mtype = DATA_BINARY;
1668 		} else if (val <= 1) {
1669 			/* variable length with max <= 255 bytes */
1670 			len = 0;
1671 			mtype = DATA_BINARY;
1672 		} else {
1673 			/* fixed length < 62 bytes */
1674 			len = val >> 1;
1675 			mtype = DATA_FIXBINARY;
1676 		}
1677 
1678 		dict_mem_table_add_col(table, NULL, NULL, mtype,
1679 				       val & 1 ? DATA_NOT_NULL : 0, len);
1680 		dict_index_add_col(index, table,
1681 				   dict_table_get_nth_col(table, i), 0);
1682 	}
1683 
1684 	val = *b++;
1685 	if (UNIV_UNLIKELY(val & 0x80)) {
1686 		val = (val & 0x7f) << 8 | *b++;
1687 	}
1688 
1689 	/* Decode the position of the trx_id column. */
1690 	if (trx_id_col) {
1691 		if (!val) {
1692 			val = ULINT_UNDEFINED;
1693 		} else if (UNIV_UNLIKELY(val >= n)) {
1694 fail:
1695 			page_zip_fields_free(index);
1696 			return NULL;
1697 		} else {
1698 			index->type = DICT_CLUSTERED;
1699 		}
1700 
1701 		*trx_id_col = val;
1702 	} else {
1703 		/* Decode the number of nullable fields. */
1704 		if (UNIV_UNLIKELY(index->n_nullable > val)) {
1705 			goto fail;
1706 		} else {
1707 			index->n_nullable = static_cast<unsigned>(val)
1708 				& dict_index_t::MAX_N_FIELDS;
1709 		}
1710 	}
1711 
1712 	/* ROW_FORMAT=COMPRESSED does not support instant ADD COLUMN */
1713 	index->n_core_fields = index->n_fields;
1714 	index->n_core_null_bytes = static_cast<uint8_t>(
1715 		UT_BITS_IN_BYTES(unsigned(index->n_nullable)));
1716 
1717 	ut_ad(b == end);
1718 
1719 	if (is_spatial) {
1720 		index->type |= DICT_SPATIAL;
1721 	}
1722 
1723 	return(index);
1724 }
1725 
1726 /**********************************************************************//**
1727 Populate the sparse page directory from the dense directory.
1728 @return TRUE on success, FALSE on failure */
1729 static MY_ATTRIBUTE((nonnull, warn_unused_result))
1730 ibool
page_zip_dir_decode(const page_zip_des_t * page_zip,page_t * page,rec_t ** recs,ulint n_dense)1731 page_zip_dir_decode(
1732 /*================*/
1733 	const page_zip_des_t*	page_zip,/*!< in: dense page directory on
1734 					compressed page */
1735 	page_t*			page,	/*!< in: compact page with valid header;
1736 					out: trailer and sparse page directory
1737 					filled in */
1738 	rec_t**			recs,	/*!< out: dense page directory sorted by
1739 					ascending address (and heap_no) */
1740 	ulint			n_dense)/*!< in: number of user records, and
1741 					size of recs[] */
1742 {
1743 	ulint	i;
1744 	ulint	n_recs;
1745 	byte*	slot;
1746 
1747 	n_recs = page_get_n_recs(page);
1748 
1749 	if (UNIV_UNLIKELY(n_recs > n_dense)) {
1750 		page_zip_fail(("page_zip_dir_decode 1: %lu > %lu\n",
1751 			       (ulong) n_recs, (ulong) n_dense));
1752 		return(FALSE);
1753 	}
1754 
1755 	/* Traverse the list of stored records in the sorting order,
1756 	starting from the first user record. */
1757 
1758 	slot = page + (srv_page_size - PAGE_DIR - PAGE_DIR_SLOT_SIZE);
1759 	UNIV_PREFETCH_RW(slot);
1760 
1761 	/* Zero out the page trailer. */
1762 	memset(slot + PAGE_DIR_SLOT_SIZE, 0, PAGE_DIR);
1763 
1764 	mach_write_to_2(slot, PAGE_NEW_INFIMUM);
1765 	slot -= PAGE_DIR_SLOT_SIZE;
1766 	UNIV_PREFETCH_RW(slot);
1767 
1768 	/* Initialize the sparse directory and copy the dense directory. */
1769 	for (i = 0; i < n_recs; i++) {
1770 		ulint	offs = page_zip_dir_get(page_zip, i);
1771 
1772 		if (offs & PAGE_ZIP_DIR_SLOT_OWNED) {
1773 			mach_write_to_2(slot, offs & PAGE_ZIP_DIR_SLOT_MASK);
1774 			slot -= PAGE_DIR_SLOT_SIZE;
1775 			UNIV_PREFETCH_RW(slot);
1776 		}
1777 
1778 		if (UNIV_UNLIKELY((offs & PAGE_ZIP_DIR_SLOT_MASK)
1779 				  < PAGE_ZIP_START + REC_N_NEW_EXTRA_BYTES)) {
1780 			page_zip_fail(("page_zip_dir_decode 2: %u %u %lx\n",
1781 				       (unsigned) i, (unsigned) n_recs,
1782 				       (ulong) offs));
1783 			return(FALSE);
1784 		}
1785 
1786 		recs[i] = page + (offs & PAGE_ZIP_DIR_SLOT_MASK);
1787 	}
1788 
1789 	mach_write_to_2(slot, PAGE_NEW_SUPREMUM);
1790 	{
1791 		const page_dir_slot_t*	last_slot = page_dir_get_nth_slot(
1792 			page, page_dir_get_n_slots(page) - 1U);
1793 
1794 		if (UNIV_UNLIKELY(slot != last_slot)) {
1795 			page_zip_fail(("page_zip_dir_decode 3: %p != %p\n",
1796 				       (const void*) slot,
1797 				       (const void*) last_slot));
1798 			return(FALSE);
1799 		}
1800 	}
1801 
1802 	/* Copy the rest of the dense directory. */
1803 	for (; i < n_dense; i++) {
1804 		ulint	offs = page_zip_dir_get(page_zip, i);
1805 
1806 		if (UNIV_UNLIKELY(offs & ~PAGE_ZIP_DIR_SLOT_MASK)) {
1807 			page_zip_fail(("page_zip_dir_decode 4: %u %u %lx\n",
1808 				       (unsigned) i, (unsigned) n_dense,
1809 				       (ulong) offs));
1810 			return(FALSE);
1811 		}
1812 
1813 		recs[i] = page + offs;
1814 	}
1815 
1816 	std::sort(recs, recs + n_dense);
1817 	return(TRUE);
1818 }
1819 
1820 /**********************************************************************//**
1821 Initialize the REC_N_NEW_EXTRA_BYTES of each record.
1822 @return TRUE on success, FALSE on failure */
1823 static
1824 ibool
page_zip_set_extra_bytes(const page_zip_des_t * page_zip,page_t * page,ulint info_bits)1825 page_zip_set_extra_bytes(
1826 /*=====================*/
1827 	const page_zip_des_t*	page_zip,/*!< in: compressed page */
1828 	page_t*			page,	/*!< in/out: uncompressed page */
1829 	ulint			info_bits)/*!< in: REC_INFO_MIN_REC_FLAG or 0 */
1830 {
1831 	ulint	n;
1832 	ulint	i;
1833 	ulint	n_owned = 1;
1834 	ulint	offs;
1835 	rec_t*	rec;
1836 
1837 	n = page_get_n_recs(page);
1838 	rec = page + PAGE_NEW_INFIMUM;
1839 
1840 	for (i = 0; i < n; i++) {
1841 		offs = page_zip_dir_get(page_zip, i);
1842 
1843 		if (offs & PAGE_ZIP_DIR_SLOT_DEL) {
1844 			info_bits |= REC_INFO_DELETED_FLAG;
1845 		}
1846 		if (UNIV_UNLIKELY(offs & PAGE_ZIP_DIR_SLOT_OWNED)) {
1847 			info_bits |= n_owned;
1848 			n_owned = 1;
1849 		} else {
1850 			n_owned++;
1851 		}
1852 		offs &= PAGE_ZIP_DIR_SLOT_MASK;
1853 		if (UNIV_UNLIKELY(offs < PAGE_ZIP_START
1854 				  + REC_N_NEW_EXTRA_BYTES)) {
1855 			page_zip_fail(("page_zip_set_extra_bytes 1:"
1856 				       " %u %u %lx\n",
1857 				       (unsigned) i, (unsigned) n,
1858 				       (ulong) offs));
1859 			return(FALSE);
1860 		}
1861 
1862 		rec_set_next_offs_new(rec, offs);
1863 		rec = page + offs;
1864 		rec[-REC_N_NEW_EXTRA_BYTES] = (byte) info_bits;
1865 		info_bits = 0;
1866 	}
1867 
1868 	/* Set the next pointer of the last user record. */
1869 	rec_set_next_offs_new(rec, PAGE_NEW_SUPREMUM);
1870 
1871 	/* Set n_owned of the supremum record. */
1872 	page[PAGE_NEW_SUPREMUM - REC_N_NEW_EXTRA_BYTES] = (byte) n_owned;
1873 
1874 	/* The dense directory excludes the infimum and supremum records. */
1875 	n = ulint(page_dir_get_n_heap(page)) - PAGE_HEAP_NO_USER_LOW;
1876 
1877 	if (i >= n) {
1878 		if (UNIV_LIKELY(i == n)) {
1879 			return(TRUE);
1880 		}
1881 
1882 		page_zip_fail(("page_zip_set_extra_bytes 2: %u != %u\n",
1883 			       (unsigned) i, (unsigned) n));
1884 		return(FALSE);
1885 	}
1886 
1887 	offs = page_zip_dir_get(page_zip, i);
1888 
1889 	/* Set the extra bytes of deleted records on the free list. */
1890 	for (;;) {
1891 		if (UNIV_UNLIKELY(!offs)
1892 		    || UNIV_UNLIKELY(offs & ~PAGE_ZIP_DIR_SLOT_MASK)) {
1893 
1894 			page_zip_fail(("page_zip_set_extra_bytes 3: %lx\n",
1895 				       (ulong) offs));
1896 			return(FALSE);
1897 		}
1898 
1899 		rec = page + offs;
1900 		rec[-REC_N_NEW_EXTRA_BYTES] = 0; /* info_bits and n_owned */
1901 
1902 		if (++i == n) {
1903 			break;
1904 		}
1905 
1906 		offs = page_zip_dir_get(page_zip, i);
1907 		rec_set_next_offs_new(rec, offs);
1908 	}
1909 
1910 	/* Terminate the free list. */
1911 	rec[-REC_N_NEW_EXTRA_BYTES] = 0; /* info_bits and n_owned */
1912 	rec_set_next_offs_new(rec, 0);
1913 
1914 	return(TRUE);
1915 }
1916 
1917 /**********************************************************************//**
1918 Apply the modification log to a record containing externally stored
1919 columns.  Do not copy the fields that are stored separately.
1920 @return pointer to modification log, or NULL on failure */
1921 static
1922 const byte*
page_zip_apply_log_ext(rec_t * rec,const rec_offs * offsets,ulint trx_id_col,const byte * data,const byte * end)1923 page_zip_apply_log_ext(
1924 /*===================*/
1925 	rec_t*		rec,		/*!< in/out: record */
1926 	const rec_offs*	offsets,	/*!< in: rec_get_offsets(rec) */
1927 	ulint		trx_id_col,	/*!< in: position of of DB_TRX_ID */
1928 	const byte*	data,		/*!< in: modification log */
1929 	const byte*	end)		/*!< in: end of modification log */
1930 {
1931 	ulint	i;
1932 	ulint	len;
1933 	byte*	next_out = rec;
1934 
1935 	/* Check if there are any externally stored columns.
1936 	For each externally stored column, skip the
1937 	BTR_EXTERN_FIELD_REF. */
1938 
1939 	for (i = 0; i < rec_offs_n_fields(offsets); i++) {
1940 		byte*	dst;
1941 
1942 		if (UNIV_UNLIKELY(i == trx_id_col)) {
1943 			/* Skip trx_id and roll_ptr */
1944 			dst = rec_get_nth_field(rec, offsets,
1945 						i, &len);
1946 			if (UNIV_UNLIKELY(dst - next_out >= end - data)
1947 			    || UNIV_UNLIKELY
1948 			    (len < (DATA_TRX_ID_LEN + DATA_ROLL_PTR_LEN))
1949 			    || rec_offs_nth_extern(offsets, i)) {
1950 				page_zip_fail(("page_zip_apply_log_ext:"
1951 					       " trx_id len %lu,"
1952 					       " %p - %p >= %p - %p\n",
1953 					       (ulong) len,
1954 					       (const void*) dst,
1955 					       (const void*) next_out,
1956 					       (const void*) end,
1957 					       (const void*) data));
1958 				return(NULL);
1959 			}
1960 
1961 			memcpy(next_out, data, ulint(dst - next_out));
1962 			data += ulint(dst - next_out);
1963 			next_out = dst + (DATA_TRX_ID_LEN
1964 					  + DATA_ROLL_PTR_LEN);
1965 		} else if (rec_offs_nth_extern(offsets, i)) {
1966 			dst = rec_get_nth_field(rec, offsets,
1967 						i, &len);
1968 			ut_ad(len
1969 			      >= BTR_EXTERN_FIELD_REF_SIZE);
1970 
1971 			len += ulint(dst - next_out)
1972 				- BTR_EXTERN_FIELD_REF_SIZE;
1973 
1974 			if (UNIV_UNLIKELY(data + len >= end)) {
1975 				page_zip_fail(("page_zip_apply_log_ext:"
1976 					       " ext %p+%lu >= %p\n",
1977 					       (const void*) data,
1978 					       (ulong) len,
1979 					       (const void*) end));
1980 				return(NULL);
1981 			}
1982 
1983 			memcpy(next_out, data, len);
1984 			data += len;
1985 			next_out += len
1986 				+ BTR_EXTERN_FIELD_REF_SIZE;
1987 		}
1988 	}
1989 
1990 	/* Copy the last bytes of the record. */
1991 	len = ulint(rec_get_end(rec, offsets) - next_out);
1992 	if (UNIV_UNLIKELY(data + len >= end)) {
1993 		page_zip_fail(("page_zip_apply_log_ext:"
1994 			       " last %p+%lu >= %p\n",
1995 			       (const void*) data,
1996 			       (ulong) len,
1997 			       (const void*) end));
1998 		return(NULL);
1999 	}
2000 	memcpy(next_out, data, len);
2001 	data += len;
2002 
2003 	return(data);
2004 }
2005 
2006 /**********************************************************************//**
2007 Apply the modification log to an uncompressed page.
2008 Do not copy the fields that are stored separately.
2009 @return pointer to end of modification log, or NULL on failure */
2010 static
2011 const byte*
page_zip_apply_log(const byte * data,ulint size,rec_t ** recs,ulint n_dense,ulint n_core,ulint trx_id_col,ulint heap_status,dict_index_t * index,rec_offs * offsets)2012 page_zip_apply_log(
2013 /*===============*/
2014 	const byte*	data,	/*!< in: modification log */
2015 	ulint		size,	/*!< in: maximum length of the log, in bytes */
2016 	rec_t**		recs,	/*!< in: dense page directory,
2017 				sorted by address (indexed by
2018 				heap_no - PAGE_HEAP_NO_USER_LOW) */
2019 	ulint		n_dense,/*!< in: size of recs[] */
2020 	ulint		n_core,	/*!< in: index->n_fields, or 0 for non-leaf */
2021 	ulint		trx_id_col,/*!< in: column number of trx_id in the index,
2022 				or ULINT_UNDEFINED if none */
2023 	ulint		heap_status,
2024 				/*!< in: heap_no and status bits for
2025 				the next record to uncompress */
2026 	dict_index_t*	index,	/*!< in: index of the page */
2027 	rec_offs*	offsets)/*!< in/out: work area for
2028 				rec_get_offsets_reverse() */
2029 {
2030 	const byte* const end = data + size;
2031 
2032 	for (;;) {
2033 		ulint	val;
2034 		rec_t*	rec;
2035 		ulint	len;
2036 		ulint	hs;
2037 
2038 		val = *data++;
2039 		if (UNIV_UNLIKELY(!val)) {
2040 			return(data - 1);
2041 		}
2042 		if (val & 0x80) {
2043 			val = (val & 0x7f) << 8 | *data++;
2044 			if (UNIV_UNLIKELY(!val)) {
2045 				page_zip_fail(("page_zip_apply_log:"
2046 					       " invalid val %x%x\n",
2047 					       data[-2], data[-1]));
2048 				return(NULL);
2049 			}
2050 		}
2051 		if (UNIV_UNLIKELY(data >= end)) {
2052 			page_zip_fail(("page_zip_apply_log: %p >= %p\n",
2053 				       (const void*) data,
2054 				       (const void*) end));
2055 			return(NULL);
2056 		}
2057 		if (UNIV_UNLIKELY((val >> 1) > n_dense)) {
2058 			page_zip_fail(("page_zip_apply_log: %lu>>1 > %lu\n",
2059 				       (ulong) val, (ulong) n_dense));
2060 			return(NULL);
2061 		}
2062 
2063 		/* Determine the heap number and status bits of the record. */
2064 		rec = recs[(val >> 1) - 1];
2065 
2066 		hs = ((val >> 1) + 1) << REC_HEAP_NO_SHIFT;
2067 		hs |= heap_status & ((1 << REC_HEAP_NO_SHIFT) - 1);
2068 
2069 		/* This may either be an old record that is being
2070 		overwritten (updated in place, or allocated from
2071 		the free list), or a new record, with the next
2072 		available_heap_no. */
2073 		if (UNIV_UNLIKELY(hs > heap_status)) {
2074 			page_zip_fail(("page_zip_apply_log: %lu > %lu\n",
2075 				       (ulong) hs, (ulong) heap_status));
2076 			return(NULL);
2077 		} else if (hs == heap_status) {
2078 			/* A new record was allocated from the heap. */
2079 			if (UNIV_UNLIKELY(val & 1)) {
2080 				/* Only existing records may be cleared. */
2081 				page_zip_fail(("page_zip_apply_log:"
2082 					       " attempting to create"
2083 					       " deleted rec %lu\n",
2084 					       (ulong) hs));
2085 				return(NULL);
2086 			}
2087 			heap_status += 1 << REC_HEAP_NO_SHIFT;
2088 		}
2089 
2090 		mach_write_to_2(rec - REC_NEW_HEAP_NO, hs);
2091 
2092 		if (val & 1) {
2093 			/* Clear the data bytes of the record. */
2094 			mem_heap_t*	heap	= NULL;
2095 			rec_offs*	offs;
2096 			offs = rec_get_offsets(rec, index, offsets, n_core,
2097 					       ULINT_UNDEFINED, &heap);
2098 			memset(rec, 0, rec_offs_data_size(offs));
2099 
2100 			if (UNIV_LIKELY_NULL(heap)) {
2101 				mem_heap_free(heap);
2102 			}
2103 			continue;
2104 		}
2105 
2106 		compile_time_assert(REC_STATUS_NODE_PTR == TRUE);
2107 		rec_get_offsets_reverse(data, index,
2108 					hs & REC_STATUS_NODE_PTR,
2109 					offsets);
2110 		/* Silence a debug assertion in rec_offs_make_valid().
2111 		This will be overwritten in page_zip_set_extra_bytes(),
2112 		called by page_zip_decompress_low(). */
2113 		ut_d(rec[-REC_NEW_INFO_BITS] = 0);
2114 		rec_offs_make_valid(rec, index, n_core != 0, offsets);
2115 
2116 		/* Copy the extra bytes (backwards). */
2117 		{
2118 			byte*	start	= rec_get_start(rec, offsets);
2119 			byte*	b	= rec - REC_N_NEW_EXTRA_BYTES;
2120 			while (b != start) {
2121 				*--b = *data++;
2122 			}
2123 		}
2124 
2125 		/* Copy the data bytes. */
2126 		if (UNIV_UNLIKELY(rec_offs_any_extern(offsets))) {
2127 			/* Non-leaf nodes should not contain any
2128 			externally stored columns. */
2129 			if (UNIV_UNLIKELY(hs & REC_STATUS_NODE_PTR)) {
2130 				page_zip_fail(("page_zip_apply_log:"
2131 					       " %lu&REC_STATUS_NODE_PTR\n",
2132 					       (ulong) hs));
2133 				return(NULL);
2134 			}
2135 
2136 			data = page_zip_apply_log_ext(
2137 				rec, offsets, trx_id_col, data, end);
2138 
2139 			if (UNIV_UNLIKELY(!data)) {
2140 				return(NULL);
2141 			}
2142 		} else if (UNIV_UNLIKELY(hs & REC_STATUS_NODE_PTR)) {
2143 			len = rec_offs_data_size(offsets)
2144 				- REC_NODE_PTR_SIZE;
2145 			/* Copy the data bytes, except node_ptr. */
2146 			if (UNIV_UNLIKELY(data + len >= end)) {
2147 				page_zip_fail(("page_zip_apply_log:"
2148 					       " node_ptr %p+%lu >= %p\n",
2149 					       (const void*) data,
2150 					       (ulong) len,
2151 					       (const void*) end));
2152 				return(NULL);
2153 			}
2154 			memcpy(rec, data, len);
2155 			data += len;
2156 		} else if (UNIV_LIKELY(trx_id_col == ULINT_UNDEFINED)) {
2157 			len = rec_offs_data_size(offsets);
2158 
2159 			/* Copy all data bytes of
2160 			a record in a secondary index. */
2161 			if (UNIV_UNLIKELY(data + len >= end)) {
2162 				page_zip_fail(("page_zip_apply_log:"
2163 					       " sec %p+%lu >= %p\n",
2164 					       (const void*) data,
2165 					       (ulong) len,
2166 					       (const void*) end));
2167 				return(NULL);
2168 			}
2169 
2170 			memcpy(rec, data, len);
2171 			data += len;
2172 		} else {
2173 			/* Skip DB_TRX_ID and DB_ROLL_PTR. */
2174 			ulint	l = rec_get_nth_field_offs(offsets,
2175 							   trx_id_col, &len);
2176 			byte*	b;
2177 
2178 			if (UNIV_UNLIKELY(data + l >= end)
2179 			    || UNIV_UNLIKELY(len < (DATA_TRX_ID_LEN
2180 						    + DATA_ROLL_PTR_LEN))) {
2181 				page_zip_fail(("page_zip_apply_log:"
2182 					       " trx_id %p+%lu >= %p\n",
2183 					       (const void*) data,
2184 					       (ulong) l,
2185 					       (const void*) end));
2186 				return(NULL);
2187 			}
2188 
2189 			/* Copy any preceding data bytes. */
2190 			memcpy(rec, data, l);
2191 			data += l;
2192 
2193 			/* Copy any bytes following DB_TRX_ID, DB_ROLL_PTR. */
2194 			b = rec + l + (DATA_TRX_ID_LEN + DATA_ROLL_PTR_LEN);
2195 			len = ulint(rec_get_end(rec, offsets) - b);
2196 			if (UNIV_UNLIKELY(data + len >= end)) {
2197 				page_zip_fail(("page_zip_apply_log:"
2198 					       " clust %p+%lu >= %p\n",
2199 					       (const void*) data,
2200 					       (ulong) len,
2201 					       (const void*) end));
2202 				return(NULL);
2203 			}
2204 			memcpy(b, data, len);
2205 			data += len;
2206 		}
2207 	}
2208 }
2209 
2210 /**********************************************************************//**
2211 Set the heap_no in a record, and skip the fixed-size record header
2212 that is not included in the d_stream.
2213 @return TRUE on success, FALSE if d_stream does not end at rec */
2214 static
2215 ibool
page_zip_decompress_heap_no(z_stream * d_stream,rec_t * rec,ulint & heap_status)2216 page_zip_decompress_heap_no(
2217 /*========================*/
2218 	z_stream*	d_stream,	/*!< in/out: compressed page stream */
2219 	rec_t*		rec,		/*!< in/out: record */
2220 	ulint&		heap_status)	/*!< in/out: heap_no and status bits */
2221 {
2222 	if (d_stream->next_out != rec - REC_N_NEW_EXTRA_BYTES) {
2223 		/* n_dense has grown since the page was last compressed. */
2224 		return(FALSE);
2225 	}
2226 
2227 	/* Skip the REC_N_NEW_EXTRA_BYTES. */
2228 	d_stream->next_out = rec;
2229 
2230 	/* Set heap_no and the status bits. */
2231 	mach_write_to_2(rec - REC_NEW_HEAP_NO, heap_status);
2232 	heap_status += 1 << REC_HEAP_NO_SHIFT;
2233 	return(TRUE);
2234 }
2235 
2236 /**********************************************************************//**
2237 Decompress the records of a node pointer page.
2238 @return TRUE on success, FALSE on failure */
2239 static
2240 ibool
page_zip_decompress_node_ptrs(page_zip_des_t * page_zip,z_stream * d_stream,rec_t ** recs,ulint n_dense,dict_index_t * index,rec_offs * offsets,mem_heap_t * heap)2241 page_zip_decompress_node_ptrs(
2242 /*==========================*/
2243 	page_zip_des_t*	page_zip,	/*!< in/out: compressed page */
2244 	z_stream*	d_stream,	/*!< in/out: compressed page stream */
2245 	rec_t**		recs,		/*!< in: dense page directory
2246 					sorted by address */
2247 	ulint		n_dense,	/*!< in: size of recs[] */
2248 	dict_index_t*	index,		/*!< in: the index of the page */
2249 	rec_offs*	offsets,	/*!< in/out: temporary offsets */
2250 	mem_heap_t*	heap)		/*!< in: temporary memory heap */
2251 {
2252 	ulint		heap_status = REC_STATUS_NODE_PTR
2253 		| PAGE_HEAP_NO_USER_LOW << REC_HEAP_NO_SHIFT;
2254 	ulint		slot;
2255 	const byte*	storage;
2256 
2257 	/* Subtract the space reserved for uncompressed data. */
2258 	d_stream->avail_in -= static_cast<uInt>(
2259 		n_dense * (PAGE_ZIP_DIR_SLOT_SIZE + REC_NODE_PTR_SIZE));
2260 
2261 	/* Decompress the records in heap_no order. */
2262 	for (slot = 0; slot < n_dense; slot++) {
2263 		rec_t*	rec = recs[slot];
2264 
2265 		d_stream->avail_out = static_cast<uInt>(
2266 			rec - REC_N_NEW_EXTRA_BYTES - d_stream->next_out);
2267 
2268 		ut_ad(d_stream->avail_out < srv_page_size
2269 		      - PAGE_ZIP_START - PAGE_DIR);
2270 		switch (inflate(d_stream, Z_SYNC_FLUSH)) {
2271 		case Z_STREAM_END:
2272 			page_zip_decompress_heap_no(
2273 				d_stream, rec, heap_status);
2274 			goto zlib_done;
2275 		case Z_OK:
2276 		case Z_BUF_ERROR:
2277 			if (!d_stream->avail_out) {
2278 				break;
2279 			}
2280 			/* fall through */
2281 		default:
2282 			page_zip_fail(("page_zip_decompress_node_ptrs:"
2283 				       " 1 inflate(Z_SYNC_FLUSH)=%s\n",
2284 				       d_stream->msg));
2285 			goto zlib_error;
2286 		}
2287 
2288 		if (!page_zip_decompress_heap_no(
2289 			    d_stream, rec, heap_status)) {
2290 			ut_ad(0);
2291 		}
2292 
2293 		/* Read the offsets. The status bits are needed here. */
2294 		offsets = rec_get_offsets(rec, index, offsets, 0,
2295 					  ULINT_UNDEFINED, &heap);
2296 
2297 		/* Non-leaf nodes should not have any externally
2298 		stored columns. */
2299 		ut_ad(!rec_offs_any_extern(offsets));
2300 
2301 		/* Decompress the data bytes, except node_ptr. */
2302 		d_stream->avail_out =static_cast<uInt>(
2303 			rec_offs_data_size(offsets) - REC_NODE_PTR_SIZE);
2304 
2305 		switch (inflate(d_stream, Z_SYNC_FLUSH)) {
2306 		case Z_STREAM_END:
2307 			goto zlib_done;
2308 		case Z_OK:
2309 		case Z_BUF_ERROR:
2310 			if (!d_stream->avail_out) {
2311 				break;
2312 			}
2313 			/* fall through */
2314 		default:
2315 			page_zip_fail(("page_zip_decompress_node_ptrs:"
2316 				       " 2 inflate(Z_SYNC_FLUSH)=%s\n",
2317 				       d_stream->msg));
2318 			goto zlib_error;
2319 		}
2320 
2321 		/* Clear the node pointer in case the record
2322 		will be deleted and the space will be reallocated
2323 		to a smaller record. */
2324 		memset(d_stream->next_out, 0, REC_NODE_PTR_SIZE);
2325 		d_stream->next_out += REC_NODE_PTR_SIZE;
2326 
2327 		ut_ad(d_stream->next_out == rec_get_end(rec, offsets));
2328 	}
2329 
2330 	/* Decompress any trailing garbage, in case the last record was
2331 	allocated from an originally longer space on the free list. */
2332 	d_stream->avail_out = static_cast<uInt>(
2333 		page_header_get_field(page_zip->data, PAGE_HEAP_TOP)
2334 		- page_offset(d_stream->next_out));
2335 	if (UNIV_UNLIKELY(d_stream->avail_out > srv_page_size
2336 			  - PAGE_ZIP_START - PAGE_DIR)) {
2337 
2338 		page_zip_fail(("page_zip_decompress_node_ptrs:"
2339 			       " avail_out = %u\n",
2340 			       d_stream->avail_out));
2341 		goto zlib_error;
2342 	}
2343 
2344 	if (UNIV_UNLIKELY(inflate(d_stream, Z_FINISH) != Z_STREAM_END)) {
2345 		page_zip_fail(("page_zip_decompress_node_ptrs:"
2346 			       " inflate(Z_FINISH)=%s\n",
2347 			       d_stream->msg));
2348 zlib_error:
2349 		inflateEnd(d_stream);
2350 		return(FALSE);
2351 	}
2352 
2353 	/* Note that d_stream->avail_out > 0 may hold here
2354 	if the modification log is nonempty. */
2355 
2356 zlib_done:
2357 	if (UNIV_UNLIKELY(inflateEnd(d_stream) != Z_OK)) {
2358 		ut_error;
2359 	}
2360 
2361 	{
2362 		page_t*	page = page_align(d_stream->next_out);
2363 
2364 		/* Clear the unused heap space on the uncompressed page. */
2365 		memset(d_stream->next_out, 0,
2366 		       ulint(page_dir_get_nth_slot(page,
2367 						   page_dir_get_n_slots(page)
2368 						   - 1U)
2369 			     - d_stream->next_out));
2370 	}
2371 
2372 #ifdef UNIV_DEBUG
2373 	page_zip->m_start = uint16_t(PAGE_DATA + d_stream->total_in);
2374 #endif /* UNIV_DEBUG */
2375 
2376 	/* Apply the modification log. */
2377 	{
2378 		const byte*	mod_log_ptr;
2379 		mod_log_ptr = page_zip_apply_log(d_stream->next_in,
2380 						 d_stream->avail_in + 1,
2381 						 recs, n_dense, 0,
2382 						 ULINT_UNDEFINED, heap_status,
2383 						 index, offsets);
2384 
2385 		if (UNIV_UNLIKELY(!mod_log_ptr)) {
2386 			return(FALSE);
2387 		}
2388 		page_zip->m_end = uint16_t(mod_log_ptr - page_zip->data);
2389 		page_zip->m_nonempty = mod_log_ptr != d_stream->next_in;
2390 	}
2391 
2392 	if (UNIV_UNLIKELY
2393 	    (page_zip_get_trailer_len(page_zip,
2394 				      dict_index_is_clust(index))
2395 	     + page_zip->m_end >= page_zip_get_size(page_zip))) {
2396 		page_zip_fail(("page_zip_decompress_node_ptrs:"
2397 			       " %lu + %lu >= %lu, %lu\n",
2398 			       (ulong) page_zip_get_trailer_len(
2399 				       page_zip, dict_index_is_clust(index)),
2400 			       (ulong) page_zip->m_end,
2401 			       (ulong) page_zip_get_size(page_zip),
2402 			       (ulong) dict_index_is_clust(index)));
2403 		return(FALSE);
2404 	}
2405 
2406 	/* Restore the uncompressed columns in heap_no order. */
2407 	storage = page_zip_dir_start_low(page_zip, n_dense);
2408 
2409 	for (slot = 0; slot < n_dense; slot++) {
2410 		rec_t*		rec	= recs[slot];
2411 
2412 		offsets = rec_get_offsets(rec, index, offsets, 0,
2413 					  ULINT_UNDEFINED, &heap);
2414 		/* Non-leaf nodes should not have any externally
2415 		stored columns. */
2416 		ut_ad(!rec_offs_any_extern(offsets));
2417 		storage -= REC_NODE_PTR_SIZE;
2418 
2419 		memcpy(rec_get_end(rec, offsets) - REC_NODE_PTR_SIZE,
2420 		       storage, REC_NODE_PTR_SIZE);
2421 	}
2422 
2423 	return(TRUE);
2424 }
2425 
2426 /**********************************************************************//**
2427 Decompress the records of a leaf node of a secondary index.
2428 @return TRUE on success, FALSE on failure */
2429 static
2430 ibool
page_zip_decompress_sec(page_zip_des_t * page_zip,z_stream * d_stream,rec_t ** recs,ulint n_dense,dict_index_t * index,rec_offs * offsets)2431 page_zip_decompress_sec(
2432 /*====================*/
2433 	page_zip_des_t*	page_zip,	/*!< in/out: compressed page */
2434 	z_stream*	d_stream,	/*!< in/out: compressed page stream */
2435 	rec_t**		recs,		/*!< in: dense page directory
2436 					sorted by address */
2437 	ulint		n_dense,	/*!< in: size of recs[] */
2438 	dict_index_t*	index,		/*!< in: the index of the page */
2439 	rec_offs*	offsets)	/*!< in/out: temporary offsets */
2440 {
2441 	ulint	heap_status	= REC_STATUS_ORDINARY
2442 		| PAGE_HEAP_NO_USER_LOW << REC_HEAP_NO_SHIFT;
2443 	ulint	slot;
2444 
2445 	ut_a(!dict_index_is_clust(index));
2446 
2447 	/* Subtract the space reserved for uncompressed data. */
2448 	d_stream->avail_in -= static_cast<uint>(
2449 		n_dense * PAGE_ZIP_DIR_SLOT_SIZE);
2450 
2451 	for (slot = 0; slot < n_dense; slot++) {
2452 		rec_t*	rec = recs[slot];
2453 
2454 		/* Decompress everything up to this record. */
2455 		d_stream->avail_out = static_cast<uint>(
2456 			rec - REC_N_NEW_EXTRA_BYTES - d_stream->next_out);
2457 
2458 		if (UNIV_LIKELY(d_stream->avail_out)) {
2459 			switch (inflate(d_stream, Z_SYNC_FLUSH)) {
2460 			case Z_STREAM_END:
2461 				page_zip_decompress_heap_no(
2462 					d_stream, rec, heap_status);
2463 				goto zlib_done;
2464 			case Z_OK:
2465 			case Z_BUF_ERROR:
2466 				if (!d_stream->avail_out) {
2467 					break;
2468 				}
2469 				/* fall through */
2470 			default:
2471 				page_zip_fail(("page_zip_decompress_sec:"
2472 					       " inflate(Z_SYNC_FLUSH)=%s\n",
2473 					       d_stream->msg));
2474 				goto zlib_error;
2475 			}
2476 		}
2477 
2478 		if (!page_zip_decompress_heap_no(
2479 			    d_stream, rec, heap_status)) {
2480 			ut_ad(0);
2481 		}
2482 	}
2483 
2484 	/* Decompress the data of the last record and any trailing garbage,
2485 	in case the last record was allocated from an originally longer space
2486 	on the free list. */
2487 	d_stream->avail_out = static_cast<uInt>(
2488 		page_header_get_field(page_zip->data, PAGE_HEAP_TOP)
2489 		- page_offset(d_stream->next_out));
2490 	if (UNIV_UNLIKELY(d_stream->avail_out > srv_page_size
2491 			  - PAGE_ZIP_START - PAGE_DIR)) {
2492 
2493 		page_zip_fail(("page_zip_decompress_sec:"
2494 			       " avail_out = %u\n",
2495 			       d_stream->avail_out));
2496 		goto zlib_error;
2497 	}
2498 
2499 	if (UNIV_UNLIKELY(inflate(d_stream, Z_FINISH) != Z_STREAM_END)) {
2500 		page_zip_fail(("page_zip_decompress_sec:"
2501 			       " inflate(Z_FINISH)=%s\n",
2502 			       d_stream->msg));
2503 zlib_error:
2504 		inflateEnd(d_stream);
2505 		return(FALSE);
2506 	}
2507 
2508 	/* Note that d_stream->avail_out > 0 may hold here
2509 	if the modification log is nonempty. */
2510 
2511 zlib_done:
2512 	if (UNIV_UNLIKELY(inflateEnd(d_stream) != Z_OK)) {
2513 		ut_error;
2514 	}
2515 
2516 	{
2517 		page_t*	page = page_align(d_stream->next_out);
2518 
2519 		/* Clear the unused heap space on the uncompressed page. */
2520 		memset(d_stream->next_out, 0,
2521 		       ulint(page_dir_get_nth_slot(page,
2522 						   page_dir_get_n_slots(page)
2523 						   - 1U)
2524 			     - d_stream->next_out));
2525 	}
2526 
2527 	ut_d(page_zip->m_start = uint16_t(PAGE_DATA + d_stream->total_in));
2528 
2529 	/* Apply the modification log. */
2530 	{
2531 		const byte*	mod_log_ptr;
2532 		mod_log_ptr = page_zip_apply_log(d_stream->next_in,
2533 						 d_stream->avail_in + 1,
2534 						 recs, n_dense,
2535 						 index->n_fields,
2536 						 ULINT_UNDEFINED, heap_status,
2537 						 index, offsets);
2538 
2539 		if (UNIV_UNLIKELY(!mod_log_ptr)) {
2540 			return(FALSE);
2541 		}
2542 		page_zip->m_end = uint16_t(mod_log_ptr - page_zip->data);
2543 		page_zip->m_nonempty = mod_log_ptr != d_stream->next_in;
2544 	}
2545 
2546 	if (UNIV_UNLIKELY(page_zip_get_trailer_len(page_zip, FALSE)
2547 			  + page_zip->m_end >= page_zip_get_size(page_zip))) {
2548 
2549 		page_zip_fail(("page_zip_decompress_sec: %lu + %lu >= %lu\n",
2550 			       (ulong) page_zip_get_trailer_len(
2551 				       page_zip, FALSE),
2552 			       (ulong) page_zip->m_end,
2553 			       (ulong) page_zip_get_size(page_zip)));
2554 		return(FALSE);
2555 	}
2556 
2557 	/* There are no uncompressed columns on leaf pages of
2558 	secondary indexes. */
2559 
2560 	return(TRUE);
2561 }
2562 
2563 /**********************************************************************//**
2564 Decompress a record of a leaf node of a clustered index that contains
2565 externally stored columns.
2566 @return TRUE on success */
2567 static
2568 ibool
page_zip_decompress_clust_ext(z_stream * d_stream,rec_t * rec,const rec_offs * offsets,ulint trx_id_col)2569 page_zip_decompress_clust_ext(
2570 /*==========================*/
2571 	z_stream*	d_stream,	/*!< in/out: compressed page stream */
2572 	rec_t*		rec,		/*!< in/out: record */
2573 	const rec_offs*	offsets,	/*!< in: rec_get_offsets(rec) */
2574 	ulint		trx_id_col)	/*!< in: position of of DB_TRX_ID */
2575 {
2576 	ulint	i;
2577 
2578 	for (i = 0; i < rec_offs_n_fields(offsets); i++) {
2579 		ulint	len;
2580 		byte*	dst;
2581 
2582 		if (UNIV_UNLIKELY(i == trx_id_col)) {
2583 			/* Skip trx_id and roll_ptr */
2584 			dst = rec_get_nth_field(rec, offsets, i, &len);
2585 			if (UNIV_UNLIKELY(len < DATA_TRX_ID_LEN
2586 					  + DATA_ROLL_PTR_LEN)) {
2587 
2588 				page_zip_fail(("page_zip_decompress_clust_ext:"
2589 					       " len[%lu] = %lu\n",
2590 					       (ulong) i, (ulong) len));
2591 				return(FALSE);
2592 			}
2593 
2594 			if (rec_offs_nth_extern(offsets, i)) {
2595 
2596 				page_zip_fail(("page_zip_decompress_clust_ext:"
2597 					       " DB_TRX_ID at %lu is ext\n",
2598 					       (ulong) i));
2599 				return(FALSE);
2600 			}
2601 
2602 			d_stream->avail_out = static_cast<uInt>(
2603 				dst - d_stream->next_out);
2604 
2605 			switch (inflate(d_stream, Z_SYNC_FLUSH)) {
2606 			case Z_STREAM_END:
2607 			case Z_OK:
2608 			case Z_BUF_ERROR:
2609 				if (!d_stream->avail_out) {
2610 					break;
2611 				}
2612 				/* fall through */
2613 			default:
2614 				page_zip_fail(("page_zip_decompress_clust_ext:"
2615 					       " 1 inflate(Z_SYNC_FLUSH)=%s\n",
2616 					       d_stream->msg));
2617 				return(FALSE);
2618 			}
2619 
2620 			ut_ad(d_stream->next_out == dst);
2621 
2622 			/* Clear DB_TRX_ID and DB_ROLL_PTR in order to
2623 			avoid uninitialized bytes in case the record
2624 			is affected by page_zip_apply_log(). */
2625 			memset(dst, 0, DATA_TRX_ID_LEN + DATA_ROLL_PTR_LEN);
2626 
2627 			d_stream->next_out += DATA_TRX_ID_LEN
2628 				+ DATA_ROLL_PTR_LEN;
2629 		} else if (rec_offs_nth_extern(offsets, i)) {
2630 			dst = rec_get_nth_field(rec, offsets, i, &len);
2631 			ut_ad(len >= BTR_EXTERN_FIELD_REF_SIZE);
2632 			dst += len - BTR_EXTERN_FIELD_REF_SIZE;
2633 
2634 			d_stream->avail_out = static_cast<uInt>(
2635 				dst - d_stream->next_out);
2636 			switch (inflate(d_stream, Z_SYNC_FLUSH)) {
2637 			case Z_STREAM_END:
2638 			case Z_OK:
2639 			case Z_BUF_ERROR:
2640 				if (!d_stream->avail_out) {
2641 					break;
2642 				}
2643 				/* fall through */
2644 			default:
2645 				page_zip_fail(("page_zip_decompress_clust_ext:"
2646 					       " 2 inflate(Z_SYNC_FLUSH)=%s\n",
2647 					       d_stream->msg));
2648 				return(FALSE);
2649 			}
2650 
2651 			ut_ad(d_stream->next_out == dst);
2652 
2653 			/* Clear the BLOB pointer in case
2654 			the record will be deleted and the
2655 			space will not be reused.  Note that
2656 			the final initialization of the BLOB
2657 			pointers (copying from "externs"
2658 			or clearing) will have to take place
2659 			only after the page modification log
2660 			has been applied.  Otherwise, we
2661 			could end up with an uninitialized
2662 			BLOB pointer when a record is deleted,
2663 			reallocated and deleted. */
2664 			memset(d_stream->next_out, 0,
2665 			       BTR_EXTERN_FIELD_REF_SIZE);
2666 			d_stream->next_out
2667 				+= BTR_EXTERN_FIELD_REF_SIZE;
2668 		}
2669 	}
2670 
2671 	return(TRUE);
2672 }
2673 
2674 /**********************************************************************//**
2675 Compress the records of a leaf node of a clustered index.
2676 @return TRUE on success, FALSE on failure */
2677 static
2678 ibool
page_zip_decompress_clust(page_zip_des_t * page_zip,z_stream * d_stream,rec_t ** recs,ulint n_dense,dict_index_t * index,ulint trx_id_col,rec_offs * offsets,mem_heap_t * heap)2679 page_zip_decompress_clust(
2680 /*======================*/
2681 	page_zip_des_t*	page_zip,	/*!< in/out: compressed page */
2682 	z_stream*	d_stream,	/*!< in/out: compressed page stream */
2683 	rec_t**		recs,		/*!< in: dense page directory
2684 					sorted by address */
2685 	ulint		n_dense,	/*!< in: size of recs[] */
2686 	dict_index_t*	index,		/*!< in: the index of the page */
2687 	ulint		trx_id_col,	/*!< index of the trx_id column */
2688 	rec_offs*	offsets,	/*!< in/out: temporary offsets */
2689 	mem_heap_t*	heap)		/*!< in: temporary memory heap */
2690 {
2691 	int		err;
2692 	ulint		slot;
2693 	ulint		heap_status	= REC_STATUS_ORDINARY
2694 		| PAGE_HEAP_NO_USER_LOW << REC_HEAP_NO_SHIFT;
2695 	const byte*	storage;
2696 	const byte*	externs;
2697 
2698 	ut_a(dict_index_is_clust(index));
2699 
2700 	/* Subtract the space reserved for uncompressed data. */
2701 	d_stream->avail_in -= static_cast<uInt>(n_dense)
2702 			    * (PAGE_ZIP_CLUST_LEAF_SLOT_SIZE);
2703 
2704 	/* Decompress the records in heap_no order. */
2705 	for (slot = 0; slot < n_dense; slot++) {
2706 		rec_t*	rec	= recs[slot];
2707 
2708 		d_stream->avail_out =static_cast<uInt>(
2709 			rec - REC_N_NEW_EXTRA_BYTES - d_stream->next_out);
2710 
2711 		ut_ad(d_stream->avail_out < srv_page_size
2712 		      - PAGE_ZIP_START - PAGE_DIR);
2713 		err = inflate(d_stream, Z_SYNC_FLUSH);
2714 		switch (err) {
2715 		case Z_STREAM_END:
2716 			page_zip_decompress_heap_no(
2717 				d_stream, rec, heap_status);
2718 			goto zlib_done;
2719 		case Z_OK:
2720 		case Z_BUF_ERROR:
2721 			if (UNIV_LIKELY(!d_stream->avail_out)) {
2722 				break;
2723 			}
2724 			/* fall through */
2725 		default:
2726 			page_zip_fail(("page_zip_decompress_clust:"
2727 				       " 1 inflate(Z_SYNC_FLUSH)=%s\n",
2728 				       d_stream->msg));
2729 			goto zlib_error;
2730 		}
2731 
2732 		if (!page_zip_decompress_heap_no(
2733 			    d_stream, rec, heap_status)) {
2734 			ut_ad(0);
2735 		}
2736 
2737 		/* Read the offsets. The status bits are needed here. */
2738 		offsets = rec_get_offsets(rec, index, offsets, index->n_fields,
2739 					  ULINT_UNDEFINED, &heap);
2740 
2741 		/* This is a leaf page in a clustered index. */
2742 
2743 		/* Check if there are any externally stored columns.
2744 		For each externally stored column, restore the
2745 		BTR_EXTERN_FIELD_REF separately. */
2746 
2747 		if (rec_offs_any_extern(offsets)) {
2748 			if (UNIV_UNLIKELY
2749 			    (!page_zip_decompress_clust_ext(
2750 				    d_stream, rec, offsets, trx_id_col))) {
2751 
2752 				goto zlib_error;
2753 			}
2754 		} else {
2755 			/* Skip trx_id and roll_ptr */
2756 			ulint	len;
2757 			byte*	dst = rec_get_nth_field(rec, offsets,
2758 							trx_id_col, &len);
2759 			if (UNIV_UNLIKELY(len < DATA_TRX_ID_LEN
2760 					  + DATA_ROLL_PTR_LEN)) {
2761 
2762 				page_zip_fail(("page_zip_decompress_clust:"
2763 					       " len = %lu\n", (ulong) len));
2764 				goto zlib_error;
2765 			}
2766 
2767 			d_stream->avail_out = static_cast<uInt>(
2768 				dst - d_stream->next_out);
2769 
2770 			switch (inflate(d_stream, Z_SYNC_FLUSH)) {
2771 			case Z_STREAM_END:
2772 			case Z_OK:
2773 			case Z_BUF_ERROR:
2774 				if (!d_stream->avail_out) {
2775 					break;
2776 				}
2777 				/* fall through */
2778 			default:
2779 				page_zip_fail(("page_zip_decompress_clust:"
2780 					       " 2 inflate(Z_SYNC_FLUSH)=%s\n",
2781 					       d_stream->msg));
2782 				goto zlib_error;
2783 			}
2784 
2785 			ut_ad(d_stream->next_out == dst);
2786 
2787 			/* Clear DB_TRX_ID and DB_ROLL_PTR in order to
2788 			avoid uninitialized bytes in case the record
2789 			is affected by page_zip_apply_log(). */
2790 			memset(dst, 0, DATA_TRX_ID_LEN + DATA_ROLL_PTR_LEN);
2791 
2792 			d_stream->next_out += DATA_TRX_ID_LEN
2793 				+ DATA_ROLL_PTR_LEN;
2794 		}
2795 
2796 		/* Decompress the last bytes of the record. */
2797 		d_stream->avail_out = static_cast<uInt>(
2798 			rec_get_end(rec, offsets) - d_stream->next_out);
2799 
2800 		switch (inflate(d_stream, Z_SYNC_FLUSH)) {
2801 		case Z_STREAM_END:
2802 		case Z_OK:
2803 		case Z_BUF_ERROR:
2804 			if (!d_stream->avail_out) {
2805 				break;
2806 			}
2807 			/* fall through */
2808 		default:
2809 			page_zip_fail(("page_zip_decompress_clust:"
2810 				       " 3 inflate(Z_SYNC_FLUSH)=%s\n",
2811 				       d_stream->msg));
2812 			goto zlib_error;
2813 		}
2814 	}
2815 
2816 	/* Decompress any trailing garbage, in case the last record was
2817 	allocated from an originally longer space on the free list. */
2818 	d_stream->avail_out = static_cast<uInt>(
2819 		page_header_get_field(page_zip->data, PAGE_HEAP_TOP)
2820 		- page_offset(d_stream->next_out));
2821 	if (UNIV_UNLIKELY(d_stream->avail_out > srv_page_size
2822 			  - PAGE_ZIP_START - PAGE_DIR)) {
2823 
2824 		page_zip_fail(("page_zip_decompress_clust:"
2825 			       " avail_out = %u\n",
2826 			       d_stream->avail_out));
2827 		goto zlib_error;
2828 	}
2829 
2830 	if (UNIV_UNLIKELY(inflate(d_stream, Z_FINISH) != Z_STREAM_END)) {
2831 		page_zip_fail(("page_zip_decompress_clust:"
2832 			       " inflate(Z_FINISH)=%s\n",
2833 			       d_stream->msg));
2834 zlib_error:
2835 		inflateEnd(d_stream);
2836 		return(FALSE);
2837 	}
2838 
2839 	/* Note that d_stream->avail_out > 0 may hold here
2840 	if the modification log is nonempty. */
2841 
2842 zlib_done:
2843 	if (UNIV_UNLIKELY(inflateEnd(d_stream) != Z_OK)) {
2844 		ut_error;
2845 	}
2846 
2847 	{
2848 		page_t*	page = page_align(d_stream->next_out);
2849 
2850 		/* Clear the unused heap space on the uncompressed page. */
2851 		memset(d_stream->next_out, 0,
2852 		       ulint(page_dir_get_nth_slot(page,
2853 						   page_dir_get_n_slots(page)
2854 						   - 1U)
2855 			     - d_stream->next_out));
2856 	}
2857 
2858 	ut_d(page_zip->m_start = uint16_t(PAGE_DATA + d_stream->total_in));
2859 
2860 	/* Apply the modification log. */
2861 	{
2862 		const byte*	mod_log_ptr;
2863 		mod_log_ptr = page_zip_apply_log(d_stream->next_in,
2864 						 d_stream->avail_in + 1,
2865 						 recs, n_dense,
2866 						 index->n_fields,
2867 						 trx_id_col, heap_status,
2868 						 index, offsets);
2869 
2870 		if (UNIV_UNLIKELY(!mod_log_ptr)) {
2871 			return(FALSE);
2872 		}
2873 		page_zip->m_end = uint16_t(mod_log_ptr - page_zip->data);
2874 		page_zip->m_nonempty = mod_log_ptr != d_stream->next_in;
2875 	}
2876 
2877 	if (UNIV_UNLIKELY(page_zip_get_trailer_len(page_zip, TRUE)
2878 			  + page_zip->m_end >= page_zip_get_size(page_zip))) {
2879 
2880 		page_zip_fail(("page_zip_decompress_clust: %lu + %lu >= %lu\n",
2881 			       (ulong) page_zip_get_trailer_len(
2882 				       page_zip, TRUE),
2883 			       (ulong) page_zip->m_end,
2884 			       (ulong) page_zip_get_size(page_zip)));
2885 		return(FALSE);
2886 	}
2887 
2888 	storage = page_zip_dir_start_low(page_zip, n_dense);
2889 
2890 	externs = storage - n_dense
2891 		* (DATA_TRX_ID_LEN + DATA_ROLL_PTR_LEN);
2892 
2893 	/* Restore the uncompressed columns in heap_no order. */
2894 
2895 	for (slot = 0; slot < n_dense; slot++) {
2896 		ulint	i;
2897 		ulint	len;
2898 		byte*	dst;
2899 		rec_t*	rec	= recs[slot];
2900 		bool	exists	= !page_zip_dir_find_free(
2901 			page_zip, page_offset(rec));
2902 		offsets = rec_get_offsets(rec, index, offsets, index->n_fields,
2903 					  ULINT_UNDEFINED, &heap);
2904 
2905 		dst = rec_get_nth_field(rec, offsets,
2906 					trx_id_col, &len);
2907 		ut_ad(len >= DATA_TRX_ID_LEN + DATA_ROLL_PTR_LEN);
2908 		storage -= DATA_TRX_ID_LEN + DATA_ROLL_PTR_LEN;
2909 		memcpy(dst, storage,
2910 		       DATA_TRX_ID_LEN + DATA_ROLL_PTR_LEN);
2911 
2912 		/* Check if there are any externally stored
2913 		columns in this record.  For each externally
2914 		stored column, restore or clear the
2915 		BTR_EXTERN_FIELD_REF. */
2916 		if (!rec_offs_any_extern(offsets)) {
2917 			continue;
2918 		}
2919 
2920 		for (i = 0; i < rec_offs_n_fields(offsets); i++) {
2921 			if (!rec_offs_nth_extern(offsets, i)) {
2922 				continue;
2923 			}
2924 			dst = rec_get_nth_field(rec, offsets, i, &len);
2925 
2926 			if (UNIV_UNLIKELY(len < BTR_EXTERN_FIELD_REF_SIZE)) {
2927 				page_zip_fail(("page_zip_decompress_clust:"
2928 					       " %lu < 20\n",
2929 					       (ulong) len));
2930 				return(FALSE);
2931 			}
2932 
2933 			dst += len - BTR_EXTERN_FIELD_REF_SIZE;
2934 
2935 			if (UNIV_LIKELY(exists)) {
2936 				/* Existing record:
2937 				restore the BLOB pointer */
2938 				externs -= BTR_EXTERN_FIELD_REF_SIZE;
2939 
2940 				if (UNIV_UNLIKELY
2941 				    (externs < page_zip->data
2942 				     + page_zip->m_end)) {
2943 					page_zip_fail(("page_zip_"
2944 						       "decompress_clust:"
2945 						       " %p < %p + %lu\n",
2946 						       (const void*) externs,
2947 						       (const void*)
2948 						       page_zip->data,
2949 						       (ulong)
2950 						       page_zip->m_end));
2951 					return(FALSE);
2952 				}
2953 
2954 				memcpy(dst, externs,
2955 				       BTR_EXTERN_FIELD_REF_SIZE);
2956 
2957 				page_zip->n_blobs++;
2958 			} else {
2959 				/* Deleted record:
2960 				clear the BLOB pointer */
2961 				memset(dst, 0,
2962 				       BTR_EXTERN_FIELD_REF_SIZE);
2963 			}
2964 		}
2965 	}
2966 
2967 	return(TRUE);
2968 }
2969 
2970 /**********************************************************************//**
2971 Decompress a page.  This function should tolerate errors on the compressed
2972 page.  Instead of letting assertions fail, it will return FALSE if an
2973 inconsistency is detected.
2974 @return TRUE on success, FALSE on failure */
2975 static
2976 ibool
page_zip_decompress_low(page_zip_des_t * page_zip,page_t * page,ibool all)2977 page_zip_decompress_low(
2978 /*====================*/
2979 	page_zip_des_t*	page_zip,/*!< in: data, ssize;
2980 				out: m_start, m_end, m_nonempty, n_blobs */
2981 	page_t*		page,	/*!< out: uncompressed page, may be trashed */
2982 	ibool		all)	/*!< in: TRUE=decompress the whole page;
2983 				FALSE=verify but do not copy some
2984 				page header fields that should not change
2985 				after page creation */
2986 {
2987 	z_stream	d_stream;
2988 	dict_index_t*	index	= NULL;
2989 	rec_t**		recs;	/*!< dense page directory, sorted by address */
2990 	ulint		n_dense;/* number of user records on the page */
2991 	ulint		trx_id_col = ULINT_UNDEFINED;
2992 	mem_heap_t*	heap;
2993 	rec_offs*	offsets;
2994 
2995 	ut_ad(page_zip_simple_validate(page_zip));
2996 	MEM_CHECK_ADDRESSABLE(page, srv_page_size);
2997 	MEM_CHECK_DEFINED(page_zip->data, page_zip_get_size(page_zip));
2998 
2999 	/* The dense directory excludes the infimum and supremum records. */
3000 	n_dense = page_dir_get_n_heap(page_zip->data) - PAGE_HEAP_NO_USER_LOW;
3001 	if (UNIV_UNLIKELY(n_dense * PAGE_ZIP_DIR_SLOT_SIZE
3002 			  >= page_zip_get_size(page_zip))) {
3003 		page_zip_fail(("page_zip_decompress 1: %lu %lu\n",
3004 			       (ulong) n_dense,
3005 			       (ulong) page_zip_get_size(page_zip)));
3006 		return(FALSE);
3007 	}
3008 
3009 	heap = mem_heap_create(n_dense * (3 * sizeof *recs) + srv_page_size);
3010 
3011 	recs = static_cast<rec_t**>(
3012 		mem_heap_alloc(heap, n_dense * sizeof *recs));
3013 
3014 	if (all) {
3015 		/* Copy the page header. */
3016 		memcpy_aligned<2>(page, page_zip->data, PAGE_DATA);
3017 	} else {
3018 		/* Check that the bytes that we skip are identical. */
3019 #if defined UNIV_DEBUG || defined UNIV_ZIP_DEBUG
3020 		ut_a(!memcmp(FIL_PAGE_TYPE + page,
3021 			     FIL_PAGE_TYPE + page_zip->data,
3022 			     PAGE_HEADER - FIL_PAGE_TYPE));
3023 		ut_a(!memcmp(PAGE_HEADER + PAGE_LEVEL + page,
3024 			     PAGE_HEADER + PAGE_LEVEL + page_zip->data,
3025 			     PAGE_DATA - (PAGE_HEADER + PAGE_LEVEL)));
3026 #endif /* UNIV_DEBUG || UNIV_ZIP_DEBUG */
3027 
3028 		/* Copy the mutable parts of the page header. */
3029 		memcpy_aligned<8>(page, page_zip->data, FIL_PAGE_TYPE);
3030 		memcpy_aligned<2>(PAGE_HEADER + page,
3031 				  PAGE_HEADER + page_zip->data,
3032 				  PAGE_LEVEL - PAGE_N_DIR_SLOTS);
3033 
3034 #if defined UNIV_DEBUG || defined UNIV_ZIP_DEBUG
3035 		/* Check that the page headers match after copying. */
3036 		ut_a(!memcmp(page, page_zip->data, PAGE_DATA));
3037 #endif /* UNIV_DEBUG || UNIV_ZIP_DEBUG */
3038 	}
3039 
3040 #ifdef UNIV_ZIP_DEBUG
3041 	/* Clear the uncompressed page, except the header. */
3042 	memset(PAGE_DATA + page, 0x55, srv_page_size - PAGE_DATA);
3043 #endif /* UNIV_ZIP_DEBUG */
3044 	MEM_UNDEFINED(PAGE_DATA + page, srv_page_size - PAGE_DATA);
3045 
3046 	/* Copy the page directory. */
3047 	if (UNIV_UNLIKELY(!page_zip_dir_decode(page_zip, page, recs,
3048 					       n_dense))) {
3049 zlib_error:
3050 		mem_heap_free(heap);
3051 		return(FALSE);
3052 	}
3053 
3054 	/* Copy the infimum and supremum records. */
3055 	memcpy(page + (PAGE_NEW_INFIMUM - REC_N_NEW_EXTRA_BYTES),
3056 	       infimum_extra, sizeof infimum_extra);
3057 	if (page_is_empty(page)) {
3058 		rec_set_next_offs_new(page + PAGE_NEW_INFIMUM,
3059 				      PAGE_NEW_SUPREMUM);
3060 	} else {
3061 		rec_set_next_offs_new(page + PAGE_NEW_INFIMUM,
3062 				      page_zip_dir_get(page_zip, 0)
3063 				      & PAGE_ZIP_DIR_SLOT_MASK);
3064 	}
3065 	memcpy(page + PAGE_NEW_INFIMUM, infimum_data, sizeof infimum_data);
3066 	memcpy_aligned<4>(PAGE_NEW_SUPREMUM - REC_N_NEW_EXTRA_BYTES + 1
3067 			  + page, supremum_extra_data,
3068 			  sizeof supremum_extra_data);
3069 
3070 	page_zip_set_alloc(&d_stream, heap);
3071 
3072 	d_stream.next_in = page_zip->data + PAGE_DATA;
3073 	/* Subtract the space reserved for
3074 	the page header and the end marker of the modification log. */
3075 	d_stream.avail_in = static_cast<uInt>(
3076 		page_zip_get_size(page_zip) - (PAGE_DATA + 1));
3077 	d_stream.next_out = page + PAGE_ZIP_START;
3078 	d_stream.avail_out = uInt(srv_page_size - PAGE_ZIP_START);
3079 
3080 	if (UNIV_UNLIKELY(inflateInit2(&d_stream, int(srv_page_size_shift))
3081 			  != Z_OK)) {
3082 		ut_error;
3083 	}
3084 
3085 	/* Decode the zlib header and the index information. */
3086 	if (UNIV_UNLIKELY(inflate(&d_stream, Z_BLOCK) != Z_OK)) {
3087 
3088 		page_zip_fail(("page_zip_decompress:"
3089 			       " 1 inflate(Z_BLOCK)=%s\n", d_stream.msg));
3090 		goto zlib_error;
3091 	}
3092 
3093 	if (UNIV_UNLIKELY(inflate(&d_stream, Z_BLOCK) != Z_OK)) {
3094 
3095 		page_zip_fail(("page_zip_decompress:"
3096 			       " 2 inflate(Z_BLOCK)=%s\n", d_stream.msg));
3097 		goto zlib_error;
3098 	}
3099 
3100 	index = page_zip_fields_decode(
3101 		page + PAGE_ZIP_START, d_stream.next_out,
3102 		page_is_leaf(page) ? &trx_id_col : NULL,
3103 		fil_page_get_type(page) == FIL_PAGE_RTREE);
3104 
3105 	if (UNIV_UNLIKELY(!index)) {
3106 
3107 		goto zlib_error;
3108 	}
3109 
3110 	/* Decompress the user records. */
3111 	page_zip->n_blobs = 0;
3112 	d_stream.next_out = page + PAGE_ZIP_START;
3113 
3114 	{
3115 		/* Pre-allocate the offsets for rec_get_offsets_reverse(). */
3116 		ulint	n = 1 + 1/* node ptr */ + REC_OFFS_HEADER_SIZE
3117 			+ dict_index_get_n_fields(index);
3118 
3119 		offsets = static_cast<rec_offs*>(
3120 			mem_heap_alloc(heap, n * sizeof(ulint)));
3121 
3122 		rec_offs_set_n_alloc(offsets, n);
3123 	}
3124 
3125 	/* Decompress the records in heap_no order. */
3126 	if (!page_is_leaf(page)) {
3127 		/* This is a node pointer page. */
3128 		ulint	info_bits;
3129 
3130 		if (UNIV_UNLIKELY
3131 		    (!page_zip_decompress_node_ptrs(page_zip, &d_stream,
3132 						    recs, n_dense, index,
3133 						    offsets, heap))) {
3134 			goto err_exit;
3135 		}
3136 
3137 		info_bits = page_has_prev(page) ? 0 : REC_INFO_MIN_REC_FLAG;
3138 
3139 		if (UNIV_UNLIKELY(!page_zip_set_extra_bytes(page_zip, page,
3140 							    info_bits))) {
3141 			goto err_exit;
3142 		}
3143 	} else if (UNIV_LIKELY(trx_id_col == ULINT_UNDEFINED)) {
3144 		/* This is a leaf page in a secondary index. */
3145 		if (UNIV_UNLIKELY(!page_zip_decompress_sec(page_zip, &d_stream,
3146 							   recs, n_dense,
3147 							   index, offsets))) {
3148 			goto err_exit;
3149 		}
3150 
3151 		if (UNIV_UNLIKELY(!page_zip_set_extra_bytes(page_zip,
3152 							    page, 0))) {
3153 err_exit:
3154 			page_zip_fields_free(index);
3155 			mem_heap_free(heap);
3156 			return(FALSE);
3157 		}
3158 	} else {
3159 		/* This is a leaf page in a clustered index. */
3160 		if (UNIV_UNLIKELY(!page_zip_decompress_clust(page_zip,
3161 							     &d_stream, recs,
3162 							     n_dense, index,
3163 							     trx_id_col,
3164 							     offsets, heap))) {
3165 			goto err_exit;
3166 		}
3167 
3168 		if (UNIV_UNLIKELY(!page_zip_set_extra_bytes(page_zip,
3169 							    page, 0))) {
3170 			goto err_exit;
3171 		}
3172 	}
3173 
3174 	ut_a(page_is_comp(page));
3175 	MEM_CHECK_DEFINED(page, srv_page_size);
3176 
3177 	page_zip_fields_free(index);
3178 	mem_heap_free(heap);
3179 
3180 	return(TRUE);
3181 }
3182 
3183 /**********************************************************************//**
3184 Decompress a page.  This function should tolerate errors on the compressed
3185 page.  Instead of letting assertions fail, it will return FALSE if an
3186 inconsistency is detected.
3187 @return TRUE on success, FALSE on failure */
3188 ibool
page_zip_decompress(page_zip_des_t * page_zip,page_t * page,ibool all)3189 page_zip_decompress(
3190 /*================*/
3191 	page_zip_des_t*	page_zip,/*!< in: data, ssize;
3192 				out: m_start, m_end, m_nonempty, n_blobs */
3193 	page_t*		page,	/*!< out: uncompressed page, may be trashed */
3194 	ibool		all)	/*!< in: TRUE=decompress the whole page;
3195 				FALSE=verify but do not copy some
3196 				page header fields that should not change
3197 				after page creation */
3198 {
3199 	const ulonglong ns = my_interval_timer();
3200 
3201 	if (!page_zip_decompress_low(page_zip, page, all)) {
3202 		return(FALSE);
3203 	}
3204 
3205 	const uint64_t time_diff = (my_interval_timer() - ns) / 1000;
3206 	page_zip_stat[page_zip->ssize - 1].decompressed++;
3207 	page_zip_stat[page_zip->ssize - 1].decompressed_usec += time_diff;
3208 
3209 	index_id_t	index_id = btr_page_get_index_id(page);
3210 
3211 	if (srv_cmp_per_index_enabled) {
3212 		mutex_enter(&page_zip_stat_per_index_mutex);
3213 		page_zip_stat_per_index[index_id].decompressed++;
3214 		page_zip_stat_per_index[index_id].decompressed_usec += time_diff;
3215 		mutex_exit(&page_zip_stat_per_index_mutex);
3216 	}
3217 
3218 	/* Update the stat counter for LRU policy. */
3219 	buf_LRU_stat_inc_unzip();
3220 
3221 	MONITOR_INC(MONITOR_PAGE_DECOMPRESS);
3222 
3223 	return(TRUE);
3224 }
3225 
3226 #ifdef UNIV_ZIP_DEBUG
3227 /**********************************************************************//**
3228 Dump a block of memory on the standard error stream. */
3229 static
3230 void
page_zip_hexdump_func(const char * name,const void * buf,ulint size)3231 page_zip_hexdump_func(
3232 /*==================*/
3233 	const char*	name,	/*!< in: name of the data structure */
3234 	const void*	buf,	/*!< in: data */
3235 	ulint		size)	/*!< in: length of the data, in bytes */
3236 {
3237 	const byte*	s	= static_cast<const byte*>(buf);
3238 	ulint		addr;
3239 	const ulint	width	= 32; /* bytes per line */
3240 
3241 	fprintf(stderr, "%s:\n", name);
3242 
3243 	for (addr = 0; addr < size; addr += width) {
3244 		ulint	i;
3245 
3246 		fprintf(stderr, "%04lx ", (ulong) addr);
3247 
3248 		i = ut_min(width, size - addr);
3249 
3250 		while (i--) {
3251 			fprintf(stderr, "%02x", *s++);
3252 		}
3253 
3254 		putc('\n', stderr);
3255 	}
3256 }
3257 
3258 /** Dump a block of memory on the standard error stream.
3259 @param buf in: data
3260 @param size in: length of the data, in bytes */
3261 #define page_zip_hexdump(buf, size) page_zip_hexdump_func(#buf, buf, size)
3262 
3263 /** Flag: make page_zip_validate() compare page headers only */
3264 bool	page_zip_validate_header_only;
3265 
3266 /**********************************************************************//**
3267 Check that the compressed and decompressed pages match.
3268 @return TRUE if valid, FALSE if not */
3269 ibool
page_zip_validate_low(const page_zip_des_t * page_zip,const page_t * page,const dict_index_t * index,ibool sloppy)3270 page_zip_validate_low(
3271 /*==================*/
3272 	const page_zip_des_t*	page_zip,/*!< in: compressed page */
3273 	const page_t*		page,	/*!< in: uncompressed page */
3274 	const dict_index_t*	index,	/*!< in: index of the page, if known */
3275 	ibool			sloppy)	/*!< in: FALSE=strict,
3276 					TRUE=ignore the MIN_REC_FLAG */
3277 {
3278 	page_zip_des_t	temp_page_zip;
3279 	ibool		valid;
3280 
3281 	if (memcmp(page_zip->data + FIL_PAGE_PREV, page + FIL_PAGE_PREV,
3282 		   FIL_PAGE_LSN - FIL_PAGE_PREV)
3283 	    || memcmp(page_zip->data + FIL_PAGE_TYPE, page + FIL_PAGE_TYPE, 2)
3284 	    || memcmp(page_zip->data + FIL_PAGE_DATA, page + FIL_PAGE_DATA,
3285 		      PAGE_ROOT_AUTO_INC)
3286 	    /* The PAGE_ROOT_AUTO_INC can be updated while holding an SX-latch
3287 	    on the clustered index root page (page number 3 in .ibd files).
3288 	    That allows concurrent readers (holding buf_block_t::lock S-latch).
3289 	    Because we do not know what type of a latch our caller is holding,
3290 	    we will ignore the field on clustered index root pages in order
3291 	    to avoid false positives. */
3292 	    || (page_get_page_no(page) != 3/* clustered index root page */
3293 		&& memcmp(&page_zip->data[FIL_PAGE_DATA + PAGE_ROOT_AUTO_INC],
3294 			  &page[FIL_PAGE_DATA + PAGE_ROOT_AUTO_INC], 8))
3295 	    || memcmp(&page_zip->data[FIL_PAGE_DATA + PAGE_HEADER_PRIV_END],
3296 		      &page[FIL_PAGE_DATA + PAGE_HEADER_PRIV_END],
3297 		      PAGE_DATA - FIL_PAGE_DATA - PAGE_HEADER_PRIV_END)) {
3298 		page_zip_fail(("page_zip_validate: page header\n"));
3299 		page_zip_hexdump(page_zip, sizeof *page_zip);
3300 		page_zip_hexdump(page_zip->data, page_zip_get_size(page_zip));
3301 		page_zip_hexdump(page, srv_page_size);
3302 		return(FALSE);
3303 	}
3304 
3305 	ut_a(page_is_comp(page));
3306 
3307 	if (page_zip_validate_header_only) {
3308 		return(TRUE);
3309 	}
3310 
3311 	/* page_zip_decompress() expects the uncompressed page to be
3312 	srv_page_size aligned. */
3313 	page_t* temp_page = static_cast<byte*>(aligned_malloc(srv_page_size,
3314 							      srv_page_size));
3315 
3316 	MEM_CHECK_DEFINED(page, srv_page_size);
3317 	MEM_CHECK_DEFINED(page_zip->data, page_zip_get_size(page_zip));
3318 
3319 	temp_page_zip = *page_zip;
3320 	valid = page_zip_decompress_low(&temp_page_zip, temp_page, TRUE);
3321 	if (!valid) {
3322 		fputs("page_zip_validate(): failed to decompress\n", stderr);
3323 		goto func_exit;
3324 	}
3325 	if (page_zip->n_blobs != temp_page_zip.n_blobs) {
3326 		page_zip_fail(("page_zip_validate: n_blobs: %u!=%u\n",
3327 			       page_zip->n_blobs, temp_page_zip.n_blobs));
3328 		valid = FALSE;
3329 	}
3330 #ifdef UNIV_DEBUG
3331 	if (page_zip->m_start != temp_page_zip.m_start) {
3332 		page_zip_fail(("page_zip_validate: m_start: %u!=%u\n",
3333 			       page_zip->m_start, temp_page_zip.m_start));
3334 		valid = FALSE;
3335 	}
3336 #endif /* UNIV_DEBUG */
3337 	if (page_zip->m_end != temp_page_zip.m_end) {
3338 		page_zip_fail(("page_zip_validate: m_end: %u!=%u\n",
3339 			       page_zip->m_end, temp_page_zip.m_end));
3340 		valid = FALSE;
3341 	}
3342 	if (page_zip->m_nonempty != temp_page_zip.m_nonempty) {
3343 		page_zip_fail(("page_zip_validate(): m_nonempty: %u!=%u\n",
3344 			       page_zip->m_nonempty,
3345 			       temp_page_zip.m_nonempty));
3346 		valid = FALSE;
3347 	}
3348 	if (memcmp(page + PAGE_HEADER, temp_page + PAGE_HEADER,
3349 		   srv_page_size - PAGE_HEADER - FIL_PAGE_DATA_END)) {
3350 
3351 		/* In crash recovery, the "minimum record" flag may be
3352 		set incorrectly until the mini-transaction is
3353 		committed.  Let us tolerate that difference when we
3354 		are performing a sloppy validation. */
3355 
3356 		rec_offs*	offsets;
3357 		mem_heap_t*	heap;
3358 		const rec_t*	rec;
3359 		const rec_t*	trec;
3360 		byte		info_bits_diff;
3361 		ulint		offset
3362 			= rec_get_next_offs(page + PAGE_NEW_INFIMUM, TRUE);
3363 		ut_a(offset >= PAGE_NEW_SUPREMUM);
3364 		offset -= 5/*REC_NEW_INFO_BITS*/;
3365 
3366 		info_bits_diff = page[offset] ^ temp_page[offset];
3367 
3368 		if (info_bits_diff == REC_INFO_MIN_REC_FLAG) {
3369 			temp_page[offset] = page[offset];
3370 
3371 			if (!memcmp(page + PAGE_HEADER,
3372 				    temp_page + PAGE_HEADER,
3373 				    srv_page_size - PAGE_HEADER
3374 				    - FIL_PAGE_DATA_END)) {
3375 
3376 				/* Only the minimum record flag
3377 				differed.  Let us ignore it. */
3378 				page_zip_fail(("page_zip_validate:"
3379 					       " min_rec_flag"
3380 					       " (%s" ULINTPF "," ULINTPF
3381 					       ",0x%02x)\n",
3382 					       sloppy ? "ignored, " : "",
3383 					       page_get_space_id(page),
3384 					       page_get_page_no(page),
3385 					       page[offset]));
3386 				/* We don't check for spatial index, since
3387 				the "minimum record" could be deleted when
3388 				doing rtr_update_mbr_field.
3389 				GIS_FIXME: need to validate why
3390 				rtr_update_mbr_field.() could affect this */
3391 				if (index && dict_index_is_spatial(index)) {
3392 					valid = true;
3393 				} else {
3394 					valid = sloppy;
3395 				}
3396 				goto func_exit;
3397 			}
3398 		}
3399 
3400 		/* Compare the pointers in the PAGE_FREE list. */
3401 		rec = page_header_get_ptr(page, PAGE_FREE);
3402 		trec = page_header_get_ptr(temp_page, PAGE_FREE);
3403 
3404 		while (rec || trec) {
3405 			if (page_offset(rec) != page_offset(trec)) {
3406 				page_zip_fail(("page_zip_validate:"
3407 					       " PAGE_FREE list: %u!=%u\n",
3408 					       (unsigned) page_offset(rec),
3409 					       (unsigned) page_offset(trec)));
3410 				valid = FALSE;
3411 				goto func_exit;
3412 			}
3413 
3414 			rec = page_rec_get_next_low(rec, TRUE);
3415 			trec = page_rec_get_next_low(trec, TRUE);
3416 		}
3417 
3418 		/* Compare the records. */
3419 		heap = NULL;
3420 		offsets = NULL;
3421 		rec = page_rec_get_next_low(
3422 			page + PAGE_NEW_INFIMUM, TRUE);
3423 		trec = page_rec_get_next_low(
3424 			temp_page + PAGE_NEW_INFIMUM, TRUE);
3425 		const ulint n_core = page_is_leaf(page) ? index->n_fields : 0;
3426 
3427 		do {
3428 			if (page_offset(rec) != page_offset(trec)) {
3429 				page_zip_fail(("page_zip_validate:"
3430 					       " record list: 0x%02x!=0x%02x\n",
3431 					       (unsigned) page_offset(rec),
3432 					       (unsigned) page_offset(trec)));
3433 				valid = FALSE;
3434 				break;
3435 			}
3436 
3437 			if (index) {
3438 				/* Compare the data. */
3439 				offsets = rec_get_offsets(
3440 					rec, index, offsets, n_core,
3441 					ULINT_UNDEFINED, &heap);
3442 
3443 				if (memcmp(rec - rec_offs_extra_size(offsets),
3444 					   trec - rec_offs_extra_size(offsets),
3445 					   rec_offs_size(offsets))) {
3446 					page_zip_fail(
3447 						("page_zip_validate:"
3448 						 " record content: 0x%02x",
3449 						 (unsigned) page_offset(rec)));
3450 					valid = FALSE;
3451 					break;
3452 				}
3453 			}
3454 
3455 			rec = page_rec_get_next_low(rec, TRUE);
3456 			trec = page_rec_get_next_low(trec, TRUE);
3457 		} while (rec || trec);
3458 
3459 		if (heap) {
3460 			mem_heap_free(heap);
3461 		}
3462 	}
3463 
3464 func_exit:
3465 	if (!valid) {
3466 		page_zip_hexdump(page_zip, sizeof *page_zip);
3467 		page_zip_hexdump(page_zip->data, page_zip_get_size(page_zip));
3468 		page_zip_hexdump(page, srv_page_size);
3469 		page_zip_hexdump(temp_page, srv_page_size);
3470 	}
3471 	aligned_free(temp_page);
3472 	return(valid);
3473 }
3474 
3475 /**********************************************************************//**
3476 Check that the compressed and decompressed pages match.
3477 @return TRUE if valid, FALSE if not */
3478 ibool
page_zip_validate(const page_zip_des_t * page_zip,const page_t * page,const dict_index_t * index)3479 page_zip_validate(
3480 /*==============*/
3481 	const page_zip_des_t*	page_zip,/*!< in: compressed page */
3482 	const page_t*		page,	/*!< in: uncompressed page */
3483 	const dict_index_t*	index)	/*!< in: index of the page, if known */
3484 {
3485 	return(page_zip_validate_low(page_zip, page, index,
3486 				     recv_recovery_is_on()));
3487 }
3488 #endif /* UNIV_ZIP_DEBUG */
3489 
3490 #ifdef UNIV_DEBUG
3491 /**********************************************************************//**
3492 Assert that the compressed and decompressed page headers match.
3493 @return TRUE */
3494 static
3495 ibool
page_zip_header_cmp(const page_zip_des_t * page_zip,const byte * page)3496 page_zip_header_cmp(
3497 /*================*/
3498 	const page_zip_des_t*	page_zip,/*!< in: compressed page */
3499 	const byte*		page)	/*!< in: uncompressed page */
3500 {
3501 	ut_ad(!memcmp(page_zip->data + FIL_PAGE_PREV, page + FIL_PAGE_PREV,
3502 		      FIL_PAGE_LSN - FIL_PAGE_PREV));
3503 	ut_ad(!memcmp(page_zip->data + FIL_PAGE_TYPE, page + FIL_PAGE_TYPE,
3504 		      2));
3505 	ut_ad(!memcmp(page_zip->data + FIL_PAGE_DATA, page + FIL_PAGE_DATA,
3506 		      PAGE_DATA - FIL_PAGE_DATA));
3507 
3508 	return(TRUE);
3509 }
3510 #endif /* UNIV_DEBUG */
3511 
3512 /**********************************************************************//**
3513 Write a record on the compressed page that contains externally stored
3514 columns.  The data must already have been written to the uncompressed page.
3515 @return end of modification log */
3516 static
3517 byte*
page_zip_write_rec_ext(buf_block_t * block,const byte * rec,const dict_index_t * index,const rec_offs * offsets,ulint create,ulint trx_id_col,ulint heap_no,byte * storage,byte * data,mtr_t * mtr)3518 page_zip_write_rec_ext(
3519 /*===================*/
3520 	buf_block_t*	block,		/*!< in/out: compressed page */
3521 	const byte*	rec,		/*!< in: record being written */
3522 	const dict_index_t*index,	/*!< in: record descriptor */
3523 	const rec_offs*	offsets,	/*!< in: rec_get_offsets(rec, index) */
3524 	ulint		create,		/*!< in: nonzero=insert, zero=update */
3525 	ulint		trx_id_col,	/*!< in: position of DB_TRX_ID */
3526 	ulint		heap_no,	/*!< in: heap number of rec */
3527 	byte*		storage,	/*!< in: end of dense page directory */
3528 	byte*		data,		/*!< in: end of modification log */
3529 	mtr_t*		mtr)		/*!< in/out: mini-transaction */
3530 {
3531 	const byte*	start	= rec;
3532 	ulint		i;
3533 	ulint		len;
3534 	byte*		externs	= storage;
3535 	ulint		n_ext	= rec_offs_n_extern(offsets);
3536 	const page_t* const page = block->frame;
3537 	page_zip_des_t* const page_zip = &block->page.zip;
3538 
3539 	ut_ad(rec_offs_validate(rec, index, offsets));
3540 	MEM_CHECK_DEFINED(rec, rec_offs_data_size(offsets));
3541 	MEM_CHECK_DEFINED(rec - rec_offs_extra_size(offsets),
3542 			  rec_offs_extra_size(offsets));
3543 
3544 	externs -= (DATA_TRX_ID_LEN + DATA_ROLL_PTR_LEN)
3545 		* (page_dir_get_n_heap(page) - PAGE_HEAP_NO_USER_LOW);
3546 
3547 	/* Note that this will not take into account
3548 	the BLOB columns of rec if create==TRUE. */
3549 	ut_ad(data + rec_offs_data_size(offsets)
3550 	      - (DATA_TRX_ID_LEN + DATA_ROLL_PTR_LEN)
3551 	      - n_ext * FIELD_REF_SIZE
3552 	      < externs - FIELD_REF_SIZE * page_zip->n_blobs);
3553 
3554 	if (n_ext) {
3555 		ulint	blob_no = page_zip_get_n_prev_extern(
3556 			page_zip, rec, index);
3557 		byte*	ext_end = externs - page_zip->n_blobs * FIELD_REF_SIZE;
3558 		ut_ad(blob_no <= page_zip->n_blobs);
3559 		externs -= blob_no * FIELD_REF_SIZE;
3560 
3561 		if (create) {
3562 			page_zip->n_blobs = (page_zip->n_blobs + n_ext)
3563 				& ((1U << 12) - 1);
3564 			ASSERT_ZERO_BLOB(ext_end - n_ext * FIELD_REF_SIZE);
3565 			if (ulint len = ulint(externs - ext_end)) {
3566 				byte* ext_start = ext_end
3567 					- n_ext * FIELD_REF_SIZE;
3568 				memmove(ext_start, ext_end, len);
3569 				mtr->memmove(*block,
3570 					     ext_start - page_zip->data,
3571 					     ext_end - page_zip->data, len);
3572 			}
3573 		}
3574 
3575 		ut_a(blob_no + n_ext <= page_zip->n_blobs);
3576 	}
3577 
3578 	for (i = 0; i < rec_offs_n_fields(offsets); i++) {
3579 		const byte*	src;
3580 
3581 		if (UNIV_UNLIKELY(i == trx_id_col)) {
3582 			ut_ad(!rec_offs_nth_extern(offsets,
3583 						   i));
3584 			ut_ad(!rec_offs_nth_extern(offsets,
3585 						   i + 1));
3586 			/* Locate trx_id and roll_ptr. */
3587 			src = rec_get_nth_field(rec, offsets,
3588 						i, &len);
3589 			ut_ad(len == DATA_TRX_ID_LEN);
3590 			ut_ad(src + DATA_TRX_ID_LEN
3591 			      == rec_get_nth_field(
3592 				      rec, offsets,
3593 				      i + 1, &len));
3594 			ut_ad(len == DATA_ROLL_PTR_LEN);
3595 
3596 			/* Log the preceding fields. */
3597 			ASSERT_ZERO(data, src - start);
3598 			memcpy(data, start, ulint(src - start));
3599 			data += src - start;
3600 			start = src + (DATA_TRX_ID_LEN
3601 				       + DATA_ROLL_PTR_LEN);
3602 
3603 			/* Store trx_id and roll_ptr. */
3604 			constexpr ulint sys_len = DATA_TRX_ID_LEN
3605 				+ DATA_ROLL_PTR_LEN;
3606 			byte* sys = storage - sys_len * (heap_no - 1);
3607 			memcpy(sys, src, sys_len);
3608 			i++; /* skip also roll_ptr */
3609 			mtr->zmemcpy(*block, sys - page_zip->data, sys_len);
3610 		} else if (rec_offs_nth_extern(offsets, i)) {
3611 			src = rec_get_nth_field(rec, offsets,
3612 						i, &len);
3613 
3614 			ut_ad(dict_index_is_clust(index));
3615 			ut_ad(len >= FIELD_REF_SIZE);
3616 			src += len - FIELD_REF_SIZE;
3617 
3618 			ASSERT_ZERO(data, src - start);
3619 			memcpy(data, start, ulint(src - start));
3620 			data += src - start;
3621 			start = src + FIELD_REF_SIZE;
3622 
3623 			/* Store the BLOB pointer. */
3624 			externs -= FIELD_REF_SIZE;
3625 			ut_ad(data < externs);
3626 			memcpy(externs, src, FIELD_REF_SIZE);
3627 			mtr->zmemcpy(*block, externs - page_zip->data,
3628 				     FIELD_REF_SIZE);
3629 		}
3630 	}
3631 
3632 	/* Log the last bytes of the record. */
3633 	len = rec_offs_data_size(offsets) - ulint(start - rec);
3634 
3635 	ASSERT_ZERO(data, len);
3636 	memcpy(data, start, len);
3637 	data += len;
3638 
3639 	return(data);
3640 }
3641 
3642 /** Write an entire record to the ROW_FORMAT=COMPRESSED page.
3643 The data must already have been written to the uncompressed page.
3644 @param[in,out]	block		ROW_FORMAT=COMPRESSED page
3645 @param[in]	rec		record in the uncompressed page
3646 @param[in]	index		the index that the page belongs to
3647 @param[in]	offsets		rec_get_offsets(rec, index)
3648 @param[in]	create		nonzero=insert, zero=update
3649 @param[in,out]	mtr		mini-transaction */
page_zip_write_rec(buf_block_t * block,const byte * rec,const dict_index_t * index,const rec_offs * offsets,ulint create,mtr_t * mtr)3650 void page_zip_write_rec(buf_block_t *block, const byte *rec,
3651                         const dict_index_t *index, const rec_offs *offsets,
3652                         ulint create, mtr_t *mtr)
3653 {
3654 	const page_t* const page = block->frame;
3655 	page_zip_des_t* const page_zip = &block->page.zip;
3656 	byte*		data;
3657 	byte*		storage;
3658 	ulint		heap_no;
3659 	byte*		slot;
3660 
3661 	ut_ad(page_zip_simple_validate(page_zip));
3662 	ut_ad(page_zip_get_size(page_zip)
3663 	      > PAGE_DATA + page_zip_dir_size(page_zip));
3664 	ut_ad(rec_offs_comp(offsets));
3665 	ut_ad(rec_offs_validate(rec, index, offsets));
3666 
3667 	ut_ad(page_zip->m_start >= PAGE_DATA);
3668 
3669 	ut_ad(page_zip_header_cmp(page_zip, page));
3670 	ut_ad(page_simple_validate_new((page_t*) page));
3671 
3672 	MEM_CHECK_DEFINED(page_zip->data, page_zip_get_size(page_zip));
3673 	MEM_CHECK_DEFINED(rec, rec_offs_data_size(offsets));
3674 	MEM_CHECK_DEFINED(rec - rec_offs_extra_size(offsets),
3675 			  rec_offs_extra_size(offsets));
3676 
3677 	slot = page_zip_dir_find(page_zip, page_offset(rec));
3678 	ut_a(slot);
3679 	byte s = *slot;
3680 	/* Copy the delete mark. */
3681 	if (rec_get_deleted_flag(rec, TRUE)) {
3682 		/* In delete-marked records, DB_TRX_ID must
3683 		always refer to an existing undo log record.
3684 		On non-leaf pages, the delete-mark flag is garbage. */
3685 		ut_ad(!index->is_primary() || !page_is_leaf(page)
3686 		      || row_get_rec_trx_id(rec, index, offsets));
3687 		s |= PAGE_ZIP_DIR_SLOT_DEL >> 8;
3688 	} else {
3689 		s &= byte(~(PAGE_ZIP_DIR_SLOT_DEL >> 8));
3690 	}
3691 
3692 	if (s != *slot) {
3693 		*slot = s;
3694 		mtr->zmemcpy(*block, slot - page_zip->data, 1);
3695 	}
3696 
3697 	ut_ad(rec_get_start((rec_t*) rec, offsets) >= page + PAGE_ZIP_START);
3698 	ut_ad(rec_get_end((rec_t*) rec, offsets) <= page + srv_page_size
3699 	      - PAGE_DIR - PAGE_DIR_SLOT_SIZE
3700 	      * page_dir_get_n_slots(page));
3701 
3702 	heap_no = rec_get_heap_no_new(rec);
3703 	ut_ad(heap_no >= PAGE_HEAP_NO_USER_LOW); /* not infimum or supremum */
3704 	ut_ad(heap_no < page_dir_get_n_heap(page));
3705 
3706 	/* Append to the modification log. */
3707 	data = page_zip->data + page_zip->m_end;
3708 	ut_ad(!*data);
3709 
3710 	/* Identify the record by writing its heap number - 1.
3711 	0 is reserved to indicate the end of the modification log. */
3712 
3713 	if (UNIV_UNLIKELY(heap_no - 1 >= 64)) {
3714 		*data++ = (byte) (0x80 | (heap_no - 1) >> 7);
3715 		ut_ad(!*data);
3716 	}
3717 	*data++ = (byte) ((heap_no - 1) << 1);
3718 	ut_ad(!*data);
3719 
3720 	{
3721 		const byte*	start	= rec - rec_offs_extra_size(offsets);
3722 		const byte*	b	= rec - REC_N_NEW_EXTRA_BYTES;
3723 
3724 		/* Write the extra bytes backwards, so that
3725 		rec_offs_extra_size() can be easily computed in
3726 		page_zip_apply_log() by invoking
3727 		rec_get_offsets_reverse(). */
3728 
3729 		while (b != start) {
3730 			*data++ = *--b;
3731 			ut_ad(!*data);
3732 		}
3733 	}
3734 
3735 	/* Write the data bytes.  Store the uncompressed bytes separately. */
3736 	storage = page_zip_dir_start(page_zip);
3737 
3738 	if (page_is_leaf(page)) {
3739 		if (dict_index_is_clust(index)) {
3740 			/* Store separately trx_id, roll_ptr and
3741 			the BTR_EXTERN_FIELD_REF of each BLOB column. */
3742 			if (rec_offs_any_extern(offsets)) {
3743 				data = page_zip_write_rec_ext(
3744 					block,
3745 					rec, index, offsets, create,
3746 					index->db_trx_id(), heap_no,
3747 					storage, data, mtr);
3748 			} else {
3749 				/* Locate trx_id and roll_ptr. */
3750 				ulint len;
3751 				const byte*	src
3752 					= rec_get_nth_field(rec, offsets,
3753 							    index->db_trx_id(),
3754 							    &len);
3755 				ut_ad(len == DATA_TRX_ID_LEN);
3756 				ut_ad(src + DATA_TRX_ID_LEN
3757 				      == rec_get_nth_field(
3758 					      rec, offsets,
3759 					      index->db_roll_ptr(), &len));
3760 				ut_ad(len == DATA_ROLL_PTR_LEN);
3761 
3762 				/* Log the preceding fields. */
3763 				ASSERT_ZERO(data, src - rec);
3764 				memcpy(data, rec, ulint(src - rec));
3765 				data += src - rec;
3766 
3767 				/* Store trx_id and roll_ptr. */
3768 				constexpr ulint sys_len
3769 					= DATA_TRX_ID_LEN + DATA_ROLL_PTR_LEN;
3770 				byte* sys = storage - sys_len * (heap_no - 1);
3771 				memcpy(sys, src, sys_len);
3772 
3773 				src += sys_len;
3774 				mtr->zmemcpy(*block, sys - page_zip->data,
3775 					     sys_len);
3776 				/* Log the last bytes of the record. */
3777 				len = rec_offs_data_size(offsets)
3778 					- ulint(src - rec);
3779 
3780 				ASSERT_ZERO(data, len);
3781 				memcpy(data, src, len);
3782 				data += len;
3783 			}
3784 		} else {
3785 			/* Leaf page of a secondary index:
3786 			no externally stored columns */
3787 			ut_ad(!rec_offs_any_extern(offsets));
3788 
3789 			/* Log the entire record. */
3790 			ulint len = rec_offs_data_size(offsets);
3791 
3792 			ASSERT_ZERO(data, len);
3793 			memcpy(data, rec, len);
3794 			data += len;
3795 		}
3796 	} else {
3797 		/* This is a node pointer page. */
3798 		/* Non-leaf nodes should not have any externally
3799 		stored columns. */
3800 		ut_ad(!rec_offs_any_extern(offsets));
3801 
3802 		/* Copy the data bytes, except node_ptr. */
3803 		ulint len = rec_offs_data_size(offsets) - REC_NODE_PTR_SIZE;
3804 		ut_ad(data + len < storage - REC_NODE_PTR_SIZE
3805 		      * (page_dir_get_n_heap(page) - PAGE_HEAP_NO_USER_LOW));
3806 		ASSERT_ZERO(data, len);
3807 		memcpy(data, rec, len);
3808 		data += len;
3809 
3810 		/* Copy the node pointer to the uncompressed area. */
3811 		byte* node_ptr = storage - REC_NODE_PTR_SIZE * (heap_no - 1);
3812 		mtr->zmemcpy<mtr_t::MAYBE_NOP>(*block, node_ptr,
3813 					       rec + len, REC_NODE_PTR_SIZE);
3814 	}
3815 
3816 	ut_a(!*data);
3817 	ut_ad((ulint) (data - page_zip->data) < page_zip_get_size(page_zip));
3818 	mtr->zmemcpy(*block, page_zip->m_end,
3819 		     data - page_zip->data - page_zip->m_end);
3820 	page_zip->m_end = uint16_t(data - page_zip->data);
3821 	page_zip->m_nonempty = TRUE;
3822 
3823 #ifdef UNIV_ZIP_DEBUG
3824 	ut_a(page_zip_validate(page_zip, page_align(rec), index));
3825 #endif /* UNIV_ZIP_DEBUG */
3826 }
3827 
3828 /**********************************************************************//**
3829 Write a BLOB pointer of a record on the leaf page of a clustered index.
3830 The information must already have been updated on the uncompressed page. */
3831 void
page_zip_write_blob_ptr(buf_block_t * block,const byte * rec,dict_index_t * index,const rec_offs * offsets,ulint n,mtr_t * mtr)3832 page_zip_write_blob_ptr(
3833 /*====================*/
3834 	buf_block_t*	block,	/*!< in/out: ROW_FORMAT=COMPRESSED page */
3835 	const byte*	rec,	/*!< in/out: record whose data is being
3836 				written */
3837 	dict_index_t*	index,	/*!< in: index of the page */
3838 	const rec_offs*	offsets,/*!< in: rec_get_offsets(rec, index) */
3839 	ulint		n,	/*!< in: column index */
3840 	mtr_t*		mtr)	/*!< in/out: mini-transaction */
3841 {
3842 	const byte*	field;
3843 	byte*		externs;
3844 	const page_t* const page = block->frame;
3845 	page_zip_des_t* const page_zip = &block->page.zip;
3846 	ulint		blob_no;
3847 	ulint		len;
3848 
3849 	ut_ad(page_align(rec) == page);
3850 	ut_ad(index != NULL);
3851 	ut_ad(offsets != NULL);
3852 	ut_ad(page_simple_validate_new((page_t*) page));
3853 	ut_ad(page_zip_simple_validate(page_zip));
3854 	ut_ad(page_zip_get_size(page_zip)
3855 	      > PAGE_DATA + page_zip_dir_size(page_zip));
3856 	ut_ad(rec_offs_comp(offsets));
3857 	ut_ad(rec_offs_validate(rec, NULL, offsets));
3858 	ut_ad(rec_offs_any_extern(offsets));
3859 	ut_ad(rec_offs_nth_extern(offsets, n));
3860 
3861 	ut_ad(page_zip->m_start >= PAGE_DATA);
3862 	ut_ad(page_zip_header_cmp(page_zip, page));
3863 
3864 	ut_ad(page_is_leaf(page));
3865 	ut_ad(dict_index_is_clust(index));
3866 
3867 	MEM_CHECK_DEFINED(page_zip->data, page_zip_get_size(page_zip));
3868 	MEM_CHECK_DEFINED(rec, rec_offs_data_size(offsets));
3869 	MEM_CHECK_DEFINED(rec - rec_offs_extra_size(offsets),
3870 			  rec_offs_extra_size(offsets));
3871 
3872 	blob_no = page_zip_get_n_prev_extern(page_zip, rec, index)
3873 		+ rec_get_n_extern_new(rec, index, n);
3874 	ut_a(blob_no < page_zip->n_blobs);
3875 
3876 	externs = page_zip->data + page_zip_get_size(page_zip)
3877 		- (page_dir_get_n_heap(page) - PAGE_HEAP_NO_USER_LOW)
3878 		* PAGE_ZIP_CLUST_LEAF_SLOT_SIZE;
3879 
3880 	field = rec_get_nth_field(rec, offsets, n, &len);
3881 
3882 	externs -= (blob_no + 1) * BTR_EXTERN_FIELD_REF_SIZE;
3883 	field += len - BTR_EXTERN_FIELD_REF_SIZE;
3884 
3885 	mtr->zmemcpy<mtr_t::MAYBE_NOP>(*block, externs, field,
3886 				       BTR_EXTERN_FIELD_REF_SIZE);
3887 
3888 #ifdef UNIV_ZIP_DEBUG
3889 	ut_a(page_zip_validate(page_zip, page, index));
3890 #endif /* UNIV_ZIP_DEBUG */
3891 }
3892 
3893 /**********************************************************************//**
3894 Write the node pointer of a record on a non-leaf compressed page. */
3895 void
page_zip_write_node_ptr(buf_block_t * block,byte * rec,ulint size,ulint ptr,mtr_t * mtr)3896 page_zip_write_node_ptr(
3897 /*====================*/
3898 	buf_block_t*	block,	/*!< in/out: compressed page */
3899 	byte*		rec,	/*!< in/out: record */
3900 	ulint		size,	/*!< in: data size of rec */
3901 	ulint		ptr,	/*!< in: node pointer */
3902 	mtr_t*		mtr)	/*!< in/out: mini-transaction */
3903 {
3904 	byte*	field;
3905 	byte*	storage;
3906 	page_zip_des_t* const page_zip = &block->page.zip;
3907 
3908 	ut_d(const page_t* const page = block->frame);
3909 	ut_ad(page_simple_validate_new(page));
3910 	ut_ad(page_zip_simple_validate(page_zip));
3911 	ut_ad(page_zip_get_size(page_zip)
3912 	      > PAGE_DATA + page_zip_dir_size(page_zip));
3913 	ut_ad(page_rec_is_comp(rec));
3914 
3915 	ut_ad(page_zip->m_start >= PAGE_DATA);
3916 	ut_ad(page_zip_header_cmp(page_zip, page));
3917 
3918 	ut_ad(!page_is_leaf(page));
3919 
3920 	MEM_CHECK_DEFINED(page_zip->data, page_zip_get_size(page_zip));
3921 	MEM_CHECK_DEFINED(rec, size);
3922 
3923 	storage = page_zip_dir_start(page_zip)
3924 		- (rec_get_heap_no_new(rec) - 1) * REC_NODE_PTR_SIZE;
3925 	field = rec + size - REC_NODE_PTR_SIZE;
3926 
3927 #if defined UNIV_DEBUG || defined UNIV_ZIP_DEBUG
3928 	ut_a(!memcmp(storage, field, REC_NODE_PTR_SIZE));
3929 #endif /* UNIV_DEBUG || UNIV_ZIP_DEBUG */
3930 	compile_time_assert(REC_NODE_PTR_SIZE == 4);
3931 	mach_write_to_4(field, ptr);
3932 	mtr->zmemcpy(*block, storage, field, REC_NODE_PTR_SIZE);
3933 }
3934 
3935 /** Write the DB_TRX_ID,DB_ROLL_PTR into a clustered index leaf page record.
3936 @param[in,out]	block		ROW_FORMAT=COMPRESSED page
3937 @param[in,out]	rec		record
3938 @param[in]	offsets		rec_get_offsets(rec, index)
3939 @param[in]	trx_id_field	field number of DB_TRX_ID (number of PK fields)
3940 @param[in]	trx_id		DB_TRX_ID value (transaction identifier)
3941 @param[in]	roll_ptr	DB_ROLL_PTR value (undo log pointer)
3942 @param[in,out]	mtr		mini-transaction */
3943 void
page_zip_write_trx_id_and_roll_ptr(buf_block_t * block,byte * rec,const rec_offs * offsets,ulint trx_id_col,trx_id_t trx_id,roll_ptr_t roll_ptr,mtr_t * mtr)3944 page_zip_write_trx_id_and_roll_ptr(
3945 	buf_block_t*	block,
3946 	byte*		rec,
3947 	const rec_offs*	offsets,
3948 	ulint		trx_id_col,
3949 	trx_id_t	trx_id,
3950 	roll_ptr_t	roll_ptr,
3951 	mtr_t*		mtr)
3952 {
3953 	page_zip_des_t* const page_zip = &block->page.zip;
3954 
3955 	ut_d(const page_t* const page = block->frame);
3956 	ut_ad(page_align(rec) == page);
3957 	ut_ad(page_simple_validate_new(page));
3958 	ut_ad(page_zip_simple_validate(page_zip));
3959 	ut_ad(page_zip_get_size(page_zip)
3960 	      > PAGE_DATA + page_zip_dir_size(page_zip));
3961 	ut_ad(rec_offs_validate(rec, NULL, offsets));
3962 	ut_ad(rec_offs_comp(offsets));
3963 
3964 	ut_ad(page_zip->m_start >= PAGE_DATA);
3965 	ut_ad(page_zip_header_cmp(page_zip, page));
3966 
3967 	ut_ad(page_is_leaf(page));
3968 
3969 	MEM_CHECK_DEFINED(page_zip->data, page_zip_get_size(page_zip));
3970 
3971 	constexpr ulint sys_len = DATA_TRX_ID_LEN + DATA_ROLL_PTR_LEN;
3972 	const ulint heap_no = rec_get_heap_no_new(rec);
3973 	ut_ad(heap_no >= PAGE_HEAP_NO_USER_LOW);
3974 	byte* storage = page_zip_dir_start(page_zip) - (heap_no - 1) * sys_len;
3975 
3976 	compile_time_assert(DATA_TRX_ID + 1 == DATA_ROLL_PTR);
3977 	ulint len;
3978 	byte* field = rec_get_nth_field(rec, offsets, trx_id_col, &len);
3979 	ut_ad(len == DATA_TRX_ID_LEN);
3980 	ut_ad(field + DATA_TRX_ID_LEN
3981 	      == rec_get_nth_field(rec, offsets, trx_id_col + 1, &len));
3982 	ut_ad(len == DATA_ROLL_PTR_LEN);
3983 #if defined UNIV_DEBUG || defined UNIV_ZIP_DEBUG
3984 	ut_a(!memcmp(storage, field, sys_len));
3985 #endif /* UNIV_DEBUG || UNIV_ZIP_DEBUG */
3986 	compile_time_assert(DATA_TRX_ID_LEN == 6);
3987 	mach_write_to_6(field, trx_id);
3988 	compile_time_assert(DATA_ROLL_PTR_LEN == 7);
3989 	mach_write_to_7(field + DATA_TRX_ID_LEN, roll_ptr);
3990 	len = 0;
3991 	if (heap_no > PAGE_HEAP_NO_USER_LOW) {
3992 		byte* prev = storage + sys_len;
3993 		for (; len < sys_len && prev[len] == field[len]; len++);
3994 		if (len > 4) {
3995 			/* We save space by replacing a single record
3996 
3997 			WRITE,offset(storage),byte[13]
3998 
3999 			with up to two records:
4000 
4001 			MEMMOVE,offset(storage),len(1 byte),+13(1 byte),
4002 			WRITE|0x80,0,byte[13-len]
4003 
4004 			The single WRITE record would be x+13 bytes long (x>2).
4005 			The MEMMOVE record would be x+1+1 = x+2 bytes, and
4006 			the second WRITE would be 1+1+13-len = 15-len bytes.
4007 
4008 			The total size is: x+13 versus x+2+15-len = x+17-len.
4009 			To save space, we must have len>4. */
4010 			memcpy(storage, prev, len);
4011 			mtr->memmove(*block, ulint(storage - page_zip->data),
4012 				     ulint(storage - page_zip->data) + sys_len,
4013 				     len);
4014 			storage += len;
4015 			field += len;
4016 			if (UNIV_LIKELY(len < sys_len)) {
4017 				goto write;
4018 			}
4019 		} else {
4020 			len = 0;
4021 			goto write;
4022 		}
4023 	} else {
4024 write:
4025                 mtr->zmemcpy<mtr_t::MAYBE_NOP>(*block, storage, field,
4026 					       sys_len - len);
4027 	}
4028 #if defined UNIV_DEBUG || defined UNIV_ZIP_DEBUG
4029 	ut_a(!memcmp(storage - len, field - len, sys_len));
4030 #endif /* UNIV_DEBUG || UNIV_ZIP_DEBUG */
4031 
4032 	MEM_CHECK_DEFINED(rec, rec_offs_data_size(offsets));
4033 	MEM_CHECK_DEFINED(rec - rec_offs_extra_size(offsets),
4034 			  rec_offs_extra_size(offsets));
4035 	MEM_CHECK_DEFINED(page_zip->data, page_zip_get_size(page_zip));
4036 }
4037 
4038 /**********************************************************************//**
4039 Clear an area on the uncompressed and compressed page.
4040 Do not clear the data payload, as that would grow the modification log. */
4041 static
4042 void
page_zip_clear_rec(buf_block_t * block,byte * rec,const dict_index_t * index,const rec_offs * offsets,mtr_t * mtr)4043 page_zip_clear_rec(
4044 /*===============*/
4045 	buf_block_t*	block,		/*!< in/out: compressed page */
4046 	byte*		rec,		/*!< in: record to clear */
4047 	const dict_index_t*	index,	/*!< in: index of rec */
4048 	const rec_offs*	offsets,	/*!< in: rec_get_offsets(rec, index) */
4049 	mtr_t*		mtr)		/*!< in/out: mini-transaction */
4050 {
4051 	ulint	heap_no;
4052 	byte*	storage;
4053 	byte*	field;
4054 	ulint	len;
4055 
4056 	ut_ad(page_align(rec) == block->frame);
4057 	page_zip_des_t* const page_zip = &block->page.zip;
4058 
4059 	/* page_zip_validate() would fail here if a record
4060 	containing externally stored columns is being deleted. */
4061 	ut_ad(rec_offs_validate(rec, index, offsets));
4062 	ut_ad(!page_zip_dir_find(page_zip, page_offset(rec)));
4063 	ut_ad(page_zip_dir_find_free(page_zip, page_offset(rec)));
4064 	ut_ad(page_zip_header_cmp(page_zip, block->frame));
4065 
4066 	heap_no = rec_get_heap_no_new(rec);
4067 	ut_ad(heap_no >= PAGE_HEAP_NO_USER_LOW);
4068 
4069 	MEM_CHECK_DEFINED(page_zip->data, page_zip_get_size(page_zip));
4070 	MEM_CHECK_DEFINED(rec, rec_offs_data_size(offsets));
4071 	MEM_CHECK_DEFINED(rec - rec_offs_extra_size(offsets),
4072 			  rec_offs_extra_size(offsets));
4073 
4074 	if (!page_is_leaf(block->frame)) {
4075 		/* Clear node_ptr. On the compressed page,
4076 		there is an array of node_ptr immediately before the
4077 		dense page directory, at the very end of the page. */
4078 		storage	= page_zip_dir_start(page_zip);
4079 		ut_ad(dict_index_get_n_unique_in_tree_nonleaf(index) ==
4080 		      rec_offs_n_fields(offsets) - 1);
4081 		field	= rec_get_nth_field(rec, offsets,
4082 					    rec_offs_n_fields(offsets) - 1,
4083 					    &len);
4084 		ut_ad(len == REC_NODE_PTR_SIZE);
4085 		ut_ad(!rec_offs_any_extern(offsets));
4086 		memset(field, 0, REC_NODE_PTR_SIZE);
4087 		storage -= (heap_no - 1) * REC_NODE_PTR_SIZE;
4088 		len = REC_NODE_PTR_SIZE;
4089 clear_page_zip:
4090 		memset(storage, 0, len);
4091 		mtr->memset(*block, storage - page_zip->data, len, 0);
4092 	} else if (index->is_clust()) {
4093 		/* Clear trx_id and roll_ptr. On the compressed page,
4094 		there is an array of these fields immediately before the
4095 		dense page directory, at the very end of the page. */
4096 		const ulint	trx_id_pos
4097 			= dict_col_get_clust_pos(
4098 			dict_table_get_sys_col(
4099 				index->table, DATA_TRX_ID), index);
4100 		field	= rec_get_nth_field(rec, offsets, trx_id_pos, &len);
4101 		ut_ad(len == DATA_TRX_ID_LEN);
4102 		memset(field, 0, DATA_TRX_ID_LEN + DATA_ROLL_PTR_LEN);
4103 
4104 		if (rec_offs_any_extern(offsets)) {
4105 			ulint	i;
4106 
4107 			for (i = rec_offs_n_fields(offsets); i--; ) {
4108 				/* Clear all BLOB pointers in order to make
4109 				page_zip_validate() pass. */
4110 				if (rec_offs_nth_extern(offsets, i)) {
4111 					field = rec_get_nth_field(
4112 						rec, offsets, i, &len);
4113 					ut_ad(len
4114 					      == BTR_EXTERN_FIELD_REF_SIZE);
4115 					memset(field + len
4116 					       - BTR_EXTERN_FIELD_REF_SIZE,
4117 					       0, BTR_EXTERN_FIELD_REF_SIZE);
4118 				}
4119 			}
4120 		}
4121 
4122 		len = DATA_TRX_ID_LEN + DATA_ROLL_PTR_LEN;
4123 		storage = page_zip_dir_start(page_zip)
4124 			- (heap_no - 1)
4125 			* (DATA_TRX_ID_LEN + DATA_ROLL_PTR_LEN);
4126 		goto clear_page_zip;
4127 	} else {
4128 		ut_ad(!rec_offs_any_extern(offsets));
4129 	}
4130 }
4131 
4132 /** Modify the delete-mark flag of a ROW_FORMAT=COMPRESSED record.
4133 @param[in,out]  block   buffer block
4134 @param[in,out]  rec     record on a physical index page
4135 @param[in]      flag    the value of the delete-mark flag
4136 @param[in,out]  mtr     mini-transaction  */
page_zip_rec_set_deleted(buf_block_t * block,rec_t * rec,bool flag,mtr_t * mtr)4137 void page_zip_rec_set_deleted(buf_block_t *block, rec_t *rec, bool flag,
4138                               mtr_t *mtr)
4139 {
4140   ut_ad(page_align(rec) == block->frame);
4141   byte *slot= page_zip_dir_find(&block->page.zip, page_offset(rec));
4142   byte b= *slot;
4143   if (flag)
4144     b|= (PAGE_ZIP_DIR_SLOT_DEL >> 8);
4145   else
4146     b&= byte(~(PAGE_ZIP_DIR_SLOT_DEL >> 8));
4147   mtr->zmemcpy<mtr_t::MAYBE_NOP>(*block, slot, &b, 1);
4148 #ifdef UNIV_ZIP_DEBUG
4149   ut_a(page_zip_validate(&block->page.zip, block->frame, nullptr));
4150 #endif /* UNIV_ZIP_DEBUG */
4151 }
4152 
4153 /**********************************************************************//**
4154 Write the "owned" flag of a record on a compressed page.  The n_owned field
4155 must already have been written on the uncompressed page. */
4156 void
page_zip_rec_set_owned(buf_block_t * block,const byte * rec,ulint flag,mtr_t * mtr)4157 page_zip_rec_set_owned(
4158 /*===================*/
4159 	buf_block_t*	block,	/*!< in/out: ROW_FORMAT=COMPRESSED page */
4160 	const byte*	rec,	/*!< in: record on the uncompressed page */
4161 	ulint		flag,	/*!< in: the owned flag (nonzero=TRUE) */
4162 	mtr_t*		mtr)	/*!< in/out: mini-transaction */
4163 {
4164   ut_ad(page_align(rec) == block->frame);
4165   page_zip_des_t *const page_zip= &block->page.zip;
4166   byte *slot= page_zip_dir_find(page_zip, page_offset(rec));
4167   MEM_CHECK_DEFINED(page_zip->data, page_zip_get_size(page_zip));
4168   byte b= *slot;
4169   if (flag)
4170     b|= (PAGE_ZIP_DIR_SLOT_OWNED >> 8);
4171   else
4172     b&= byte(~(PAGE_ZIP_DIR_SLOT_OWNED >> 8));
4173   mtr->zmemcpy<mtr_t::MAYBE_NOP>(*block, slot, &b, 1);
4174 }
4175 
4176 /**********************************************************************//**
4177 Insert a record to the dense page directory. */
4178 void
page_zip_dir_insert(page_cur_t * cursor,uint16_t free_rec,byte * rec,mtr_t * mtr)4179 page_zip_dir_insert(
4180 /*================*/
4181 	page_cur_t*	cursor,	/*!< in/out: page cursor */
4182 	uint16_t	free_rec,/*!< in: record from which rec was
4183 				allocated, or 0 */
4184 	byte*		rec,	/*!< in: record to insert */
4185 	mtr_t*		mtr)	/*!< in/out: mini-transaction */
4186 {
4187 	ut_ad(page_align(cursor->rec) == cursor->block->frame);
4188 	ut_ad(page_align(rec) == cursor->block->frame);
4189 	page_zip_des_t *const page_zip= &cursor->block->page.zip;
4190 
4191 	ulint	n_dense;
4192 	byte*	slot_rec;
4193 	byte*	slot_free;
4194 
4195 	ut_ad(cursor->rec != rec);
4196 	ut_ad(page_rec_get_next_const(cursor->rec) == rec);
4197 	ut_ad(page_zip_simple_validate(page_zip));
4198 
4199 	MEM_CHECK_DEFINED(page_zip->data, page_zip_get_size(page_zip));
4200 
4201 	if (page_rec_is_infimum(cursor->rec)) {
4202 		/* Use the first slot. */
4203 		slot_rec = page_zip->data + page_zip_get_size(page_zip);
4204 	} else {
4205 		byte*	end	= page_zip->data + page_zip_get_size(page_zip);
4206 		byte*	start	= end - page_zip_dir_user_size(page_zip);
4207 
4208 		if (UNIV_LIKELY(!free_rec)) {
4209 			/* PAGE_N_RECS was already incremented
4210 			in page_cur_insert_rec_zip(), but the
4211 			dense directory slot at that position
4212 			contains garbage.  Skip it. */
4213 			start += PAGE_ZIP_DIR_SLOT_SIZE;
4214 		}
4215 
4216 		slot_rec = page_zip_dir_find_low(start, end,
4217 						 page_offset(cursor->rec));
4218 		ut_a(slot_rec);
4219 	}
4220 
4221 	/* Read the old n_dense (n_heap may have been incremented). */
4222 	n_dense = page_dir_get_n_heap(page_zip->data)
4223 		- (PAGE_HEAP_NO_USER_LOW + 1U);
4224 
4225 	if (UNIV_UNLIKELY(free_rec)) {
4226 		/* The record was allocated from the free list.
4227 		Shift the dense directory only up to that slot.
4228 		Note that in this case, n_dense is actually
4229 		off by one, because page_cur_insert_rec_zip()
4230 		did not increment n_heap. */
4231 		ut_ad(rec_get_heap_no_new(rec) < n_dense + 1
4232 		      + PAGE_HEAP_NO_USER_LOW);
4233 		ut_ad(page_offset(rec) >= free_rec);
4234 		slot_free = page_zip_dir_find(page_zip, free_rec);
4235 		ut_ad(slot_free);
4236 		slot_free += PAGE_ZIP_DIR_SLOT_SIZE;
4237 	} else {
4238 		/* The record was allocated from the heap.
4239 		Shift the entire dense directory. */
4240 		ut_ad(rec_get_heap_no_new(rec) == n_dense
4241 		      + PAGE_HEAP_NO_USER_LOW);
4242 
4243 		/* Shift to the end of the dense page directory. */
4244 		slot_free = page_zip->data + page_zip_get_size(page_zip)
4245 			- PAGE_ZIP_DIR_SLOT_SIZE * n_dense;
4246 	}
4247 
4248 	if (const ulint slot_len = ulint(slot_rec - slot_free)) {
4249 		/* Shift the dense directory to allocate place for rec. */
4250 		memmove_aligned<2>(slot_free - PAGE_ZIP_DIR_SLOT_SIZE,
4251 				   slot_free, slot_len);
4252 		mtr->memmove(*cursor->block, (slot_free - page_zip->data)
4253 			     - PAGE_ZIP_DIR_SLOT_SIZE,
4254 			     slot_free - page_zip->data, slot_len);
4255 	}
4256 
4257 	/* Write the entry for the inserted record.
4258 	The "owned" flag must be zero. */
4259 	uint16_t offs = page_offset(rec);
4260 	if (rec_get_deleted_flag(rec, true)) {
4261 		offs |= PAGE_ZIP_DIR_SLOT_DEL;
4262 	}
4263 
4264 	mach_write_to_2(slot_rec - PAGE_ZIP_DIR_SLOT_SIZE, offs);
4265 	mtr->zmemcpy(*cursor->block, slot_rec - page_zip->data
4266 		     - PAGE_ZIP_DIR_SLOT_SIZE, PAGE_ZIP_DIR_SLOT_SIZE);
4267 }
4268 
4269 /** Shift the dense page directory and the array of BLOB pointers
4270 when a record is deleted.
4271 @param[in,out]  block   index page
4272 @param[in,out]  rec     record being deleted
4273 @param[in]      index   the index that the page belongs to
4274 @param[in]      offsets rec_get_offsets(rec, index)
4275 @param[in]      free    previous start of the free list
4276 @param[in,out]  mtr     mini-transaction */
page_zip_dir_delete(buf_block_t * block,byte * rec,const dict_index_t * index,const rec_offs * offsets,const byte * free,mtr_t * mtr)4277 void page_zip_dir_delete(buf_block_t *block, byte *rec,
4278                          const dict_index_t *index, const rec_offs *offsets,
4279                          const byte *free, mtr_t *mtr)
4280 {
4281   ut_ad(page_align(rec) == block->frame);
4282   page_zip_des_t *const page_zip= &block->page.zip;
4283 
4284   ut_ad(rec_offs_validate(rec, index, offsets));
4285   ut_ad(rec_offs_comp(offsets));
4286 
4287   MEM_CHECK_DEFINED(page_zip->data, page_zip_get_size(page_zip));
4288   MEM_CHECK_DEFINED(rec, rec_offs_data_size(offsets));
4289   MEM_CHECK_DEFINED(rec - rec_offs_extra_size(offsets),
4290 		    rec_offs_extra_size(offsets));
4291 
4292   mach_write_to_2(rec - REC_NEXT,
4293                   free ? static_cast<uint16_t>(free - rec) : 0);
4294   byte *page_free= my_assume_aligned<2>(PAGE_FREE + PAGE_HEADER +
4295                                         block->frame);
4296   mtr->write<2>(*block, page_free, page_offset(rec));
4297   byte *garbage= my_assume_aligned<2>(PAGE_GARBAGE + PAGE_HEADER +
4298                                       block->frame);
4299   mtr->write<2>(*block, garbage, rec_offs_size(offsets) +
4300                 mach_read_from_2(garbage));
4301   compile_time_assert(PAGE_GARBAGE == PAGE_FREE + 2);
4302   memcpy_aligned<4>(PAGE_FREE + PAGE_HEADER + page_zip->data, page_free, 4);
4303   byte *slot_rec= page_zip_dir_find(page_zip, page_offset(rec));
4304   ut_a(slot_rec);
4305   uint16_t n_recs= page_get_n_recs(block->frame);
4306   ut_ad(n_recs);
4307   ut_ad(n_recs > 1 || page_get_page_no(block->frame) == index->page);
4308   /* This could not be done before page_zip_dir_find(). */
4309   byte *page_n_recs= my_assume_aligned<2>(PAGE_N_RECS + PAGE_HEADER +
4310                                           block->frame);
4311   mtr->write<2>(*block, page_n_recs, n_recs - 1U);
4312   memcpy_aligned<2>(PAGE_N_RECS + PAGE_HEADER + page_zip->data, page_n_recs,
4313                     2);
4314 
4315   byte *slot_free;
4316 
4317   if (UNIV_UNLIKELY(!free))
4318     /* Make the last slot the start of the free list. */
4319     slot_free= page_zip->data + page_zip_get_size(page_zip) -
4320       PAGE_ZIP_DIR_SLOT_SIZE * (page_dir_get_n_heap(page_zip->data) -
4321                                 PAGE_HEAP_NO_USER_LOW);
4322   else
4323   {
4324     slot_free= page_zip_dir_find_free(page_zip, page_offset(free));
4325     ut_a(slot_free < slot_rec);
4326     /* Grow the free list by one slot by moving the start. */
4327     slot_free+= PAGE_ZIP_DIR_SLOT_SIZE;
4328   }
4329 
4330   const ulint slot_len= slot_rec > slot_free ? ulint(slot_rec - slot_free) : 0;
4331   if (slot_len)
4332   {
4333     memmove_aligned<2>(slot_free + PAGE_ZIP_DIR_SLOT_SIZE, slot_free,
4334                        slot_len);
4335     mtr->memmove(*block, (slot_free - page_zip->data) + PAGE_ZIP_DIR_SLOT_SIZE,
4336                  slot_free - page_zip->data, slot_len);
4337   }
4338 
4339   /* Write the entry for the deleted record.
4340   The "owned" and "deleted" flags will be cleared. */
4341   mach_write_to_2(slot_free, page_offset(rec));
4342   mtr->zmemcpy(*block, slot_free - page_zip->data, 2);
4343 
4344   if (const ulint n_ext= rec_offs_n_extern(offsets))
4345   {
4346     ut_ad(index->is_primary());
4347     ut_ad(page_is_leaf(block->frame));
4348 
4349     /* Shift and zero fill the array of BLOB pointers. */
4350     ulint blob_no = page_zip_get_n_prev_extern(page_zip, rec, index);
4351     ut_a(blob_no + n_ext <= page_zip->n_blobs);
4352 
4353     byte *externs= page_zip->data + page_zip_get_size(page_zip) -
4354       (page_dir_get_n_heap(block->frame) - PAGE_HEAP_NO_USER_LOW) *
4355       PAGE_ZIP_CLUST_LEAF_SLOT_SIZE;
4356     byte *ext_end= externs - page_zip->n_blobs * FIELD_REF_SIZE;
4357 
4358     /* Shift and zero fill the array. */
4359     if (const ulint ext_len= ulint(page_zip->n_blobs - n_ext - blob_no) *
4360         BTR_EXTERN_FIELD_REF_SIZE)
4361     {
4362       memmove(ext_end + n_ext * FIELD_REF_SIZE, ext_end, ext_len);
4363       mtr->memmove(*block, (ext_end - page_zip->data) + n_ext * FIELD_REF_SIZE,
4364                    ext_end - page_zip->data, ext_len);
4365     }
4366     memset(ext_end, 0, n_ext * FIELD_REF_SIZE);
4367     mtr->memset(*block, ext_end - page_zip->data, n_ext * FIELD_REF_SIZE, 0);
4368     page_zip->n_blobs = (page_zip->n_blobs - n_ext) & ((1U << 12) - 1);
4369   }
4370 
4371   /* The compression algorithm expects info_bits and n_owned
4372   to be 0 for deleted records. */
4373   rec[-REC_N_NEW_EXTRA_BYTES]= 0; /* info_bits and n_owned */
4374 
4375   page_zip_clear_rec(block, rec, index, offsets, mtr);
4376 }
4377 
4378 /**********************************************************************//**
4379 Reorganize and compress a page.  This is a low-level operation for
4380 compressed pages, to be used when page_zip_compress() fails.
4381 On success, redo log will be written.
4382 The function btr_page_reorganize() should be preferred whenever possible.
4383 IMPORTANT: if page_zip_reorganize() is invoked on a leaf page of a
4384 non-clustered index, the caller must update the insert buffer free
4385 bits in the same mini-transaction in such a way that the modification
4386 will be redo-logged.
4387 @retval true on success
4388 @retval false on failure; the block will be left intact */
4389 bool
page_zip_reorganize(buf_block_t * block,dict_index_t * index,ulint z_level,mtr_t * mtr,bool restore)4390 page_zip_reorganize(
4391 	buf_block_t*	block,	/*!< in/out: page with compressed page;
4392 				on the compressed page, in: size;
4393 				out: data, n_blobs,
4394 				m_start, m_end, m_nonempty */
4395 	dict_index_t*	index,	/*!< in: index of the B-tree node */
4396 	ulint		z_level,/*!< in: compression level */
4397 	mtr_t*		mtr,	/*!< in: mini-transaction */
4398 	bool		restore)/*!< whether to restore on failure */
4399 {
4400 	page_t*		page		= buf_block_get_frame(block);
4401 	buf_block_t*	temp_block;
4402 	page_t*		temp_page;
4403 
4404 	ut_ad(mtr->memo_contains_flagged(block, MTR_MEMO_PAGE_X_FIX));
4405 	ut_ad(block->page.zip.data);
4406 	ut_ad(page_is_comp(page));
4407 	ut_ad(!dict_index_is_ibuf(index));
4408 	ut_ad(!index->table->is_temporary());
4409 	/* Note that page_zip_validate(page_zip, page, index) may fail here. */
4410 	MEM_CHECK_DEFINED(page, srv_page_size);
4411 	MEM_CHECK_DEFINED(buf_block_get_page_zip(block)->data,
4412 			  page_zip_get_size(buf_block_get_page_zip(block)));
4413 
4414 	/* Disable logging */
4415 	mtr_log_t	log_mode = mtr_set_log_mode(mtr, MTR_LOG_NONE);
4416 
4417 	temp_block = buf_block_alloc();
4418 	btr_search_drop_page_hash_index(block);
4419 	temp_page = temp_block->frame;
4420 
4421 	/* Copy the old page to temporary space */
4422 	memcpy_aligned<UNIV_PAGE_SIZE_MIN>(temp_block->frame, block->frame,
4423 					   srv_page_size);
4424 
4425 	/* Recreate the page: note that global data on page (possible
4426 	segment headers, next page-field, etc.) is preserved intact */
4427 
4428 	page_create(block, mtr, true);
4429 	if (index->is_spatial()) {
4430 		mach_write_to_2(FIL_PAGE_TYPE + page, FIL_PAGE_RTREE);
4431 		memcpy_aligned<2>(block->page.zip.data + FIL_PAGE_TYPE,
4432 				  page + FIL_PAGE_TYPE, 2);
4433 		memset(FIL_RTREE_SPLIT_SEQ_NUM + page, 0, 8);
4434 		memset(FIL_RTREE_SPLIT_SEQ_NUM + block->page.zip.data, 0, 8);
4435 	}
4436 
4437 	/* Copy the records from the temporary space to the recreated page;
4438 	do not copy the lock bits yet */
4439 
4440 	page_copy_rec_list_end_no_locks(block, temp_block,
4441 					page_get_infimum_rec(temp_page),
4442 					index, mtr);
4443 
4444 	/* Copy the PAGE_MAX_TRX_ID or PAGE_ROOT_AUTO_INC. */
4445 	memcpy_aligned<8>(page + (PAGE_HEADER + PAGE_MAX_TRX_ID),
4446 			  temp_page + (PAGE_HEADER + PAGE_MAX_TRX_ID), 8);
4447 	/* PAGE_MAX_TRX_ID must be set on secondary index leaf pages. */
4448 	ut_ad(dict_index_is_clust(index) || !page_is_leaf(temp_page)
4449 	      || page_get_max_trx_id(page) != 0);
4450 	/* PAGE_MAX_TRX_ID must be zero on non-leaf pages other than
4451 	clustered index root pages. */
4452 	ut_ad(page_get_max_trx_id(page) == 0
4453 	      || (dict_index_is_clust(index)
4454 		  ? !page_has_siblings(temp_page)
4455 		  : page_is_leaf(temp_page)));
4456 
4457 	/* Restore logging. */
4458 	mtr_set_log_mode(mtr, log_mode);
4459 
4460 	if (!page_zip_compress(block, index, z_level, mtr)) {
4461 		if (restore) {
4462 			/* Restore the old page and exit. */
4463 #if defined UNIV_DEBUG || defined UNIV_ZIP_DEBUG
4464 			/* Check that the bytes that we skip are identical. */
4465 			ut_a(!memcmp(page, temp_page, PAGE_HEADER));
4466 			ut_a(!memcmp(PAGE_HEADER + PAGE_N_RECS + page,
4467 				     PAGE_HEADER + PAGE_N_RECS + temp_page,
4468 				     PAGE_DATA - (PAGE_HEADER + PAGE_N_RECS)));
4469 			ut_a(!memcmp(srv_page_size - FIL_PAGE_DATA_END + page,
4470 				     srv_page_size - FIL_PAGE_DATA_END
4471 				     + temp_page,
4472 				     FIL_PAGE_DATA_END));
4473 #endif /* UNIV_DEBUG || UNIV_ZIP_DEBUG */
4474 
4475 			memcpy(PAGE_HEADER + page, PAGE_HEADER + temp_page,
4476 			       PAGE_N_RECS - PAGE_N_DIR_SLOTS);
4477 			memcpy(PAGE_DATA + page, PAGE_DATA + temp_page,
4478 			       srv_page_size - PAGE_DATA - FIL_PAGE_DATA_END);
4479 
4480 #if defined UNIV_DEBUG || defined UNIV_ZIP_DEBUG
4481 			ut_a(!memcmp(page, temp_page, srv_page_size));
4482 #endif /* UNIV_DEBUG || UNIV_ZIP_DEBUG */
4483 		}
4484 
4485 		buf_block_free(temp_block);
4486 		return false;
4487 	}
4488 
4489 	lock_move_reorganize_page(block, temp_block);
4490 
4491 	buf_block_free(temp_block);
4492 	return true;
4493 }
4494 
4495 /**********************************************************************//**
4496 Copy the records of a page byte for byte.  Do not copy the page header
4497 or trailer, except those B-tree header fields that are directly
4498 related to the storage of records.  Also copy PAGE_MAX_TRX_ID.
4499 NOTE: The caller must update the lock table and the adaptive hash index. */
4500 void
page_zip_copy_recs(buf_block_t * block,const page_zip_des_t * src_zip,const page_t * src,dict_index_t * index,mtr_t * mtr)4501 page_zip_copy_recs(
4502 	buf_block_t*		block,		/*!< in/out: buffer block */
4503 	const page_zip_des_t*	src_zip,	/*!< in: compressed page */
4504 	const page_t*		src,		/*!< in: page */
4505 	dict_index_t*		index,		/*!< in: index of the B-tree */
4506 	mtr_t*			mtr)		/*!< in: mini-transaction */
4507 {
4508 	page_t* page = block->frame;
4509 	page_zip_des_t* page_zip = &block->page.zip;
4510 
4511 	ut_ad(mtr->memo_contains_flagged(block, MTR_MEMO_PAGE_X_FIX));
4512 	ut_ad(mtr->memo_contains_page_flagged(src, MTR_MEMO_PAGE_X_FIX));
4513 	ut_ad(!dict_index_is_ibuf(index));
4514 	ut_ad(!index->table->is_temporary());
4515 #ifdef UNIV_ZIP_DEBUG
4516 	/* The B-tree operations that call this function may set
4517 	FIL_PAGE_PREV or PAGE_LEVEL, causing a temporary min_rec_flag
4518 	mismatch.  A strict page_zip_validate() will be executed later
4519 	during the B-tree operations. */
4520 	ut_a(page_zip_validate_low(src_zip, src, index, TRUE));
4521 #endif /* UNIV_ZIP_DEBUG */
4522 	ut_a(page_zip_get_size(page_zip) == page_zip_get_size(src_zip));
4523 	if (UNIV_UNLIKELY(src_zip->n_blobs)) {
4524 		ut_a(page_is_leaf(src));
4525 		ut_a(dict_index_is_clust(index));
4526 	}
4527 
4528 	MEM_CHECK_ADDRESSABLE(page, srv_page_size);
4529 	MEM_CHECK_ADDRESSABLE(page_zip->data, page_zip_get_size(page_zip));
4530 	MEM_CHECK_DEFINED(src, srv_page_size);
4531 	MEM_CHECK_DEFINED(src_zip->data, page_zip_get_size(page_zip));
4532 
4533 	/* Copy those B-tree page header fields that are related to
4534 	the records stored in the page.  Also copy the field
4535 	PAGE_MAX_TRX_ID.  Skip the rest of the page header and
4536 	trailer.  On the compressed page, there is no trailer. */
4537 	compile_time_assert(PAGE_MAX_TRX_ID + 8 == PAGE_HEADER_PRIV_END);
4538 	memcpy_aligned<2>(PAGE_HEADER + page, PAGE_HEADER + src,
4539 			  PAGE_HEADER_PRIV_END);
4540 	memcpy_aligned<2>(PAGE_DATA + page, PAGE_DATA + src,
4541 			  srv_page_size - (PAGE_DATA + FIL_PAGE_DATA_END));
4542 	memcpy_aligned<2>(PAGE_HEADER + page_zip->data,
4543 			  PAGE_HEADER + src_zip->data,
4544 			  PAGE_HEADER_PRIV_END);
4545 	memcpy_aligned<2>(PAGE_DATA + page_zip->data,
4546 			  PAGE_DATA + src_zip->data,
4547 			  page_zip_get_size(page_zip) - PAGE_DATA);
4548 
4549 	if (dict_index_is_clust(index)) {
4550 		/* Reset the PAGE_ROOT_AUTO_INC field when copying
4551 		from a root page. */
4552 		memset_aligned<8>(PAGE_HEADER + PAGE_ROOT_AUTO_INC
4553 				  + page, 0, 8);
4554 		memset_aligned<8>(PAGE_HEADER + PAGE_ROOT_AUTO_INC
4555 				  + page_zip->data, 0, 8);
4556 	} else {
4557 		/* The PAGE_MAX_TRX_ID must be nonzero on leaf pages
4558 		of secondary indexes, and 0 on others. */
4559 		ut_ad(!page_is_leaf(src) == !page_get_max_trx_id(src));
4560 	}
4561 
4562 	/* Copy all fields of src_zip to page_zip, except the pointer
4563 	to the compressed data page. */
4564 	{
4565 		page_zip_t*	data = page_zip->data;
4566 		memcpy(page_zip, src_zip, sizeof *page_zip);
4567 		page_zip->data = data;
4568 	}
4569 	ut_ad(page_zip_get_trailer_len(page_zip, dict_index_is_clust(index))
4570 	      + page_zip->m_end < page_zip_get_size(page_zip));
4571 
4572 	if (!page_is_leaf(src)
4573 	    && UNIV_UNLIKELY(!page_has_prev(src))
4574 	    && UNIV_LIKELY(page_has_prev(page))) {
4575 		/* Clear the REC_INFO_MIN_REC_FLAG of the first user record. */
4576 		ulint	offs = rec_get_next_offs(page + PAGE_NEW_INFIMUM,
4577 						 TRUE);
4578 		if (UNIV_LIKELY(offs != PAGE_NEW_SUPREMUM)) {
4579 			rec_t*	rec = page + offs;
4580 			ut_a(rec[-REC_N_NEW_EXTRA_BYTES]
4581 			     & REC_INFO_MIN_REC_FLAG);
4582 			rec[-REC_N_NEW_EXTRA_BYTES]
4583 				&= byte(~REC_INFO_MIN_REC_FLAG);
4584 		}
4585 	}
4586 
4587 #ifdef UNIV_ZIP_DEBUG
4588 	ut_a(page_zip_validate(page_zip, page, index));
4589 #endif /* UNIV_ZIP_DEBUG */
4590 	page_zip_compress_write_log(block, index, mtr);
4591 }
4592 #endif /* !UNIV_INNOCHECKSUM */
4593 
4594 /** Calculate the compressed page checksum.
4595 @param[in]	data			compressed page
4596 @param[in]	size			size of compressed page
4597 @param[in]	algo			algorithm to use
4598 @return page checksum */
4599 uint32_t
page_zip_calc_checksum(const void * data,ulint size,srv_checksum_algorithm_t algo)4600 page_zip_calc_checksum(
4601 	const void*			data,
4602 	ulint				size,
4603 	srv_checksum_algorithm_t	algo)
4604 {
4605 	uLong		adler;
4606 	const Bytef*	s = static_cast<const byte*>(data);
4607 
4608 	/* Exclude FIL_PAGE_SPACE_OR_CHKSUM, FIL_PAGE_LSN,
4609 	and FIL_PAGE_FILE_FLUSH_LSN from the checksum. */
4610 
4611 	switch (algo) {
4612 	case SRV_CHECKSUM_ALGORITHM_FULL_CRC32:
4613 	case SRV_CHECKSUM_ALGORITHM_STRICT_FULL_CRC32:
4614 	case SRV_CHECKSUM_ALGORITHM_CRC32:
4615 	case SRV_CHECKSUM_ALGORITHM_STRICT_CRC32:
4616 		ut_ad(size > FIL_PAGE_ARCH_LOG_NO_OR_SPACE_ID);
4617 		return ut_crc32(s + FIL_PAGE_OFFSET,
4618 				FIL_PAGE_LSN - FIL_PAGE_OFFSET)
4619 			^ ut_crc32(s + FIL_PAGE_TYPE, 2)
4620 			^ ut_crc32(s + FIL_PAGE_ARCH_LOG_NO_OR_SPACE_ID,
4621 				   size - FIL_PAGE_ARCH_LOG_NO_OR_SPACE_ID);
4622 	case SRV_CHECKSUM_ALGORITHM_INNODB:
4623 	case SRV_CHECKSUM_ALGORITHM_STRICT_INNODB:
4624 		ut_ad(size > FIL_PAGE_ARCH_LOG_NO_OR_SPACE_ID);
4625 
4626 		adler = adler32(0L, s + FIL_PAGE_OFFSET,
4627 				FIL_PAGE_LSN - FIL_PAGE_OFFSET);
4628 		adler = adler32(adler, s + FIL_PAGE_TYPE, 2);
4629 		adler = adler32(
4630 			adler, s + FIL_PAGE_ARCH_LOG_NO_OR_SPACE_ID,
4631 			static_cast<uInt>(size)
4632 			- FIL_PAGE_ARCH_LOG_NO_OR_SPACE_ID);
4633 
4634 		return(uint32_t(adler));
4635 	case SRV_CHECKSUM_ALGORITHM_NONE:
4636 	case SRV_CHECKSUM_ALGORITHM_STRICT_NONE:
4637 		return(BUF_NO_CHECKSUM_MAGIC);
4638 	/* no default so the compiler will emit a warning if new enum
4639 	is added and not handled here */
4640 	}
4641 
4642 	ut_error;
4643 	return(0);
4644 }
4645 
4646 /** Validate the checksum on a ROW_FORMAT=COMPRESSED page.
4647 @param data    ROW_FORMAT=COMPRESSED page
4648 @param size    size of the page, in bytes
4649 @return whether the stored checksum matches innodb_checksum_algorithm */
page_zip_verify_checksum(const byte * data,size_t size)4650 bool page_zip_verify_checksum(const byte *data, size_t size)
4651 {
4652 	const srv_checksum_algorithm_t	curr_algo =
4653 		static_cast<srv_checksum_algorithm_t>(srv_checksum_algorithm);
4654 
4655 	if (curr_algo == SRV_CHECKSUM_ALGORITHM_NONE) {
4656 		return true;
4657 	}
4658 
4659 	if (buf_is_zeroes(span<const byte>(data, size))) {
4660 		return true;
4661 	}
4662 
4663 	const uint32_t stored = mach_read_from_4(
4664 		data + FIL_PAGE_SPACE_OR_CHKSUM);
4665 
4666 	uint32_t calc = page_zip_calc_checksum(data, size, curr_algo);
4667 
4668 #ifdef UNIV_INNOCHECKSUM
4669 	if (log_file) {
4670 		fprintf(log_file, "page::" UINT32PF ";"
4671 			" %s checksum: calculated = " UINT32PF ";"
4672 			" recorded = " UINT32PF "\n", cur_page_num,
4673 			buf_checksum_algorithm_name(
4674 				static_cast<srv_checksum_algorithm_t>(
4675 				srv_checksum_algorithm)),
4676 			calc, stored);
4677 	}
4678 
4679 	if (!strict_verify) {
4680 		const uint32_t	crc32 = page_zip_calc_checksum(
4681 			data, size, SRV_CHECKSUM_ALGORITHM_CRC32);
4682 
4683 		if (log_file) {
4684 			fprintf(log_file, "page::" UINT32PF ": crc32 checksum:"
4685 				" calculated = " UINT32PF "; recorded = " UINT32PF "\n",
4686 				cur_page_num, crc32, stored);
4687 			fprintf(log_file, "page::" UINT32PF ": none checksum:"
4688 				" calculated = %lu; recorded = " UINT32PF "\n",
4689 				cur_page_num, BUF_NO_CHECKSUM_MAGIC, stored);
4690 		}
4691 	}
4692 #endif /* UNIV_INNOCHECKSUM */
4693 
4694 	if (stored == calc) {
4695 		return(TRUE);
4696 	}
4697 
4698 	switch (curr_algo) {
4699 	case SRV_CHECKSUM_ALGORITHM_STRICT_FULL_CRC32:
4700 	case SRV_CHECKSUM_ALGORITHM_STRICT_CRC32:
4701 	case SRV_CHECKSUM_ALGORITHM_STRICT_INNODB:
4702 	case SRV_CHECKSUM_ALGORITHM_STRICT_NONE:
4703 		return FALSE;
4704 	case SRV_CHECKSUM_ALGORITHM_FULL_CRC32:
4705 	case SRV_CHECKSUM_ALGORITHM_CRC32:
4706 		if (stored == BUF_NO_CHECKSUM_MAGIC) {
4707 			return(TRUE);
4708 		}
4709 
4710 		return stored == page_zip_calc_checksum(
4711 			data, size, SRV_CHECKSUM_ALGORITHM_INNODB);
4712 	case SRV_CHECKSUM_ALGORITHM_INNODB:
4713 		if (stored == BUF_NO_CHECKSUM_MAGIC) {
4714 			return TRUE;
4715 		}
4716 
4717 		return stored == page_zip_calc_checksum(
4718 			data, size, SRV_CHECKSUM_ALGORITHM_CRC32);
4719 	case SRV_CHECKSUM_ALGORITHM_NONE:
4720 		return TRUE;
4721 	}
4722 
4723 	return FALSE;
4724 }
4725