1/*****************************************************************************
2
3Copyright (c) 2005, 2018, Oracle and/or its affiliates. All Rights Reserved.
4Copyright (c) 2012, Facebook Inc.
5
6This program is free software; you can redistribute it and/or modify
7it under the terms of the GNU General Public License, version 2.0,
8as published by the Free Software Foundation.
9
10This program is also distributed with certain software (including
11but not limited to OpenSSL) that is licensed under separate terms,
12as designated in a particular file or component or in included license
13documentation.  The authors of MySQL hereby grant you an additional
14permission to link the program and your derivative works with the
15separately licensed software that they have included with MySQL.
16
17This program is distributed in the hope that it will be useful,
18but WITHOUT ANY WARRANTY; without even the implied warranty of
19MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
20GNU General Public License, version 2.0, for more details.
21
22You should have received a copy of the GNU General Public License along with
23this program; if not, write to the Free Software Foundation, Inc.,
2451 Franklin Street, Suite 500, Boston, MA 02110-1335 USA
25
26*****************************************************************************/
27
28/**************************************************//**
29@file include/page0zip.ic
30Compressed page interface
31
32Created June 2005 by Marko Makela
33*******************************************************/
34
35#ifdef UNIV_MATERIALIZE
36# undef UNIV_INLINE
37# define UNIV_INLINE
38#endif
39
40#include "page0zip.h"
41#include "mtr0log.h"
42#include "page0page.h"
43
44/* The format of compressed pages is as follows.
45
46The header and trailer of the uncompressed pages, excluding the page
47directory in the trailer, are copied as is to the header and trailer
48of the compressed page.
49
50At the end of the compressed page, there is a dense page directory
51pointing to every user record contained on the page, including deleted
52records on the free list.  The dense directory is indexed in the
53collation order, i.e., in the order in which the record list is
54linked on the uncompressed page.  The infimum and supremum records are
55excluded.  The two most significant bits of the entries are allocated
56for the delete-mark and an n_owned flag indicating the last record in
57a chain of records pointed to from the sparse page directory on the
58uncompressed page.
59
60The data between PAGE_ZIP_START and the last page directory entry will
61be written in compressed format, starting at offset PAGE_DATA.
62Infimum and supremum records are not stored.  We exclude the
63REC_N_NEW_EXTRA_BYTES in every record header.  These can be recovered
64from the dense page directory stored at the end of the compressed
65page.
66
67The fields node_ptr (in non-leaf B-tree nodes; level>0), trx_id and
68roll_ptr (in leaf B-tree nodes; level=0), and BLOB pointers of
69externally stored columns are stored separately, in ascending order of
70heap_no and column index, starting backwards from the dense page
71directory.
72
73The compressed data stream may be followed by a modification log
74covering the compressed portion of the page, as follows.
75
76MODIFICATION LOG ENTRY FORMAT
77- write record:
78  - (heap_no - 1) << 1 (1..2 bytes)
79  - extra bytes backwards
80  - data bytes
81- clear record:
82  - (heap_no - 1) << 1 | 1 (1..2 bytes)
83
84The integer values are stored in a variable-length format:
85- 0xxxxxxx: 0..127
86- 1xxxxxxx xxxxxxxx: 0..32767
87
88The end of the modification log is marked by a 0 byte.
89
90In summary, the compressed page looks like this:
91
92(1) Uncompressed page header (PAGE_DATA bytes)
93(2) Compressed index information
94(3) Compressed page data
95(4) Page modification log (page_zip->m_start..page_zip->m_end)
96(5) Empty zero-filled space
97(6) BLOB pointers (on leaf pages)
98  - BTR_EXTERN_FIELD_REF_SIZE for each externally stored column
99  - in descending collation order
100(7) Uncompressed columns of user records, n_dense * uncompressed_size bytes,
101  - indexed by heap_no
102  - DATA_TRX_ID_LEN + DATA_ROLL_PTR_LEN for leaf pages of clustered indexes
103  - REC_NODE_PTR_SIZE for non-leaf pages
104  - 0 otherwise
105(8) dense page directory, stored backwards
106  - n_dense = n_heap - 2
107  - existing records in ascending collation order
108  - deleted records (free list) in link order
109*/
110
111/** Start offset of the area that will be compressed */
112#define PAGE_ZIP_START		PAGE_NEW_SUPREMUM_END
113/** Size of an compressed page directory entry */
114#define PAGE_ZIP_DIR_SLOT_SIZE	2
115/** Mask of record offsets */
116#define PAGE_ZIP_DIR_SLOT_MASK	0x3fff
117/** 'owned' flag */
118#define PAGE_ZIP_DIR_SLOT_OWNED	0x4000
119/** 'deleted' flag */
120#define PAGE_ZIP_DIR_SLOT_DEL	0x8000
121
122/**********************************************************************//**
123Determine the size of a compressed page in bytes.
124@return	size in bytes */
125UNIV_INLINE
126ulint
127page_zip_get_size(
128/*==============*/
129	const page_zip_des_t*	page_zip)	/*!< in: compressed page */
130{
131	ulint	size;
132
133	if (!page_zip->ssize) {
134		return(0);
135	}
136
137	size = (UNIV_ZIP_SIZE_MIN >> 1) << page_zip->ssize;
138
139	ut_ad(size >= UNIV_ZIP_SIZE_MIN);
140	ut_ad(size <= UNIV_PAGE_SIZE);
141
142	return(size);
143}
144/**********************************************************************//**
145Set the size of a compressed page in bytes. */
146UNIV_INLINE
147void
148page_zip_set_size(
149/*==============*/
150	page_zip_des_t*	page_zip,	/*!< in/out: compressed page */
151	ulint		size)		/*!< in: size in bytes */
152{
153	if (size) {
154		int	ssize;
155
156		ut_ad(ut_is_2pow(size));
157
158		for (ssize = 1; size > (ulint) (512 << ssize); ssize++) {
159		}
160
161		page_zip->ssize = ssize;
162	} else {
163		page_zip->ssize = 0;
164	}
165
166	ut_ad(page_zip_get_size(page_zip) == size);
167}
168
169#ifndef UNIV_HOTBACKUP
170/**********************************************************************//**
171Determine if a record is so big that it needs to be stored externally.
172@return	FALSE if the entire record can be stored locally on the page */
173UNIV_INLINE
174ibool
175page_zip_rec_needs_ext(
176/*===================*/
177	ulint	rec_size,	/*!< in: length of the record in bytes */
178	ulint	comp,		/*!< in: nonzero=compact format */
179	ulint	n_fields,	/*!< in: number of fields in the record;
180				ignored if zip_size == 0 */
181	ulint	zip_size)	/*!< in: compressed page size in bytes, or 0 */
182{
183	ut_ad(rec_size > (comp ? REC_N_NEW_EXTRA_BYTES : REC_N_OLD_EXTRA_BYTES));
184	ut_ad(ut_is_2pow(zip_size));
185	ut_ad(comp || !zip_size);
186
187#if UNIV_PAGE_SIZE_MAX > REC_MAX_DATA_SIZE
188	if (rec_size >= REC_MAX_DATA_SIZE) {
189		return(TRUE);
190	}
191#endif
192
193	if (zip_size) {
194		ut_ad(comp);
195		/* On a compressed page, there is a two-byte entry in
196		the dense page directory for every record.  But there
197		is no record header.  There should be enough room for
198		one record on an empty leaf page.  Subtract 1 byte for
199		the encoded heap number.  Check also the available space
200		on the uncompressed page. */
201		return(rec_size - (REC_N_NEW_EXTRA_BYTES - 2 - 1)
202		       >= page_zip_empty_size(n_fields, zip_size)
203		       || rec_size >= page_get_free_space_of_empty(TRUE) / 2);
204	}
205
206	return(rec_size >= page_get_free_space_of_empty(comp) / 2);
207}
208#endif /* !UNIV_HOTBACKUP */
209
210#ifdef UNIV_DEBUG
211/**********************************************************************//**
212Validate a compressed page descriptor.
213@return	TRUE if ok */
214UNIV_INLINE
215ibool
216page_zip_simple_validate(
217/*=====================*/
218	const page_zip_des_t*	page_zip)/*!< in: compressed page descriptor */
219{
220	ut_ad(page_zip);
221	ut_ad(page_zip->data);
222	ut_ad(page_zip->ssize <= PAGE_ZIP_SSIZE_MAX);
223	ut_ad(page_zip_get_size(page_zip)
224	      > PAGE_DATA + PAGE_ZIP_DIR_SLOT_SIZE);
225	ut_ad(page_zip->m_start <= page_zip->m_end);
226	ut_ad(page_zip->m_end < page_zip_get_size(page_zip));
227	ut_ad(page_zip->n_blobs
228	      < page_zip_get_size(page_zip) / BTR_EXTERN_FIELD_REF_SIZE);
229	return(TRUE);
230}
231#endif /* UNIV_DEBUG */
232
233/**********************************************************************//**
234Determine if the length of the page trailer.
235@return length of the page trailer, in bytes, not including the
236terminating zero byte of the modification log */
237UNIV_INLINE
238ibool
239page_zip_get_trailer_len(
240/*=====================*/
241	const page_zip_des_t*	page_zip,/*!< in: compressed page */
242	ibool			is_clust)/*!< in: TRUE if clustered index */
243{
244	ulint	uncompressed_size;
245
246	ut_ad(page_zip_simple_validate(page_zip));
247	UNIV_MEM_ASSERT_RW(page_zip->data, page_zip_get_size(page_zip));
248
249	if (!page_is_leaf(page_zip->data)) {
250		uncompressed_size = PAGE_ZIP_DIR_SLOT_SIZE
251			+ REC_NODE_PTR_SIZE;
252		ut_ad(!page_zip->n_blobs);
253	} else if (is_clust) {
254		uncompressed_size = PAGE_ZIP_DIR_SLOT_SIZE
255			+ DATA_TRX_ID_LEN + DATA_ROLL_PTR_LEN;
256	} else {
257		uncompressed_size = PAGE_ZIP_DIR_SLOT_SIZE;
258		ut_ad(!page_zip->n_blobs);
259	}
260
261	return((page_dir_get_n_heap(page_zip->data) - 2)
262	       * uncompressed_size
263	       + page_zip->n_blobs * BTR_EXTERN_FIELD_REF_SIZE);
264}
265
266/**********************************************************************//**
267Determine how big record can be inserted without recompressing the page.
268@return a positive number indicating the maximum size of a record
269whose insertion is guaranteed to succeed, or zero or negative */
270UNIV_INLINE
271lint
272page_zip_max_ins_size(
273/*==================*/
274	const page_zip_des_t*	page_zip,/*!< in: compressed page */
275	ibool			is_clust)/*!< in: TRUE if clustered index */
276{
277	ulint	trailer_len;
278
279	trailer_len = page_zip_get_trailer_len(page_zip, is_clust);
280
281	/* When a record is created, a pointer may be added to
282	the dense directory.
283	Likewise, space for the columns that will not be
284	compressed will be allocated from the page trailer.
285	Also the BLOB pointers will be allocated from there, but
286	we may as well count them in the length of the record. */
287
288	trailer_len += PAGE_ZIP_DIR_SLOT_SIZE;
289
290	return((lint) page_zip_get_size(page_zip)
291	       - trailer_len - page_zip->m_end
292	       - (REC_N_NEW_EXTRA_BYTES - 2));
293}
294
295/**********************************************************************//**
296Determine if enough space is available in the modification log.
297@return	TRUE if enough space is available */
298UNIV_INLINE
299ibool
300page_zip_available(
301/*===============*/
302	const page_zip_des_t*	page_zip,/*!< in: compressed page */
303	ibool			is_clust,/*!< in: TRUE if clustered index */
304	ulint			length,	/*!< in: combined size of the record */
305	ulint			create)	/*!< in: nonzero=add the record to
306					the heap */
307{
308	ulint	trailer_len;
309
310	ut_ad(length > REC_N_NEW_EXTRA_BYTES);
311
312	trailer_len = page_zip_get_trailer_len(page_zip, is_clust);
313
314	/* Subtract the fixed extra bytes and add the maximum
315	space needed for identifying the record (encoded heap_no). */
316	length -= REC_N_NEW_EXTRA_BYTES - 2;
317
318	if (create > 0) {
319		/* When a record is created, a pointer may be added to
320		the dense directory.
321		Likewise, space for the columns that will not be
322		compressed will be allocated from the page trailer.
323		Also the BLOB pointers will be allocated from there, but
324		we may as well count them in the length of the record. */
325
326		trailer_len += PAGE_ZIP_DIR_SLOT_SIZE;
327	}
328
329	return(length + trailer_len + page_zip->m_end
330	       < page_zip_get_size(page_zip));
331}
332
333/**********************************************************************//**
334Initialize a compressed page descriptor. */
335UNIV_INLINE
336void
337page_zip_des_init(
338/*==============*/
339	page_zip_des_t*	page_zip)	/*!< in/out: compressed page
340					descriptor */
341{
342	memset(page_zip, 0, sizeof *page_zip);
343}
344
345/**********************************************************************//**
346Write a log record of writing to the uncompressed header portion of a page. */
347UNIV_INTERN
348void
349page_zip_write_header_log(
350/*======================*/
351	const byte*	data,/*!< in: data on the uncompressed page */
352	ulint		length,	/*!< in: length of the data */
353	mtr_t*		mtr);	/*!< in: mini-transaction */
354
355/**********************************************************************//**
356Write data to the uncompressed header portion of a page.  The data must
357already have been written to the uncompressed page.
358However, the data portion of the uncompressed page may differ from
359the compressed page when a record is being inserted in
360page_cur_insert_rec_zip(). */
361UNIV_INLINE
362void
363page_zip_write_header(
364/*==================*/
365	page_zip_des_t*	page_zip,/*!< in/out: compressed page */
366	const byte*	str,	/*!< in: address on the uncompressed page */
367	ulint		length,	/*!< in: length of the data */
368	mtr_t*		mtr)	/*!< in: mini-transaction, or NULL */
369{
370	ulint	pos;
371
372	ut_ad(PAGE_ZIP_MATCH(str, page_zip));
373	ut_ad(page_zip_simple_validate(page_zip));
374	UNIV_MEM_ASSERT_RW(page_zip->data, page_zip_get_size(page_zip));
375
376	pos = page_offset(str);
377
378	ut_ad(pos < PAGE_DATA);
379
380	memcpy(page_zip->data + pos, str, length);
381
382	/* The following would fail in page_cur_insert_rec_zip(). */
383	/* ut_ad(page_zip_validate(page_zip, str - pos)); */
384
385	if (mtr) {
386#ifndef UNIV_HOTBACKUP
387		page_zip_write_header_log(str, length, mtr);
388#endif /* !UNIV_HOTBACKUP */
389	}
390}
391
392/**********************************************************************//**
393Write a log record of compressing an index page without the data on the page. */
394UNIV_INLINE
395void
396page_zip_compress_write_log_no_data(
397/*================================*/
398	ulint		level,	/*!< in: compression level */
399	const page_t*	page,	/*!< in: page that is compressed */
400	dict_index_t*	index,	/*!< in: index */
401	mtr_t*		mtr)	/*!< in: mtr */
402{
403	byte* log_ptr = mlog_open_and_write_index(
404		mtr, page, index, MLOG_ZIP_PAGE_COMPRESS_NO_DATA, 1);
405
406	if (log_ptr) {
407		mach_write_to_1(log_ptr, level);
408		mlog_close(mtr, log_ptr + 1);
409	}
410}
411
412/**********************************************************************//**
413Parses a log record of compressing an index page without the data.
414@return	end of log record or NULL */
415UNIV_INLINE
416byte*
417page_zip_parse_compress_no_data(
418/*============================*/
419	byte*		ptr,		/*!< in: buffer */
420	byte*		end_ptr,	/*!< in: buffer end */
421	page_t*		page,		/*!< in: uncompressed page */
422	page_zip_des_t*	page_zip,	/*!< out: compressed page */
423	dict_index_t*	index)		/*!< in: index */
424{
425	ulint	level;
426	if (end_ptr == ptr) {
427		return(NULL);
428	}
429
430	level = mach_read_from_1(ptr);
431
432	/* If page compression fails then there must be something wrong
433	because a compress log record is logged only if the compression
434	was successful. Crash in this case. */
435
436	if (page
437	    && !page_zip_compress(page_zip, page, index, level, NULL)) {
438		ut_error;
439	}
440
441	return(ptr + 1);
442}
443
444/**********************************************************************//**
445Reset the counters used for filling
446INFORMATION_SCHEMA.innodb_cmp_per_index. */
447UNIV_INLINE
448void
449page_zip_reset_stat_per_index()
450/*===========================*/
451{
452	mutex_enter(&page_zip_stat_per_index_mutex);
453
454	page_zip_stat_per_index.erase(
455		page_zip_stat_per_index.begin(),
456		page_zip_stat_per_index.end());
457
458	mutex_exit(&page_zip_stat_per_index_mutex);
459}
460
461#ifdef UNIV_MATERIALIZE
462# undef UNIV_INLINE
463# define UNIV_INLINE	UNIV_INLINE_ORIGINAL
464#endif
465