1 /* Copyright (C) 2007-2008 Michael Widenius
2
3 This program is free software; you can redistribute it and/or modify
4 it under the terms of the GNU General Public License as published by
5 the Free Software Foundation; version 2 of the License.
6
7 This program is distributed in the hope that it will be useful,
8 but WITHOUT ANY WARRANTY; without even the implied warranty of
9 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
10 GNU General Public License for more details.
11
12 You should have received a copy of the GNU General Public License
13 along with this program; if not, write to the Free Software
14 Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1335 USA */
15
16 /*
17 Storage of records in block
18
19 Some clarifications about the abbrev used:
20
21 NULL fields -> Fields that may have contain a NULL value.
22 Not null fields -> Fields that may not contain a NULL value.
23 Critical fields -> Fields that can't be null and can't be dropped without
24 causing a table reorganization.
25
26
27 Maria will have a LSN at start of each page (excluding the bitmap pages)
28
29 The different page types that are in a data file are:
30
31 Bitmap pages Map of free pages in the next extent (8192 page size
32 gives us 256M of mapped pages / bitmap)
33 Head page Start of rows are stored on this page.
34 A rowid always points to a head page
35 Blob page This page is totally filled with data from one blob or by
36 a set of long VARCHAR/CHAR fields
37 Tail page This contains the last part from different rows, blobs
38 or varchar fields.
39
40 The data file starts with a bitmap page, followed by as many data
41 pages as the bitmap can cover. After this there is a new bitmap page
42 and more data pages etc.
43
44 For information about the bitmap page, see ma_bitmap.c
45
46 Structure of data and tail page:
47
48 The page has a row directory at end of page to allow us to do deletes
49 without having to reorganize the page. It also allows us to later store
50 some more bytes after each row to allow them to grow without having to move
51 around other rows.
52
53 Page header:
54
55 LSN 7 bytes Log position for last page change
56 PAGE_TYPE 1 uchar 0 unalloced / 1 for head / 2 for tail / 3 for blob
57 DIR_COUNT 1 uchar Number of row/tail entries on page
58 FREE_DIR_LINK 1 uchar Pointer to first free director entry or 255 if no
59 empty space 2 bytes Bytes of empty space on page
60
61 The most significant bit in PAGE_TYPE is set to 1 if the data on the page
62 can be compacted to get more space. (PAGE_CAN_BE_COMPACTED)
63
64 Row data
65
66 Row directory of NO entries, that consist of the following for each row
67 (in reverse order; i.e., first record is stored last):
68
69 Position 2 bytes Position of row on page
70 Length 2 bytes Length of entry
71
72 For Position and Length, the 1 most significant bit of the position and
73 the 1 most significant bit of the length could be used for some states of
74 the row (in other words, we should try to keep these reserved)
75
76 Position is 0 if the entry is not used. In this case length[0] points
77 to a previous free entry (255 if no previous entry) and length[1]
78 to the next free entry (or 255 if last free entry). This works because
79 the directory entry 255 can never be marked free (if the first directory
80 entry is freed, the directory is shrinked).
81
82 checksum 4 bytes Reserved for full page read testing and live backup.
83
84 ----------------
85
86 Structure of blob pages:
87
88 LSN 7 bytes Log position for last page change
89 PAGE_TYPE 1 uchar 3
90
91 data
92
93 -----------------
94
95 Row data structure:
96
97 Flag 1 uchar Marker of which header field exists
98 TRANSID 6 bytes TRANSID of changing transaction
99 (optional, added on insert and first
100 update/delete)
101 VER_PTR 7 bytes Pointer to older version in log
102 (undo record)
103 (optional, added after first
104 update/delete)
105 DELETE_TRANSID 6 bytes (optional). TRANSID of original row.
106 Added on delete.
107 Nulls_extended 1 uchar To allow us to add new DEFAULT NULL
108 fields (optional, added after first
109 change of row after alter table)
110 Number of ROW_EXTENT's 1-3 uchar Length encoded, optional
111 This is the number of extents the
112 row is split into
113 First row_extent 7 uchar Pointer to first row extent (optional)
114
115 Total length of length array 1-3 uchar Only used if we have
116 char/varchar/blob fields.
117 Row checksum 1 uchar Only if table created with checksums
118 Null_bits .. One bit for each NULL field (a field that may
119 have the value NULL)
120 Empty_bits .. One bit for each field that may be 'empty'.
121 (Both for null and not null fields).
122 This bit is 1 if the value for the field is
123 0 or empty string.
124
125 field_offsets 2 byte/offset
126 For each 32'th field, there is one offset
127 that points to where the field information
128 starts in the block. This is to provide
129 fast access to later field in the row
130 when we only need to return a small
131 set of fields.
132 TODO: Implement this.
133
134 Things marked above as 'optional' will only be present if the
135 corresponding bit is set in 'Flag' field. Flag gives us a way to
136 get more space on a page when doing page compaction as we don't need
137 to store TRANSID that have committed before the smallest running
138 transaction we have in memory.
139
140 Data in the following order:
141 (Field order is precalculated when table is created)
142
143 Critical fixed length, not null, fields. (Note, these can't be dropped)
144 Fixed length, null fields
145
146 Length array, 1-4 uchar per field for all CHAR/VARCHAR/BLOB fields.
147 Number of bytes used in length array per entry is depending on max length
148 for field.
149
150 ROW_EXTENT's
151 CHAR data (space stripped)
152 VARCHAR data
153 BLOB data
154
155 Fields marked in null_bits or empty_bits are not stored in data part or
156 length array.
157
158 If row doesn't fit into the given block, then the first EXTENT will be
159 stored last on the row. This is done so that we don't break any field
160 data in the middle.
161
162 We first try to store the full row into one block. If that's not possible
163 we move out each big blob into their own extents. If this is not enough we
164 move out a concatenation of all varchars to their own extent.
165
166 Each blob and the concatenated char/varchar fields are stored the following
167 way:
168 - Store the parts in as many full-contiguous pages as possible.
169 - The last part, that doesn't fill a full page, is stored in tail page.
170
171 When doing an insert of a new row, we don't have to have
172 VER_PTR in the row. This will make rows that are not changed stored
173 efficiently. On update and delete we would add TRANSID (if it was an old
174 committed row) and VER_PTR to
175 the row. On row page compaction we can easily detect rows where
176 TRANSID was committed before the longest running transaction
177 started and we can then delete TRANSID and VER_PTR from the row to
178 gain more space.
179
180 If a row is deleted in Maria, we change TRANSID to the deleting
181 transaction's id, change VER_PTR to point to the undo record for the delete,
182 and add DELETE_TRANSID (the id of the transaction which last
183 inserted/updated the row before its deletion). DELETE_TRANSID allows an old
184 transaction to avoid reading the log to know if it can see the last version
185 before delete (in other words it reduces the probability of having to follow
186 VER_PTR). TODO: depending on a compilation option, evaluate the performance
187 impact of not storing DELETE_TRANSID (which would make the row smaller).
188
189 Description of the different parts:
190
191 Flag is coded as:
192
193 Description bit
194 TRANS_ID_exists 0
195 VER_PTR_exists 1
196 Row is deleted 2 (Means that DELETE_TRANSID exists)
197 Nulls_extended_exists 3
198 Row is split 7 This means that 'Number_of_row_extents' exists
199
200 Nulls_extended is the number of new DEFAULT NULL fields in the row
201 compared to the number of DEFAULT NULL fields when the first version
202 of the table was created. If Nulls_extended doesn't exist in the row,
203 we know it's 0 as this must be one of the original rows from when the
204 table was created first time. This coding allows us to add 255*8 =
205 2048 new fields without requiring a full alter table.
206
207 Empty_bits is used to allow us to store 0, 0.0, empty string, empty
208 varstring and empty blob efficiently. (This is very good for data
209 warehousing where NULL's are often regarded as evil). Having this
210 bitmap also allows us to drop information of a field during a future
211 delete if field was deleted with ALTER TABLE DROP COLUMN. To be able
212 to handle DROP COLUMN, we must store in the index header the fields
213 that has been dropped. When unpacking a row we will ignore dropped
214 fields. When storing a row, we will mark a dropped field either with a
215 null in the null bit map or in the empty_bits and not store any data
216 for it.
217 TODO: Add code for handling dropped fields.
218
219
220 A ROW EXTENT is range of pages. One ROW_EXTENT is coded as:
221
222 START_PAGE 5 bytes
223 PAGE_COUNT 2 bytes. Bit 16 is set if this is a tail page.
224 Bit 15 is to set if this is start of a new
225 blob extent.
226
227 With 8K pages, we can cover 256M in one extent. This coding gives us a
228 maximum file size of 2^40*8192 = 8192 tera
229
230 As an example of ROW_EXTENT handling, assume a row with one integer
231 field (value 5), two big VARCHAR fields (size 250 and 8192*3), and 2
232 big BLOB fields that we have updated.
233
234 The record format for storing this into an empty file would be:
235
236 Page 1:
237
238 00 00 00 00 00 00 00 LSN
239 01 Only one row in page
240 FF No free dir entry
241 xx xx Empty space on page
242
243 10 Flag: row split, VER_PTR exists
244 01 00 00 00 00 00 TRANSID 1
245 00 00 00 00 00 01 00 VER_PTR to first block in LOG file 1
246 5 Number of row extents
247 02 00 00 00 00 03 00 VARCHAR's are stored in full pages 2,3,4
248 0 No null fields
249 0 No empty fields
250 05 00 00 00 00 00 80 Tail page for VARCHAR, rowid 0
251 06 00 00 00 00 80 00 First blob, stored at page 6-133
252 05 00 00 00 00 01 80 Tail of first blob (896 bytes) at page 5
253 86 00 00 00 00 80 00 Second blob, stored at page 134-262
254 05 00 00 00 00 02 80 Tail of second blob (896 bytes) at page 5
255 05 00 5 integer
256 FA Length of first varchar field (size 250)
257 00 60 Length of second varchar field (size 8192*3)
258 00 60 10 First medium BLOB, 1M
259 01 00 10 00 Second BLOB, 1M
260 xx xx xx xx xx xx Varchars are stored here until end of page
261
262 ..... until end of page
263
264 09 00 F4 1F Start position 9, length 8180
265 xx xx xx xx Checksum
266
267 A data page is allowed to have a wrong CRC and header as long as it is
268 marked empty in the bitmap and its directory's count is 0.
269 */
270
271 #include "maria_def.h"
272 #include "ma_blockrec.h"
273 #include "trnman.h"
274 #include "ma_trnman.h"
275 #include "ma_key_recover.h"
276 #include "ma_recovery_util.h"
277 #include <lf.h>
278
279 /*
280 Struct for having a cursor over a set of extent.
281 This is used to loop over all extents for a row when reading
282 the row data. It's also used to store the tail positions for
283 a read row to be used by a later update/delete command.
284 */
285
286 typedef struct st_maria_extent_cursor
287 {
288 /*
289 Pointer to packed uchar array of extents for the row.
290 Format is described above in the header
291 */
292 uchar *extent;
293 /* Where data starts on page; Only for debugging */
294 uchar *data_start;
295 /* Position to all tails in the row. Updated when reading a row */
296 MARIA_RECORD_POS *tail_positions;
297 /* Current page */
298 pgcache_page_no_t page;
299 /* How many pages in the page region */
300 uint page_count;
301 /* What kind of lock to use for tail pages */
302 enum pagecache_page_lock lock_for_tail_pages;
303 /* Total number of extents (i.e., entries in the 'extent' slot) */
304 uint extent_count;
305 /* <> 0 if current extent is a tail page; Set while using cursor */
306 uint tail;
307 /* Position for tail on tail page */
308 uint tail_row_nr;
309 /*
310 == 1 if we are working on the first extent (i.e., the one that is stored in
311 the row header, not an extent that is stored as part of the row data).
312 */
313 my_bool first_extent;
314 } MARIA_EXTENT_CURSOR;
315
316
317 /**
318 @brief Structure for passing down info to write_hook_for_clr_end().
319 This hooks needs to know the variation of the live checksum caused by the
320 current operation to update state.checksum under log's mutex,
321 needs to know the transaction's previous undo_lsn to set
322 trn->undo_lsn under log mutex, and needs to know the type of UNDO being
323 undone now to modify state.records under log mutex.
324 */
325
326 /** S:share,D:checksum_delta,E:expression,P:pointer_into_record,L:length */
327 #define store_checksum_in_rec(S,D,E,P,L) do \
328 { \
329 D= 0; \
330 if ((S)->calc_checksum != NULL) \
331 { \
332 D= (E); \
333 ha_checksum_store(P, D); \
334 L+= HA_CHECKSUM_STORE_SIZE; \
335 } \
336 } while (0)
337
338
339 static my_bool delete_tails(MARIA_HA *info, MARIA_RECORD_POS *tails);
340 static my_bool delete_head_or_tail(MARIA_HA *info,
341 pgcache_page_no_t page, uint record_number,
342 my_bool head, my_bool from_update);
343 #ifndef DBUG_OFF
344 static void _ma_print_directory(MARIA_SHARE *share,
345 FILE *file, uchar *buff, uint block_size);
346 #endif
347 static uchar *store_page_range(MARIA_SHARE *share,
348 uchar *to, MARIA_BITMAP_BLOCK *block,
349 ulong length,
350 uint *tot_ranges);
351 static size_t fill_insert_undo_parts(MARIA_HA *info, const uchar *record,
352 LEX_CUSTRING *log_parts,
353 uint *log_parts_count);
354 static size_t fill_update_undo_parts(MARIA_HA *info, const uchar *oldrec,
355 const uchar *newrec,
356 LEX_CUSTRING *log_parts,
357 uint *log_parts_count);
358
359 /****************************************************************************
360 Initialization
361 ****************************************************************************/
362
363 /*
364 Initialize data needed for block structures
365 */
366
367
368 /* Size of the different header elements for a row */
369
370 static uchar header_sizes[]=
371 {
372 TRANSID_SIZE,
373 VERPTR_SIZE,
374 TRANSID_SIZE, /* Delete transid */
375 1 /* Null extends */
376 };
377
378 /*
379 Calculate array of all used headers
380
381 Used to speed up:
382
383 size= 1;
384 if (flag & 1)
385 size+= TRANSID_SIZE;
386 if (flag & 2)
387 size+= VERPTR_SIZE;
388 if (flag & 4)
389 size+= TRANSID_SIZE
390 if (flag & 8)
391 size+= 1;
392
393 NOTES
394 This is called only once at startup of Maria
395 */
396
397 static uchar total_header_size[1 << array_elements(header_sizes)];
398 #define PRECALC_HEADER_BITMASK (array_elements(total_header_size) -1)
399
_ma_init_block_record_data(void)400 void _ma_init_block_record_data(void)
401 {
402 uint i;
403 bzero(total_header_size, sizeof(total_header_size));
404 total_header_size[0]= FLAG_SIZE; /* Flag uchar */
405 for (i= 1; i < array_elements(total_header_size); i++)
406 {
407 uint size= FLAG_SIZE, j, bit;
408 for (j= 0; (bit= (1 << j)) <= i; j++)
409 {
410 if (i & bit)
411 size+= header_sizes[j];
412 }
413 total_header_size[i]= size;
414 }
415 }
416
417
_ma_once_init_block_record(MARIA_SHARE * share,File data_file)418 my_bool _ma_once_init_block_record(MARIA_SHARE *share, File data_file)
419 {
420 my_bool res;
421 pgcache_page_no_t last_page;
422
423 /*
424 First calculate the max file length with can have with a pointer of size
425 rec_reflength.
426
427 The 'rec_reflength - 1' is because one byte is used for row
428 position withing the page.
429 The /2 comes from _ma_transaction_recpos_to_keypos() where we use
430 the lowest bit to mark if there is a transid following the rownr.
431 */
432 last_page= ((ulonglong) 1 << ((share->base.rec_reflength-1)*8))/2;
433 if (!last_page) /* Overflow; set max size */
434 last_page= ~(pgcache_page_no_t) 0;
435
436 res= _ma_bitmap_init(share, data_file, &last_page);
437 share->base.max_data_file_length= _ma_safe_mul(last_page + 1,
438 share->block_size);
439 #if SIZEOF_OFF_T == 4
440 set_if_smaller(share->base.max_data_file_length, INT_MAX32);
441 #endif
442 return res;
443 }
444
445
_ma_once_end_block_record(MARIA_SHARE * share)446 my_bool _ma_once_end_block_record(MARIA_SHARE *share)
447 {
448 int res= _ma_bitmap_end(share);
449 if (share->bitmap.file.file >= 0)
450 {
451 if (flush_pagecache_blocks(share->pagecache, &share->bitmap.file,
452 share->deleting ? FLUSH_IGNORE_CHANGED : FLUSH_RELEASE))
453 res= 1;
454 /*
455 File must be synced as it is going out of the maria_open_list and so
456 becoming unknown to Checkpoint.
457 */
458 if (share->now_transactional &&
459 mysql_file_sync(share->bitmap.file.file, MYF(MY_WME)))
460 res= 1;
461 if (mysql_file_close(share->bitmap.file.file, MYF(MY_WME)))
462 res= 1;
463 /*
464 Trivial assignment to guard against multiple invocations
465 (May happen if file are closed but we want to keep the maria object
466 around a bit longer)
467 */
468 share->bitmap.file.file= -1;
469 }
470 if (share->id != 0)
471 {
472 /*
473 We de-assign the id even though index has not been flushed, this is ok
474 as close_lock serializes us with a Checkpoint looking at our share.
475 */
476 translog_deassign_id_from_share(share);
477 }
478 return res;
479 }
480
481
482 /* Init info->cur_row structure */
483
_ma_init_block_record(MARIA_HA * info)484 my_bool _ma_init_block_record(MARIA_HA *info)
485 {
486 MARIA_ROW *row= &info->cur_row, *new_row= &info->new_row;
487 MARIA_SHARE *share= info->s;
488 uint default_extents;
489 DBUG_ENTER("_ma_init_block_record");
490
491 if (!my_multi_malloc(MY_WME,
492 &row->empty_bits, share->base.pack_bytes,
493 &row->field_lengths,
494 share->base.max_field_lengths + 2,
495 &row->blob_lengths, sizeof(ulong) * share->base.blobs,
496 &row->null_field_lengths, (sizeof(uint) *
497 (share->base.fields -
498 share->base.blobs +
499 EXTRA_LENGTH_FIELDS)),
500 &row->tail_positions, (sizeof(MARIA_RECORD_POS) *
501 (share->base.blobs + 2)),
502 &new_row->empty_bits, share->base.pack_bytes,
503 &new_row->field_lengths,
504 share->base.max_field_lengths + 2,
505 &new_row->blob_lengths,
506 sizeof(ulong) * share->base.blobs,
507 &new_row->null_field_lengths, (sizeof(uint) *
508 (share->base.fields -
509 share->base.blobs +
510 EXTRA_LENGTH_FIELDS)),
511 &info->log_row_parts,
512 sizeof(*info->log_row_parts) *
513 (TRANSLOG_INTERNAL_PARTS + 3 +
514 share->base.fields + 3),
515 &info->update_field_data,
516 (share->base.fields * 4 +
517 share->base.max_field_lengths + 1 + 4),
518 NullS, 0))
519 DBUG_RETURN(1);
520 /* Skip over bytes used to store length of field length for logging */
521 row->field_lengths+= 2;
522 new_row->field_lengths+= 2;
523
524 /* Reserve some initial space to avoid mallocs during execution */
525 default_extents= (ELEMENTS_RESERVED_FOR_MAIN_PART + 1 +
526 (AVERAGE_BLOB_SIZE /
527 FULL_PAGE_SIZE(share) /
528 BLOB_SEGMENT_MIN_SIZE));
529
530 if (my_init_dynamic_array(&info->bitmap_blocks,
531 sizeof(MARIA_BITMAP_BLOCK), default_extents,
532 64, MYF(0)))
533 goto err;
534 info->cur_row.extents_buffer_length= default_extents * ROW_EXTENT_SIZE;
535 if (!(info->cur_row.extents= my_malloc(info->cur_row.extents_buffer_length,
536 MYF(MY_WME))))
537 goto err;
538
539 info->row_base_length= share->base_length;
540 info->row_flag= share->base.default_row_flag;
541
542 /*
543 We need to reserve 'EXTRA_LENGTH_FIELDS' number of parts in
544 null_field_lengths to allow splitting of rows in 'find_where_to_split_row'
545 */
546 row->null_field_lengths+= EXTRA_LENGTH_FIELDS;
547 new_row->null_field_lengths+= EXTRA_LENGTH_FIELDS;
548
549 DBUG_RETURN(0);
550
551 err:
552 _ma_end_block_record(info);
553 DBUG_RETURN(1);
554 }
555
556
_ma_end_block_record(MARIA_HA * info)557 void _ma_end_block_record(MARIA_HA *info)
558 {
559 DBUG_ENTER("_ma_end_block_record");
560 my_free(info->cur_row.empty_bits);
561 delete_dynamic(&info->bitmap_blocks);
562 my_free(info->cur_row.extents);
563 my_free(info->blob_buff);
564 /*
565 The data file is closed, when needed, in ma_once_end_block_record().
566 The following protects us from doing an extra, not allowed, close
567 in maria_close()
568 */
569 info->dfile.file= -1;
570 DBUG_VOID_RETURN;
571 }
572
573
574 /****************************************************************************
575 Helper functions
576 ****************************************************************************/
577
578 /*
579 Return the next unused postion on the page after a directory entry.
580
581 SYNOPSIS
582 start_of_next_entry()
583 dir Directory entry to be used. This can not be the
584 the last entry on the page!
585
586 RETURN
587 # Position in page where next entry starts.
588 Everything between the '*dir' and this are free to be used.
589 */
590
start_of_next_entry(uchar * dir)591 static inline uint start_of_next_entry(uchar *dir)
592 {
593 uchar *prev;
594 /*
595 Find previous used entry. (There is always a previous entry as
596 the directory never starts with a deleted entry)
597 */
598 for (prev= dir - DIR_ENTRY_SIZE ;
599 prev[0] == 0 && prev[1] == 0 ;
600 prev-= DIR_ENTRY_SIZE)
601 {}
602 return (uint) uint2korr(prev);
603 }
604
605
606 /*
607 Return the offset where the previous entry ends (before on page)
608
609 SYNOPSIS
610 end_of_previous_entry()
611 dir Address for current directory entry
612 end Address to last directory entry
613
614 RETURN
615 # Position where previous entry ends (smallest address on page)
616 Everything between # and current entry are free to be used.
617 */
618
619
end_of_previous_entry(MARIA_SHARE * share,uchar * dir,uchar * end)620 static inline uint end_of_previous_entry(MARIA_SHARE *share,
621 uchar *dir, uchar *end)
622 {
623 uchar *pos;
624 for (pos= dir + DIR_ENTRY_SIZE ; pos < end ; pos+= DIR_ENTRY_SIZE)
625 {
626 uint offset;
627 if ((offset= uint2korr(pos)))
628 return offset + uint2korr(pos+2);
629 }
630 return PAGE_HEADER_SIZE(share);
631 }
632
633
634 #ifndef DBUG_OFF
635
_ma_print_directory(MARIA_SHARE * share,FILE * file,uchar * buff,uint block_size)636 static void _ma_print_directory(MARIA_SHARE *share,
637 FILE *file, uchar *buff, uint block_size)
638 {
639 uint max_entry= (uint) ((uchar *) buff)[DIR_COUNT_OFFSET], row= 0;
640 uint end_of_prev_row= PAGE_HEADER_SIZE(share);
641 uchar *dir, *end;
642
643 dir= dir_entry_pos(buff, block_size, max_entry-1);
644 end= dir_entry_pos(buff, block_size, 0);
645
646 DBUG_LOCK_FILE; /* If using DBUG_FILE */
647 fprintf(file,"Directory dump (pos:length):\n");
648
649 for (row= 1; dir <= end ; end-= DIR_ENTRY_SIZE, row++)
650 {
651 uint offset= uint2korr(end);
652 uint length= uint2korr(end+2);
653 fprintf(file, " %4u:%4u", offset, offset ? length : 0);
654 if (!(row % (80/12)))
655 fputc('\n', file);
656 if (offset)
657 {
658 DBUG_ASSERT(offset >= end_of_prev_row);
659 end_of_prev_row= offset + length;
660 }
661 }
662 fputc('\n', file);
663 fflush(file);
664 DBUG_UNLOCK_FILE;
665 }
666
667
check_directory(MARIA_SHARE * share,uchar * buff,uint block_size,uint min_row_length,uint real_empty_size)668 static void check_directory(MARIA_SHARE *share,
669 uchar *buff, uint block_size, uint min_row_length,
670 uint real_empty_size)
671 {
672 uchar *dir, *end;
673 uint max_entry= (uint) buff[DIR_COUNT_OFFSET];
674 uint start_of_dir, deleted;
675 uint end_of_prev_row= PAGE_HEADER_SIZE(share);
676 uint empty_size_on_page;
677 uint empty_size;
678 uchar free_entry, prev_free_entry;
679
680 dir= dir_entry_pos(buff, block_size, max_entry-1);
681 start_of_dir= (uint) (dir - buff);
682 end= dir_entry_pos(buff, block_size, 0);
683 deleted= empty_size= 0;
684
685 empty_size_on_page= (real_empty_size != (uint) -1 ? real_empty_size :
686 uint2korr(buff + EMPTY_SPACE_OFFSET));
687
688 /* Ensure that all rows are in increasing order and no overlaps */
689 for (; dir <= end ; end-= DIR_ENTRY_SIZE)
690 {
691 uint offset= uint2korr(end);
692 uint length= uint2korr(end+2);
693 if (offset)
694 {
695 DBUG_ASSERT(offset >= end_of_prev_row);
696 DBUG_ASSERT(!length || length >= min_row_length);
697 empty_size+= offset - end_of_prev_row;
698 end_of_prev_row= offset + length;
699 }
700 else
701 deleted++;
702 }
703 empty_size+= start_of_dir - end_of_prev_row;
704 DBUG_ASSERT(end_of_prev_row <= start_of_dir);
705 DBUG_ASSERT(empty_size == empty_size_on_page);
706
707 /* check free links */
708 free_entry= buff[DIR_FREE_OFFSET];
709 prev_free_entry= END_OF_DIR_FREE_LIST;
710 while (free_entry != END_OF_DIR_FREE_LIST)
711 {
712 uchar *dir= dir_entry_pos(buff, block_size, free_entry);
713 DBUG_ASSERT(dir[0] == 0 && dir[1] == 0);
714 DBUG_ASSERT(dir[2] == prev_free_entry);
715 prev_free_entry= free_entry;
716 free_entry= dir[3];
717 deleted--;
718 }
719 DBUG_ASSERT(deleted == 0);
720 }
721 #else
722 #define check_directory(A,B,C,D,E)
723 #endif /* DBUG_OFF */
724
725
726 /**
727 @brief Calculate if there is enough entries on the page
728 */
729
enough_free_entries(uchar * buff,uint block_size,uint wanted_entries)730 static my_bool enough_free_entries(uchar *buff, uint block_size,
731 uint wanted_entries)
732 {
733 uint entries= (uint) buff[DIR_COUNT_OFFSET];
734 uint needed_free_entries, free_entry;
735
736 if (entries + wanted_entries <= MAX_ROWS_PER_PAGE)
737 return 1;
738
739 /* Check if enough free entries in free list */
740 needed_free_entries= entries + wanted_entries - MAX_ROWS_PER_PAGE;
741
742 free_entry= (uint) buff[DIR_FREE_OFFSET];
743 while (free_entry != END_OF_DIR_FREE_LIST)
744 {
745 uchar *dir;
746 if (!--needed_free_entries)
747 return 1;
748 dir= dir_entry_pos(buff, block_size, free_entry);
749 free_entry= dir[3];
750 }
751 return 0; /* Not enough entries */
752 }
753
754
755 /**
756 @brief Check if there is room for more rows on page
757
758 @fn enough_free_entries_on_page
759
760 @return 0 Directory is full
761 @return 1 There is room for more entries on the page
762 */
763
enough_free_entries_on_page(MARIA_SHARE * share,uchar * page_buff)764 my_bool enough_free_entries_on_page(MARIA_SHARE *share,
765 uchar *page_buff)
766 {
767 enum en_page_type page_type;
768 page_type= (enum en_page_type) (page_buff[PAGE_TYPE_OFFSET] &
769 ~(uchar) PAGE_CAN_BE_COMPACTED);
770
771 if (page_type == HEAD_PAGE)
772 {
773 uint row_count= (uint) page_buff[DIR_COUNT_OFFSET];
774 return !(row_count == MAX_ROWS_PER_PAGE &&
775 page_buff[DIR_FREE_OFFSET] == END_OF_DIR_FREE_LIST);
776 }
777 return enough_free_entries(page_buff, share->block_size,
778 1 + share->base.blobs);
779 }
780
781
782 /**
783 @brief Extend a record area to fit a given size block
784
785 @fn extend_area_on_page()
786 @param info Handler
787 @param buff Page buffer
788 @param dir Pointer to dir entry in buffer
789 @param rownr Row number we working on
790 @param block_size Block size of buffer
791 @param request_length How much data we want to put at [dir]
792 @param empty_space Total empty space in buffer
793 This is updated with length after dir
794 is allocated and current block freed
795 @param head_page 1 if head page, 0 for tail page
796
797 @implementation
798 The logic is as follows (same as in _ma_update_block_record())
799 - If new data fits in old block, use old block.
800 - Extend block with empty space before block. If enough, use it.
801 - Extend block with empty space after block. If enough, use it.
802 - Use _ma_compact_block_page() to get all empty space at dir.
803
804 @note
805 The given directory entry is set to rec length.
806 empty_space doesn't include the new directory entry
807
808
809 @return
810 @retval 0 ok
811 @retval ret_offset Pointer to store offset to found area
812 @retval ret_length Pointer to store length of found area
813 @retval [dir] rec_offset is store here too
814
815 @retval 1 error (wrong info in block)
816 */
817
extend_area_on_page(MARIA_HA * info,uchar * buff,uchar * dir,uint rownr,uint request_length,uint * empty_space,uint * ret_offset,uint * ret_length,my_bool head_page)818 static my_bool extend_area_on_page(MARIA_HA *info,
819 uchar *buff, uchar *dir,
820 uint rownr,
821 uint request_length,
822 uint *empty_space, uint *ret_offset,
823 uint *ret_length,
824 my_bool head_page)
825 {
826 uint rec_offset, length, org_rec_length;
827 uint max_entry= (uint) buff[DIR_COUNT_OFFSET];
828 MARIA_SHARE *share= info->s;
829 uint block_size= share->block_size;
830 DBUG_ENTER("extend_area_on_page");
831
832 /*
833 We can't check for min length here as we may have called
834 extend_directory() to create a new (empty) entry just before
835 */
836 check_directory(share, buff, block_size, 0, *empty_space);
837
838 rec_offset= uint2korr(dir);
839 if (rec_offset)
840 {
841 /* Extending old row; Mark current space as 'free' */
842 length= org_rec_length= uint2korr(dir + 2);
843 DBUG_PRINT("info", ("rec_offset: %u length: %u request_length: %u "
844 "empty_space: %u",
845 rec_offset, org_rec_length, request_length,
846 *empty_space));
847
848 *empty_space+= org_rec_length;
849 }
850 else
851 {
852 /* Reusing free directory entry; Free it from the directory list */
853 if (dir[2] == END_OF_DIR_FREE_LIST)
854 buff[DIR_FREE_OFFSET]= dir[3];
855 else
856 {
857 uchar *prev_dir= dir_entry_pos(buff, block_size, (uint) dir[2]);
858 DBUG_ASSERT(uint2korr(prev_dir) == 0 && prev_dir[3] == (uchar) rownr);
859 prev_dir[3]= dir[3];
860 }
861 if (dir[3] != END_OF_DIR_FREE_LIST)
862 {
863 uchar *next_dir= dir_entry_pos(buff, block_size, (uint) dir[3]);
864 DBUG_ASSERT(uint2korr(next_dir) == 0 && next_dir[2] == (uchar) rownr);
865 next_dir[2]= dir[2];
866 }
867 rec_offset= start_of_next_entry(dir);
868 length= 0;
869 }
870 if (length < request_length)
871 {
872 uint old_rec_offset;
873 /*
874 New data did not fit in old position.
875 Find first possible position where to put new data.
876 */
877 old_rec_offset= rec_offset;
878 rec_offset= end_of_previous_entry(share,
879 dir, buff + block_size -
880 PAGE_SUFFIX_SIZE);
881 length+= (uint) (old_rec_offset - rec_offset);
882 DBUG_ASSERT(old_rec_offset);
883 /*
884 'length' is 0 if we are doing an insert into a not allocated block.
885 This can only happen during "REDO of INSERT" or "UNDO of DELETE."
886 */
887 if (length < request_length)
888 {
889 /*
890 Did not fit in current block + empty space. Extend with
891 empty space after block.
892 */
893 if (rownr == max_entry - 1)
894 {
895 /* Last entry; Everything is free between this and directory */
896 length= ((block_size - PAGE_SUFFIX_SIZE - DIR_ENTRY_SIZE * max_entry) -
897 rec_offset);
898 }
899 else
900 length= start_of_next_entry(dir) - rec_offset;
901 DBUG_ASSERT((int) length >= 0);
902 if (length < request_length)
903 {
904 /* Not enough continuous space, compact page to get more */
905 int2store(dir, rec_offset);
906 /* Reset length, as this may be a deleted block */
907 int2store(dir+2, 0);
908 _ma_compact_block_page(share,
909 buff, rownr, 1,
910 head_page ? info->trn->min_read_from: 0,
911 head_page ? share->base.min_block_length : 0);
912 rec_offset= uint2korr(dir);
913 length= uint2korr(dir+2);
914 if (length < request_length)
915 {
916 DBUG_PRINT("error", ("Not enough space: "
917 "length: %u request_length: %u",
918 length, request_length));
919 _ma_set_fatal_error(share, HA_ERR_WRONG_IN_RECORD);
920 DBUG_RETURN(1); /* Error in block */
921 }
922 *empty_space= length; /* All space is here */
923 }
924 }
925 }
926 int2store(dir, rec_offset);
927 int2store(dir + 2, length);
928 *ret_offset= rec_offset;
929 *ret_length= length;
930
931 check_directory(share,
932 buff, block_size,
933 head_page ? share->base.min_block_length : 0,
934 *empty_space - length);
935 DBUG_RETURN(0);
936 }
937
938
939 /**
940 @brief Copy not changed fields from 'from' to 'to'
941
942 @notes
943 Assumption is that most fields are not changed!
944 (Which is why we don't test if all bits are set for some bytes in bitmap)
945 */
946
copy_not_changed_fields(MARIA_HA * info,MY_BITMAP * changed_fields,uchar * to,uchar * from)947 void copy_not_changed_fields(MARIA_HA *info, MY_BITMAP *changed_fields,
948 uchar *to, uchar *from)
949 {
950 MARIA_COLUMNDEF *column, *end_column;
951 uchar *bitmap= (uchar*) changed_fields->bitmap;
952 MARIA_SHARE *share= info->s;
953 uint bit= 1;
954
955 for (column= share->columndef, end_column= column+ share->base.fields;
956 column < end_column; column++)
957 {
958 if (!(*bitmap & bit))
959 {
960 uint field_length= column->length;
961 if (column->type == FIELD_VARCHAR)
962 {
963 if (column->fill_length == 1)
964 field_length= (uint) from[column->offset] + 1;
965 else
966 field_length= uint2korr(from + column->offset) + 2;
967 }
968 memcpy(to + column->offset, from + column->offset, field_length);
969 }
970 if ((bit= (bit << 1)) == 256)
971 {
972 bitmap++;
973 bit= 1;
974 }
975 }
976 }
977
978 #ifdef NOT_YET_NEEDED
979 /* Calculate empty space on a page */
980
empty_space_on_page(uchar * buff,uint block_size)981 static uint empty_space_on_page(uchar *buff, uint block_size)
982 {
983 enum en_page_type;
984 page_type= (enum en_page_type) (buff[PAGE_TYPE_OFFSET] &
985 ~(uchar) PAGE_CAN_BE_COMPACTED);
986 if (page_type == UNALLOCATED_PAGE)
987 return block_size;
988 if ((uint) page_type <= TAIL_PAGE)
989 return uint2korr(buff+EMPTY_SPACE_OFFSET);
990 return 0; /* Blob page */
991 }
992 #endif
993
994
995 /*
996 @brief Ensure we have space for new directory entries
997
998 @fn make_space_for_directory()
999 @param info Handler
1000 @param buff Page buffer
1001 @param max_entry Number of current entries in directory
1002 @param count Number of new entries to be added to directory
1003 @param first_dir First directory entry on page
1004 @param empty_space Total empty space in buffer. It's updated
1005 to reflect the new empty space
1006 @param first_pos Store position to last data byte on page here
1007 @param head_page 1 if head page, 0 for tail page.
1008
1009 @note
1010 This function is inline as the argument passing is the biggest
1011 part of the function
1012
1013 @return
1014 @retval 0 ok
1015 @retval 1 error (No data on page, fatal error)
1016 */
1017
1018 static inline my_bool
make_space_for_directory(MARIA_HA * info,uchar * buff,uint max_entry,uint count,uchar * first_dir,uint * empty_space,uint * first_pos,my_bool head_page)1019 make_space_for_directory(MARIA_HA *info,
1020 uchar *buff, uint max_entry,
1021 uint count, uchar *first_dir, uint *empty_space,
1022 uint *first_pos,
1023 my_bool head_page)
1024 {
1025 uint length_needed= DIR_ENTRY_SIZE * count;
1026 MARIA_SHARE *share= info->s;
1027
1028 /*
1029 The following is not true only in the case and UNDO is used to reinsert
1030 a row on a previously not used page
1031 */
1032 if (likely(max_entry))
1033 {
1034 /* Check if there is place for the directory entry on the page */
1035 *first_pos= uint2korr(first_dir) + uint2korr(first_dir + 2);
1036
1037 if ((uint) (first_dir - buff) < *first_pos + length_needed)
1038 {
1039 /* Create place for directory */
1040 _ma_compact_block_page(share,
1041 buff, max_entry - 1, 0,
1042 head_page ? info->trn->min_read_from : 0,
1043 head_page ? share->base.min_block_length : 0);
1044 *first_pos= (uint2korr(first_dir) + uint2korr(first_dir + 2));
1045 *empty_space= uint2korr(buff + EMPTY_SPACE_OFFSET);
1046 if (*empty_space < length_needed)
1047 {
1048 /*
1049 We should always have space, as we only come here for
1050 UNDO of DELETE (in which case we know the row was on the
1051 page before) or if the bitmap told us there was space on page
1052 */
1053 DBUG_ASSERT(!maria_assert_if_crashed_table);
1054 return(1);
1055 }
1056 }
1057 }
1058 else
1059 *first_pos= PAGE_HEADER_SIZE(share);
1060
1061 /* Reduce directory entry size from free space size */
1062 (*empty_space)-= length_needed;
1063 buff[DIR_COUNT_OFFSET]= (uchar) (max_entry + count);
1064 return(0);
1065 }
1066
1067
1068 /*
1069 Find free position in directory
1070
1071 SYNOPSIS
1072 find_free_position()
1073 info Handler
1074 buff Page
1075 block_size Size of page
1076 res_rownr Store index to free position here
1077 res_length Store length of found segment here
1078 empty_space Store length of empty space on disk here. This is
1079 all empty space, including the found block.
1080 @param head_page 1 if head page, 0 for tail page.
1081
1082 NOTES
1083 If there is a free directory entry (entry with position == 0),
1084 then use it and change it to be the size of the empty block
1085 after the previous entry. This guarantees that all row entries
1086 are stored on disk in inverse directory order, which makes life easier for
1087 '_ma_compact_block_page()' and to know if there is free space after any
1088 block.
1089
1090 If there is no free entry (entry with position == 0), then we create
1091 a new one. If there is not space for the directory entry (because
1092 the last block overlapps with the directory), we compact the page.
1093
1094 We will update the offset and the length of the found dir entry to
1095 match the position and empty space found.
1096
1097 buff[EMPTY_SPACE_OFFSET] is NOT updated but left up to the caller
1098
1099 See start of file for description of how free directory entires are linked
1100
1101 RETURN
1102 0 Error (directory full or last block goes over directory)
1103 # Pointer to directory entry on page
1104 */
1105
find_free_position(MARIA_HA * info,uchar * buff,uint block_size,uint * res_rownr,uint * res_length,uint * empty_space,my_bool head_page)1106 static uchar *find_free_position(MARIA_HA *info,
1107 uchar *buff, uint block_size, uint *res_rownr,
1108 uint *res_length, uint *empty_space,
1109 my_bool head_page)
1110 {
1111 uint max_entry, free_entry;
1112 uint length, first_pos;
1113 uchar *dir, *first_dir;
1114 MARIA_SHARE *share= info->s;
1115 DBUG_ENTER("find_free_position");
1116
1117 max_entry= (uint) buff[DIR_COUNT_OFFSET];
1118 free_entry= (uint) buff[DIR_FREE_OFFSET];
1119 *empty_space= uint2korr(buff + EMPTY_SPACE_OFFSET);
1120
1121 DBUG_PRINT("info", ("max_entry: %u free_entry: %u", max_entry, free_entry));
1122
1123 first_dir= dir_entry_pos(buff, block_size, max_entry - 1);
1124
1125 /* Search after first free position */
1126 if (free_entry != END_OF_DIR_FREE_LIST)
1127 {
1128 if (free_entry >= max_entry)
1129 DBUG_RETURN(0); /* Consistency error */
1130 dir= dir_entry_pos(buff, block_size, free_entry);
1131 DBUG_ASSERT(uint2korr(dir) == 0 && dir[2] == END_OF_DIR_FREE_LIST);
1132 /* Relink free list */
1133 if ((buff[DIR_FREE_OFFSET]= dir[3]) != END_OF_DIR_FREE_LIST)
1134 {
1135 uchar *next_entry= dir_entry_pos(buff, block_size, (uint) dir[3]);
1136 DBUG_ASSERT((uint) next_entry[2] == free_entry &&
1137 uint2korr(next_entry) == 0);
1138 next_entry[2]= END_OF_DIR_FREE_LIST; /* Backlink */
1139 }
1140
1141 first_pos= end_of_previous_entry(share,
1142 dir, buff + block_size -
1143 PAGE_SUFFIX_SIZE);
1144 length= start_of_next_entry(dir) - first_pos;
1145 int2store(dir, first_pos); /* Update dir entry */
1146 int2store(dir + 2, 0);
1147 *res_rownr= free_entry;
1148 *res_length= length;
1149
1150 check_directory(share, buff, block_size,
1151 head_page ? share->base.min_block_length : 0, (uint) -1);
1152 DBUG_RETURN(dir);
1153 }
1154 /* No free places in dir; create a new one */
1155
1156 /* Check if there is place for the directory entry */
1157 if (max_entry == MAX_ROWS_PER_PAGE)
1158 DBUG_RETURN(0);
1159
1160 if (make_space_for_directory(info, buff, max_entry, 1,
1161 first_dir, empty_space, &first_pos, head_page))
1162 DBUG_RETURN(0);
1163
1164 dir= first_dir - DIR_ENTRY_SIZE;
1165 length= (uint) (dir - buff - first_pos);
1166 DBUG_ASSERT(length <= *empty_space);
1167 int2store(dir, first_pos);
1168 int2store(dir + 2, 0); /* Max length of region */
1169 *res_rownr= max_entry;
1170 *res_length= length;
1171
1172 check_directory(share,
1173 buff, block_size,
1174 head_page ? share->base.min_block_length : 0,
1175 *empty_space);
1176 DBUG_RETURN(dir);
1177 }
1178
1179
1180 /**
1181 @brief Enlarge page directory to hold more entries
1182
1183 @fn extend_directory()
1184 @param info Handler
1185 @param buff Page buffer
1186 @param block_size Block size
1187 @param max_entry Number of directory entries on page
1188 @param new_entry Position for new entry
1189 @param empty_space Total empty space in buffer. It's updated
1190 to reflect the new empty space
1191 @param head_page 1 if head page, 0 for tail page.
1192
1193 @note
1194 This is only called on UNDO when we want to expand the directory
1195 to be able to re-insert row in a given position
1196
1197 The new directory entry will be set to cover the maximum possible space
1198
1199 @return
1200 @retval 0 ok
1201 @retval 1 error (No data on page, fatal error)
1202 */
1203
extend_directory(MARIA_HA * info,uchar * buff,uint block_size,uint max_entry,uint new_entry,uint * empty_space,my_bool head_page)1204 static my_bool extend_directory(MARIA_HA *info, uchar *buff, uint block_size,
1205 uint max_entry, uint new_entry,
1206 uint *empty_space, my_bool head_page)
1207 {
1208 uint length, first_pos;
1209 uchar *dir, *first_dir;
1210 DBUG_ENTER("extend_directory");
1211
1212 /*
1213 Note that in if max_entry is 0, then first_dir will point to
1214 an illegal directory entry. This is ok, as in this case we will
1215 not access anything through first_dir.
1216 */
1217 first_dir= dir_entry_pos(buff, block_size, max_entry) + DIR_ENTRY_SIZE;
1218
1219 if (make_space_for_directory(info, buff, max_entry,
1220 new_entry - max_entry + 1,
1221 first_dir, empty_space, &first_pos, head_page))
1222 DBUG_RETURN(1);
1223
1224 /* Set the new directory entry to cover the max possible length */
1225 dir= first_dir - DIR_ENTRY_SIZE * (new_entry - max_entry + 1);
1226 length= (uint) (dir - buff - first_pos);
1227 int2store(dir, first_pos);
1228 int2store(dir+2, length);
1229 *empty_space-= length;
1230
1231 if (new_entry-- > max_entry)
1232 {
1233 /* Link all row entries between new_entry and max_entry into free list */
1234 uint free_entry= (uint) buff[DIR_FREE_OFFSET];
1235 uint prev_entry= END_OF_DIR_FREE_LIST;
1236 buff[DIR_FREE_OFFSET]= new_entry;
1237 do
1238 {
1239 dir+= DIR_ENTRY_SIZE;
1240 dir[0]= dir[1]= 0;
1241 dir[2]= (uchar) prev_entry;
1242 dir[3]= (uchar) new_entry-1;
1243 prev_entry= new_entry;
1244 } while (new_entry-- > max_entry);
1245 if ((dir[3]= free_entry) != END_OF_DIR_FREE_LIST)
1246 {
1247 /* Relink next entry to point to newly freed entry */
1248 uchar *next_entry= dir_entry_pos(buff, block_size, (uint) dir[3]);
1249 DBUG_ASSERT(uint2korr(next_entry) == 0 &&
1250 next_entry[2] == END_OF_DIR_FREE_LIST);
1251 next_entry[2]= max_entry;
1252 }
1253 }
1254
1255 check_directory(info->s,
1256 buff, block_size,
1257 head_page ? MY_MIN(info->s->base.min_block_length, length) :
1258 0, *empty_space);
1259 DBUG_RETURN(0);
1260 }
1261
1262
1263 /****************************************************************************
1264 Updating records
1265 ****************************************************************************/
1266
1267 /*
1268 Calculate length of all the different field parts
1269
1270 SYNOPSIS
1271 calc_record_size()
1272 info Maria handler
1273 record Row to store
1274 row Store statistics about row here
1275
1276 NOTES
1277 The statistics is used to find out how much space a row will need
1278 and also where we can split a row when we need to split it into several
1279 extents.
1280 */
1281
calc_record_size(MARIA_HA * info,const uchar * record,MARIA_ROW * row)1282 static void calc_record_size(MARIA_HA *info, const uchar *record,
1283 MARIA_ROW *row)
1284 {
1285 MARIA_SHARE *share= info->s;
1286 uchar *field_length_data;
1287 MARIA_COLUMNDEF *column, *end_column;
1288 uint *null_field_lengths= row->null_field_lengths;
1289 ulong *blob_lengths= row->blob_lengths;
1290 DBUG_ENTER("calc_record_size");
1291
1292 row->normal_length= row->char_length= row->varchar_length=
1293 row->blob_length= row->extents_count= 0;
1294
1295 /* Create empty bitmap and calculate length of each varlength/char field */
1296 bzero(row->empty_bits, share->base.pack_bytes);
1297 field_length_data= row->field_lengths;
1298 for (column= share->columndef + share->base.fixed_not_null_fields,
1299 end_column= share->columndef + share->base.fields;
1300 column < end_column; column++, null_field_lengths++)
1301 {
1302 if ((record[column->null_pos] & column->null_bit))
1303 {
1304 if (column->type != FIELD_BLOB)
1305 *null_field_lengths= 0;
1306 else
1307 *blob_lengths++= 0;
1308 continue;
1309 }
1310 switch (column->type) {
1311 case FIELD_CHECK:
1312 case FIELD_NORMAL: /* Fixed length field */
1313 case FIELD_ZERO:
1314 DBUG_ASSERT(column->empty_bit == 0);
1315 /* fall through */
1316 case FIELD_SKIP_PRESPACE: /* Not packed */
1317 row->normal_length+= column->length;
1318 *null_field_lengths= column->length;
1319 break;
1320 case FIELD_SKIP_ZERO: /* Fixed length field */
1321 if (memcmp(record+ column->offset, maria_zero_string,
1322 column->length) == 0)
1323 {
1324 row->empty_bits[column->empty_pos] |= column->empty_bit;
1325 *null_field_lengths= 0;
1326 }
1327 else
1328 {
1329 row->normal_length+= column->length;
1330 *null_field_lengths= column->length;
1331 }
1332 break;
1333 case FIELD_SKIP_ENDSPACE: /* CHAR */
1334 {
1335 const uchar *pos, *end;
1336 for (pos= record + column->offset, end= pos + column->length;
1337 end > pos && end[-1] == ' '; end--)
1338 ;
1339 if (pos == end) /* If empty string */
1340 {
1341 row->empty_bits[column->empty_pos]|= column->empty_bit;
1342 *null_field_lengths= 0;
1343 }
1344 else
1345 {
1346 uint length= (uint) (end - pos);
1347 if (column->length <= 255)
1348 *field_length_data++= (uchar) length;
1349 else
1350 {
1351 int2store(field_length_data, length);
1352 field_length_data+= 2;
1353 }
1354 row->char_length+= length;
1355 *null_field_lengths= length;
1356 }
1357 break;
1358 }
1359 case FIELD_VARCHAR:
1360 {
1361 uint length, field_length_data_length;
1362 const uchar *field_pos= record + column->offset;
1363
1364 /* 256 is correct as this includes the length uchar */
1365 field_length_data[0]= field_pos[0];
1366 if (column->length <= 256)
1367 {
1368 length= (uint) (uchar) *field_pos;
1369 field_length_data_length= 1;
1370 }
1371 else
1372 {
1373 length= uint2korr(field_pos);
1374 field_length_data[1]= field_pos[1];
1375 field_length_data_length= 2;
1376 }
1377 *null_field_lengths= length;
1378 if (!length)
1379 {
1380 row->empty_bits[column->empty_pos]|= column->empty_bit;
1381 break;
1382 }
1383 row->varchar_length+= length;
1384 *null_field_lengths= length;
1385 field_length_data+= field_length_data_length;
1386 break;
1387 }
1388 case FIELD_BLOB:
1389 {
1390 const uchar *field_pos= record + column->offset;
1391 uint size_length= column->length - portable_sizeof_char_ptr;
1392 ulong blob_length= _ma_calc_blob_length(size_length, field_pos);
1393
1394 *blob_lengths++= blob_length;
1395 if (!blob_length)
1396 row->empty_bits[column->empty_pos]|= column->empty_bit;
1397 else
1398 {
1399 row->blob_length+= blob_length;
1400 memcpy(field_length_data, field_pos, size_length);
1401 field_length_data+= size_length;
1402 }
1403 break;
1404 }
1405 default:
1406 DBUG_ASSERT(0);
1407 }
1408 }
1409 row->field_lengths_length= (uint) (field_length_data - row->field_lengths);
1410 /*
1411 - info->row_base_length is base information we must have on a page in first
1412 extent:
1413 - flag byte (1) + is_nulls_extended (0 | 1) + null_bytes + pack_bytes +
1414 table_checksum (0 | 1)
1415 - row->min_length is minimum amount of data we must store on
1416 a page. bitmap code will ensure we get at list this much +
1417 total number of extents and one extent information
1418 - fixed_not_null_fields_length is length of fixed length fields that can't
1419 be compacted
1420 - head_length is the amount of data for the head page
1421 (ie, all fields except blobs)
1422 */
1423 row->min_length= (info->row_base_length +
1424 (share->base.max_field_lengths ?
1425 size_to_store_key_length(row->field_lengths_length) :
1426 0));
1427 row->head_length= (row->min_length +
1428 share->base.fixed_not_null_fields_length +
1429 row->field_lengths_length +
1430 row->normal_length +
1431 row->char_length + row->varchar_length);
1432 row->total_length= (row->head_length + row->blob_length);
1433 if (row->total_length < share->base.min_block_length)
1434 row->total_length= share->base.min_block_length;
1435 DBUG_PRINT("exit", ("head_length: %lu total_length: %lu",
1436 (ulong) row->head_length, (ulong) row->total_length));
1437 DBUG_VOID_RETURN;
1438 }
1439
1440
1441 /**
1442 Compact page by removing all space between rows
1443
1444 Moves up all rows to start of page. Moves blocks that are directly after
1445 each other with one memmove.
1446
1447 @note if rownr is the last row in the page, and extend_block is false,
1448 caller has to make sure to update bitmap page afterwards to reflect freed
1449 space.
1450
1451 @param buff Page to compact
1452 @param block_size Size of page
1453 @param rownr Put empty data after this row
1454 @param extend_block If 1, extend the block at 'rownr' to cover the
1455 whole block.
1456 @param min_read_from If <> 0, remove all trid's that are less than this
1457 */
1458
_ma_compact_block_page(MARIA_SHARE * share,uchar * buff,uint rownr,my_bool extend_block,TrID min_read_from,uint min_row_length)1459 void _ma_compact_block_page(MARIA_SHARE *share,
1460 uchar *buff, uint rownr,
1461 my_bool extend_block, TrID min_read_from,
1462 uint min_row_length)
1463 {
1464 uint max_entry= (uint) buff[DIR_COUNT_OFFSET];
1465 uint page_pos, next_free_pos, start_of_found_block, diff, end_of_found_block;
1466 uint freed_size= 0;
1467 uint block_size= share->block_size;
1468 uchar *dir, *end;
1469 DBUG_ENTER("_ma_compact_block_page");
1470 DBUG_PRINT("enter", ("rownr: %u min_read_from: %lu", rownr,
1471 (ulong) min_read_from));
1472 DBUG_ASSERT(max_entry > 0 &&
1473 max_entry < (block_size - PAGE_HEADER_SIZE(share) -
1474 PAGE_SUFFIX_SIZE) / DIR_ENTRY_SIZE);
1475
1476 /* Move all entries before and including rownr up to start of page */
1477 dir= dir_entry_pos(buff, block_size, rownr);
1478 end= dir_entry_pos(buff, block_size, 0);
1479 page_pos= next_free_pos= start_of_found_block= PAGE_HEADER_SIZE(share);
1480 diff= 0;
1481 for (; dir <= end ; end-= DIR_ENTRY_SIZE)
1482 {
1483 uint offset= uint2korr(end);
1484
1485 if (offset)
1486 {
1487 uint row_length= uint2korr(end + 2);
1488 DBUG_ASSERT(offset >= page_pos);
1489 DBUG_ASSERT(buff + offset + row_length <= dir);
1490 DBUG_ASSERT(row_length >= min_row_length || row_length == 0);
1491
1492 /* Row length can be zero if row is to be deleted */
1493 if (min_read_from && row_length && (buff[offset] & ROW_FLAG_TRANSID))
1494 {
1495 TrID transid= transid_korr(buff+offset+1);
1496 if (transid < min_read_from)
1497 {
1498 /* Remove transid from row by moving the start point of the row up */
1499 buff[offset + TRANSID_SIZE]= buff[offset] & ~ROW_FLAG_TRANSID;
1500 offset+= TRANSID_SIZE;
1501 freed_size+= TRANSID_SIZE;
1502 row_length-= TRANSID_SIZE;
1503 int2store(end+2, row_length);
1504 }
1505 }
1506
1507 if (offset != next_free_pos)
1508 {
1509 uint length= (next_free_pos - start_of_found_block);
1510 /*
1511 There was empty space before this and prev block
1512 Check if we have to move previous block up to page start
1513 */
1514 if (page_pos != start_of_found_block)
1515 {
1516 /* move up previous block */
1517 memmove(buff + page_pos, buff + start_of_found_block, length);
1518 }
1519 page_pos+= length;
1520 /* next continuous block starts here */
1521 start_of_found_block= offset;
1522 diff= offset - page_pos;
1523 }
1524 int2store(end, offset - diff); /* correct current pos */
1525 next_free_pos= offset + row_length;
1526
1527 if (unlikely(row_length < min_row_length) && row_length)
1528 {
1529 /*
1530 This can only happen in the case we compacted transid and
1531 the row become 'too short'
1532
1533 Move the current row down to it's right place and extend it
1534 with 0.
1535 */
1536 uint row_diff= min_row_length - row_length;
1537 uint length= (next_free_pos - start_of_found_block);
1538
1539 DBUG_ASSERT(page_pos != start_of_found_block);
1540 bmove(buff + page_pos, buff + start_of_found_block, length);
1541 bzero(buff+ page_pos + length, row_diff);
1542 page_pos+= min_row_length;
1543 int2store(end+2, min_row_length);
1544 freed_size-= row_diff;
1545 next_free_pos= start_of_found_block= page_pos;
1546 diff= 0;
1547 }
1548 }
1549 }
1550 if (page_pos != start_of_found_block)
1551 {
1552 uint length= (next_free_pos - start_of_found_block);
1553 memmove(buff + page_pos, buff + start_of_found_block, length);
1554 }
1555 start_of_found_block= uint2korr(dir);
1556
1557 if (rownr != max_entry - 1)
1558 {
1559 /* Move all entries after rownr to end of page */
1560 uint rownr_length;
1561
1562 DBUG_ASSERT(extend_block); /* Should always be true */
1563 next_free_pos= end_of_found_block= page_pos=
1564 block_size - DIR_ENTRY_SIZE * max_entry - PAGE_SUFFIX_SIZE;
1565 diff= 0;
1566 /* End points to entry before 'rownr' */
1567 for (dir= buff + end_of_found_block ; dir <= end ; dir+= DIR_ENTRY_SIZE)
1568 {
1569 uint offset= uint2korr(dir);
1570 uint row_length;
1571 uint row_end;
1572 if (!offset)
1573 continue;
1574 row_length= uint2korr(dir + 2);
1575 row_end= offset + row_length;
1576 DBUG_ASSERT(offset >= start_of_found_block &&
1577 row_end <= next_free_pos && row_length >= min_row_length);
1578
1579 if (min_read_from && (buff[offset] & ROW_FLAG_TRANSID))
1580 {
1581 TrID transid= transid_korr(buff + offset+1);
1582 if (transid < min_read_from)
1583 {
1584 /* Remove transid from row */
1585 buff[offset + TRANSID_SIZE]= buff[offset] & ~ROW_FLAG_TRANSID;
1586 offset+= TRANSID_SIZE;
1587 row_length-= TRANSID_SIZE;
1588 int2store(dir+2, row_length);
1589 }
1590 if (unlikely(row_length < min_row_length))
1591 {
1592 /*
1593 This can only happen in the case we compacted transid and
1594 the row become 'too short'
1595 */
1596 uint row_diff= min_row_length - row_length;
1597 if (next_free_pos < row_end + row_diff)
1598 {
1599 /*
1600 Not enough space for extending next block with enough
1601 end 0's. Move current data down to get place for them
1602 */
1603 uint move_down= row_diff - (next_free_pos - row_end);
1604 bmove(buff + offset - move_down, buff + offset, row_length);
1605 offset-= move_down;
1606 }
1607 /*
1608 Extend the next block with 0, which will be part of current
1609 row when the blocks are joined together later
1610 */
1611 bzero(buff + next_free_pos - row_diff, row_diff);
1612 next_free_pos-= row_diff;
1613 int2store(dir+2, min_row_length);
1614 }
1615 row_end= offset + row_length;
1616 }
1617
1618 if (row_end != next_free_pos)
1619 {
1620 uint length= (end_of_found_block - next_free_pos);
1621 if (page_pos != end_of_found_block)
1622 {
1623 /* move next block down */
1624 memmove(buff + page_pos - length, buff + next_free_pos, length);
1625 }
1626 page_pos-= length;
1627 /* next continuous block starts here */
1628 end_of_found_block= row_end;
1629 diff= page_pos - row_end;
1630 }
1631 int2store(dir, offset + diff); /* correct current pos */
1632 next_free_pos= offset;
1633 }
1634 if (page_pos != end_of_found_block)
1635 {
1636 uint length= (end_of_found_block - next_free_pos);
1637 memmove(buff + page_pos - length, buff + next_free_pos, length);
1638 next_free_pos= page_pos- length;
1639 }
1640
1641 /* Extend rownr block to cover hole */
1642 rownr_length= next_free_pos - start_of_found_block;
1643 int2store(dir+2, rownr_length);
1644 DBUG_ASSERT(rownr_length >= min_row_length);
1645 }
1646 else
1647 {
1648 if (extend_block)
1649 {
1650 /* Extend last block to cover whole page */
1651 uint length= ((uint) (dir - buff) - start_of_found_block);
1652 int2store(dir+2, length);
1653 DBUG_ASSERT(length >= min_row_length);
1654 }
1655 else
1656 {
1657 /* Add length gained from freed transaction id's to this page */
1658 uint length= uint2korr(buff+ EMPTY_SPACE_OFFSET) + freed_size;
1659 int2store(buff + EMPTY_SPACE_OFFSET, length);
1660 }
1661 buff[PAGE_TYPE_OFFSET]&= ~(uchar) PAGE_CAN_BE_COMPACTED;
1662 }
1663 check_directory(share, buff, block_size, min_row_length,
1664 extend_block ? 0 : (uint) -1);
1665 DBUG_EXECUTE("directory", _ma_print_directory(share,
1666 DBUG_FILE, buff, block_size););
1667 DBUG_VOID_RETURN;
1668 }
1669
1670
1671 /*
1672 Create an empty tail or head page
1673
1674 SYNOPSIS
1675 make_empty_page()
1676 buff Page buffer
1677 block_size Block size
1678 page_type HEAD_PAGE or TAIL_PAGE
1679 create_dir_entry TRUE of we should create a directory entry
1680
1681 NOTES
1682 EMPTY_SPACE is not updated
1683 */
1684
make_empty_page(MARIA_HA * info,uchar * buff,uint page_type,my_bool create_dir_entry)1685 static void make_empty_page(MARIA_HA *info, uchar *buff, uint page_type,
1686 my_bool create_dir_entry)
1687 {
1688 uint block_size= info->s->block_size;
1689 DBUG_ENTER("make_empty_page");
1690
1691 bzero(buff, PAGE_HEADER_SIZE(info->s));
1692
1693 #if !defined(DONT_ZERO_PAGE_BLOCKS) || defined(HAVE_valgrind)
1694 /*
1695 We zero the rest of the block to avoid getting old memory information
1696 to disk and to allow the file to be compressed better if archived.
1697 The code does not assume the block is zeroed.
1698 */
1699 if (page_type != BLOB_PAGE)
1700 bzero(buff+ PAGE_HEADER_SIZE(info->s),
1701 block_size - PAGE_HEADER_SIZE(info->s));
1702 #endif
1703 buff[PAGE_TYPE_OFFSET]= (uchar) page_type;
1704 buff[DIR_COUNT_OFFSET]= (int) create_dir_entry;
1705 buff[DIR_FREE_OFFSET]= END_OF_DIR_FREE_LIST;
1706 if (create_dir_entry)
1707 {
1708 /* Create directory entry to point to start of page with size 0 */
1709 buff+= block_size - PAGE_SUFFIX_SIZE - DIR_ENTRY_SIZE;
1710 int2store(buff, PAGE_HEADER_SIZE(info->s));
1711 int2store(buff+2, 0);
1712 }
1713 DBUG_VOID_RETURN;
1714 }
1715
1716
1717 /*
1718 Read or initialize new head or tail page
1719
1720 SYNOPSIS
1721 get_head_or_tail_page()
1722 info Maria handler
1723 block Block to read
1724 buff Suggest this buffer to key cache
1725 length Minimum space needed
1726 page_type HEAD_PAGE || TAIL_PAGE
1727 res Store result position here
1728
1729 NOTES
1730 We don't decremented buff[EMPTY_SPACE_OFFSET] with the allocated data
1731 as we don't know how much data the caller will actually use.
1732
1733 res->empty_space is set to length of empty space
1734
1735 RETURN
1736 0 ok All slots in 'res' are updated
1737 1 error my_errno is set
1738 */
1739
1740 struct st_row_pos_info
1741 {
1742 uchar *buff; /* page buffer */
1743 uchar *data; /* Place for data */
1744 uchar *dir; /* Directory */
1745 uint length; /* Length for data */
1746 uint rownr; /* Offset in directory */
1747 uint empty_space; /* Space left on page */
1748 };
1749
1750
get_head_or_tail_page(MARIA_HA * info,const MARIA_BITMAP_BLOCK * block,uchar * buff,uint length,uint page_type,enum pagecache_page_lock lock,struct st_row_pos_info * res)1751 static my_bool get_head_or_tail_page(MARIA_HA *info,
1752 const MARIA_BITMAP_BLOCK *block,
1753 uchar *buff, uint length, uint page_type,
1754 enum pagecache_page_lock lock,
1755 struct st_row_pos_info *res)
1756 {
1757 uint block_size;
1758 MARIA_PINNED_PAGE page_link;
1759 MARIA_SHARE *share= info->s;
1760 DBUG_ENTER("get_head_or_tail_page");
1761 DBUG_PRINT("enter", ("page_type: %u length: %u", page_type, length));
1762
1763 block_size= share->block_size;
1764 if (block->org_bitmap_value == 0) /* Empty block */
1765 {
1766 /* New page */
1767 make_empty_page(info, buff, page_type, 1);
1768 res->buff= buff;
1769 res->empty_space= res->length= (block_size - PAGE_OVERHEAD_SIZE(share));
1770 res->data= (buff + PAGE_HEADER_SIZE(share));
1771 res->dir= res->data + res->length;
1772 res->rownr= 0;
1773 DBUG_ASSERT(length <= res->length);
1774 }
1775 else
1776 {
1777 uchar *dir;
1778 /* Read old page */
1779 page_link.unlock= PAGECACHE_LOCK_WRITE_UNLOCK;
1780 res->buff= pagecache_read(share->pagecache, &info->dfile,
1781 block->page, 0, 0, share->page_type,
1782 lock, &page_link.link);
1783 page_link.changed= res->buff != 0;
1784 push_dynamic(&info->pinned_pages, (void*) &page_link);
1785 if (!page_link.changed)
1786 goto crashed;
1787
1788 DBUG_ASSERT((uint) (res->buff[PAGE_TYPE_OFFSET] & PAGE_TYPE_MASK) ==
1789 page_type);
1790 if (!(dir= find_free_position(info, res->buff, block_size, &res->rownr,
1791 &res->length, &res->empty_space,
1792 page_type == HEAD_PAGE)))
1793 goto crashed;
1794
1795 if (res->length < length)
1796 {
1797 if (res->empty_space + res->length >= length)
1798 {
1799 _ma_compact_block_page(share,
1800 res->buff, res->rownr, 1,
1801 (page_type == HEAD_PAGE ?
1802 info->trn->min_read_from : 0),
1803 (page_type == HEAD_PAGE ?
1804 share->base.min_block_length :
1805 0));
1806 /* All empty space are now after current position */
1807 dir= dir_entry_pos(res->buff, block_size, res->rownr);
1808 res->length= res->empty_space= uint2korr(dir+2);
1809 }
1810 if (res->length < length)
1811 {
1812 DBUG_PRINT("error", ("length: %u res->length: %u empty_space: %u",
1813 length, res->length, res->empty_space));
1814 goto crashed; /* Wrong bitmap information */
1815 }
1816 }
1817 res->dir= dir;
1818 res->data= res->buff + uint2korr(dir);
1819 }
1820 DBUG_RETURN(0);
1821
1822 crashed:
1823 DBUG_ASSERT(!maria_assert_if_crashed_table);
1824 _ma_set_fatal_error(share, HA_ERR_WRONG_IN_RECORD); /* File crashed */
1825 DBUG_RETURN(1);
1826 }
1827
1828
1829 /*
1830 @brief Create room for a head or tail row on a given page at given position
1831
1832 @fn get_rowpos_in_head_or_tail_page()
1833 @param info Maria handler
1834 @param block Block to read
1835 @param buff Suggest this buffer to key cache
1836 @param length Minimum space needed
1837 @param page_type HEAD_PAGE || TAIL_PAGE
1838 @param rownr Rownr to use
1839 @param res Store result position here
1840
1841 @note
1842 This is essential same as get_head_or_tail_page, with the difference
1843 that the caller species at what position the row should be put.
1844 This is used when restoring a row to it's original position as
1845 part of UNDO DELETE or UNDO UPDATE
1846
1847 @return
1848 @retval 0 ok All slots in 'res' are updated
1849 @retval 1 error my_errno is set
1850 */
1851
get_rowpos_in_head_or_tail_page(MARIA_HA * info,const MARIA_BITMAP_BLOCK * block,uchar * buff,uint length,uint page_type,enum pagecache_page_lock lock,uint rownr,struct st_row_pos_info * res)1852 static my_bool get_rowpos_in_head_or_tail_page(MARIA_HA *info,
1853 const MARIA_BITMAP_BLOCK *block,
1854 uchar *buff, uint length,
1855 uint page_type,
1856 enum pagecache_page_lock lock,
1857 uint rownr,
1858 struct st_row_pos_info *res)
1859 {
1860 MARIA_PINNED_PAGE page_link;
1861 MARIA_SHARE *share= info->s;
1862 uchar *dir;
1863 uint block_size= share->block_size;
1864 uint max_entry, max_length, rec_offset;
1865 DBUG_ENTER("get_rowpos_in_head_or_tail_page");
1866
1867 if (block->org_bitmap_value == 0) /* Empty block */
1868 {
1869 /* New page */
1870 make_empty_page(info, buff, page_type, 0);
1871 res->empty_space= block_size - PAGE_HEADER_SIZE(share) - PAGE_SUFFIX_SIZE;
1872 }
1873 else
1874 {
1875 page_link.unlock= PAGECACHE_LOCK_WRITE_UNLOCK;
1876 buff= pagecache_read(share->pagecache, &info->dfile,
1877 block->page, 0, 0, share->page_type,
1878 lock, &page_link.link);
1879 page_link.changed= buff != 0;
1880 push_dynamic(&info->pinned_pages, (void*) &page_link);
1881 if (!page_link.changed) /* Read error */
1882 goto err;
1883 DBUG_ASSERT((buff[PAGE_TYPE_OFFSET] & PAGE_TYPE_MASK) ==
1884 (uchar) page_type);
1885 if ((buff[PAGE_TYPE_OFFSET] & PAGE_TYPE_MASK) != (uchar) page_type)
1886 goto err;
1887 res->empty_space= uint2korr(buff + EMPTY_SPACE_OFFSET);
1888 }
1889
1890 max_entry= (uint) buff[DIR_COUNT_OFFSET];
1891 if (max_entry <= rownr)
1892 {
1893 if (extend_directory(info, buff, block_size,
1894 max_entry, rownr, &res->empty_space,
1895 page_type == HEAD_PAGE))
1896 goto err;
1897 }
1898
1899 /*
1900 The following dir entry is unused in case of insert / update but
1901 not in case of undo_update / undo_delete
1902 */
1903 dir= dir_entry_pos(buff, block_size, rownr);
1904
1905 if (extend_area_on_page(info, buff, dir, rownr, length,
1906 &res->empty_space, &rec_offset, &max_length,
1907 page_type == HEAD_PAGE))
1908 goto err;
1909
1910 res->buff= buff;
1911 res->rownr= rownr;
1912 res->dir= dir;
1913 res->data= buff + rec_offset;
1914 res->length= length;
1915 DBUG_RETURN(0);
1916
1917 err:
1918 DBUG_ASSERT(!maria_assert_if_crashed_table);
1919 _ma_set_fatal_error(share, HA_ERR_WRONG_IN_RECORD); /* File crashed */
1920 DBUG_RETURN(1);
1921 }
1922
1923
1924 /*
1925 Write tail for head data or blob
1926
1927 SYNOPSIS
1928 write_tail()
1929 info Maria handler
1930 block Block to tail page
1931 row_part Data to write to page
1932 length Length of data
1933
1934 NOTES
1935 block->page_count is updated to the directory offset for the tail
1936 so that we can store the position in the row extent information
1937
1938 RETURN
1939 0 ok
1940 block->page_count is set to point (dir entry + TAIL_BIT)
1941
1942 1 error; In this case my_errno is set to the error
1943 */
1944
write_tail(MARIA_HA * info,MARIA_BITMAP_BLOCK * block,uchar * row_part,uint org_length)1945 static my_bool write_tail(MARIA_HA *info,
1946 MARIA_BITMAP_BLOCK *block,
1947 uchar *row_part, uint org_length)
1948 {
1949 MARIA_SHARE *share= info->s;
1950 MARIA_PINNED_PAGE page_link;
1951 uint block_size= share->block_size, empty_space, length= org_length;
1952 struct st_row_pos_info row_pos;
1953 my_off_t position;
1954 my_bool res, block_is_read;
1955 DBUG_ENTER("write_tail");
1956 DBUG_PRINT("enter", ("page: %lu length: %u",
1957 (ulong) block->page, length));
1958
1959 info->keyread_buff_used= 1;
1960 /*
1961 Don't allocate smaller block than MIN_TAIL_SIZE (we want to give rows
1962 some place to grow in the future)
1963 */
1964 if (length < MIN_TAIL_SIZE)
1965 length= MIN_TAIL_SIZE;
1966
1967 if (block->page_count == TAIL_PAGE_COUNT_MARKER)
1968 {
1969 /*
1970 Create new tail
1971 page will be pinned & locked by get_head_or_tail_page
1972 */
1973 if (get_head_or_tail_page(info, block, info->keyread_buff, length,
1974 TAIL_PAGE, PAGECACHE_LOCK_WRITE,
1975 &row_pos))
1976 DBUG_RETURN(1);
1977 }
1978 else
1979 {
1980 /* Write tail on predefined row position */
1981 if (get_rowpos_in_head_or_tail_page(info, block, info->keyread_buff,
1982 length, TAIL_PAGE,
1983 PAGECACHE_LOCK_WRITE,
1984 block->page_count & ~TAIL_BIT,
1985 &row_pos))
1986 DBUG_RETURN(1);
1987 }
1988 DBUG_PRINT("info", ("tailid: %lu (%lu:%u)",
1989 (ulong) ma_recordpos(block->page, row_pos.rownr),
1990 (ulong) block->page, row_pos.rownr));
1991
1992 block_is_read= block->org_bitmap_value != 0;
1993
1994 memcpy(row_pos.data, row_part, org_length);
1995
1996 if (share->now_transactional)
1997 {
1998 /* Log changes in tail block */
1999 uchar log_data[FILEID_STORE_SIZE + PAGE_STORE_SIZE + DIRPOS_STORE_SIZE];
2000 LEX_CUSTRING log_array[TRANSLOG_INTERNAL_PARTS + 2];
2001 LSN lsn;
2002
2003 /*
2004 Log REDO changes of tail page
2005 Note that we have to log length, not org_length, to be sure that
2006 REDO, which doesn't use write_tail, also creates a block of at least
2007 MIN_TAIL_SIZE
2008 */
2009 page_store(log_data + FILEID_STORE_SIZE, block->page);
2010 dirpos_store(log_data + FILEID_STORE_SIZE + PAGE_STORE_SIZE,
2011 row_pos.rownr);
2012 log_array[TRANSLOG_INTERNAL_PARTS + 0].str= log_data;
2013 log_array[TRANSLOG_INTERNAL_PARTS + 0].length= sizeof(log_data);
2014 log_array[TRANSLOG_INTERNAL_PARTS + 1].str= row_pos.data;
2015 log_array[TRANSLOG_INTERNAL_PARTS + 1].length= length;
2016 if (translog_write_record(&lsn,
2017 (block_is_read ? LOGREC_REDO_INSERT_ROW_TAIL :
2018 LOGREC_REDO_NEW_ROW_TAIL),
2019 info->trn, info,
2020 (translog_size_t) (sizeof(log_data) + length),
2021 TRANSLOG_INTERNAL_PARTS + 2, log_array,
2022 log_data, NULL))
2023 DBUG_RETURN(1);
2024 }
2025
2026 int2store(row_pos.dir + 2, length);
2027 empty_space= row_pos.empty_space - length;
2028 int2store(row_pos.buff + EMPTY_SPACE_OFFSET, empty_space);
2029 block->page_count= row_pos.rownr + TAIL_BIT;
2030 /*
2031 If there is less directory entries free than number of possible tails
2032 we can write for a row, we mark the page full to ensure that we don't
2033 during _ma_bitmap_find_place() allocate more entries on the tail page
2034 than it can hold
2035 */
2036 block->empty_space= (enough_free_entries(row_pos.buff, share->block_size,
2037 1 + share->base.blobs) ?
2038 empty_space : 0);
2039 /* Keep BLOCKUSED_USE_ORG_BITMAP */
2040 block->used|= BLOCKUSED_USED | BLOCKUSED_TAIL;
2041
2042 if (block_is_read)
2043 {
2044 /* Current page link is last element in pinned_pages */
2045 MARIA_PINNED_PAGE *page_link;
2046 page_link= dynamic_element(&info->pinned_pages,
2047 info->pinned_pages.elements-1,
2048 MARIA_PINNED_PAGE*);
2049 pagecache_unlock_by_link(share->pagecache, page_link->link,
2050 PAGECACHE_LOCK_WRITE_TO_READ,
2051 PAGECACHE_PIN_LEFT_PINNED, LSN_IMPOSSIBLE,
2052 LSN_IMPOSSIBLE, 1, FALSE);
2053 DBUG_ASSERT(page_link->changed);
2054 page_link->unlock= PAGECACHE_LOCK_READ_UNLOCK;
2055 res= 0;
2056 }
2057 else
2058 {
2059 if (!(res= pagecache_write(share->pagecache,
2060 &info->dfile, block->page, 0,
2061 row_pos.buff,share->page_type,
2062 PAGECACHE_LOCK_READ,
2063 PAGECACHE_PIN,
2064 PAGECACHE_WRITE_DELAY, &page_link.link,
2065 LSN_IMPOSSIBLE)))
2066 {
2067 DBUG_ASSERT(page_link.link);
2068 page_link.unlock= PAGECACHE_LOCK_READ_UNLOCK;
2069 page_link.changed= 1;
2070 push_dynamic(&info->pinned_pages, (void*) &page_link);
2071 }
2072
2073 /* Increase data file size, if extended */
2074 position= (my_off_t) block->page * block_size;
2075 if (share->state.state.data_file_length <= position)
2076 {
2077 /*
2078 We are modifying a state member before writing the UNDO; this is a WAL
2079 violation. But for data_file_length this is ok, as long as we change
2080 data_file_length after writing any log record (FILE_ID/REDO/UNDO) (see
2081 collect_tables()).
2082 */
2083 _ma_set_share_data_file_length(share, position + block_size);
2084 }
2085 }
2086 DBUG_RETURN(res);
2087 }
2088
2089
2090 /*
2091 Write full pages
2092
2093 SYNOPSIS
2094 write_full_pages()
2095 info Maria handler
2096 lsn LSN for the undo record
2097 block Where to write data
2098 data Data to write
2099 length Length of data
2100
2101 NOTES
2102 Logging of the changes to the full pages are done in the caller
2103 write_block_record().
2104
2105 RETURN
2106 0 ok
2107 1 error on write
2108 */
2109
write_full_pages(MARIA_HA * info,LSN lsn,MARIA_BITMAP_BLOCK * block,uchar * data,ulong length)2110 static my_bool write_full_pages(MARIA_HA *info,
2111 LSN lsn,
2112 MARIA_BITMAP_BLOCK *block,
2113 uchar *data, ulong length)
2114 {
2115 pgcache_page_no_t page;
2116 MARIA_SHARE *share= info->s;
2117 uint block_size= share->block_size;
2118 uint data_size= FULL_PAGE_SIZE(share);
2119 uchar *buff= info->keyread_buff;
2120 uint page_count, sub_blocks;
2121 my_off_t position, max_position;
2122 DBUG_ENTER("write_full_pages");
2123 DBUG_PRINT("enter", ("length: %lu page: %lu page_count: %lu",
2124 (ulong) length, (ulong) block->page,
2125 (ulong) block->page_count));
2126 DBUG_ASSERT((block->page_count & TAIL_BIT) == 0);
2127
2128 info->keyread_buff_used= 1;
2129 page= block->page;
2130 page_count= block->page_count;
2131 sub_blocks= block->sub_blocks;
2132
2133 max_position= (my_off_t) (page + page_count) * block_size;
2134
2135 /* Increase data file size, if extended */
2136
2137 for (; length; data+= data_size)
2138 {
2139 uint copy_length;
2140 if (!page_count--)
2141 {
2142 if (!--sub_blocks)
2143 {
2144 _ma_set_fatal_error(share, HA_ERR_WRONG_IN_RECORD);
2145 DBUG_RETURN(1);
2146 }
2147
2148 block++;
2149 page= block->page;
2150 page_count= block->page_count - 1;
2151 DBUG_PRINT("info", ("page: %lu page_count: %lu",
2152 (ulong) block->page, (ulong) block->page_count));
2153
2154 position= (page + page_count + 1) * block_size;
2155 set_if_bigger(max_position, position);
2156 }
2157 lsn_store(buff, lsn);
2158 buff[PAGE_TYPE_OFFSET]= (uchar) BLOB_PAGE;
2159 bzero(buff + LSN_SIZE + PAGE_TYPE_SIZE,
2160 FULL_PAGE_HEADER_SIZE(share) - (LSN_SIZE + PAGE_TYPE_SIZE));
2161 copy_length= MY_MIN(data_size, length);
2162 memcpy(buff + FULL_PAGE_HEADER_SIZE(share), data, copy_length);
2163 length-= copy_length;
2164
2165 /*
2166 Zero out old information from the block. This removes possible
2167 sensitive information from the block and also makes the file
2168 easier to compress and easier to compare after recovery.
2169 */
2170 if (copy_length != data_size)
2171 bzero(buff + block_size - PAGE_SUFFIX_SIZE - (data_size - copy_length),
2172 (data_size - copy_length) + PAGE_SUFFIX_SIZE);
2173
2174 if (pagecache_write(share->pagecache,
2175 &info->dfile, page, 0,
2176 buff, share->page_type,
2177 PAGECACHE_LOCK_LEFT_UNLOCKED,
2178 PAGECACHE_PIN_LEFT_UNPINNED,
2179 PAGECACHE_WRITE_DELAY,
2180 0, info->trn->rec_lsn))
2181 DBUG_RETURN(1);
2182 page++;
2183 DBUG_ASSERT(block->used & BLOCKUSED_USED);
2184 }
2185 if (share->state.state.data_file_length < max_position)
2186 _ma_set_share_data_file_length(share, max_position);
2187 DBUG_RETURN(0);
2188 }
2189
2190
2191 /*
2192 Store ranges of full pages in compact format for logging
2193
2194 SYNOPSIS
2195 store_page_range()
2196 to Store data here
2197 block Where pages are to be written
2198 length Length of data to be written
2199 Normally this is full pages, except for the last
2200 tail block that may only partly fit the last page.
2201 tot_ranges Add here the number of ranges used
2202
2203 NOTES
2204 The format of one entry is:
2205
2206 Ranges SUB_RANGE_SIZE
2207 Empty bytes at end of last byte BLOCK_FILLER_SIZE
2208 For each range
2209 Page number PAGE_STORE_SIZE
2210 Number of pages PAGERANGE_STORE_SIZE
2211
2212 RETURN
2213 # end position for 'to'
2214 */
2215
store_page_range(MARIA_SHARE * share,uchar * to,MARIA_BITMAP_BLOCK * block,ulong length,uint * tot_ranges)2216 static uchar *store_page_range(MARIA_SHARE *share,
2217 uchar *to, MARIA_BITMAP_BLOCK *block,
2218 ulong length,
2219 uint *tot_ranges)
2220 {
2221 uint data_size= FULL_PAGE_SIZE(share);
2222 ulong pages_left= (length + data_size -1) / data_size;
2223 uint page_count, ranges, empty_space;
2224 uchar *to_start;
2225 DBUG_ENTER("store_page_range");
2226
2227 to_start= to;
2228 to+= SUB_RANGE_SIZE;
2229
2230 /* Store number of unused bytes at last page */
2231 empty_space= (uint) (pages_left * data_size - length);
2232 int2store(to, empty_space);
2233 to+= BLOCK_FILLER_SIZE;
2234
2235 ranges= 0;
2236 do
2237 {
2238 pgcache_page_no_t page;
2239 page= block->page;
2240 page_count= block->page_count;
2241 block++;
2242 if (page_count > pages_left)
2243 page_count= pages_left;
2244
2245 page_store(to, page);
2246 to+= PAGE_STORE_SIZE;
2247 pagerange_store(to, page_count);
2248 to+= PAGERANGE_STORE_SIZE;
2249 ranges++;
2250 } while ((pages_left-= page_count));
2251 /* Store number of ranges for this block */
2252 int2store(to_start, ranges);
2253 (*tot_ranges)+= ranges;
2254
2255 DBUG_RETURN(to);
2256 }
2257
2258
2259 /*
2260 Store packed extent data
2261
2262 SYNOPSIS
2263 store_extent_info()
2264 to Store first packed data here
2265 row_extents_second_part Store rest here
2266 first_block First block to store
2267 count Number of blocks
2268
2269 NOTES
2270 We don't have to store the position for the head block
2271
2272 We have to set the START_EXTENT_BIT for every extent where the
2273 blob will be stored on a page of it's own. We need this in the
2274 UNDO phase to generate MARIA_BITMAP_BLOCK's for undo-delete and
2275 undo-update.
2276 */
2277
store_extent_info(uchar * to,uchar * row_extents_second_part,MARIA_BITMAP_BLOCK * first_block,uint count)2278 static void store_extent_info(uchar *to,
2279 uchar *row_extents_second_part,
2280 MARIA_BITMAP_BLOCK *first_block,
2281 uint count)
2282 {
2283 MARIA_BITMAP_BLOCK *block, *end_block;
2284 uint copy_length;
2285 my_bool first_found= 0;
2286 DBUG_ENTER("store_extent_info");
2287 DBUG_PRINT("enter", ("count: %u", count));
2288
2289 for (block= first_block, end_block= first_block+count ;
2290 block < end_block; block++)
2291 {
2292 /* The following is only false for marker (unused) blocks */
2293 if (likely(block->used & BLOCKUSED_USED))
2294 {
2295 uint page_count= block->page_count;
2296 DBUG_ASSERT(page_count != 0);
2297 page_store(to, block->page);
2298 if (block->sub_blocks)
2299 {
2300 /*
2301 Set a bit so that we later know that this was the first block
2302 for a blob
2303 */
2304 page_count|= START_EXTENT_BIT;
2305 }
2306 pagerange_store(to + PAGE_STORE_SIZE, page_count);
2307 DBUG_DUMP("extent", to, ROW_EXTENT_SIZE);
2308 to+= ROW_EXTENT_SIZE;
2309 if (!first_found)
2310 {
2311 first_found= 1;
2312 to= row_extents_second_part;
2313 }
2314 }
2315 }
2316 copy_length= (count - 1) * ROW_EXTENT_SIZE;
2317 /*
2318 In some unlikely cases we have allocated to many blocks. Clear this
2319 data.
2320 */
2321 bzero(to, (size_t) (row_extents_second_part + copy_length - to));
2322 DBUG_VOID_RETURN;
2323 }
2324
2325
2326 /**
2327 @brief
2328 Convert extent info read from file to MARIA_BITMAP_BLOCKS suitable
2329 for write_block_record
2330
2331 @note
2332 In case of blobs, this function marks all the blob pages in the bitmap
2333 as full pages. The bitmap bits for other pages will be marked
2334 when write_block_record() calls _ma_bitmap_release_unused().
2335
2336 This function will be removed in Maria 2.0 when we instead of delete rows
2337 mark them as deleted and only remove them after commit.
2338
2339 @return
2340 @retval 0 ok
2341 @retval 1 Error (out of memory or disk error changing bitmap) or
2342 wrong information in extent information
2343 */
2344
extent_to_bitmap_blocks(MARIA_HA * info,MARIA_BITMAP_BLOCKS * blocks,pgcache_page_no_t head_page,uint extent_count,const uchar * extent_info)2345 static my_bool extent_to_bitmap_blocks(MARIA_HA *info,
2346 MARIA_BITMAP_BLOCKS *blocks,
2347 pgcache_page_no_t head_page,
2348 uint extent_count,
2349 const uchar *extent_info)
2350 {
2351 MARIA_BITMAP_BLOCK *block, *start_block;
2352 MARIA_SHARE *share= info->s;
2353 uint i, tail_page;
2354 DBUG_ENTER("extent_to_bitmap_blocks");
2355
2356 if (allocate_dynamic(&info->bitmap_blocks, extent_count + 2))
2357 DBUG_RETURN(1);
2358 block= blocks->block= dynamic_element(&info->bitmap_blocks, 0,
2359 MARIA_BITMAP_BLOCK*);
2360 blocks->count= extent_count + 1;
2361 blocks->tail_page_skipped= blocks->page_skipped= 0;
2362 block->page= head_page;
2363 block->page_count= 1;
2364 block->used= BLOCKUSED_USED | BLOCKUSED_USE_ORG_BITMAP;
2365 /* Impossible value, will force storage of real value */
2366 block->org_bitmap_value= 255;
2367
2368 start_block= block++;
2369 for (i=0 ;
2370 i++ < extent_count ;
2371 block++, extent_info+= ROW_EXTENT_SIZE)
2372 {
2373 uint page_count= uint2korr(extent_info + ROW_EXTENT_PAGE_SIZE);
2374 if (page_count & START_EXTENT_BIT)
2375 {
2376 page_count&= ~START_EXTENT_BIT;
2377 start_block->sub_blocks= (uint) (block - start_block);
2378 start_block= block;
2379 }
2380 block->page= page_korr(extent_info);
2381 block->page_count= page_count;
2382 block->sub_blocks= 0;
2383 if (block->page_count == 0)
2384 {
2385 /* Extend allocated but not used by write_block_record() */
2386 DBUG_ASSERT(block->page == 0);
2387 /* This is the last block */
2388 blocks->count= i;
2389 break;
2390 }
2391 if ((tail_page= page_count & TAIL_BIT))
2392 page_count= 1;
2393
2394 /* Check if wrong data */
2395 if (block->page == 0 || page_count == 0 ||
2396 (block->page + page_count) * share->block_size >
2397 share->state.state.data_file_length)
2398 {
2399 DBUG_PRINT("error", ("page: %lu page_count: %u tail: %u length: %ld data_length: %ld",
2400 (ulong) block->page,
2401 (block->page_count & ~TAIL_BIT),
2402 (uint) MY_TEST(block->page_count & TAIL_BIT),
2403 (ulong) ((block->page + (page_count & ~TAIL_BIT)) *
2404 share->block_size),
2405 (ulong) share->state.state.data_file_length));
2406 DBUG_RETURN(1);
2407 }
2408 if (tail_page)
2409 {
2410 block->org_bitmap_value= _ma_bitmap_get_page_bits(info, &share->bitmap,
2411 block->page);
2412 block->used= (BLOCKUSED_TAIL | BLOCKUSED_USED |
2413 BLOCKUSED_USE_ORG_BITMAP);
2414 }
2415 else
2416 {
2417 my_bool res;
2418 mysql_mutex_lock(&share->bitmap.bitmap_lock);
2419 res= _ma_bitmap_set_full_page_bits(info, &share->bitmap,
2420 block->page, page_count);
2421 mysql_mutex_unlock(&share->bitmap.bitmap_lock);
2422 if (res)
2423 DBUG_RETURN(1);
2424 block->used= BLOCKUSED_USED;
2425 }
2426 }
2427 start_block->sub_blocks= (uint) (block - start_block);
2428 DBUG_RETURN(0);
2429 }
2430
2431
2432 /*
2433 Free regions of pages with logging
2434
2435 NOTES
2436 We are removing filler events and tail page events from
2437 row->extents to get smaller log.
2438
2439 RETURN
2440 0 ok
2441 1 error
2442 */
2443
free_full_pages(MARIA_HA * info,MARIA_ROW * row)2444 static my_bool free_full_pages(MARIA_HA *info, MARIA_ROW *row)
2445 {
2446 uchar log_data[FILEID_STORE_SIZE + PAGERANGE_STORE_SIZE];
2447 LEX_CUSTRING log_array[TRANSLOG_INTERNAL_PARTS + 2];
2448 LSN lsn;
2449 size_t extents_length;
2450 uchar *extents= row->extents;
2451 DBUG_ENTER("free_full_pages");
2452
2453 if (info->s->now_transactional)
2454 {
2455 /* Compact events by removing filler and tail events */
2456 uchar *new_block= 0;
2457 uchar *end, *to, *compact_extent_info;
2458 my_bool res;
2459 uint extents_count;
2460
2461 if (!(compact_extent_info= my_alloca(row->extents_count *
2462 ROW_EXTENT_SIZE)))
2463 DBUG_RETURN(1);
2464
2465 to= compact_extent_info;
2466 for (end= extents + row->extents_count * ROW_EXTENT_SIZE ;
2467 extents < end ;
2468 extents+= ROW_EXTENT_SIZE)
2469 {
2470 uint page_count= uint2korr(extents + ROW_EXTENT_PAGE_SIZE);
2471 page_count&= ~START_EXTENT_BIT;
2472 if (! (page_count & TAIL_BIT) && page_count != 0)
2473 {
2474 /* Found correct extent */
2475 if (!new_block)
2476 new_block= extents; /* First extent in range */
2477 continue;
2478 }
2479 /* Found extent to remove, copy everything found so far */
2480 if (new_block)
2481 {
2482 size_t length= (size_t) (extents - new_block);
2483 memcpy(to, new_block, length);
2484 to+= length;
2485 new_block= 0;
2486 }
2487 }
2488 if (new_block)
2489 {
2490 size_t length= (size_t) (extents - new_block);
2491 memcpy(to, new_block, length);
2492 to+= length;
2493 }
2494
2495 if (!unlikely(extents_length= (uint) (to - compact_extent_info)))
2496 {
2497 /*
2498 No ranges. This happens in the rear case when we have a allocated
2499 place for a blob on a tail page but it did fit into the main page.
2500 */
2501 my_afree(compact_extent_info);
2502 DBUG_RETURN(0);
2503 }
2504 extents_count= (uint) (extents_length / ROW_EXTENT_SIZE);
2505 pagerange_store(log_data + FILEID_STORE_SIZE, extents_count);
2506 log_array[TRANSLOG_INTERNAL_PARTS + 0].str= log_data;
2507 log_array[TRANSLOG_INTERNAL_PARTS + 0].length= sizeof(log_data);
2508 log_array[TRANSLOG_INTERNAL_PARTS + 1].str= compact_extent_info;
2509 log_array[TRANSLOG_INTERNAL_PARTS + 1].length= extents_length;
2510 res= translog_write_record(&lsn, LOGREC_REDO_FREE_BLOCKS, info->trn,
2511 info,
2512 (translog_size_t) (sizeof(log_data) +
2513 extents_length),
2514 TRANSLOG_INTERNAL_PARTS + 2, log_array,
2515 log_data, NULL);
2516 my_afree(compact_extent_info);
2517 if (res)
2518 DBUG_RETURN(1);
2519 }
2520
2521 DBUG_RETURN(_ma_bitmap_free_full_pages(info, row->extents,
2522 row->extents_count));
2523 }
2524
2525
2526 /*
2527 Free one page range
2528
2529 NOTES
2530 This is very similar to free_full_pages()
2531
2532 RETURN
2533 0 ok
2534 1 error
2535 */
2536
free_full_page_range(MARIA_HA * info,pgcache_page_no_t page,uint count)2537 static my_bool free_full_page_range(MARIA_HA *info, pgcache_page_no_t page,
2538 uint count)
2539 {
2540 my_bool res= 0;
2541 uint delete_count;
2542 MARIA_SHARE *share= info->s;
2543 DBUG_ENTER("free_full_page_range");
2544
2545 delete_count= count;
2546 if (share->state.state.data_file_length ==
2547 (page + count) * share->block_size)
2548 {
2549 /*
2550 Don't delete last page from pagecache as this will make the file
2551 shorter than expected if the last operation extended the file
2552 */
2553 delete_count--;
2554 }
2555 if (delete_count &&
2556 pagecache_delete_pages(share->pagecache, &info->dfile,
2557 page, delete_count, PAGECACHE_LOCK_WRITE, 1))
2558 res= 1;
2559
2560 if (share->now_transactional)
2561 {
2562 LSN lsn;
2563 /** @todo unify log_data's shape with delete_head_or_tail() */
2564 uchar log_data[FILEID_STORE_SIZE + PAGERANGE_STORE_SIZE +
2565 ROW_EXTENT_SIZE];
2566 LEX_CUSTRING log_array[TRANSLOG_INTERNAL_PARTS + 1];
2567 DBUG_ASSERT(info->trn->rec_lsn);
2568 pagerange_store(log_data + FILEID_STORE_SIZE, 1);
2569 page_store(log_data + FILEID_STORE_SIZE + PAGERANGE_STORE_SIZE,
2570 page);
2571 int2store(log_data + FILEID_STORE_SIZE + PAGERANGE_STORE_SIZE +
2572 PAGE_STORE_SIZE, count);
2573 log_array[TRANSLOG_INTERNAL_PARTS + 0].str= log_data;
2574 log_array[TRANSLOG_INTERNAL_PARTS + 0].length= sizeof(log_data);
2575
2576 if (translog_write_record(&lsn, LOGREC_REDO_FREE_BLOCKS,
2577 info->trn, info,
2578 (translog_size_t) sizeof(log_data),
2579 TRANSLOG_INTERNAL_PARTS + 1, log_array,
2580 log_data, NULL))
2581 res= 1;
2582 }
2583 mysql_mutex_lock(&share->bitmap.bitmap_lock);
2584 if (_ma_bitmap_reset_full_page_bits(info, &share->bitmap, page, count))
2585 res= 1;
2586 mysql_mutex_unlock(&share->bitmap.bitmap_lock);
2587 DBUG_RETURN(res);
2588 }
2589
2590
2591 /**
2592 @brief Write a record to a (set of) pages
2593
2594 @fn write_block_record()
2595 @param info Maria handler
2596 @param old_record Original record in case of update; NULL in case of
2597 insert
2598 @param record Record we should write
2599 @param row Statistics about record (calculated by
2600 calc_record_size())
2601 @param map_blocks On which pages the record should be stored
2602 @param row_pos Position on head page where to put head part of
2603 record
2604 @param undo_lsn <> LSN_ERROR if we are executing an UNDO
2605 @param old_record_checksum Checksum of old_record: ignored if table does
2606 not have live checksum; otherwise if
2607 old_record==NULL it must be 0.
2608
2609 @note
2610 On return all pinned pages are released.
2611
2612 [page_buff + EMPTY_SPACE_OFFSET] is set to
2613 row_pos->empty_space - head_length
2614
2615 @return Operation status
2616 @retval 0 OK
2617 @retval 1 Error
2618 */
2619
write_block_record(MARIA_HA * info,const uchar * old_record,const uchar * record,MARIA_ROW * row,MARIA_BITMAP_BLOCKS * bitmap_blocks,my_bool head_block_is_read,struct st_row_pos_info * row_pos,LSN undo_lsn,ha_checksum old_record_checksum)2620 static my_bool write_block_record(MARIA_HA *info,
2621 const uchar *old_record,
2622 const uchar *record,
2623 MARIA_ROW *row,
2624 MARIA_BITMAP_BLOCKS *bitmap_blocks,
2625 my_bool head_block_is_read,
2626 struct st_row_pos_info *row_pos,
2627 LSN undo_lsn,
2628 ha_checksum old_record_checksum)
2629 {
2630 uchar *data, *end_of_data, *tmp_data_used, *tmp_data;
2631 uchar *UNINIT_VAR(row_extents_first_part), *UNINIT_VAR(row_extents_second_part);
2632 uchar *field_length_data;
2633 uchar *page_buff;
2634 MARIA_BITMAP_BLOCK *block, *head_block;
2635 MARIA_SHARE *share= info->s;
2636 MARIA_COLUMNDEF *column, *end_column;
2637 MARIA_PINNED_PAGE page_link;
2638 uint block_size, flag, head_length;
2639 ulong *blob_lengths;
2640 my_bool row_extents_in_use, blob_full_pages_exists;
2641 LSN lsn;
2642 my_off_t position;
2643 uint save_my_errno;
2644 DBUG_ENTER("write_block_record");
2645
2646 head_block= bitmap_blocks->block;
2647 block_size= share->block_size;
2648
2649 page_buff= row_pos->buff;
2650 /* Position on head page where we should store the head part */
2651 data= row_pos->data;
2652 end_of_data= data + row_pos->length;
2653
2654 /* Write header */
2655 flag= info->row_flag;
2656 row_extents_in_use= 0;
2657 if (unlikely(row->total_length > row_pos->length))
2658 {
2659 /* Need extent */
2660 DBUG_ASSERT(bitmap_blocks->count > 1);
2661 if (bitmap_blocks->count <= 1)
2662 goto crashed; /* Wrong in bitmap */
2663 flag|= ROW_FLAG_EXTENTS;
2664 row_extents_in_use= 1;
2665 }
2666 /* For now we have only a minimum header */
2667 *data++= (uchar) flag;
2668 if (flag & ROW_FLAG_TRANSID)
2669 {
2670 transid_store(data, info->trn->trid);
2671 data+= TRANSID_SIZE;
2672 }
2673
2674 if (unlikely(flag & ROW_FLAG_NULLS_EXTENDED))
2675 *data++= (uchar) (share->base.null_bytes -
2676 share->base.original_null_bytes);
2677 if (row_extents_in_use)
2678 {
2679 /* Store first extent in header */
2680 store_key_length_inc(data, bitmap_blocks->count - 1);
2681 row_extents_first_part= data;
2682 data+= ROW_EXTENT_SIZE;
2683 }
2684 if (share->base.max_field_lengths)
2685 store_key_length_inc(data, row->field_lengths_length);
2686 if (share->calc_checksum)
2687 {
2688 *(data++)= (uchar) (row->checksum); /* store least significant byte */
2689 DBUG_ASSERT(!((old_record_checksum != 0) && (old_record == NULL)));
2690 }
2691 memcpy(data, record, share->base.null_bytes);
2692 data+= share->base.null_bytes;
2693 memcpy(data, row->empty_bits, share->base.pack_bytes);
2694 data+= share->base.pack_bytes;
2695
2696 DBUG_ASSERT(row_extents_in_use || undo_lsn != LSN_ERROR ||
2697 (uint) (data - row_pos->data) == row->min_length);
2698
2699 /*
2700 Allocate a buffer of rest of data (except blobs)
2701
2702 To avoid double copying of data, we copy as many columns that fits into
2703 the page. The rest goes into info->packed_row.
2704
2705 Using an extra buffer, instead of doing continuous writes to different
2706 pages, uses less code and we don't need to have to do a complex call
2707 for every data segment we want to store.
2708 */
2709 if (_ma_alloc_buffer(&info->rec_buff, &info->rec_buff_size,
2710 row->head_length))
2711 DBUG_RETURN(1);
2712
2713 tmp_data_used= 0; /* Either 0 or last used uchar in 'data' */
2714 tmp_data= data;
2715
2716 if (row_extents_in_use)
2717 {
2718 uint copy_length= (bitmap_blocks->count - 2) * ROW_EXTENT_SIZE;
2719 if (!tmp_data_used && tmp_data + copy_length > end_of_data)
2720 {
2721 tmp_data_used= tmp_data;
2722 tmp_data= info->rec_buff;
2723 }
2724 row_extents_second_part= tmp_data;
2725 /*
2726 We will copy the extents here when we have figured out the tail
2727 positions.
2728 */
2729 tmp_data+= copy_length;
2730 }
2731
2732 /* Copy fields that has fixed lengths (primary key etc) */
2733 for (column= share->columndef,
2734 end_column= column + share->base.fixed_not_null_fields;
2735 column < end_column; column++)
2736 {
2737 if (!tmp_data_used && tmp_data + column->length > end_of_data)
2738 {
2739 tmp_data_used= tmp_data;
2740 tmp_data= info->rec_buff;
2741 }
2742 memcpy(tmp_data, record + column->offset, column->length);
2743 tmp_data+= column->length;
2744 }
2745
2746 /* Copy length of data for variable length fields */
2747 if (!tmp_data_used && tmp_data + row->field_lengths_length > end_of_data)
2748 {
2749 tmp_data_used= tmp_data;
2750 tmp_data= info->rec_buff;
2751 }
2752 field_length_data= row->field_lengths;
2753 memcpy(tmp_data, field_length_data, row->field_lengths_length);
2754 tmp_data+= row->field_lengths_length;
2755
2756 DBUG_ASSERT(row_extents_in_use || undo_lsn != LSN_ERROR ||
2757 (uint) (tmp_data - row_pos->data) == row->min_length +
2758 share->base.fixed_not_null_fields_length +
2759 row->field_lengths_length);
2760
2761 /* Copy variable length fields and fields with null/zero */
2762 for (end_column= share->columndef + share->base.fields - share->base.blobs;
2763 column < end_column ;
2764 column++)
2765 {
2766 const uchar *field_pos;
2767 ulong length;
2768 if ((record[column->null_pos] & column->null_bit) ||
2769 (row->empty_bits[column->empty_pos] & column->empty_bit))
2770 continue;
2771
2772 field_pos= record + column->offset;
2773 switch (column->type) {
2774 case FIELD_NORMAL: /* Fixed length field */
2775 case FIELD_SKIP_PRESPACE:
2776 case FIELD_SKIP_ZERO: /* Fixed length field */
2777 length= column->length;
2778 break;
2779 case FIELD_SKIP_ENDSPACE: /* CHAR */
2780 /* Char that is space filled */
2781 if (column->length <= 255)
2782 length= (uint) (uchar) *field_length_data++;
2783 else
2784 {
2785 length= uint2korr(field_length_data);
2786 field_length_data+= 2;
2787 }
2788 break;
2789 case FIELD_VARCHAR:
2790 if (column->length <= 256)
2791 {
2792 length= (uint) (uchar) *field_length_data++;
2793 field_pos++; /* Skip length uchar */
2794 }
2795 else
2796 {
2797 length= uint2korr(field_length_data);
2798 field_length_data+= 2;
2799 field_pos+= 2;
2800 }
2801 DBUG_ASSERT(length <= column->length);
2802 break;
2803 default: /* Wrong data */
2804 DBUG_ASSERT(!maria_assert_if_crashed_table);
2805 length=0;
2806 break;
2807 }
2808 if (!tmp_data_used && tmp_data + length > end_of_data)
2809 {
2810 /* Data didn't fit in page; Change to use tmp buffer */
2811 tmp_data_used= tmp_data;
2812 tmp_data= info->rec_buff;
2813 }
2814 memcpy((char*) tmp_data, field_pos, length);
2815 tmp_data+= length;
2816 }
2817
2818 block= head_block + head_block->sub_blocks; /* Point to first blob data */
2819
2820 end_column= column + share->base.blobs;
2821 blob_lengths= row->blob_lengths;
2822 if (!tmp_data_used)
2823 {
2824 /* Still room on page; Copy as many blobs we can into this page */
2825 data= tmp_data;
2826 for (; column < end_column &&
2827 *blob_lengths <= (ulong)(end_of_data - data);
2828 column++, blob_lengths++)
2829 {
2830 uchar *tmp_pos;
2831 uint length;
2832 if (!*blob_lengths) /* Null or "" */
2833 continue;
2834 length= column->length - portable_sizeof_char_ptr;
2835 memcpy(&tmp_pos, record + column->offset + length, sizeof(char*));
2836 memcpy(data, tmp_pos, *blob_lengths);
2837 data+= *blob_lengths;
2838 /*
2839 The following is not true when we want to insert data into original
2840 place. In this case we don't have any extra blocks allocated
2841 */
2842 if (likely(undo_lsn == LSN_ERROR))
2843 {
2844 /* Skip over tail page that was prepared for storing blob */
2845 block++;
2846 bitmap_blocks->tail_page_skipped= 1;
2847 }
2848 }
2849 if (head_block->sub_blocks > 1)
2850 {
2851 /* We have allocated pages that where not used */
2852 bitmap_blocks->page_skipped= 1;
2853 }
2854 }
2855 else
2856 data= tmp_data_used; /* Get last used on page */
2857
2858 /* Update page directory */
2859 head_length= (uint) (data - row_pos->data);
2860 DBUG_PRINT("info", ("Used head length on page: %u header_length: %u",
2861 head_length,
2862 (uint) (flag & ROW_FLAG_TRANSID ? TRANSID_SIZE : 0)));
2863 if (head_length < share->base.min_block_length)
2864 {
2865 /* Extend row to be of size min_block_length */
2866 uint diff_length= share->base.min_block_length - head_length;
2867 bzero(data, diff_length);
2868 data+= diff_length;
2869 head_length= share->base.min_block_length;
2870 }
2871 DBUG_ASSERT(data <= end_of_data);
2872 /*
2873 If this is a redo entry (ie, undo_lsn != LSN_ERROR) then we should have
2874 written exactly head_length bytes (same as original record).
2875 */
2876 DBUG_ASSERT(undo_lsn == LSN_ERROR || head_length == row_pos->length);
2877 int2store(row_pos->dir + 2, head_length);
2878 /* update empty space at start of block */
2879 row_pos->empty_space-= head_length;
2880 int2store(page_buff + EMPTY_SPACE_OFFSET, row_pos->empty_space);
2881 /* Mark in bitmaps how the current page was actually used */
2882 head_block->empty_space= row_pos->empty_space;
2883 if (page_buff[DIR_COUNT_OFFSET] == MAX_ROWS_PER_PAGE &&
2884 page_buff[DIR_FREE_OFFSET] == END_OF_DIR_FREE_LIST)
2885 head_block->empty_space= 0; /* Page is full */
2886 head_block->used|= BLOCKUSED_USED;
2887
2888 check_directory(share,
2889 page_buff, share->block_size, share->base.min_block_length,
2890 (uint) -1);
2891
2892 /*
2893 Now we have to write tail pages, as we need to store the position
2894 to them in the row extent header.
2895
2896 We first write out all blob tails, to be able to store them in
2897 the current page or 'tmp_data'.
2898
2899 Then we write the tail of the non-blob fields (The position to the
2900 tail page is stored either in row header, the extents in the head
2901 page or in the first full page of the non-blob data. It's never in
2902 the tail page of the non-blob data)
2903 */
2904
2905 blob_full_pages_exists= 0;
2906 if (row_extents_in_use)
2907 {
2908 if (column != end_column) /* If blob fields */
2909 {
2910 MARIA_COLUMNDEF *save_column= column;
2911 MARIA_BITMAP_BLOCK *save_block= block;
2912 MARIA_BITMAP_BLOCK *end_block;
2913 ulong *save_blob_lengths= blob_lengths;
2914
2915 for (; column < end_column; column++, blob_lengths++)
2916 {
2917 uchar *blob_pos;
2918 if (!*blob_lengths) /* Null or "" */
2919 continue;
2920 if (block[block->sub_blocks - 1].used & BLOCKUSED_TAIL)
2921 {
2922 uint length;
2923 length= column->length - portable_sizeof_char_ptr;
2924 memcpy(&blob_pos, record + column->offset + length, sizeof(char*));
2925 length= *blob_lengths % FULL_PAGE_SIZE(share); /* tail size */
2926 if (length != *blob_lengths)
2927 blob_full_pages_exists= 1;
2928 if (write_tail(info, block + block->sub_blocks-1,
2929 blob_pos + *blob_lengths - length,
2930 length))
2931 goto disk_err;
2932 }
2933 else
2934 blob_full_pages_exists= 1;
2935
2936 for (end_block= block + block->sub_blocks; block < end_block; block++)
2937 {
2938 /*
2939 Set only a bit, to not cause bitmap code to believe a block is full
2940 when there is still a lot of entries in it.
2941 */
2942 block->used|= BLOCKUSED_USED;
2943 }
2944 }
2945 DBUG_ASSERT((undo_lsn == LSN_ERROR ||
2946 block == bitmap_blocks->block + bitmap_blocks->count));
2947 column= save_column;
2948 block= save_block;
2949 blob_lengths= save_blob_lengths;
2950 }
2951
2952 if (tmp_data_used) /* non blob data overflows */
2953 {
2954 MARIA_BITMAP_BLOCK *cur_block, *end_block, *last_head_block;
2955 MARIA_BITMAP_BLOCK *head_tail_block= 0;
2956 ulong length;
2957 ulong data_length= (ulong) (tmp_data - info->rec_buff);
2958
2959 #ifdef SANITY_CHECKS
2960 DBUG_ASSERT(head_block->sub_blocks != 1);
2961 if (head_block->sub_blocks == 1)
2962 goto crashed; /* no reserved full or tails */
2963 #endif
2964 /*
2965 Find out where to write tail for non-blob fields.
2966
2967 Problem here is that the bitmap code may have allocated more
2968 space than we need. We have to handle the following cases:
2969
2970 - Bitmap code allocated a tail page we don't need.
2971 - The last full page allocated needs to be changed to a tail page
2972 (Because we where able to put more data on the head page than
2973 the bitmap allocation assumed)
2974
2975 The reserved pages in bitmap_blocks for the main page has one of
2976 the following allocations:
2977 - Full pages, with following blocks:
2978 # * full pages
2979 empty page ; To be used if we change last full to tail page. This
2980 has 'count' = 0.
2981 tail page (optional, if last full page was part full)
2982 - One tail page
2983 */
2984
2985 cur_block= head_block + 1;
2986 end_block= head_block + head_block->sub_blocks;
2987 /*
2988 Loop until we have find a block bigger than we need or
2989 we find the empty page block.
2990 */
2991 while (data_length >= (length= (cur_block->page_count *
2992 FULL_PAGE_SIZE(share))) &&
2993 cur_block->page_count)
2994 {
2995 #ifdef SANITY_CHECKS
2996 DBUG_ASSERT(!((cur_block == end_block) ||
2997 (cur_block->used & BLOCKUSED_USED)));
2998 if ((cur_block == end_block) || (cur_block->used & BLOCKUSED_USED))
2999 goto crashed;
3000 #endif
3001 data_length-= length;
3002 (cur_block++)->used|= BLOCKUSED_USED;
3003 }
3004 last_head_block= cur_block;
3005 if (data_length)
3006 {
3007 if (cur_block->page_count == 0)
3008 {
3009 /* Skip empty filler block */
3010 cur_block++;
3011 }
3012 #ifdef SANITY_CHECKS
3013 DBUG_ASSERT(!(cur_block >= end_block));
3014 if ((cur_block >= end_block))
3015 goto crashed;
3016 #endif
3017 if (cur_block->used & BLOCKUSED_TAIL)
3018 {
3019 DBUG_ASSERT(data_length < MAX_TAIL_SIZE(block_size));
3020 /* tail written to tail page */
3021 cur_block->used|= BLOCKUSED_USED;
3022 head_tail_block= cur_block;
3023 }
3024 else if (data_length > length - MAX_TAIL_SIZE(block_size))
3025 {
3026 /* tail written to full page */
3027 cur_block->used|= BLOCKUSED_USED;
3028 if ((cur_block != end_block - 1) &&
3029 (end_block[-1].used & BLOCKUSED_TAIL))
3030 bitmap_blocks->tail_page_skipped= 1;
3031 }
3032 else
3033 {
3034 /*
3035 cur_block is a full block, followed by an empty and optional
3036 tail block. Change cur_block to a tail block or split it
3037 into full blocks and tail blocks.
3038
3039 TODO:
3040 If there is enough space on the following tail block, use
3041 this instead of creating a new tail block.
3042 */
3043 DBUG_ASSERT(cur_block[1].page_count == 0);
3044 if (cur_block->page_count == 1)
3045 {
3046 /* convert full block to tail block */
3047 cur_block->used|= BLOCKUSED_USED | BLOCKUSED_TAIL;
3048 head_tail_block= cur_block;
3049 }
3050 else
3051 {
3052 DBUG_ASSERT(data_length < length - FULL_PAGE_SIZE(share));
3053 DBUG_PRINT("info", ("Splitting blocks into full and tail"));
3054 cur_block[1].page= (cur_block->page + cur_block->page_count - 1);
3055 cur_block[1].page_count= 1; /* Avoid DBUG_ASSERT */
3056 cur_block[1].used= BLOCKUSED_USED | BLOCKUSED_TAIL;
3057 cur_block->page_count--;
3058 cur_block->used|= BLOCKUSED_USED;
3059 last_head_block= head_tail_block= cur_block+1;
3060 }
3061 if (end_block[-1].used & BLOCKUSED_TAIL)
3062 bitmap_blocks->tail_page_skipped= 1;
3063 }
3064 }
3065 else
3066 {
3067 /* Must be an empty or tail page */
3068 DBUG_ASSERT(cur_block->page_count == 0 ||
3069 cur_block->used & BLOCKUSED_TAIL);
3070 if (end_block[-1].used & BLOCKUSED_TAIL)
3071 bitmap_blocks->tail_page_skipped= 1;
3072 }
3073
3074 /*
3075 Write all extents into page or tmp_data
3076
3077 Note that we still don't have a correct position for the tail
3078 of the non-blob fields.
3079 */
3080 store_extent_info(row_extents_first_part,
3081 row_extents_second_part,
3082 head_block+1, bitmap_blocks->count - 1);
3083 if (head_tail_block)
3084 {
3085 ulong block_length= (ulong) (tmp_data - info->rec_buff);
3086 uchar *extent_data;
3087
3088 length= (uint) (block_length % FULL_PAGE_SIZE(share));
3089 if (write_tail(info, head_tail_block,
3090 info->rec_buff + block_length - length,
3091 length))
3092 goto disk_err;
3093 tmp_data-= length; /* Remove the tail */
3094 if (tmp_data == info->rec_buff)
3095 {
3096 /* We have no full blocks to write for the head part */
3097 tmp_data_used= 0;
3098 }
3099
3100 /* Store the tail position for the non-blob fields */
3101 if (head_tail_block == head_block + 1)
3102 {
3103 /*
3104 We had a head block + tail block, which means that the
3105 tail block is the first extent
3106 */
3107 extent_data= row_extents_first_part;
3108 }
3109 else
3110 {
3111 /*
3112 We have a head block + some full blocks + tail block
3113 last_head_block is pointing after the last used extent
3114 for the head block.
3115 */
3116 extent_data= row_extents_second_part +
3117 ((last_head_block - head_block) - 2) * ROW_EXTENT_SIZE;
3118 }
3119 /* Write information for tail block in the reserved space */
3120 page_store(extent_data, head_tail_block->page);
3121 pagerange_store(extent_data + PAGE_STORE_SIZE,
3122 head_tail_block->page_count);
3123 }
3124 }
3125 else
3126 store_extent_info(row_extents_first_part,
3127 row_extents_second_part,
3128 head_block+1, bitmap_blocks->count - 1);
3129 }
3130
3131 if (share->now_transactional)
3132 {
3133 uchar log_data[FILEID_STORE_SIZE + PAGE_STORE_SIZE + DIRPOS_STORE_SIZE];
3134 LEX_CUSTRING log_array[TRANSLOG_INTERNAL_PARTS + 2];
3135
3136 /* Log REDO changes of head page */
3137 page_store(log_data + FILEID_STORE_SIZE, head_block->page);
3138 dirpos_store(log_data + FILEID_STORE_SIZE + PAGE_STORE_SIZE,
3139 row_pos->rownr);
3140 log_array[TRANSLOG_INTERNAL_PARTS + 0].str= log_data;
3141 log_array[TRANSLOG_INTERNAL_PARTS + 0].length= sizeof(log_data);
3142 log_array[TRANSLOG_INTERNAL_PARTS + 1].str= row_pos->data;
3143 log_array[TRANSLOG_INTERNAL_PARTS + 1].length= head_length;
3144 if (translog_write_record(&lsn,
3145 head_block_is_read ?
3146 LOGREC_REDO_INSERT_ROW_HEAD :
3147 LOGREC_REDO_NEW_ROW_HEAD,
3148 info->trn,
3149 info,
3150 (translog_size_t) (sizeof(log_data) +
3151 head_length),
3152 TRANSLOG_INTERNAL_PARTS + 2, log_array,
3153 log_data, NULL))
3154 goto disk_err;
3155 }
3156
3157 #ifdef RECOVERY_EXTRA_DEBUG
3158 if (info->trn->undo_lsn != LSN_IMPOSSIBLE)
3159 {
3160 /* Stop right after the REDO; testing incomplete log record groups */
3161 DBUG_EXECUTE_IF("maria_flush_whole_log",
3162 {
3163 DBUG_PRINT("maria_flush_whole_log", ("now"));
3164 translog_flush(translog_get_horizon());
3165 });
3166 DBUG_EXECUTE_IF("maria_crash",
3167 { DBUG_PRINT("maria_crash", ("now")); DBUG_SUICIDE(); });
3168 }
3169 #endif
3170
3171 if (head_block_is_read)
3172 {
3173 MARIA_PINNED_PAGE *page_link;
3174 /* Head page is always the first pinned page */
3175 page_link= dynamic_element(&info->pinned_pages, 0,
3176 MARIA_PINNED_PAGE*);
3177 pagecache_unlock_by_link(share->pagecache, page_link->link,
3178 PAGECACHE_LOCK_WRITE_TO_READ,
3179 PAGECACHE_PIN_LEFT_PINNED, LSN_IMPOSSIBLE,
3180 LSN_IMPOSSIBLE, 1, FALSE);
3181 page_link->unlock= PAGECACHE_LOCK_READ_UNLOCK;
3182 page_link->changed= 1;
3183 }
3184 else
3185 {
3186 if (pagecache_write(share->pagecache,
3187 &info->dfile, head_block->page, 0,
3188 page_buff, share->page_type,
3189 head_block_is_read ? PAGECACHE_LOCK_WRITE_TO_READ :
3190 PAGECACHE_LOCK_READ,
3191 head_block_is_read ? PAGECACHE_PIN_LEFT_PINNED :
3192 PAGECACHE_PIN,
3193 PAGECACHE_WRITE_DELAY, &page_link.link,
3194 LSN_IMPOSSIBLE))
3195 goto disk_err;
3196 DBUG_ASSERT(page_link.link);
3197 page_link.unlock= PAGECACHE_LOCK_READ_UNLOCK;
3198 page_link.changed= 1;
3199 push_dynamic(&info->pinned_pages, (void*) &page_link);
3200
3201 /* Increase data file size, if extended */
3202 position= (my_off_t) head_block->page * block_size;
3203 if (share->state.state.data_file_length <= position)
3204 _ma_set_share_data_file_length(share, position + block_size);
3205 }
3206
3207 if (share->now_transactional && (tmp_data_used || blob_full_pages_exists))
3208 {
3209 /*
3210 Log REDO writes for all full pages (head part and all blobs)
3211 We write all here to be able to generate the UNDO record early
3212 so that we can write the LSN for the UNDO record to all full pages.
3213 */
3214 uchar tmp_log_data[FILEID_STORE_SIZE + PAGERANGE_STORE_SIZE +
3215 (ROW_EXTENT_SIZE + BLOCK_FILLER_SIZE + SUB_RANGE_SIZE) *
3216 ROW_EXTENTS_ON_STACK];
3217 uchar *log_data, *log_pos;
3218 LEX_CUSTRING tmp_log_array[TRANSLOG_INTERNAL_PARTS + 2 +
3219 ROW_EXTENTS_ON_STACK];
3220 LEX_CUSTRING *log_array_pos, *log_array;
3221 int error;
3222 translog_size_t log_entry_length= 0;
3223 uint ext_length, extents= 0, sub_extents= 0;
3224
3225 /* If few extents, then allocate things on stack to avoid a malloc call */
3226 if (bitmap_blocks->count < ROW_EXTENTS_ON_STACK)
3227 {
3228 log_array= tmp_log_array;
3229 log_data= tmp_log_data;
3230 }
3231 else
3232 {
3233 if (!my_multi_malloc(MY_WME, &log_array,
3234 (uint) ((bitmap_blocks->count +
3235 TRANSLOG_INTERNAL_PARTS + 2) *
3236 sizeof(*log_array)),
3237 &log_data, FILEID_STORE_SIZE + PAGERANGE_STORE_SIZE +
3238 bitmap_blocks->count * (ROW_EXTENT_SIZE +
3239 BLOCK_FILLER_SIZE +
3240 SUB_RANGE_SIZE),
3241 NullS))
3242 goto disk_err;
3243 }
3244 log_pos= log_data + FILEID_STORE_SIZE + PAGERANGE_STORE_SIZE * 2;
3245 log_array_pos= log_array+ TRANSLOG_INTERNAL_PARTS+1;
3246
3247 if (tmp_data_used)
3248 {
3249 /* Full head page */
3250 translog_size_t block_length= (translog_size_t) (tmp_data -
3251 info->rec_buff);
3252 log_pos= store_page_range(share,
3253 log_pos, head_block+1,
3254 (ulong) block_length, &extents);
3255 log_array_pos->str= info->rec_buff;
3256 log_array_pos->length= block_length;
3257 log_entry_length+= block_length;
3258 log_array_pos++;
3259 sub_extents++;
3260 }
3261 if (blob_full_pages_exists)
3262 {
3263 MARIA_COLUMNDEF *tmp_column= column;
3264 ulong *tmp_blob_lengths= blob_lengths;
3265 MARIA_BITMAP_BLOCK *tmp_block= block;
3266
3267 /* Full blob pages */
3268 for (; tmp_column < end_column; tmp_column++, tmp_blob_lengths++)
3269 {
3270 ulong blob_length;
3271 uint length;
3272
3273 if (!*tmp_blob_lengths) /* Null or "" */
3274 continue;
3275 blob_length= *tmp_blob_lengths;
3276 length= tmp_column->length - portable_sizeof_char_ptr;
3277 /*
3278 If last part of blog was on tail page, change blob_length to
3279 reflect this
3280 */
3281 if (tmp_block[tmp_block->sub_blocks - 1].used & BLOCKUSED_TAIL)
3282 blob_length-= (blob_length % FULL_PAGE_SIZE(share));
3283 if (blob_length)
3284 {
3285 memcpy((void*) &log_array_pos->str,
3286 record + tmp_column->offset + length,
3287 sizeof(uchar*));
3288 log_array_pos->length= blob_length;
3289 log_entry_length+= blob_length;
3290 log_array_pos++;
3291 sub_extents++;
3292
3293 log_pos= store_page_range(share,
3294 log_pos, tmp_block,
3295 blob_length, &extents);
3296 }
3297 tmp_block+= tmp_block->sub_blocks;
3298 }
3299 }
3300
3301 log_array[TRANSLOG_INTERNAL_PARTS + 0].str= log_data;
3302 ext_length= (uint) (log_pos - log_data);
3303 log_array[TRANSLOG_INTERNAL_PARTS + 0].length= ext_length;
3304 pagerange_store(log_data+ FILEID_STORE_SIZE, extents);
3305 pagerange_store(log_data+ FILEID_STORE_SIZE + PAGERANGE_STORE_SIZE,
3306 sub_extents);
3307
3308 log_entry_length+= ext_length;
3309 /* trn->rec_lsn is already set earlier in this function */
3310 error= translog_write_record(&lsn, LOGREC_REDO_INSERT_ROW_BLOBS,
3311 info->trn, info, log_entry_length,
3312 (uint) (log_array_pos - log_array),
3313 log_array, log_data, NULL);
3314 if (log_array != tmp_log_array)
3315 my_free(log_array);
3316 if (error)
3317 goto disk_err;
3318 }
3319
3320 /* Write UNDO or CLR record */
3321 lsn= LSN_IMPOSSIBLE;
3322 if (share->now_transactional)
3323 {
3324 LEX_CUSTRING *log_array= info->log_row_parts;
3325
3326 if (undo_lsn != LSN_ERROR)
3327 {
3328 /*
3329 Store if this CLR is about UNDO_DELETE or UNDO_UPDATE;
3330 in the first case, Recovery, when it sees the CLR_END in the
3331 REDO phase, may decrement the records' count.
3332 */
3333 if (_ma_write_clr(info, undo_lsn,
3334 old_record ? LOGREC_UNDO_ROW_UPDATE :
3335 LOGREC_UNDO_ROW_DELETE,
3336 share->calc_checksum != 0,
3337 row->checksum - old_record_checksum,
3338 &lsn, (void*) 0))
3339 goto disk_err;
3340 }
3341 else
3342 {
3343 uchar log_data[LSN_STORE_SIZE + FILEID_STORE_SIZE +
3344 PAGE_STORE_SIZE + DIRPOS_STORE_SIZE + 2 +
3345 HA_CHECKSUM_STORE_SIZE + 2 + PAGERANGE_STORE_SIZE +
3346 ROW_EXTENT_SIZE];
3347 uchar *log_pos;
3348 ha_checksum checksum_delta;
3349
3350 /* LOGREC_UNDO_ROW_INSERT & LOGREC_UNDO_ROW_UPDATE share same header */
3351 lsn_store(log_data, info->trn->undo_lsn);
3352 page_store(log_data + LSN_STORE_SIZE + FILEID_STORE_SIZE,
3353 head_block->page);
3354 dirpos_store(log_data + LSN_STORE_SIZE + FILEID_STORE_SIZE +
3355 PAGE_STORE_SIZE,
3356 row_pos->rownr);
3357 log_pos= (log_data + LSN_STORE_SIZE + FILEID_STORE_SIZE +
3358 PAGE_STORE_SIZE + DIRPOS_STORE_SIZE);
3359 store_checksum_in_rec(share, checksum_delta,
3360 row->checksum - old_record_checksum,
3361 log_pos, log_pos);
3362 compile_time_assert(sizeof(ha_checksum) == HA_CHECKSUM_STORE_SIZE);
3363
3364 log_array[TRANSLOG_INTERNAL_PARTS + 0].str= log_data;
3365 log_array[TRANSLOG_INTERNAL_PARTS + 0].length= (uint) (log_pos -
3366 log_data);
3367
3368 if (!old_record)
3369 {
3370 /* Store undo_lsn in case we are aborting the insert */
3371 row->orig_undo_lsn= info->trn->undo_lsn;
3372 /* Write UNDO log record for the INSERT */
3373 if (translog_write_record(&lsn, LOGREC_UNDO_ROW_INSERT,
3374 info->trn, info,
3375 (translog_size_t)
3376 log_array[TRANSLOG_INTERNAL_PARTS +
3377 0].length,
3378 TRANSLOG_INTERNAL_PARTS + 1,
3379 log_array,
3380 log_data + LSN_STORE_SIZE, &checksum_delta))
3381 goto disk_err;
3382 }
3383 else
3384 {
3385 /* Write UNDO log record for the UPDATE */
3386 size_t row_length, extents_length;
3387 uint row_parts_count, cur_head_length;
3388
3389 /*
3390 Write head length and extents of the original row so that we
3391 during UNDO can put it back in the original position.
3392 We don't store size for TRANSID, as we don't write this during
3393 UNDO.
3394 */
3395 cur_head_length= (info->cur_row.head_length -
3396 info->cur_row.header_length);
3397 int2store(log_pos, cur_head_length);
3398 pagerange_store(log_pos + 2, info->cur_row.extents_count);
3399 log_pos+= 2 + PAGERANGE_STORE_SIZE;
3400 log_array[TRANSLOG_INTERNAL_PARTS + 0].length+= (2 +
3401 PAGERANGE_STORE_SIZE);
3402 info->log_row_parts[TRANSLOG_INTERNAL_PARTS+1].str=
3403 info->cur_row.extents;
3404 info->log_row_parts[TRANSLOG_INTERNAL_PARTS+1].length=
3405 extents_length= info->cur_row.extents_count * ROW_EXTENT_SIZE;
3406
3407 row_length= fill_update_undo_parts(info, old_record, record,
3408 log_array +
3409 TRANSLOG_INTERNAL_PARTS + 2,
3410 &row_parts_count);
3411 if (translog_write_record(&lsn, LOGREC_UNDO_ROW_UPDATE, info->trn,
3412 info,
3413 (translog_size_t)
3414 (log_array[TRANSLOG_INTERNAL_PARTS +
3415 0].length + extents_length +
3416 row_length),
3417 TRANSLOG_INTERNAL_PARTS + 2 +
3418 row_parts_count,
3419 log_array,
3420 log_data + LSN_STORE_SIZE,
3421 &checksum_delta))
3422 goto disk_err;
3423 }
3424 }
3425 }
3426 /* Release not used space in used pages */
3427 if (_ma_bitmap_release_unused(info, bitmap_blocks))
3428 goto disk_err;
3429 _ma_unpin_all_pages(info, lsn);
3430
3431 if (tmp_data_used)
3432 {
3433 /*
3434 Write data stored in info->rec_buff to pages
3435 This is the char/varchar data that didn't fit into the head page.
3436 */
3437 DBUG_ASSERT(bitmap_blocks->count != 0);
3438 if (write_full_pages(info, lsn, head_block + 1,
3439 info->rec_buff, (ulong) (tmp_data - info->rec_buff)))
3440 goto disk_err;
3441 }
3442
3443 /* Write rest of blobs (data, but no tails as they are already written) */
3444 for (; column < end_column; column++, blob_lengths++)
3445 {
3446 uchar *blob_pos;
3447 uint length;
3448 ulong blob_length;
3449 if (!*blob_lengths) /* Null or "" */
3450 continue;
3451 length= column->length - portable_sizeof_char_ptr;
3452 memcpy(&blob_pos, record + column->offset + length, sizeof(char*));
3453 /* remove tail part */
3454 blob_length= *blob_lengths;
3455 if (block[block->sub_blocks - 1].used & BLOCKUSED_TAIL)
3456 blob_length-= (blob_length % FULL_PAGE_SIZE(share));
3457
3458 if (blob_length && write_full_pages(info, lsn, block,
3459 blob_pos, blob_length))
3460 goto disk_err;
3461 block+= block->sub_blocks;
3462 }
3463
3464 _ma_finalize_row(info);
3465 DBUG_RETURN(0);
3466
3467 crashed:
3468 DBUG_ASSERT(!maria_assert_if_crashed_table);
3469 /* Something was wrong with data on page */
3470 _ma_set_fatal_error(share, HA_ERR_WRONG_IN_RECORD);
3471
3472 disk_err:
3473 /**
3474 @todo RECOVERY we are going to let dirty pages go to disk while we have
3475 logged UNDO, this violates WAL. We must mark the table corrupted!
3476
3477 @todo RECOVERY we have written some REDOs without a closing UNDO,
3478 it's possible that a next operation by this transaction succeeds and then
3479 Recovery would glue the "orphan REDOs" to the succeeded operation and
3480 execute the failed REDOs. We need some mark "abort this group" in the
3481 log, or mark the table corrupted (then user will repair it and thus REDOs
3482 will be skipped).
3483
3484 @todo RECOVERY to not let write errors go unnoticed, pagecache_write()
3485 should take a MARIA_HA* in argument, and it it
3486 fails when flushing a page to disk it should call
3487 (*the_maria_ha->write_error_func)(the_maria_ha)
3488 and this hook will mark the table corrupted.
3489 Maybe hook should be stored in the pagecache's block structure, or in a
3490 hash "file->maria_ha*".
3491
3492 @todo RECOVERY we should distinguish below between log write error and
3493 table write error. The former should stop Maria immediately, the latter
3494 should mark the table corrupted.
3495 */
3496 /*
3497 Unpin all pinned pages to not cause problems for disk cache. This is
3498 safe to call even if we already called _ma_unpin_all_pages() above.
3499 */
3500 save_my_errno= my_errno;
3501 _ma_unpin_all_pages_and_finalize_row(info, LSN_IMPOSSIBLE);
3502 my_errno= save_my_errno;
3503 DBUG_RETURN(1);
3504 }
3505
3506
3507 /*
3508 @brief Write a record
3509
3510 @fn allocate_and_write_block_record()
3511 @param info Maria handler
3512 @param record Record to write
3513 @param row Information about fields in 'record'
3514 @param undo_lsn <> LSN_ERROR if we are executing an UNDO
3515
3516 @return
3517 @retval 0 ok
3518 @retval 1 Error
3519 */
3520
allocate_and_write_block_record(MARIA_HA * info,const uchar * record,MARIA_ROW * row,LSN undo_lsn)3521 static my_bool allocate_and_write_block_record(MARIA_HA *info,
3522 const uchar *record,
3523 MARIA_ROW *row,
3524 LSN undo_lsn)
3525 {
3526 struct st_row_pos_info row_pos;
3527 MARIA_BITMAP_BLOCKS *blocks= &row->insert_blocks;
3528 int save_my_errno;
3529 DBUG_ENTER("allocate_and_write_block_record");
3530
3531 _ma_bitmap_flushable(info, 1);
3532 if (_ma_bitmap_find_place(info, row, blocks))
3533 goto err; /* Error reading bitmap */
3534
3535 /*
3536 Sleep; a checkpoint will happen and should not send this over-allocated
3537 bitmap to disk but rather wait.
3538 */
3539 DBUG_EXECUTE_IF("maria_over_alloc_bitmap", sleep(10););
3540
3541 /* page will be pinned & locked by get_head_or_tail_page */
3542 if (get_head_or_tail_page(info, blocks->block, info->buff,
3543 MY_MAX(row->space_on_head_page,
3544 info->s->base.min_block_length),
3545 HEAD_PAGE,
3546 PAGECACHE_LOCK_WRITE, &row_pos))
3547 goto err;
3548 row->lastpos= ma_recordpos(blocks->block->page, row_pos.rownr);
3549 if (info->s->calc_checksum)
3550 {
3551 if (undo_lsn == LSN_ERROR)
3552 row->checksum= (info->s->calc_checksum)(info, record);
3553 else
3554 {
3555 /* _ma_apply_undo_row_delete() already set row's checksum. Verify it. */
3556 DBUG_ASSERT(row->checksum == (info->s->calc_checksum)(info, record));
3557 }
3558 }
3559 DBUG_PRINT("info", ("rowid: %lu (%lu:%u) length: %u", (ulong) row->lastpos,
3560 (ulong) ma_recordpos_to_page(row->lastpos),
3561 ma_recordpos_to_dir_entry(row->lastpos),
3562 row_pos.length));
3563 if (write_block_record(info, (uchar*) 0, record, row,
3564 blocks, blocks->block->org_bitmap_value != 0,
3565 &row_pos, undo_lsn, 0))
3566 goto err;
3567 /* Now let checkpoint happen but don't commit */
3568 DBUG_EXECUTE_IF("maria_over_alloc_bitmap", sleep(1000););
3569 DBUG_RETURN(0);
3570
3571 err:
3572 save_my_errno= my_errno;
3573 if (info->non_flushable_state)
3574 _ma_bitmap_flushable(info, -1);
3575 _ma_unpin_all_pages_and_finalize_row(info, LSN_IMPOSSIBLE);
3576 my_errno= save_my_errno;
3577 DBUG_RETURN(1);
3578 }
3579
3580
3581 /*
3582 Write a record and return rowid for it
3583
3584 SYNOPSIS
3585 _ma_write_init_block_record()
3586 info Maria handler
3587 record Record to write
3588
3589 NOTES
3590 This is done BEFORE we write the keys to the row!
3591
3592 RETURN
3593 HA_OFFSET_ERROR Something went wrong
3594 # Rowid for row
3595 */
3596
_ma_write_init_block_record(MARIA_HA * info,const uchar * record)3597 MARIA_RECORD_POS _ma_write_init_block_record(MARIA_HA *info,
3598 const uchar *record)
3599 {
3600 DBUG_ENTER("_ma_write_init_block_record");
3601
3602 calc_record_size(info, record, &info->cur_row);
3603 if (allocate_and_write_block_record(info, record,
3604 &info->cur_row, LSN_ERROR))
3605 DBUG_RETURN(HA_OFFSET_ERROR);
3606 DBUG_RETURN(info->cur_row.lastpos);
3607 }
3608
3609
3610 /*
3611 Dummy function for (*info->s->write_record)()
3612
3613 Nothing to do here, as we already wrote the record in
3614 _ma_write_init_block_record()
3615 */
3616
_ma_write_block_record(MARIA_HA * info,const uchar * record)3617 my_bool _ma_write_block_record(MARIA_HA *info __attribute__ ((unused)),
3618 const uchar *record __attribute__ ((unused)))
3619 {
3620 return 0; /* Row already written */
3621 }
3622
3623
3624 /**
3625 @brief Remove row written by _ma_write_block_record() and log undo
3626
3627 @param info Maria handler
3628
3629 @note
3630 This is called in case we got a duplicate unique key while
3631 writing keys.
3632
3633 @return Operation status
3634 @retval 0 OK
3635 @retval 1 Error
3636 */
3637
_ma_write_abort_block_record(MARIA_HA * info)3638 my_bool _ma_write_abort_block_record(MARIA_HA *info)
3639 {
3640 my_bool res= 0;
3641 MARIA_BITMAP_BLOCKS *blocks= &info->cur_row.insert_blocks;
3642 MARIA_BITMAP_BLOCK *block, *end;
3643 LSN lsn= LSN_IMPOSSIBLE;
3644 MARIA_SHARE *share= info->s;
3645 DBUG_ENTER("_ma_write_abort_block_record");
3646
3647 _ma_bitmap_lock(share); /* Lock bitmap from other insert threads */
3648 if (delete_head_or_tail(info,
3649 ma_recordpos_to_page(info->cur_row.lastpos),
3650 ma_recordpos_to_dir_entry(info->cur_row.lastpos), 1,
3651 0))
3652 res= 1;
3653 for (block= blocks->block + 1, end= block + blocks->count - 1; block < end;
3654 block++)
3655 {
3656 if (block->used & BLOCKUSED_USED)
3657 {
3658 if (block->used & BLOCKUSED_TAIL)
3659 {
3660 /*
3661 block->page_count is set to the tail directory entry number in
3662 write_block_record()
3663 */
3664 if (delete_head_or_tail(info, block->page,
3665 block->page_count & ~TAIL_BIT,
3666 0, 0))
3667 res= 1;
3668 }
3669 else
3670 {
3671 if (free_full_page_range(info, block->page, block->page_count))
3672 res= 1;
3673 }
3674 }
3675 }
3676 _ma_bitmap_unlock(share);
3677 if (share->now_transactional)
3678 {
3679 if (_ma_write_clr(info, info->cur_row.orig_undo_lsn,
3680 LOGREC_UNDO_ROW_INSERT,
3681 share->calc_checksum != 0,
3682 (ha_checksum) 0 - info->cur_row.checksum,
3683 &lsn, (void*) 0))
3684 res= 1;
3685 }
3686 _ma_unpin_all_pages_and_finalize_row(info, lsn);
3687 DBUG_RETURN(res);
3688 }
3689
3690
3691 /*
3692 Update a record
3693
3694 NOTES
3695 For the moment, we assume that info->curr_row.extents is always updated
3696 when a row is read. In the future we may decide to read this on demand
3697 for rows split into many extents.
3698 */
3699
_ma_update_block_record2(MARIA_HA * info,MARIA_RECORD_POS record_pos,const uchar * oldrec,const uchar * record,LSN undo_lsn)3700 static my_bool _ma_update_block_record2(MARIA_HA *info,
3701 MARIA_RECORD_POS record_pos,
3702 const uchar *oldrec,
3703 const uchar *record,
3704 LSN undo_lsn)
3705 {
3706 MARIA_BITMAP_BLOCKS *blocks= &info->cur_row.insert_blocks;
3707 uchar *buff;
3708 MARIA_ROW *cur_row= &info->cur_row, *new_row= &info->new_row;
3709 MARIA_PINNED_PAGE page_link;
3710 uint rownr, org_empty_size, head_length;
3711 uint block_size= info->s->block_size;
3712 uint errpos __attribute__((unused)) = 0;
3713 uchar *dir;
3714 pgcache_page_no_t page;
3715 struct st_row_pos_info row_pos;
3716 my_bool res;
3717 ha_checksum old_checksum;
3718 MARIA_SHARE *share= info->s;
3719 DBUG_ENTER("_ma_update_block_record2");
3720 DBUG_PRINT("enter", ("rowid: %lu", (long) record_pos));
3721
3722 #ifdef ENABLE_IF_PROBLEM_WITH_UPDATE
3723 DBUG_DUMP("oldrec", oldrec, share->base.reclength);
3724 DBUG_DUMP("newrec", record, share->base.reclength);
3725 #endif
3726
3727 /*
3728 Checksums of new and old rows were computed by callers already; new
3729 row's was put into cur_row, old row's was put into new_row.
3730 */
3731 old_checksum= new_row->checksum;
3732 new_row->checksum= cur_row->checksum;
3733 calc_record_size(info, record, new_row);
3734 page= ma_recordpos_to_page(record_pos);
3735
3736 _ma_bitmap_flushable(info, 1);
3737 buff= pagecache_read(share->pagecache,
3738 &info->dfile, (pgcache_page_no_t) page, 0, 0,
3739 share->page_type,
3740 PAGECACHE_LOCK_WRITE, &page_link.link);
3741 page_link.unlock= PAGECACHE_LOCK_WRITE_UNLOCK;
3742 page_link.changed= buff != 0;
3743 push_dynamic(&info->pinned_pages, (void*) &page_link);
3744 if (!buff)
3745 goto err;
3746
3747 org_empty_size= uint2korr(buff + EMPTY_SPACE_OFFSET);
3748 rownr= ma_recordpos_to_dir_entry(record_pos);
3749 dir= dir_entry_pos(buff, block_size, rownr);
3750
3751 /*
3752 We can't use cur_row->head_length as the block may have been compacted
3753 since we read it.
3754 */
3755 head_length= uint2korr(dir + 2);
3756
3757 if ((org_empty_size + head_length) >= new_row->total_length)
3758 {
3759 uint rec_offset, length;
3760 MARIA_BITMAP_BLOCK block;
3761
3762 DBUG_PRINT("info", ("org_empty_size: %u org_length: %u new_length: %lu",
3763 org_empty_size, head_length,
3764 new_row->total_length));
3765
3766 /*
3767 We can fit the new row in the same page as the original head part
3768 of the row
3769 */
3770 block.org_bitmap_value= _ma_free_size_to_head_pattern(&share->bitmap,
3771 org_empty_size);
3772 if (extend_area_on_page(info, buff, dir, rownr,
3773 new_row->total_length, &org_empty_size,
3774 &rec_offset, &length, 1))
3775 {
3776 errpos= 1;
3777 goto err;
3778 }
3779
3780 row_pos.buff= buff;
3781 row_pos.rownr= rownr;
3782 row_pos.empty_space= org_empty_size;
3783 row_pos.dir= dir;
3784 row_pos.data= buff + rec_offset;
3785 row_pos.length= length;
3786 blocks->block= █
3787 blocks->count= 1;
3788 block.page= page;
3789 block.sub_blocks= 1;
3790 block.used= BLOCKUSED_USED | BLOCKUSED_USE_ORG_BITMAP;
3791 block.empty_space= row_pos.empty_space;
3792
3793 if (*cur_row->tail_positions &&
3794 delete_tails(info, cur_row->tail_positions))
3795 {
3796 errpos= 2;
3797 goto err;
3798 }
3799 if (cur_row->extents_count && free_full_pages(info, cur_row))
3800 {
3801 errpos= 3;
3802 goto err;
3803 }
3804 res= write_block_record(info, oldrec, record, new_row, blocks,
3805 1, &row_pos, undo_lsn, old_checksum);
3806 /* We can't update or delete this without re-reading it again */
3807 info->update&= ~HA_STATE_AKTIV;
3808 DBUG_RETURN(res);
3809 }
3810 /* Delete old row */
3811 if (*cur_row->tail_positions &&
3812 delete_tails(info, cur_row->tail_positions))
3813 {
3814 errpos= 4;
3815 goto err;
3816 }
3817 if (cur_row->extents_count && free_full_pages(info, cur_row))
3818 {
3819 errpos= 5;
3820 goto err;
3821 }
3822
3823 head_length= uint2korr(dir + 2);
3824 if (_ma_bitmap_find_new_place(info, new_row, page, head_length +
3825 org_empty_size, blocks))
3826 {
3827 errpos= 6;
3828 goto err;
3829 }
3830
3831 /*
3832 Allocate all size in block for record
3833 TODO:
3834 Need to improve this to do compact if we can fit one more blob into
3835 the head page
3836 */
3837 if ((head_length < new_row->space_on_head_page ||
3838 (new_row->total_length <= head_length &&
3839 org_empty_size + head_length >= new_row->total_length)))
3840 {
3841 _ma_compact_block_page(share,
3842 buff, rownr, 1,
3843 info->trn->min_read_from,
3844 share->base.min_block_length);
3845 org_empty_size= 0;
3846 head_length= uint2korr(dir + 2);
3847 }
3848
3849 row_pos.buff= buff;
3850 row_pos.rownr= rownr;
3851 row_pos.empty_space= org_empty_size + head_length;
3852 row_pos.dir= dir;
3853 row_pos.data= buff + uint2korr(dir);
3854 row_pos.length= head_length;
3855 if ((res= write_block_record(info, oldrec, record, new_row, blocks, 1,
3856 &row_pos, undo_lsn, old_checksum)))
3857 {
3858 errpos= 7;
3859 goto err;
3860 }
3861 DBUG_RETURN(0);
3862
3863 err:
3864 DBUG_ASSERT(!maria_assert_if_crashed_table);
3865 DBUG_PRINT("error", ("errpos: %d", errpos));
3866 if (info->non_flushable_state)
3867 _ma_bitmap_flushable(info, -1);
3868 _ma_unpin_all_pages_and_finalize_row(info, LSN_IMPOSSIBLE);
3869 DBUG_RETURN(1);
3870 }
3871
3872
3873 /*
3874 @brief Store new row on it's original position
3875
3876 @note
3877 This is basicly a copy of _ma_update_block_record2
3878 When we have a purge thread for deleted row, we can remove this function
3879 and use _ma_update_block_record2 instead.
3880
3881 This is the main reason we don't make a lot of subfunctions that are
3882 common between _ma_update_block_record2() and this function.
3883
3884 Note: If something goes wrong we mark the file crashed
3885 */
3886
_ma_update_at_original_place(MARIA_HA * info,pgcache_page_no_t page,uint rownr,uint length_on_head_page,uint extent_count,const uchar * extent_info,const uchar * oldrec,const uchar * record,LSN undo_lsn)3887 static my_bool _ma_update_at_original_place(MARIA_HA *info,
3888 pgcache_page_no_t page,
3889 uint rownr,
3890 uint length_on_head_page,
3891 uint extent_count,
3892 const uchar *extent_info,
3893 const uchar *oldrec,
3894 const uchar *record,
3895 LSN undo_lsn)
3896 {
3897 MARIA_BITMAP_BLOCKS *blocks;
3898 MARIA_BITMAP_BLOCK *block;
3899 MARIA_ROW *cur_row= &info->cur_row, *new_row= &info->new_row;
3900 MARIA_PINNED_PAGE page_link;
3901 MARIA_SHARE *share= info->s;
3902 ha_checksum old_checksum;
3903 uint org_empty_size, empty_size;
3904 uint block_size= info->s->block_size;
3905 uchar *dir, *buff;
3906 struct st_row_pos_info row_pos;
3907 my_bool res;
3908 uint rec_offset, length;
3909 DBUG_ENTER("_ma_update_at_original_place");
3910
3911 #ifdef ENABLE_IF_PROBLEM_WITH_UPDATE
3912 DBUG_DUMP("oldrec", oldrec, share->base.reclength);
3913 DBUG_DUMP("newrec", record, share->base.reclength);
3914 #endif
3915
3916 /*
3917 Checksums of new and old rows were computed by callers already; new
3918 row's was put into cur_row, old row's was put into new_row.
3919 */
3920 old_checksum= new_row->checksum;
3921 new_row->checksum= cur_row->checksum;
3922 calc_record_size(info, record, new_row);
3923
3924 _ma_bitmap_flushable(info, 1);
3925 buff= pagecache_read(share->pagecache,
3926 &info->dfile, (pgcache_page_no_t) page, 0, 0,
3927 share->page_type,
3928 PAGECACHE_LOCK_WRITE, &page_link.link);
3929 page_link.unlock= PAGECACHE_LOCK_WRITE_UNLOCK;
3930 page_link.changed= buff != 0;
3931 push_dynamic(&info->pinned_pages, (void*) &page_link);
3932 if (!buff)
3933 goto err;
3934
3935 org_empty_size= uint2korr(buff + EMPTY_SPACE_OFFSET);
3936 dir= dir_entry_pos(buff, block_size, rownr);
3937
3938 if ((org_empty_size + cur_row->head_length) < length_on_head_page)
3939 {
3940 DBUG_PRINT("error",
3941 ("org_empty_size: %u head_length: %u length_on_page: %u",
3942 org_empty_size, (uint) cur_row->head_length,
3943 length_on_head_page));
3944 _ma_set_fatal_error(share, HA_ERR_WRONG_IN_RECORD);
3945 goto err;
3946 }
3947
3948 /*
3949 We can fit the new row in the same page as the original head part
3950 of the row
3951 */
3952 empty_size= org_empty_size;
3953 if (extend_area_on_page(info, buff, dir, rownr,
3954 length_on_head_page, &empty_size,
3955 &rec_offset, &length, 1))
3956 goto err;
3957
3958 row_pos.buff= buff;
3959 row_pos.rownr= rownr;
3960 row_pos.empty_space= empty_size;
3961 row_pos.dir= dir;
3962 row_pos.data= buff + rec_offset;
3963
3964 /* Delete old row */
3965 if (*cur_row->tail_positions &&
3966 delete_tails(info, cur_row->tail_positions))
3967 goto err;
3968 if (cur_row->extents_count && free_full_pages(info, cur_row))
3969 goto err;
3970
3971 /* Change extent information to be usable by write_block_record() */
3972 blocks= &cur_row->insert_blocks;
3973 if (extent_to_bitmap_blocks(info, blocks, page, extent_count, extent_info))
3974 goto err;
3975 block= blocks->block;
3976 block->empty_space= row_pos.empty_space;
3977 block->org_bitmap_value=
3978 _ma_free_size_to_head_pattern(&share->bitmap,
3979 (enough_free_entries_on_page(share, buff) ?
3980 org_empty_size : 0));
3981
3982 DBUG_ASSERT(block->org_bitmap_value ==
3983 _ma_bitmap_get_page_bits(info, &info->s->bitmap, page));
3984 block->used|= BLOCKUSED_USE_ORG_BITMAP;
3985
3986 /*
3987 We have to use <= below as the new_row may be smaller than the original
3988 row as the new row doesn't have transaction id
3989 */
3990
3991 DBUG_ASSERT(blocks->count > 1 ||
3992 MY_MAX(new_row->total_length, share->base.min_block_length) <=
3993 length_on_head_page);
3994
3995 /* Store same amount of data on head page as on original page */
3996 row_pos.length= (length_on_head_page -
3997 (extent_count + 1 - blocks->count) * ROW_EXTENT_SIZE);
3998 set_if_bigger(row_pos.length, share->base.min_block_length);
3999 if ((res= write_block_record(info, oldrec, record, new_row, blocks,
4000 1, &row_pos, undo_lsn, old_checksum)))
4001 goto err;
4002 DBUG_RETURN(0);
4003
4004 err:
4005 DBUG_ASSERT(!maria_assert_if_crashed_table);
4006 _ma_mark_file_crashed(share);
4007 if (info->non_flushable_state)
4008 _ma_bitmap_flushable(info, -1);
4009 _ma_unpin_all_pages_and_finalize_row(info, LSN_IMPOSSIBLE);
4010 DBUG_RETURN(1);
4011 }
4012
4013
4014 /* Wrapper for _ma_update_block_record2() used by ma_update() */
4015
_ma_update_block_record(MARIA_HA * info,MARIA_RECORD_POS record_pos,const uchar * orig_rec,const uchar * new_rec)4016 my_bool _ma_update_block_record(MARIA_HA *info, MARIA_RECORD_POS record_pos,
4017 const uchar *orig_rec, const uchar *new_rec)
4018 {
4019 return _ma_update_block_record2(info, record_pos, orig_rec, new_rec,
4020 LSN_ERROR);
4021 }
4022
4023
4024 /*
4025 Delete a directory entry
4026
4027 SYNOPSIS
4028 delete_dir_entry()
4029 buff Page buffer
4030 record_number Record number to delete
4031 empty_space Empty space on page after delete
4032
4033 RETURN
4034 -1 Error on page
4035 0 ok
4036 1 Page is now empty
4037 */
4038
delete_dir_entry(MARIA_SHARE * share,uchar * buff,uint record_number,uint * empty_space_res)4039 static int delete_dir_entry(MARIA_SHARE *share,
4040 uchar *buff, uint record_number,
4041 uint *empty_space_res)
4042 {
4043 uint block_size= share->block_size;
4044 uint number_of_records= (uint) buff[DIR_COUNT_OFFSET];
4045 uint length, empty_space;
4046 uchar *dir;
4047 DBUG_ENTER("delete_dir_entry");
4048 DBUG_PRINT("enter", ("record_number: %u number_of_records: %u",
4049 record_number, number_of_records));
4050
4051 #ifdef SANITY_CHECKS
4052 if (record_number >= number_of_records ||
4053 record_number > ((block_size - LSN_SIZE - PAGE_TYPE_SIZE - 1 -
4054 PAGE_SUFFIX_SIZE) / DIR_ENTRY_SIZE))
4055 {
4056 DBUG_PRINT("error", ("record_number: %u number_of_records: %u",
4057 record_number, number_of_records));
4058
4059 DBUG_RETURN(-1);
4060 }
4061 #endif
4062
4063 check_directory(share, buff, block_size, 0, (uint) -1);
4064 empty_space= uint2korr(buff + EMPTY_SPACE_OFFSET);
4065 dir= dir_entry_pos(buff, block_size, record_number);
4066 length= uint2korr(dir + 2); /* Length of entry we just deleted */
4067 DBUG_ASSERT(uint2korr(dir) != 0 && length < block_size);
4068
4069 if (record_number == number_of_records - 1)
4070 {
4071 /* Delete this entry and all following free directory entries */
4072 uchar *end= buff + block_size - PAGE_SUFFIX_SIZE;
4073 number_of_records--;
4074 dir+= DIR_ENTRY_SIZE;
4075 empty_space+= DIR_ENTRY_SIZE;
4076
4077 /* Unlink and free the next empty ones */
4078 while (dir < end && dir[0] == 0 && dir[1] == 0)
4079 {
4080 number_of_records--;
4081 if (dir[2] == END_OF_DIR_FREE_LIST)
4082 buff[DIR_FREE_OFFSET]= dir[3];
4083 else
4084 {
4085 uchar *prev_entry= dir_entry_pos(buff, block_size, (uint) dir[2]);
4086 DBUG_ASSERT(uint2korr(prev_entry) == 0 && prev_entry[3] ==
4087 number_of_records);
4088 prev_entry[3]= dir[3];
4089 }
4090 if (dir[3] != END_OF_DIR_FREE_LIST)
4091 {
4092 uchar *next_entry= dir_entry_pos(buff, block_size, (uint) dir[3]);
4093 DBUG_ASSERT(uint2korr(next_entry) == 0 && next_entry[2] ==
4094 number_of_records);
4095 next_entry[2]= dir[2];
4096 }
4097 dir+= DIR_ENTRY_SIZE;
4098 empty_space+= DIR_ENTRY_SIZE;
4099 }
4100
4101 if (number_of_records == 0)
4102 {
4103 /* All entries on page deleted */
4104 DBUG_PRINT("info", ("Page marked as unallocated"));
4105 buff[PAGE_TYPE_OFFSET]= UNALLOCATED_PAGE;
4106 #ifdef IDENTICAL_PAGES_AFTER_RECOVERY
4107 {
4108 dir= dir_entry_pos(buff, block_size, record_number);
4109 bzero(dir, (record_number+1) * DIR_ENTRY_SIZE);
4110 }
4111 #endif
4112 *empty_space_res= block_size;
4113 DBUG_RETURN(1);
4114 }
4115 buff[DIR_COUNT_OFFSET]= (uchar) number_of_records;
4116 }
4117 else
4118 {
4119 /* Update directory */
4120 dir[0]= dir[1]= 0;
4121 dir[2]= END_OF_DIR_FREE_LIST;
4122 if ((dir[3]= buff[DIR_FREE_OFFSET]) != END_OF_DIR_FREE_LIST)
4123 {
4124 /* Relink next entry to point to newly freed entry */
4125 uchar *next_entry= dir_entry_pos(buff, block_size, (uint) dir[3]);
4126 DBUG_ASSERT(uint2korr(next_entry) == 0 &&
4127 next_entry[2] == END_OF_DIR_FREE_LIST);
4128 next_entry[2]= record_number;
4129 }
4130 buff[DIR_FREE_OFFSET]= record_number;
4131 }
4132 empty_space+= length;
4133
4134 int2store(buff + EMPTY_SPACE_OFFSET, empty_space);
4135 buff[PAGE_TYPE_OFFSET]|= (uchar) PAGE_CAN_BE_COMPACTED;
4136
4137 *empty_space_res= empty_space;
4138
4139 check_directory(share, buff, block_size, 0, empty_space);
4140 DBUG_RETURN(0);
4141 }
4142
4143
4144 /*
4145 Delete a head a tail part
4146
4147 SYNOPSIS
4148 delete_head_or_tail()
4149 info Maria handler
4150 page Page (not file offset!) on which the row is
4151 head 1 if this is a head page
4152 from_update 1 if we are called from update. In this case we
4153 leave the page as write locked as we may put
4154 the new row into the old position.
4155
4156 RETURN
4157 0 ok
4158 1 error
4159 */
4160
delete_head_or_tail(MARIA_HA * info,pgcache_page_no_t page,uint record_number,my_bool head,my_bool from_update)4161 static my_bool delete_head_or_tail(MARIA_HA *info,
4162 pgcache_page_no_t page, uint record_number,
4163 my_bool head, my_bool from_update)
4164 {
4165 MARIA_SHARE *share= info->s;
4166 uint empty_space;
4167 int res;
4168 my_bool page_is_empty;
4169 uchar *buff;
4170 LSN lsn;
4171 MARIA_PINNED_PAGE page_link;
4172 enum pagecache_page_lock lock_at_write, lock_at_unpin;
4173 DBUG_ENTER("delete_head_or_tail");
4174 DBUG_PRINT("enter", ("id: %lu (%lu:%u)",
4175 (ulong) ma_recordpos(page, record_number),
4176 (ulong) page, record_number));
4177
4178 buff= pagecache_read(share->pagecache,
4179 &info->dfile, page, 0, 0,
4180 share->page_type,
4181 PAGECACHE_LOCK_WRITE, &page_link.link);
4182 page_link.unlock= PAGECACHE_LOCK_WRITE_UNLOCK;
4183 page_link.changed= buff != 0;
4184 push_dynamic(&info->pinned_pages, (void*) &page_link);
4185 if (!buff)
4186 DBUG_RETURN(1);
4187 DBUG_ASSERT((buff[PAGE_TYPE_OFFSET] & PAGE_TYPE_MASK) ==
4188 (head ? HEAD_PAGE : TAIL_PAGE));
4189
4190 if (from_update)
4191 {
4192 lock_at_write= PAGECACHE_LOCK_LEFT_WRITELOCKED;
4193 lock_at_unpin= PAGECACHE_LOCK_WRITE_UNLOCK;
4194 }
4195 else
4196 {
4197 lock_at_write= PAGECACHE_LOCK_WRITE_TO_READ;
4198 lock_at_unpin= PAGECACHE_LOCK_READ_UNLOCK;
4199 }
4200
4201 res= delete_dir_entry(share, buff, record_number, &empty_space);
4202 if (res < 0)
4203 DBUG_RETURN(1);
4204 if (res == 0) /* after our deletion, page is still not empty */
4205 {
4206 uchar log_data[FILEID_STORE_SIZE + PAGE_STORE_SIZE + DIRPOS_STORE_SIZE];
4207 LEX_CUSTRING log_array[TRANSLOG_INTERNAL_PARTS + 1];
4208 page_is_empty= 0;
4209 if (share->now_transactional)
4210 {
4211 /* Log REDO data */
4212 page_store(log_data + FILEID_STORE_SIZE, page);
4213 dirpos_store(log_data + FILEID_STORE_SIZE + PAGE_STORE_SIZE,
4214 record_number);
4215
4216 log_array[TRANSLOG_INTERNAL_PARTS + 0].str= log_data;
4217 log_array[TRANSLOG_INTERNAL_PARTS + 0].length= sizeof(log_data);
4218 if (translog_write_record(&lsn, (head ? LOGREC_REDO_PURGE_ROW_HEAD :
4219 LOGREC_REDO_PURGE_ROW_TAIL),
4220 info->trn, info,
4221 (translog_size_t) sizeof(log_data),
4222 TRANSLOG_INTERNAL_PARTS + 1, log_array,
4223 log_data, NULL))
4224 DBUG_RETURN(1);
4225 }
4226 }
4227 else /* page is now empty */
4228 {
4229 page_is_empty= 1;
4230 if (share->now_transactional)
4231 {
4232 uchar log_data[FILEID_STORE_SIZE + PAGE_STORE_SIZE];
4233 LEX_CUSTRING log_array[TRANSLOG_INTERNAL_PARTS + 1];
4234 page_store(log_data + FILEID_STORE_SIZE, page);
4235 log_array[TRANSLOG_INTERNAL_PARTS + 0].str= log_data;
4236 log_array[TRANSLOG_INTERNAL_PARTS + 0].length= sizeof(log_data);
4237 if (translog_write_record(&lsn, LOGREC_REDO_FREE_HEAD_OR_TAIL,
4238 info->trn, info,
4239 (translog_size_t) sizeof(log_data),
4240 TRANSLOG_INTERNAL_PARTS + 1, log_array,
4241 log_data, NULL))
4242 DBUG_RETURN(1);
4243 }
4244 /*
4245 Mark that this page must be written to disk by page cache, even
4246 if we could call pagecache_delete() on it.
4247 This is needed to ensure that repair finds the empty page on disk
4248 and not old data.
4249 */
4250 pagecache_set_write_on_delete_by_link(page_link.link);
4251 DBUG_ASSERT(empty_space >= share->bitmap.sizes[0]);
4252 }
4253
4254 pagecache_unlock_by_link(share->pagecache, page_link.link,
4255 lock_at_write,
4256 PAGECACHE_PIN_LEFT_PINNED, LSN_IMPOSSIBLE,
4257 LSN_IMPOSSIBLE, 1, FALSE);
4258 page_link.unlock= lock_at_unpin;
4259 set_dynamic(&info->pinned_pages, (void*) &page_link,
4260 info->pinned_pages.elements-1);
4261
4262 DBUG_PRINT("info", ("empty_space: %u", empty_space));
4263
4264 /*
4265 If there is not enough space for all possible tails, mark the
4266 page full
4267 */
4268 if (!head && !page_is_empty && !enough_free_entries(buff, share->block_size,
4269 1 + share->base.blobs))
4270 empty_space= 0;
4271
4272 DBUG_RETURN(_ma_bitmap_set(info, page, head, empty_space));
4273 }
4274
4275
4276 /*
4277 delete all tails
4278
4279 SYNOPSIS
4280 delete_tails()
4281 info Handler
4282 tails Pointer to vector of tail positions, ending with 0
4283
4284 RETURN
4285 0 ok
4286 1 error
4287 */
4288
delete_tails(MARIA_HA * info,MARIA_RECORD_POS * tails)4289 static my_bool delete_tails(MARIA_HA *info, MARIA_RECORD_POS *tails)
4290 {
4291 my_bool res= 0;
4292 DBUG_ENTER("delete_tails");
4293 for (; *tails; tails++)
4294 {
4295 if (delete_head_or_tail(info,
4296 ma_recordpos_to_page(*tails),
4297 ma_recordpos_to_dir_entry(*tails), 0, 1))
4298 res= 1;
4299 }
4300 DBUG_RETURN(res);
4301 }
4302
4303
4304 /*
4305 Delete a record
4306
4307 NOTES
4308 For the moment, we assume that info->cur_row.extents is always updated
4309 when a row is read. In the future we may decide to read this on demand
4310 for rows with many splits.
4311 */
4312
_ma_delete_block_record(MARIA_HA * info,const uchar * record)4313 my_bool _ma_delete_block_record(MARIA_HA *info, const uchar *record)
4314 {
4315 pgcache_page_no_t page;
4316 uint record_number;
4317 MARIA_SHARE *share= info->s;
4318 LSN lsn= LSN_IMPOSSIBLE;
4319 DBUG_ENTER("_ma_delete_block_record");
4320
4321 page= ma_recordpos_to_page(info->cur_row.lastpos);
4322 record_number= ma_recordpos_to_dir_entry(info->cur_row.lastpos);
4323 DBUG_PRINT("enter", ("rowid: %lu (%lu:%u)", (ulong) info->cur_row.lastpos,
4324 (ulong) page, record_number));
4325
4326 _ma_bitmap_flushable(info, 1);
4327 if (delete_head_or_tail(info, page, record_number, 1, 0) ||
4328 delete_tails(info, info->cur_row.tail_positions))
4329 goto err;
4330
4331 if (info->cur_row.extents_count && free_full_pages(info, &info->cur_row))
4332 goto err;
4333
4334 if (share->now_transactional)
4335 {
4336 uchar log_data[LSN_STORE_SIZE + FILEID_STORE_SIZE + PAGE_STORE_SIZE +
4337 DIRPOS_STORE_SIZE + 2 + PAGERANGE_STORE_SIZE +
4338 HA_CHECKSUM_STORE_SIZE];
4339 uchar *log_pos;
4340 size_t row_length;
4341 uint row_parts_count, extents_length;
4342 ha_checksum checksum_delta;
4343
4344 /* Write UNDO record */
4345 lsn_store(log_data, info->trn->undo_lsn);
4346 page_store(log_data + LSN_STORE_SIZE + FILEID_STORE_SIZE, page);
4347 log_pos= log_data + LSN_STORE_SIZE + FILEID_STORE_SIZE + PAGE_STORE_SIZE;
4348 dirpos_store(log_pos, record_number);
4349 log_pos+= DIRPOS_STORE_SIZE;
4350 int2store(log_pos, info->cur_row.head_length -
4351 info->cur_row.header_length);
4352 log_pos+= 2;
4353 pagerange_store(log_pos, info->cur_row.extents_count);
4354 log_pos+= PAGERANGE_STORE_SIZE;
4355
4356 info->log_row_parts[TRANSLOG_INTERNAL_PARTS].str= log_data;
4357 info->log_row_parts[TRANSLOG_INTERNAL_PARTS].length=
4358 sizeof(log_data) - HA_CHECKSUM_STORE_SIZE;
4359 store_checksum_in_rec(share, checksum_delta,
4360 (ha_checksum) 0 - info->cur_row.checksum, log_pos,
4361 info->log_row_parts[TRANSLOG_INTERNAL_PARTS +
4362 0].length);
4363 info->log_row_parts[TRANSLOG_INTERNAL_PARTS+1].str=
4364 info->cur_row.extents;
4365 info->log_row_parts[TRANSLOG_INTERNAL_PARTS+1].length=
4366 extents_length= info->cur_row.extents_count * ROW_EXTENT_SIZE;
4367
4368 row_length= fill_insert_undo_parts(info, record,
4369 (info->log_row_parts +
4370 TRANSLOG_INTERNAL_PARTS + 2),
4371 &row_parts_count);
4372
4373 if (translog_write_record(&lsn, LOGREC_UNDO_ROW_DELETE, info->trn,
4374 info,
4375 (translog_size_t)
4376 (info->log_row_parts[TRANSLOG_INTERNAL_PARTS +
4377 0].length + row_length +
4378 extents_length),
4379 TRANSLOG_INTERNAL_PARTS + 2 + row_parts_count,
4380 info->log_row_parts,
4381 log_data + LSN_STORE_SIZE,
4382 &checksum_delta))
4383 goto err;
4384 }
4385
4386 _ma_bitmap_flushable(info, -1);
4387 _ma_unpin_all_pages_and_finalize_row(info, lsn);
4388 DBUG_RETURN(0);
4389
4390 err:
4391 DBUG_ASSERT(!maria_assert_if_crashed_table);
4392 _ma_bitmap_flushable(info, -1);
4393 _ma_unpin_all_pages_and_finalize_row(info, LSN_IMPOSSIBLE);
4394 DBUG_RETURN(1);
4395 }
4396
4397
4398 /****************************************************************************
4399 Reading of records
4400 ****************************************************************************/
4401
4402 /*
4403 Read position to record from record directory at end of page
4404
4405 SYNOPSIS
4406 get_record_position()
4407 buff page buffer
4408 block_size block size for page
4409 record_number Record number in index
4410 end_of_data pointer to end of data for record
4411
4412 RETURN
4413 0 Error in data
4414 # Pointer to start of record.
4415 In this case *end_of_data is set.
4416 */
4417
get_record_position(MARIA_SHARE * share,uchar * buff,uint record_number,uchar ** end_of_data)4418 static uchar *get_record_position(MARIA_SHARE *share, uchar *buff,
4419 uint record_number, uchar **end_of_data)
4420 {
4421 uint block_size= share->block_size;
4422 uint number_of_records= (uint) buff[DIR_COUNT_OFFSET];
4423 uchar *dir;
4424 uchar *data;
4425 uint offset, length;
4426
4427 #ifdef SANITY_CHECKS
4428 if (record_number >= number_of_records ||
4429 record_number > ((block_size - PAGE_HEADER_SIZE(share) - PAGE_SUFFIX_SIZE)
4430 / DIR_ENTRY_SIZE))
4431 {
4432 DBUG_PRINT("error",
4433 ("Wrong row number: record_number: %u number_of_records: %u",
4434 record_number, number_of_records));
4435 return 0;
4436 }
4437 #endif
4438
4439 dir= dir_entry_pos(buff, block_size, record_number);
4440 offset= uint2korr(dir);
4441 length= uint2korr(dir + 2);
4442 #ifdef SANITY_CHECKS
4443 if (offset < PAGE_HEADER_SIZE(share) ||
4444 offset + length > (block_size -
4445 number_of_records * DIR_ENTRY_SIZE -
4446 PAGE_SUFFIX_SIZE))
4447 {
4448 DBUG_PRINT("error",
4449 ("Wrong row position: record_number: %u offset: %u "
4450 "length: %u number_of_records: %u",
4451 record_number, offset, length, number_of_records));
4452 return 0;
4453 }
4454 #endif
4455 data= buff + offset;
4456 *end_of_data= data + length;
4457 return data;
4458 }
4459
4460
4461 /*
4462 Init extent
4463
4464 NOTES
4465 extent is a cursor over which pages to read
4466 */
4467
init_extent(MARIA_EXTENT_CURSOR * extent,uchar * extent_info,uint extents,MARIA_RECORD_POS * tail_positions)4468 static void init_extent(MARIA_EXTENT_CURSOR *extent, uchar *extent_info,
4469 uint extents, MARIA_RECORD_POS *tail_positions)
4470 {
4471 uint page_count;
4472 extent->extent= extent_info;
4473 extent->extent_count= extents;
4474 extent->page= page_korr(extent_info); /* First extent */
4475 page_count= (uint2korr(extent_info + ROW_EXTENT_PAGE_SIZE) &
4476 ~START_EXTENT_BIT);
4477 extent->tail= page_count & TAIL_BIT;
4478 if (extent->tail)
4479 {
4480 extent->page_count= 1;
4481 extent->tail_row_nr= page_count & ~TAIL_BIT;
4482 }
4483 else
4484 extent->page_count= page_count;
4485 extent->tail_positions= tail_positions;
4486 extent->lock_for_tail_pages= PAGECACHE_LOCK_LEFT_UNLOCKED;
4487 }
4488
4489
4490 /*
4491 Read next extent
4492
4493 SYNOPSIS
4494 read_next_extent()
4495 info Maria handler
4496 extent Pointer to current extent (this is updated to point
4497 to next)
4498 end_of_data Pointer to end of data in read block (out)
4499
4500 NOTES
4501 New block is read into info->buff
4502
4503 RETURN
4504 0 Error; my_errno is set
4505 # Pointer to start of data in read block
4506 In this case end_of_data is updated to point to end of data.
4507 */
4508
read_next_extent(MARIA_HA * info,MARIA_EXTENT_CURSOR * extent,uchar ** end_of_data)4509 static uchar *read_next_extent(MARIA_HA *info, MARIA_EXTENT_CURSOR *extent,
4510 uchar **end_of_data)
4511 {
4512 MARIA_SHARE *share= info->s;
4513 uchar *buff, *data;
4514 MARIA_PINNED_PAGE page_link;
4515 enum pagecache_page_lock lock;
4516 DBUG_ENTER("read_next_extent");
4517
4518 if (!extent->page_count)
4519 {
4520 uint page_count;
4521 if (!--extent->extent_count)
4522 goto crashed;
4523 extent->extent+= ROW_EXTENT_SIZE;
4524 extent->page= page_korr(extent->extent);
4525 page_count= (uint2korr(extent->extent+ROW_EXTENT_PAGE_SIZE) &
4526 ~START_EXTENT_BIT);
4527 if (!page_count)
4528 goto crashed;
4529 extent->tail= page_count & TAIL_BIT;
4530 if (extent->tail)
4531 extent->tail_row_nr= page_count & ~TAIL_BIT;
4532 else
4533 extent->page_count= page_count;
4534 DBUG_PRINT("info",("New extent. Page: %lu page_count: %u tail_flag: %d",
4535 (ulong) extent->page, extent->page_count,
4536 extent->tail != 0));
4537 }
4538 extent->first_extent= 0;
4539
4540 lock= PAGECACHE_LOCK_LEFT_UNLOCKED;
4541 if (extent->tail)
4542 lock= extent->lock_for_tail_pages;
4543
4544 buff= pagecache_read(share->pagecache,
4545 &info->dfile, extent->page, 0,
4546 info->buff, share->page_type,
4547 lock, &page_link.link);
4548 if (lock != PAGECACHE_LOCK_LEFT_UNLOCKED)
4549 {
4550 /* Read during UNDO */
4551 page_link.unlock= PAGECACHE_LOCK_WRITE_UNLOCK;
4552 page_link.changed= buff != 0;
4553 push_dynamic(&info->pinned_pages, (void*) &page_link);
4554 }
4555 if (!buff)
4556 {
4557 /* check if we tried to read over end of file (ie: bad data in record) */
4558 if ((extent->page + 1) * share->block_size >
4559 share->state.state.data_file_length)
4560 goto crashed;
4561 DBUG_RETURN(0);
4562 }
4563
4564 if (!extent->tail)
4565 {
4566 /* Full data page */
4567 if ((buff[PAGE_TYPE_OFFSET] & PAGE_TYPE_MASK) != BLOB_PAGE)
4568 goto crashed;
4569 extent->page++; /* point to next page */
4570 extent->page_count--;
4571 *end_of_data= buff + share->block_size - PAGE_SUFFIX_SIZE;
4572 info->cur_row.full_page_count++; /* For maria_chk */
4573 DBUG_RETURN(extent->data_start= buff + FULL_PAGE_HEADER_SIZE(share));
4574 }
4575
4576 /* Found tail */
4577 if ((buff[PAGE_TYPE_OFFSET] & PAGE_TYPE_MASK) != TAIL_PAGE)
4578 goto crashed;
4579 *(extent->tail_positions++)= ma_recordpos(extent->page,
4580 extent->tail_row_nr);
4581 info->cur_row.tail_count++; /* For maria_chk */
4582
4583 if (!(data= get_record_position(share, buff,
4584 extent->tail_row_nr,
4585 end_of_data)))
4586 goto crashed;
4587 extent->data_start= data;
4588 extent->page_count= 0; /* No more data in extent */
4589 DBUG_RETURN(data);
4590
4591
4592 crashed:
4593 DBUG_ASSERT(!maria_assert_if_crashed_table);
4594 _ma_set_fatal_error(share, HA_ERR_WRONG_IN_RECORD);
4595 DBUG_PRINT("error", ("wrong extent information"));
4596 DBUG_RETURN(0);
4597 }
4598
4599
4600 /*
4601 Read data that may be split over many blocks
4602
4603 SYNOPSIS
4604 read_long_data()
4605 info Maria handler
4606 to Store result string here (this is allocated)
4607 extent Pointer to current extent position
4608 data Current position in buffer
4609 end_of_data End of data in buffer
4610
4611 NOTES
4612 When we have to read a new buffer, it's read into info->buff
4613
4614 This loop is implemented by goto's instead of a for() loop as
4615 the code is notable smaller and faster this way (and it's not nice
4616 to jump into a for loop() or into a 'then' clause)
4617
4618 RETURN
4619 0 ok
4620 1 error
4621 */
4622
read_long_data2(MARIA_HA * info,uchar * to,ulong length,MARIA_EXTENT_CURSOR * extent,uchar ** data,uchar ** end_of_data)4623 static my_bool read_long_data2(MARIA_HA *info, uchar *to, ulong length,
4624 MARIA_EXTENT_CURSOR *extent,
4625 uchar **data, uchar **end_of_data)
4626 {
4627 uint left_length= (uint) (*end_of_data - *data);
4628 DBUG_ENTER("read_long_data2");
4629 DBUG_PRINT("enter", ("length: %lu left_length: %u",
4630 length, left_length));
4631 DBUG_ASSERT(*data <= *end_of_data);
4632
4633 /*
4634 Fields are never split in middle. This means that if length > rest-of-data
4635 we should start reading from the next extent. The reason we may have
4636 data left on the page is that if the fixed part of the row was less than
4637 min_block_length the head block was extended to min_block_length.
4638
4639 This may change in the future, which is why we have the loop written
4640 the way it's written.
4641 */
4642 if (extent->first_extent && length > left_length)
4643 {
4644 *end_of_data= *data;
4645 left_length= 0;
4646 }
4647
4648 for(;;)
4649 {
4650 if (unlikely(left_length >= length))
4651 {
4652 memcpy(to, *data, length);
4653 (*data)+= length;
4654 DBUG_PRINT("info", ("left_length: %u", left_length - (uint) length));
4655 DBUG_RETURN(0);
4656 }
4657 memcpy(to, *data, left_length);
4658 to+= left_length;
4659 length-= left_length;
4660 if (!(*data= read_next_extent(info, extent, end_of_data)))
4661 break;
4662 left_length= (uint) (*end_of_data - *data);
4663 }
4664 DBUG_RETURN(1);
4665 }
4666
read_long_data(MARIA_HA * info,uchar * to,ulong length,MARIA_EXTENT_CURSOR * extent,uchar ** data,uchar ** end_of_data)4667 static inline my_bool read_long_data(MARIA_HA *info, uchar *to, ulong length,
4668 MARIA_EXTENT_CURSOR *extent,
4669 uchar **data, uchar **end_of_data)
4670 {
4671 uint left_length= (uint) (*end_of_data - *data);
4672 if (likely(left_length >= length))
4673 {
4674 memcpy(to, *data, length);
4675 (*data)+= length;
4676 return 0;
4677 }
4678 return read_long_data2(info, to, length, extent, data, end_of_data);
4679 }
4680
4681
4682 /*
4683 Read a record from page (helper function for _ma_read_block_record())
4684
4685 SYNOPSIS
4686 _ma_read_block_record2()
4687 info Maria handler
4688 record Store record here
4689 data Start of head data for row
4690 end_of_data End of data for row
4691
4692 NOTES
4693 The head page is already read by caller
4694 Following data is update in info->cur_row:
4695
4696 cur_row.head_length is set to size of entry in head block
4697 cur_row.tail_positions is set to point to all tail blocks
4698 cur_row.extents points to extents data
4699 cur_row.extents_counts contains number of extents
4700 cur_row.empty_bits is set to empty bits
4701 cur_row.field_lengths contains packed length of all fields
4702 cur_row.blob_length contains total length of all blobs
4703 cur_row.checksum contains checksum of read record.
4704
4705 RETURN
4706 0 ok
4707 # Error code
4708 */
4709
_ma_read_block_record2(MARIA_HA * info,uchar * record,uchar * data,uchar * end_of_data)4710 int _ma_read_block_record2(MARIA_HA *info, uchar *record,
4711 uchar *data, uchar *end_of_data)
4712 {
4713 MARIA_SHARE *share= info->s;
4714 uchar *UNINIT_VAR(field_length_data), *UNINIT_VAR(blob_buffer), *start_of_data;
4715 uint flag, null_bytes, cur_null_bytes, row_extents, field_lengths;
4716 my_bool found_blob= 0;
4717 MARIA_EXTENT_CURSOR extent;
4718 MARIA_COLUMNDEF *column, *end_column;
4719 MARIA_ROW *cur_row= &info->cur_row;
4720 DBUG_ENTER("_ma_read_block_record2");
4721
4722 start_of_data= data;
4723 flag= (uint) (uchar) data[0];
4724 cur_null_bytes= share->base.original_null_bytes;
4725 null_bytes= share->base.null_bytes;
4726 cur_row->head_length= (uint) (end_of_data - data);
4727 cur_row->full_page_count= cur_row->tail_count= 0;
4728 cur_row->blob_length= 0;
4729 /* Number of bytes in header that we don't need to write during undo */
4730 cur_row->header_length= total_header_size[(flag & PRECALC_HEADER_BITMASK)]-1;
4731
4732 if (flag & ROW_FLAG_TRANSID)
4733 {
4734 cur_row->trid= transid_korr(data+1);
4735 if (!info->trn)
4736 {
4737 /* File crashed */
4738 DBUG_ASSERT(!maria_assert_if_crashed_table);
4739 _ma_set_fatal_error(share, HA_ERR_WRONG_IN_RECORD);
4740 DBUG_RETURN(HA_ERR_WRONG_IN_RECORD);
4741 }
4742 if (!trnman_can_read_from(info->trn, cur_row->trid))
4743 DBUG_RETURN(my_errno= HA_ERR_ROW_NOT_VISIBLE);
4744 }
4745
4746 /* Skip trans header (for now, until we have MVCC csupport) */
4747 data+= cur_row->header_length + 1 ;
4748 if (flag & ROW_FLAG_NULLS_EXTENDED)
4749 cur_null_bytes+= data[-1];
4750
4751 row_extents= 0;
4752 if (flag & ROW_FLAG_EXTENTS)
4753 {
4754 uint row_extent_size;
4755 /*
4756 Record is split over many data pages.
4757 Get number of extents and first extent
4758 */
4759 get_key_length(row_extents, data);
4760 cur_row->extents_count= row_extents;
4761 row_extent_size= row_extents * ROW_EXTENT_SIZE;
4762 if (cur_row->extents_buffer_length < row_extent_size &&
4763 _ma_alloc_buffer(&cur_row->extents,
4764 &cur_row->extents_buffer_length,
4765 row_extent_size))
4766 DBUG_RETURN(my_errno);
4767 memcpy(cur_row->extents, data, ROW_EXTENT_SIZE);
4768 data+= ROW_EXTENT_SIZE;
4769 init_extent(&extent, cur_row->extents, row_extents,
4770 cur_row->tail_positions);
4771 }
4772 else
4773 {
4774 cur_row->extents_count= 0;
4775 (*cur_row->tail_positions)= 0;
4776 extent.page_count= 0;
4777 extent.extent_count= 1;
4778 }
4779 extent.first_extent= 1;
4780
4781 field_lengths= 0;
4782 if (share->base.max_field_lengths)
4783 {
4784 get_key_length(field_lengths, data);
4785 cur_row->field_lengths_length= field_lengths;
4786 #ifdef SANITY_CHECKS
4787 if (field_lengths > share->base.max_field_lengths)
4788 goto err;
4789 #endif
4790 }
4791
4792 if (share->calc_checksum)
4793 cur_row->checksum= (uint) (uchar) *data++;
4794 /* data now points on null bits */
4795 memcpy(record, data, cur_null_bytes);
4796 if (unlikely(cur_null_bytes != null_bytes))
4797 {
4798 /*
4799 This only happens if we have added more NULL columns with
4800 ALTER TABLE and are fetching an old, not yet modified old row
4801 */
4802 bzero(record + cur_null_bytes, (uint) (null_bytes - cur_null_bytes));
4803 }
4804 data+= null_bytes;
4805 /* We copy the empty bits to be able to use them for delete/update */
4806 memcpy(cur_row->empty_bits, data, share->base.pack_bytes);
4807 data+= share->base.pack_bytes;
4808
4809 /* TODO: Use field offsets, instead of just skipping them */
4810 data+= share->base.field_offsets * FIELD_OFFSET_SIZE;
4811
4812 /*
4813 Read row extents (note that first extent was already read into
4814 cur_row->extents above)
4815 */
4816 if (row_extents > 1)
4817 {
4818 if (read_long_data(info, cur_row->extents + ROW_EXTENT_SIZE,
4819 (row_extents - 1) * ROW_EXTENT_SIZE,
4820 &extent, &data, &end_of_data))
4821 DBUG_RETURN(my_errno);
4822 }
4823
4824 /*
4825 Data now points to start of fixed length field data that can't be null
4826 or 'empty'. Note that these fields can't be split over blocks.
4827 */
4828 for (column= share->columndef,
4829 end_column= column + share->base.fixed_not_null_fields;
4830 column < end_column; column++)
4831 {
4832 uint column_length= column->length;
4833 if (data + column_length > end_of_data &&
4834 !(data= read_next_extent(info, &extent, &end_of_data)))
4835 goto err;
4836 memcpy(record + column->offset, data, column_length);
4837 data+= column_length;
4838 }
4839
4840 /* Read array of field lengths. This may be stored in several extents */
4841 if (field_lengths)
4842 {
4843 field_length_data= cur_row->field_lengths;
4844 if (read_long_data(info, field_length_data, field_lengths, &extent,
4845 &data, &end_of_data))
4846 DBUG_RETURN(my_errno);
4847 }
4848
4849 /* Read variable length data. Each of these may be split over many extents */
4850 for (end_column= share->columndef + share->base.fields;
4851 column < end_column; column++)
4852 {
4853 enum en_fieldtype type= column->type;
4854 uchar *field_pos= record + column->offset;
4855 /* First check if field is present in record */
4856 if ((record[column->null_pos] & column->null_bit) ||
4857 (cur_row->empty_bits[column->empty_pos] & column->empty_bit))
4858 {
4859 bfill(record + column->offset, column->fill_length,
4860 type == FIELD_SKIP_ENDSPACE ? ' ' : 0);
4861 continue;
4862 }
4863 switch (type) {
4864 case FIELD_NORMAL: /* Fixed length field */
4865 case FIELD_SKIP_PRESPACE:
4866 case FIELD_SKIP_ZERO: /* Fixed length field */
4867 if (data + column->length > end_of_data &&
4868 !(data= read_next_extent(info, &extent, &end_of_data)))
4869 goto err;
4870 memcpy(field_pos, data, column->length);
4871 data+= column->length;
4872 break;
4873 case FIELD_SKIP_ENDSPACE: /* CHAR */
4874 {
4875 /* Char that is space filled */
4876 uint length;
4877 if (column->length <= 255)
4878 length= (uint) (uchar) *field_length_data++;
4879 else
4880 {
4881 length= uint2korr(field_length_data);
4882 field_length_data+= 2;
4883 }
4884 #ifdef SANITY_CHECKS
4885 if (length > column->length)
4886 goto err;
4887 #endif
4888 if (read_long_data(info, field_pos, length, &extent, &data,
4889 &end_of_data))
4890 DBUG_RETURN(my_errno);
4891 bfill(field_pos + length, column->length - length, ' ');
4892 break;
4893 }
4894 case FIELD_VARCHAR:
4895 {
4896 ulong length;
4897 if (column->length <= 256)
4898 {
4899 length= (uint) (uchar) (*field_pos++= *field_length_data++);
4900 }
4901 else
4902 {
4903 length= uint2korr(field_length_data);
4904 field_pos[0]= field_length_data[0];
4905 field_pos[1]= field_length_data[1];
4906 field_pos+= 2;
4907 field_length_data+= 2;
4908 }
4909 #ifdef SANITY_CHECKS
4910 if (length > column->length)
4911 goto err;
4912 #endif
4913 if (read_long_data(info, field_pos, length, &extent, &data,
4914 &end_of_data))
4915 DBUG_RETURN(my_errno);
4916 break;
4917 }
4918 case FIELD_BLOB:
4919 {
4920 uint column_size_length= column->length - portable_sizeof_char_ptr;
4921 ulong blob_length= _ma_calc_blob_length(column_size_length,
4922 field_length_data);
4923
4924 if (!found_blob)
4925 {
4926 /* Calculate total length for all blobs */
4927 ulong blob_lengths= 0;
4928 uchar *length_data= field_length_data;
4929 MARIA_COLUMNDEF *blob_field= column;
4930
4931 found_blob= 1;
4932 for (; blob_field < end_column; blob_field++)
4933 {
4934 uint size_length;
4935 if ((record[blob_field->null_pos] & blob_field->null_bit) ||
4936 (cur_row->empty_bits[blob_field->empty_pos] &
4937 blob_field->empty_bit))
4938 continue;
4939 size_length= blob_field->length - portable_sizeof_char_ptr;
4940 blob_lengths+= _ma_calc_blob_length(size_length, length_data);
4941 length_data+= size_length;
4942 }
4943 cur_row->blob_length= blob_lengths;
4944 DBUG_PRINT("info", ("Total blob length: %lu", blob_lengths));
4945 if (_ma_alloc_buffer(&info->blob_buff, &info->blob_buff_size,
4946 blob_lengths))
4947 DBUG_RETURN(my_errno);
4948 blob_buffer= info->blob_buff;
4949 }
4950
4951 memcpy(field_pos, field_length_data, column_size_length);
4952 memcpy(field_pos + column_size_length, (uchar *) &blob_buffer,
4953 sizeof(char*));
4954 field_length_data+= column_size_length;
4955
4956 /*
4957 After we have read one extent, then each blob is in it's own extent
4958 */
4959 if (!extent.first_extent || (ulong) (end_of_data - data) < blob_length)
4960 end_of_data= data; /* Force read of next extent */
4961
4962 if (read_long_data(info, blob_buffer, blob_length, &extent, &data,
4963 &end_of_data))
4964 DBUG_RETURN(my_errno);
4965 blob_buffer+= blob_length;
4966 break;
4967 }
4968 default:
4969 #ifdef EXTRA_DEBUG
4970 DBUG_ASSERT(0); /* purecov: deadcode */
4971 #endif
4972 goto err;
4973 }
4974 continue;
4975 }
4976
4977 if (row_extents)
4978 {
4979 DBUG_PRINT("info", ("Row read: page_count: %u extent_count: %u",
4980 extent.page_count, extent.extent_count));
4981 *extent.tail_positions= 0; /* End marker */
4982 if (extent.page_count)
4983 goto err;
4984 if (extent.extent_count > 1)
4985 {
4986 if (_ma_check_if_zero(extent.extent + ROW_EXTENT_SIZE,
4987 (extent.extent_count-1) * ROW_EXTENT_SIZE))
4988 {
4989 DBUG_PRINT("error", ("Data in extent is not zero"));
4990 DBUG_DUMP("extent", extent.extent + ROW_EXTENT_SIZE,
4991 (extent.extent_count-1) * ROW_EXTENT_SIZE);
4992 goto err;
4993 }
4994 }
4995 }
4996 else
4997 {
4998 DBUG_PRINT("info", ("Row read"));
4999 /*
5000 data should normally point to end_of_date. The only exception is if
5001 the row is very short in which case we allocated 'min_block_length' data
5002 for allowing the row to expand.
5003 */
5004 if (data != end_of_data && (uint) (end_of_data - start_of_data) >
5005 share->base.min_block_length)
5006 goto err;
5007 }
5008 #ifdef EXTRA_DEBUG
5009 if (share->calc_checksum && !info->in_check_table)
5010 {
5011 /* Esnure that row checksum is correct */
5012 DBUG_ASSERT(((share->calc_checksum)(info, record) & 255) ==
5013 cur_row->checksum);
5014 }
5015 #endif
5016 info->update|= HA_STATE_AKTIV; /* We have an active record */
5017 DBUG_RETURN(0);
5018
5019 err:
5020 DBUG_ASSERT(!maria_assert_if_crashed_table);
5021 /* Something was wrong with data on record */
5022 DBUG_PRINT("error", ("Found record with wrong data"));
5023 _ma_set_fatal_error(share, HA_ERR_WRONG_IN_RECORD);
5024 DBUG_RETURN(HA_ERR_WRONG_IN_RECORD);
5025 }
5026
5027
5028 /** @brief Read positions to tail blocks and full blocks
5029
5030 @fn read_row_extent_info()
5031 @param info Handler
5032
5033 @notes
5034 This function is a simpler version of _ma_read_block_record2()
5035 The data about the used pages is stored in info->cur_row.
5036
5037 @return Status
5038 @retval 0 ok
5039 @retval 1 Error. my_errno contains error number
5040 */
5041
read_row_extent_info(MARIA_HA * info,uchar * buff,uint record_number)5042 static my_bool read_row_extent_info(MARIA_HA *info, uchar *buff,
5043 uint record_number)
5044 {
5045 MARIA_SHARE *share= info->s;
5046 MARIA_EXTENT_CURSOR extent;
5047 MARIA_RECORD_POS *tail_pos;
5048 uchar *data, *end_of_data;
5049 uint flag, row_extents, row_extents_size;
5050 uint field_lengths __attribute__ ((unused));
5051 uchar *extents, *end;
5052 DBUG_ENTER("read_row_extent_info");
5053
5054 if (!(data= get_record_position(share, buff,
5055 record_number, &end_of_data)))
5056 DBUG_RETURN(1); /* Wrong in record */
5057
5058 flag= (uint) (uchar) data[0];
5059 /* Skip trans header */
5060 data+= total_header_size[(flag & PRECALC_HEADER_BITMASK)];
5061
5062 row_extents= 0;
5063 row_extents_size= 0;
5064 if (flag & ROW_FLAG_EXTENTS)
5065 {
5066 /*
5067 Record is split over many data pages.
5068 Get number of extents and first extent
5069 */
5070 get_key_length(row_extents, data);
5071 row_extents_size= row_extents * ROW_EXTENT_SIZE;
5072 if (info->cur_row.extents_buffer_length < row_extents_size &&
5073 _ma_alloc_buffer(&info->cur_row.extents,
5074 &info->cur_row.extents_buffer_length,
5075 row_extents_size))
5076 DBUG_RETURN(1);
5077 memcpy(info->cur_row.extents, data, ROW_EXTENT_SIZE);
5078 data+= ROW_EXTENT_SIZE;
5079 init_extent(&extent, info->cur_row.extents, row_extents,
5080 info->cur_row.tail_positions);
5081 extent.first_extent= 1;
5082 }
5083 info->cur_row.extents_count= row_extents;
5084
5085 /*
5086 field_lengths looks unused but get_key_length will
5087 increment data, which is required as data it's used later.
5088 */
5089 if (share->base.max_field_lengths)
5090 get_key_length(field_lengths, data);
5091
5092 if (share->calc_checksum)
5093 info->cur_row.checksum= (uint) (uchar) *data++;
5094 if (row_extents > 1)
5095 {
5096 data+= share->base.null_bytes;
5097 data+= share->base.pack_bytes;
5098 data+= share->base.field_offsets * FIELD_OFFSET_SIZE;
5099
5100 /*
5101 Read row extents (note that first extent was already read into
5102 info->cur_row.extents above)
5103 Lock tails with write lock as we will delete them later.
5104 */
5105 extent.lock_for_tail_pages= PAGECACHE_LOCK_LEFT_WRITELOCKED;
5106 if (read_long_data(info, info->cur_row.extents + ROW_EXTENT_SIZE,
5107 row_extents_size - ROW_EXTENT_SIZE,
5108 &extent, &data, &end_of_data))
5109 DBUG_RETURN(1);
5110 }
5111
5112 /* Update tail_positions with pointer to tails */
5113 tail_pos= info->cur_row.tail_positions;
5114 for (extents= info->cur_row.extents, end= extents + row_extents_size;
5115 extents < end;
5116 extents+= ROW_EXTENT_SIZE)
5117 {
5118 pgcache_page_no_t page= uint5korr(extents);
5119 uint page_count= uint2korr(extents + ROW_EXTENT_PAGE_SIZE);
5120 if (page_count & TAIL_BIT)
5121 *(tail_pos++)= ma_recordpos(page, (page_count & ~ (TAIL_BIT |
5122 START_EXTENT_BIT)));
5123 }
5124 *tail_pos= 0; /* End marker */
5125 DBUG_RETURN(0);
5126 }
5127
5128
5129 /*
5130 Read a record based on record position
5131
5132 @fn _ma_read_block_record()
5133 @param info Maria handler
5134 @param record Store record here
5135 @param record_pos Record position
5136
5137 @return Status
5138 @retval 0 ok
5139 @retval # Error number
5140 */
5141
_ma_read_block_record(MARIA_HA * info,uchar * record,MARIA_RECORD_POS record_pos)5142 int _ma_read_block_record(MARIA_HA *info, uchar *record,
5143 MARIA_RECORD_POS record_pos)
5144 {
5145 MARIA_SHARE *share= info->s;
5146 uchar *data, *end_of_data, *buff;
5147 uint offset;
5148 int ret;
5149 DBUG_ENTER("_ma_read_block_record");
5150 DBUG_PRINT("enter", ("rowid: %lu page: %lu rownr: %u",
5151 (ulong) record_pos,
5152 (ulong) ma_recordpos_to_page(record_pos),
5153 ma_recordpos_to_dir_entry(record_pos)));
5154
5155 offset= ma_recordpos_to_dir_entry(record_pos);
5156
5157 if (!(buff= pagecache_read(share->pagecache,
5158 &info->dfile, ma_recordpos_to_page(record_pos), 0,
5159 info->buff, share->page_type,
5160 PAGECACHE_LOCK_LEFT_UNLOCKED, 0)))
5161 DBUG_RETURN(my_errno);
5162
5163 /*
5164 Unallocated page access can happen if this is an access to a page where
5165 all rows where deleted as part of this statement.
5166 */
5167 DBUG_ASSERT((buff[PAGE_TYPE_OFFSET] & PAGE_TYPE_MASK) == HEAD_PAGE ||
5168 (buff[PAGE_TYPE_OFFSET] & PAGE_TYPE_MASK) == UNALLOCATED_PAGE);
5169
5170 if (((buff[PAGE_TYPE_OFFSET] & PAGE_TYPE_MASK) == UNALLOCATED_PAGE) ||
5171 !(data= get_record_position(share, buff, offset, &end_of_data)))
5172 {
5173 DBUG_ASSERT(!maria_assert_if_crashed_table);
5174 DBUG_PRINT("warning", ("Wrong directory entry in data block"));
5175 my_errno= HA_ERR_RECORD_DELETED; /* File crashed */
5176 DBUG_RETURN(HA_ERR_RECORD_DELETED);
5177 }
5178 ret= _ma_read_block_record2(info, record, data, end_of_data);
5179 DBUG_RETURN(ret);
5180 }
5181
5182
5183 /* compare unique constraint between stored rows */
5184
_ma_cmp_block_unique(MARIA_HA * info,MARIA_UNIQUEDEF * def,const uchar * record,MARIA_RECORD_POS pos)5185 my_bool _ma_cmp_block_unique(MARIA_HA *info, MARIA_UNIQUEDEF *def,
5186 const uchar *record, MARIA_RECORD_POS pos)
5187 {
5188 uchar *org_rec_buff, *old_record;
5189 size_t org_rec_buff_size;
5190 int error;
5191 DBUG_ENTER("_ma_cmp_block_unique");
5192
5193 /*
5194 Don't allocate more than 16K on the stack to ensure we don't get
5195 stack overflow.
5196 */
5197 if (!(old_record= my_safe_alloca(info->s->base.reclength)))
5198 DBUG_RETURN(1);
5199
5200 /* Don't let the compare destroy blobs that may be in use */
5201 org_rec_buff= info->rec_buff;
5202 org_rec_buff_size= info->rec_buff_size;
5203 if (info->s->base.blobs)
5204 {
5205 /* Force realloc of record buffer*/
5206 info->rec_buff= 0;
5207 info->rec_buff_size= 0;
5208 }
5209 error= _ma_read_block_record(info, old_record, pos);
5210 if (!error)
5211 error= _ma_unique_comp(def, record, old_record, def->null_are_equal);
5212 if (info->s->base.blobs)
5213 {
5214 my_free(info->rec_buff);
5215 info->rec_buff= org_rec_buff;
5216 info->rec_buff_size= org_rec_buff_size;
5217 }
5218 DBUG_PRINT("exit", ("result: %d", error));
5219 my_safe_afree(old_record, info->s->base.reclength);
5220 DBUG_RETURN(error != 0);
5221 }
5222
5223
5224 /****************************************************************************
5225 Table scan
5226 ****************************************************************************/
5227
5228 /*
5229 Allocate buffers for table scan
5230
5231 SYNOPSIS
5232 _ma_scan_init_block_record(MARIA_HA *info)
5233
5234 IMPLEMENTATION
5235 We allocate one buffer for the current bitmap and one buffer for the
5236 current page
5237
5238 RETURN
5239 0 ok
5240 1 error (couldn't allocate memory or disk error)
5241 */
5242
_ma_scan_init_block_record(MARIA_HA * info)5243 my_bool _ma_scan_init_block_record(MARIA_HA *info)
5244 {
5245 MARIA_SHARE *share= info->s;
5246 DBUG_ENTER("_ma_scan_init_block_record");
5247 DBUG_ASSERT(info->dfile.file == share->bitmap.file.file);
5248
5249 /*
5250 bitmap_buff may already be allocated if this is the second call to
5251 rnd_init() without a rnd_end() in between, see sql/handler.h
5252 */
5253 if (!(info->scan.bitmap_buff ||
5254 ((info->scan.bitmap_buff=
5255 (uchar *) my_malloc(share->block_size * 2, MYF(MY_WME))))))
5256 DBUG_RETURN(1);
5257 info->scan.page_buff= info->scan.bitmap_buff + share->block_size;
5258 info->scan.bitmap_end= info->scan.bitmap_buff + share->bitmap.max_total_size;
5259
5260 /* Set scan variables to get _ma_scan_block() to start with reading bitmap */
5261 info->scan.number_of_rows= 0;
5262 info->scan.bitmap_pos= info->scan.bitmap_end;
5263 info->scan.bitmap_page= (pgcache_page_no_t) 0 - share->bitmap.pages_covered;
5264 info->scan.max_page= share->state.state.data_file_length / share->block_size;
5265 /*
5266 We need to flush what's in memory (bitmap.map) to page cache otherwise, as
5267 we are going to read bitmaps from page cache in table scan (see
5268 _ma_scan_block_record()), we may miss recently inserted rows (bitmap page
5269 in page cache would be too old).
5270 */
5271 DBUG_RETURN(_ma_bitmap_flush(info->s));
5272 }
5273
5274
5275 /* Free buffers allocated by _ma_scan_block_init() */
5276
_ma_scan_end_block_record(MARIA_HA * info)5277 void _ma_scan_end_block_record(MARIA_HA *info)
5278 {
5279 DBUG_ENTER("_ma_scan_end_block_record");
5280 my_free(info->scan.bitmap_buff);
5281 info->scan.bitmap_buff= 0;
5282 if (info->scan_save)
5283 {
5284 my_free(info->scan_save);
5285 info->scan_save= 0;
5286 }
5287 DBUG_VOID_RETURN;
5288 }
5289
5290
5291 /**
5292 @brief Save current scan position
5293
5294 @note
5295 For the moment we can only remember one position, but this is
5296 good enough for MySQL usage
5297
5298 @return
5299 @retval 0 ok
5300 @retval HA_ERR_WRONG_IN_RECORD Could not allocate memory to hold position
5301 */
5302
_ma_scan_remember_block_record(MARIA_HA * info,MARIA_RECORD_POS * lastpos)5303 int _ma_scan_remember_block_record(MARIA_HA *info,
5304 MARIA_RECORD_POS *lastpos)
5305 {
5306 uchar *bitmap_buff;
5307 DBUG_ENTER("_ma_scan_remember_block_record");
5308 if (!(info->scan_save))
5309 {
5310 if (!(info->scan_save= my_malloc(ALIGN_SIZE(sizeof(*info->scan_save)) +
5311 info->s->block_size * 2,
5312 MYF(MY_WME))))
5313 DBUG_RETURN(HA_ERR_OUT_OF_MEM);
5314 info->scan_save->bitmap_buff= ((uchar*) info->scan_save +
5315 ALIGN_SIZE(sizeof(*info->scan_save)));
5316 }
5317 /* For checking if pages have changed since we last read it */
5318 info->scan.row_changes= info->row_changes;
5319
5320 /* Remember used bitmap and used head page */
5321 bitmap_buff= info->scan_save->bitmap_buff;
5322 memcpy(info->scan_save, &info->scan, sizeof(*info->scan_save));
5323 info->scan_save->bitmap_buff= bitmap_buff;
5324 memcpy(bitmap_buff, info->scan.bitmap_buff, info->s->block_size * 2);
5325
5326 /* Point to the last read row */
5327 *lastpos= info->cur_row.nextpos - 1;
5328 info->scan_save->dir+= DIR_ENTRY_SIZE;
5329 DBUG_RETURN(0);
5330 }
5331
5332
5333 /**
5334 @brief restore scan block it's original values
5335
5336 @return
5337 0 ok
5338 # error
5339
5340 @note
5341 In theory we could swap bitmap buffers instead of copy them.
5342 For the moment we don't do that because there are variables pointing
5343 inside the buffers and it's a bit of hassle to either make them relative
5344 or repoint them.
5345
5346 If the data file has changed, we will re-read the new block record
5347 to ensure that when we continue scanning we can ignore any deleted rows.
5348 */
5349
_ma_scan_restore_block_record(MARIA_HA * info,MARIA_RECORD_POS lastpos)5350 int _ma_scan_restore_block_record(MARIA_HA *info,
5351 MARIA_RECORD_POS lastpos)
5352 {
5353 uchar *bitmap_buff;
5354 DBUG_ENTER("_ma_scan_restore_block_record");
5355
5356 info->cur_row.nextpos= lastpos;
5357 bitmap_buff= info->scan.bitmap_buff;
5358 memcpy(&info->scan, info->scan_save, sizeof(*info->scan_save));
5359 info->scan.bitmap_buff= bitmap_buff;
5360 memcpy(bitmap_buff, info->scan_save->bitmap_buff, info->s->block_size * 2);
5361
5362 if (info->scan.row_changes != info->row_changes)
5363 {
5364 /*
5365 Table has been changed. We have to re-read the current page block as
5366 data may have changed on it that we have to see.
5367 */
5368 if (!(pagecache_read(info->s->pagecache,
5369 &info->dfile,
5370 ma_recordpos_to_page(info->scan.row_base_page),
5371 0, info->scan.page_buff,
5372 info->s->page_type,
5373 PAGECACHE_LOCK_LEFT_UNLOCKED, 0)))
5374 DBUG_RETURN(my_errno);
5375 info->scan.number_of_rows=
5376 (uint) (uchar) info->scan.page_buff[DIR_COUNT_OFFSET];
5377 info->scan.dir_end= (info->scan.page_buff + info->s->block_size -
5378 PAGE_SUFFIX_SIZE -
5379 info->scan.number_of_rows * DIR_ENTRY_SIZE);
5380 }
5381 DBUG_RETURN(0);
5382 }
5383
5384
5385 /*
5386 Read next record while scanning table
5387
5388 SYNOPSIS
5389 _ma_scan_block_record()
5390 info Maria handler
5391 record Store found here
5392 record_pos Value stored in info->cur_row.next_pos after last call
5393 This is offset inside the current pagebuff
5394 skip_deleted
5395
5396 NOTES
5397 - One must have called mi_scan() before this
5398 - In this version, we don't actually need record_pos, we as easily
5399 use a variable in info->scan
5400
5401 IMPLEMENTATION
5402 Current code uses a lot of goto's to separate the different kind of
5403 states we may be in. This gives us a minimum of executed if's for
5404 the normal cases. I tried several different ways to code this, but
5405 the current one was in the end the most readable and fastest.
5406
5407 RETURN
5408 0 ok
5409 # Error code (Normally HA_ERR_END_OF_FILE)
5410 */
5411
_ma_scan_block_record(MARIA_HA * info,uchar * record,MARIA_RECORD_POS record_pos,my_bool skip_deleted)5412 int _ma_scan_block_record(MARIA_HA *info, uchar *record,
5413 MARIA_RECORD_POS record_pos,
5414 my_bool skip_deleted __attribute__ ((unused)))
5415 {
5416 uint block_size;
5417 MARIA_SHARE *share= info->s;
5418 DBUG_ENTER("_ma_scan_block_record");
5419
5420 restart_record_read:
5421 /* Find next row in current page */
5422 while (likely(record_pos < info->scan.number_of_rows))
5423 {
5424 uint length, offset;
5425 uchar *data, *end_of_data;
5426 int error;
5427
5428 /* Ensure that scan.dir and record_pos are in sync */
5429 DBUG_ASSERT(info->scan.dir == dir_entry_pos(info->scan.page_buff,
5430 share->block_size,
5431 (uint) record_pos));
5432
5433 /* Search for a valid directory entry (not 0) */
5434 while (!(offset= uint2korr(info->scan.dir)))
5435 {
5436 info->scan.dir-= DIR_ENTRY_SIZE;
5437 record_pos++;
5438 #ifdef SANITY_CHECKS
5439 if (info->scan.dir < info->scan.dir_end)
5440 {
5441 DBUG_ASSERT(!maria_assert_if_crashed_table);
5442 goto err;
5443 }
5444 #endif
5445 }
5446 /*
5447 This should always be true as the directory should always start with
5448 a valid entry.
5449 */
5450 DBUG_ASSERT(info->scan.dir >= info->scan.dir_end);
5451
5452 /* found row */
5453 info->cur_row.lastpos= info->scan.row_base_page + record_pos;
5454 info->cur_row.nextpos= record_pos + 1;
5455 data= info->scan.page_buff + offset;
5456 length= uint2korr(info->scan.dir + 2);
5457 end_of_data= data + length;
5458 info->scan.dir-= DIR_ENTRY_SIZE; /* Point to next row to process */
5459 #ifdef SANITY_CHECKS
5460 if (end_of_data > info->scan.dir_end ||
5461 offset < PAGE_HEADER_SIZE(share) ||
5462 length < share->base.min_block_length)
5463 {
5464 DBUG_ASSERT(!(end_of_data > info->scan.dir_end));
5465 DBUG_ASSERT(!(offset < PAGE_HEADER_SIZE(share)));
5466 DBUG_ASSERT(!(length < share->base.min_block_length));
5467 goto err;
5468 }
5469 #endif
5470 DBUG_PRINT("info", ("rowid: %lu", (ulong) info->cur_row.lastpos));
5471 error= _ma_read_block_record2(info, record, data, end_of_data);
5472 if (error != HA_ERR_ROW_NOT_VISIBLE)
5473 DBUG_RETURN(error);
5474 record_pos++;
5475 }
5476
5477 /* Find next head page in current bitmap */
5478 restart_bitmap_scan:
5479 block_size= share->block_size;
5480 if (likely(info->scan.bitmap_pos < info->scan.bitmap_end))
5481 {
5482 uchar *data= info->scan.bitmap_pos;
5483 longlong bits= info->scan.bits;
5484 uint bit_pos= info->scan.bit_pos;
5485
5486 do
5487 {
5488 while (likely(bits))
5489 {
5490 uint pattern= (uint) (bits & 7);
5491 bits >>= 3;
5492 bit_pos++;
5493 if (pattern > 0 && pattern <= 4)
5494 {
5495 /* Found head page; Read it */
5496 pgcache_page_no_t page;
5497 info->scan.bitmap_pos= data;
5498 info->scan.bits= bits;
5499 info->scan.bit_pos= bit_pos;
5500 page= (info->scan.bitmap_page + 1 +
5501 (data - info->scan.bitmap_buff) / 6 * 16 + bit_pos - 1);
5502 info->scan.row_base_page= ma_recordpos(page, 0);
5503 if (page >= info->scan.max_page)
5504 {
5505 DBUG_PRINT("info", ("Found end of file"));
5506 DBUG_RETURN((my_errno= HA_ERR_END_OF_FILE));
5507 }
5508 if (!(pagecache_read(share->pagecache,
5509 &info->dfile,
5510 page, 0, info->scan.page_buff,
5511 share->page_type,
5512 PAGECACHE_LOCK_LEFT_UNLOCKED, 0)))
5513 DBUG_RETURN(my_errno);
5514 if (((info->scan.page_buff[PAGE_TYPE_OFFSET] & PAGE_TYPE_MASK) !=
5515 HEAD_PAGE))
5516 {
5517 /*
5518 This may happen if someone has been deleting all rows
5519 from a page since we read the bitmap, so it may be ok.
5520 Print warning in debug log and continue.
5521 */
5522 DBUG_PRINT("warning",
5523 ("Found page of type %d when expecting head page",
5524 (info->scan.page_buff[PAGE_TYPE_OFFSET] &
5525 PAGE_TYPE_MASK)));
5526 continue;
5527 }
5528 if ((info->scan.number_of_rows=
5529 (uint) (uchar) info->scan.page_buff[DIR_COUNT_OFFSET]) == 0)
5530 {
5531 DBUG_PRINT("error", ("Wrong page header"));
5532 _ma_set_fatal_error(share, HA_ERR_WRONG_IN_RECORD);
5533 DBUG_RETURN(HA_ERR_WRONG_IN_RECORD);
5534 }
5535 DBUG_PRINT("info", ("Page %lu has %u rows",
5536 (ulong) page, info->scan.number_of_rows));
5537 info->scan.dir= (info->scan.page_buff + block_size -
5538 PAGE_SUFFIX_SIZE - DIR_ENTRY_SIZE);
5539 info->scan.dir_end= (info->scan.dir -
5540 (info->scan.number_of_rows - 1) *
5541 DIR_ENTRY_SIZE);
5542 record_pos= 0;
5543 goto restart_record_read;
5544 }
5545 }
5546 for (data+= 6; data < info->scan.bitmap_end; data+= 6)
5547 {
5548 bits= uint6korr(data);
5549 /* Skip not allocated pages and blob / full tail pages */
5550 if (bits && bits != 07777777777777777LL)
5551 break;
5552 }
5553 bit_pos= 0;
5554 } while (data < info->scan.bitmap_end);
5555 }
5556
5557 /* Read next bitmap */
5558 info->scan.bitmap_page+= share->bitmap.pages_covered;
5559 if (unlikely(info->scan.bitmap_page >= info->scan.max_page))
5560 {
5561 DBUG_PRINT("info", ("Found end of file"));
5562 DBUG_RETURN((my_errno= HA_ERR_END_OF_FILE));
5563 }
5564 DBUG_PRINT("info", ("Reading bitmap at %lu",
5565 (ulong) info->scan.bitmap_page));
5566 if (!(pagecache_read(share->pagecache, &info->s->bitmap.file,
5567 info->scan.bitmap_page,
5568 0, info->scan.bitmap_buff, PAGECACHE_PLAIN_PAGE,
5569 PAGECACHE_LOCK_LEFT_UNLOCKED, 0)))
5570 DBUG_RETURN(my_errno);
5571 /* Skip scanning 'bits' in bitmap scan code */
5572 info->scan.bitmap_pos= info->scan.bitmap_buff - 6;
5573 info->scan.bits= 0;
5574 goto restart_bitmap_scan;
5575
5576 err:
5577 DBUG_ASSERT(!maria_assert_if_crashed_table);
5578 DBUG_PRINT("error", ("Wrong data on page"));
5579 _ma_set_fatal_error(share, HA_ERR_WRONG_IN_RECORD);
5580 DBUG_RETURN(HA_ERR_WRONG_IN_RECORD);
5581 }
5582
5583
5584 /*
5585 Compare a row against a stored one
5586
5587 NOTES
5588 Not implemented, as block record is not supposed to be used in a shared
5589 global environment
5590 */
5591
_ma_compare_block_record(MARIA_HA * info,const uchar * record)5592 my_bool _ma_compare_block_record(MARIA_HA *info __attribute__ ((unused)),
5593 const uchar *record __attribute__ ((unused)))
5594 {
5595 return 0;
5596 }
5597
5598
5599 /*
5600 Store an integer with simple packing
5601
5602 SYNOPSIS
5603 ma_store_integer()
5604 to Store the packed integer here
5605 nr Integer to store
5606
5607 NOTES
5608 This is mostly used to store field numbers and lengths of strings.
5609 We have to cast the result for the LL() becasue of a bug in Forte CC
5610 compiler.
5611
5612 Packing used is:
5613 nr < 251 is stored as is (in 1 byte)
5614 Numbers that require 1-4 bytes are stored as char(250+byte_length), data
5615 Bigger numbers are stored as 255, data as ulonglong (not yet done).
5616
5617 RETURN
5618 Position in 'to' after the packed length
5619 */
5620
ma_store_length(uchar * to,ulong nr)5621 uchar *ma_store_length(uchar *to, ulong nr)
5622 {
5623 if (nr < 251)
5624 {
5625 *to=(uchar) nr;
5626 return to+1;
5627 }
5628 if (nr < 65536)
5629 {
5630 if (nr <= 255)
5631 {
5632 to[0]= (uchar) 251;
5633 to[1]= (uchar) nr;
5634 return to+2;
5635 }
5636 to[0]= (uchar) 252;
5637 int2store(to+1, nr);
5638 return to+3;
5639 }
5640 if (nr < 16777216)
5641 {
5642 *to++= (uchar) 253;
5643 int3store(to, nr);
5644 return to+3;
5645 }
5646 *to++= (uchar) 254;
5647 int4store(to, nr);
5648 return to+4;
5649 }
5650
5651
5652 /* Calculate how many bytes needed to store a number */
5653
ma_calc_length_for_store_length(ulong nr)5654 uint ma_calc_length_for_store_length(ulong nr)
5655 {
5656 if (nr < 251)
5657 return 1;
5658 if (nr < 65536)
5659 {
5660 if (nr <= 255)
5661 return 2;
5662 return 3;
5663 }
5664 if (nr < 16777216)
5665 return 4;
5666 return 5;
5667 }
5668
5669
5670 /* Retrive a stored number */
5671
ma_get_length(const uchar ** packet)5672 static ulong ma_get_length(const uchar **packet)
5673 {
5674 reg1 const uchar *pos= *packet;
5675 if (*pos < 251)
5676 {
5677 (*packet)++;
5678 return (ulong) *pos;
5679 }
5680 if (*pos == 251)
5681 {
5682 (*packet)+= 2;
5683 return (ulong) pos[1];
5684 }
5685 if (*pos == 252)
5686 {
5687 (*packet)+= 3;
5688 return (ulong) uint2korr(pos+1);
5689 }
5690 if (*pos == 253)
5691 {
5692 (*packet)+= 4;
5693 return (ulong) uint3korr(pos+1);
5694 }
5695 DBUG_ASSERT(*pos == 254);
5696 (*packet)+= 5;
5697 return (ulong) uint4korr(pos+1);
5698 }
5699
5700
5701 /*
5702 Fill array with pointers to field parts to be stored in log for insert
5703
5704 SYNOPSIS
5705 fill_insert_undo_parts()
5706 info Maria handler
5707 record Inserted row
5708 log_parts Store pointers to changed memory areas here
5709 log_parts_count See RETURN
5710
5711 NOTES
5712 We have information in info->cur_row about the read row.
5713
5714 RETURN
5715 length of data in log_parts.
5716 log_parts_count contains number of used log_parts
5717 */
5718
fill_insert_undo_parts(MARIA_HA * info,const uchar * record,LEX_CUSTRING * log_parts,uint * log_parts_count)5719 static size_t fill_insert_undo_parts(MARIA_HA *info, const uchar *record,
5720 LEX_CUSTRING *log_parts,
5721 uint *log_parts_count)
5722 {
5723 MARIA_SHARE *share= info->s;
5724 MARIA_COLUMNDEF *column, *end_column;
5725 uchar *field_lengths= info->cur_row.field_lengths;
5726 size_t row_length;
5727 MARIA_ROW *cur_row= &info->cur_row;
5728 LEX_CUSTRING *start_log_parts;
5729 DBUG_ENTER("fill_insert_undo_parts");
5730
5731 start_log_parts= log_parts;
5732
5733 /* Store null bits */
5734 log_parts->str= record;
5735 log_parts->length= share->base.null_bytes;
5736 row_length= log_parts->length;
5737 log_parts++;
5738
5739 /* Stored bitmap over packed (zero length or all-zero fields) */
5740 log_parts->str= info->cur_row.empty_bits;
5741 log_parts->length= share->base.pack_bytes;
5742 row_length+= log_parts->length;
5743 log_parts++;
5744
5745 if (share->base.max_field_lengths)
5746 {
5747 /* Store length of all not empty char, varchar and blob fields */
5748 log_parts->str= field_lengths - 2;
5749 log_parts->length= info->cur_row.field_lengths_length+2;
5750 int2store(log_parts->str, info->cur_row.field_lengths_length);
5751 row_length+= log_parts->length;
5752 log_parts++;
5753 }
5754
5755 if (share->base.blobs)
5756 {
5757 /*
5758 Store total blob length to make buffer allocation easier during UNDO
5759 */
5760 log_parts->str= info->length_buff;
5761 log_parts->length= (uint) (ma_store_length(info->length_buff,
5762 info->cur_row.blob_length) -
5763 (uchar*) log_parts->str);
5764 row_length+= log_parts->length;
5765 log_parts++;
5766 }
5767
5768 /* Handle constant length fields that are always present */
5769 for (column= share->columndef,
5770 end_column= column+ share->base.fixed_not_null_fields;
5771 column < end_column;
5772 column++)
5773 {
5774 log_parts->str= record + column->offset;
5775 log_parts->length= column->length;
5776 row_length+= log_parts->length;
5777 log_parts++;
5778 }
5779
5780 /* Handle NULL fields and CHAR/VARCHAR fields */
5781 for (end_column= share->columndef + share->base.fields - share->base.blobs;
5782 column < end_column;
5783 column++)
5784 {
5785 const uchar *column_pos;
5786 size_t column_length;
5787 if ((record[column->null_pos] & column->null_bit) ||
5788 cur_row->empty_bits[column->empty_pos] & column->empty_bit)
5789 continue;
5790
5791 column_pos= record+ column->offset;
5792 column_length= column->length;
5793
5794 switch (column->type) {
5795 case FIELD_CHECK:
5796 case FIELD_NORMAL: /* Fixed length field */
5797 case FIELD_ZERO:
5798 case FIELD_SKIP_PRESPACE: /* Not packed */
5799 case FIELD_SKIP_ZERO: /* Fixed length field */
5800 break;
5801 case FIELD_SKIP_ENDSPACE: /* CHAR */
5802 {
5803 if (column->length <= 255)
5804 column_length= *field_lengths++;
5805 else
5806 {
5807 column_length= uint2korr(field_lengths);
5808 field_lengths+= 2;
5809 }
5810 break;
5811 }
5812 case FIELD_VARCHAR:
5813 {
5814 if (column->fill_length == 1)
5815 column_length= *field_lengths;
5816 else
5817 column_length= uint2korr(field_lengths);
5818 field_lengths+= column->fill_length;
5819 column_pos+= column->fill_length;
5820 break;
5821 }
5822 default:
5823 DBUG_ASSERT(0);
5824 }
5825 log_parts->str= column_pos;
5826 log_parts->length= column_length;
5827 row_length+= log_parts->length;
5828 log_parts++;
5829 }
5830
5831 /* Add blobs */
5832 for (end_column+= share->base.blobs; column < end_column; column++)
5833 {
5834 const uchar *field_pos= record + column->offset;
5835 uint size_length= column->length - portable_sizeof_char_ptr;
5836 ulong blob_length= _ma_calc_blob_length(size_length, field_pos);
5837
5838 /*
5839 We don't have to check for null, as blob_length is guranteed to be 0
5840 if the blob is null
5841 */
5842 if (blob_length)
5843 {
5844 uchar *blob_pos;
5845 memcpy(&blob_pos, record + column->offset + size_length,
5846 sizeof(blob_pos));
5847 log_parts->str= blob_pos;
5848 log_parts->length= blob_length;
5849 row_length+= log_parts->length;
5850 log_parts++;
5851 }
5852 }
5853 *log_parts_count= (uint) (log_parts - start_log_parts);
5854 DBUG_RETURN(row_length);
5855 }
5856
5857
5858 /*
5859 Fill array with pointers to field parts to be stored in log for update
5860
5861 SYNOPSIS
5862 fill_update_undo_parts()
5863 info Maria handler
5864 oldrec Original row
5865 newrec New row
5866 log_parts Store pointers to changed memory areas here
5867 log_parts_count See RETURN
5868
5869 IMPLEMENTATION
5870 Format of undo record:
5871
5872 Fields are stored in same order as the field array.
5873
5874 Offset to changed field data (packed)
5875
5876 For each changed field
5877 Fieldnumber (packed)
5878 Length, if variable length field (packed)
5879
5880 For each changed field
5881 Data
5882
5883 Packing is using ma_store_integer()
5884
5885 The reason we store field numbers & length separated from data (ie, not
5886 after each other) is to get better cpu caching when we loop over
5887 fields (as we probably don't have to access data for each field when we
5888 want to read and old row through the undo log record).
5889
5890 As a special case, we use '255' for the field number of the null bitmap.
5891
5892 RETURN
5893 length of data in log_parts.
5894 log_parts_count contains number of used log_parts
5895 */
5896
fill_update_undo_parts(MARIA_HA * info,const uchar * oldrec,const uchar * newrec,LEX_CUSTRING * log_parts,uint * log_parts_count)5897 static size_t fill_update_undo_parts(MARIA_HA *info, const uchar *oldrec,
5898 const uchar *newrec,
5899 LEX_CUSTRING *log_parts,
5900 uint *log_parts_count)
5901 {
5902 MARIA_SHARE *share= info->s;
5903 MARIA_COLUMNDEF *column, *end_column;
5904 MARIA_ROW *old_row= &info->cur_row, *new_row= &info->new_row;
5905 uchar *field_data, *start_field_data, *length_str;
5906 uchar *old_field_lengths= old_row->field_lengths;
5907 uchar *new_field_lengths= new_row->field_lengths;
5908 size_t row_length= 0;
5909 uint field_lengths;
5910 LEX_CUSTRING *start_log_parts;
5911 my_bool new_column_is_empty;
5912 DBUG_ENTER("fill_update_undo_parts");
5913
5914 start_log_parts= log_parts;
5915
5916 /*
5917 First log part is for number of fields, field numbers and lengths
5918 The +4 is to reserve place for the number of changed fields.
5919 */
5920 start_field_data= field_data= info->update_field_data + 4;
5921 log_parts++;
5922
5923 if (memcmp(oldrec, newrec, share->base.null_bytes))
5924 {
5925 /* Store changed null bits */
5926 *field_data++= (uchar) 255; /* Special case */
5927 log_parts->str= oldrec;
5928 log_parts->length= share->base.null_bytes;
5929 row_length= log_parts->length;
5930 log_parts++;
5931 }
5932
5933 /* Handle constant length fields */
5934 for (column= share->columndef,
5935 end_column= column+ share->base.fixed_not_null_fields;
5936 column < end_column;
5937 column++)
5938 {
5939 if (memcmp(oldrec + column->offset, newrec + column->offset,
5940 column->length))
5941 {
5942 field_data= ma_store_length(field_data,
5943 (uint) (column - share->columndef));
5944 log_parts->str= oldrec + column->offset;
5945 log_parts->length= column->length;
5946 row_length+= column->length;
5947 log_parts++;
5948 }
5949 }
5950
5951 /* Handle the rest: NULL fields and CHAR/VARCHAR fields and BLOB's */
5952 for (end_column= share->columndef + share->base.fields;
5953 column < end_column;
5954 column++)
5955 {
5956 const uchar *new_column_pos, *old_column_pos;
5957 size_t new_column_length, old_column_length;
5958
5959 /* First check if old column is null or empty */
5960 if (oldrec[column->null_pos] & column->null_bit)
5961 {
5962 /*
5963 It's safe to skip this one as either the new column is also null
5964 (no change) or the new_column is not null, in which case the null-bit
5965 maps differed and we have already stored the null bitmap.
5966 */
5967 continue;
5968 }
5969 if (old_row->empty_bits[column->empty_pos] & column->empty_bit)
5970 {
5971 if (new_row->empty_bits[column->empty_pos] & column->empty_bit)
5972 continue; /* Both are empty; skip */
5973
5974 /* Store null length column */
5975 field_data= ma_store_length(field_data,
5976 (uint) (column - share->columndef));
5977 field_data= ma_store_length(field_data, 0);
5978 continue;
5979 }
5980 /*
5981 Remember if the 'new' value is empty (as in this case we must always
5982 log the original value
5983 */
5984 new_column_is_empty= ((newrec[column->null_pos] & column->null_bit) ||
5985 (new_row->empty_bits[column->empty_pos] &
5986 column->empty_bit));
5987
5988 old_column_pos= oldrec + column->offset;
5989 new_column_pos= newrec + column->offset;
5990 old_column_length= new_column_length= column->length;
5991
5992 switch (column->type) {
5993 case FIELD_CHECK:
5994 case FIELD_NORMAL: /* Fixed length field */
5995 case FIELD_ZERO:
5996 case FIELD_SKIP_PRESPACE: /* Not packed */
5997 case FIELD_SKIP_ZERO: /* Fixed length field */
5998 break;
5999 case FIELD_VARCHAR:
6000 new_column_length--; /* Skip length prefix */
6001 old_column_pos+= column->fill_length;
6002 new_column_pos+= column->fill_length;
6003 /* Fall through */
6004 case FIELD_SKIP_ENDSPACE: /* CHAR */
6005 {
6006 if (new_column_length <= 255)
6007 {
6008 old_column_length= *old_field_lengths++;
6009 if (!new_column_is_empty)
6010 new_column_length= *new_field_lengths++;
6011 }
6012 else
6013 {
6014 old_column_length= uint2korr(old_field_lengths);
6015 old_field_lengths+= 2;
6016 if (!new_column_is_empty)
6017 {
6018 new_column_length= uint2korr(new_field_lengths);
6019 new_field_lengths+= 2;
6020 }
6021 }
6022 break;
6023 }
6024 case FIELD_BLOB:
6025 {
6026 uint size_length= column->length - portable_sizeof_char_ptr;
6027 old_column_length= _ma_calc_blob_length(size_length, old_column_pos);
6028 memcpy((void*) &old_column_pos, oldrec + column->offset + size_length,
6029 sizeof(old_column_pos));
6030 if (!new_column_is_empty)
6031 {
6032 new_column_length= _ma_calc_blob_length(size_length, new_column_pos);
6033 memcpy((void*) &new_column_pos, newrec + column->offset + size_length,
6034 sizeof(old_column_pos));
6035 }
6036 break;
6037 }
6038 default:
6039 DBUG_ASSERT(0);
6040 }
6041
6042 if (new_column_is_empty || new_column_length != old_column_length ||
6043 memcmp(old_column_pos, new_column_pos, new_column_length))
6044 {
6045 field_data= ma_store_length(field_data,
6046 (ulong) (column - share->columndef));
6047 field_data= ma_store_length(field_data, (ulong) old_column_length);
6048
6049 log_parts->str= old_column_pos;
6050 log_parts->length= old_column_length;
6051 row_length+= old_column_length;
6052 log_parts++;
6053 }
6054 }
6055
6056 *log_parts_count= (uint) (log_parts - start_log_parts);
6057
6058 /* Store length of field length data before the field/field_lengths */
6059 field_lengths= (uint) (field_data - start_field_data);
6060 length_str= start_field_data - ma_calc_length_for_store_length(field_lengths);
6061 start_log_parts->str= length_str;
6062 ma_store_length(length_str, field_lengths);
6063 start_log_parts->length= (size_t) (field_data - start_log_parts->str);
6064 row_length+= start_log_parts->length;
6065 DBUG_RETURN(row_length);
6066 }
6067
6068 /***************************************************************************
6069 In-write hooks called under log's lock when log record is written
6070 ***************************************************************************/
6071
6072 /**
6073 @brief Sets transaction's rec_lsn if needed
6074
6075 A transaction sometimes writes a REDO even before the page is in the
6076 pagecache (example: brand new head or tail pages; full pages). So, if
6077 Checkpoint happens just after the REDO write, it needs to know that the
6078 REDO phase must start before this REDO. Scanning the pagecache cannot
6079 tell that as the page is not in the cache. So, transaction sets its rec_lsn
6080 to the REDO's LSN or somewhere before, and Checkpoint reads the
6081 transaction's rec_lsn.
6082
6083 @return Operation status, always 0 (success)
6084 */
6085
write_hook_for_redo(enum translog_record_type type,TRN * trn,MARIA_HA * tbl_info,LSN * lsn,void * hook_arg)6086 my_bool write_hook_for_redo(enum translog_record_type type
6087 __attribute__ ((unused)),
6088 TRN *trn, MARIA_HA *tbl_info
6089 __attribute__ ((unused)),
6090 LSN *lsn, void *hook_arg
6091 __attribute__ ((unused)))
6092 {
6093 /*
6094 Users of dummy_transaction_object must keep this TRN clean as it
6095 is used by many threads (like those manipulating non-transactional
6096 tables). It might be dangerous if one user sets rec_lsn or some other
6097 member and it is picked up by another user (like putting this rec_lsn into
6098 a page of a non-transactional table); it's safer if all members stay 0. So
6099 non-transactional log records (REPAIR, CREATE, RENAME, DROP) should not
6100 call this hook; we trust them but verify ;)
6101 */
6102 DBUG_ASSERT(trn->trid != 0);
6103 /*
6104 If the hook stays so simple, it would be faster to pass
6105 !trn->rec_lsn ? trn->rec_lsn : some_dummy_lsn
6106 to translog_write_record(), like Monty did in his original code, and not
6107 have a hook. For now we keep it like this.
6108 */
6109 if (trn->rec_lsn == 0)
6110 trn->rec_lsn= *lsn;
6111 return 0;
6112 }
6113
6114
6115 /**
6116 @brief Sets transaction's undo_lsn, first_undo_lsn if needed
6117
6118 @return Operation status, always 0 (success)
6119 */
6120
write_hook_for_undo(enum translog_record_type type,TRN * trn,MARIA_HA * tbl_info,LSN * lsn,void * hook_arg)6121 my_bool write_hook_for_undo(enum translog_record_type type
6122 __attribute__ ((unused)),
6123 TRN *trn, MARIA_HA *tbl_info
6124 __attribute__ ((unused)),
6125 LSN *lsn, void *hook_arg
6126 __attribute__ ((unused)))
6127 {
6128 DBUG_ASSERT(trn->trid != 0);
6129 trn->undo_lsn= *lsn;
6130 if (unlikely(LSN_WITH_FLAGS_TO_LSN(trn->first_undo_lsn) == 0))
6131 trn->first_undo_lsn=
6132 trn->undo_lsn | LSN_WITH_FLAGS_TO_FLAGS(trn->first_undo_lsn);
6133 return 0;
6134 /*
6135 when we implement purging, we will specialize this hook: UNDO_PURGE
6136 records will additionally set trn->undo_purge_lsn
6137 */
6138 }
6139
6140
6141 /**
6142 @brief Sets the table's records count and checksum and others to 0, then
6143 calls the generic REDO hook.
6144
6145 @return Operation status, always 0 (success)
6146 */
6147
write_hook_for_redo_delete_all(enum translog_record_type type,TRN * trn,MARIA_HA * tbl_info,LSN * lsn,void * hook_arg)6148 my_bool write_hook_for_redo_delete_all(enum translog_record_type type
6149 __attribute__ ((unused)),
6150 TRN *trn, MARIA_HA *tbl_info
6151 __attribute__ ((unused)),
6152 LSN *lsn, void *hook_arg)
6153 {
6154 _ma_reset_status(tbl_info);
6155 return write_hook_for_redo(type, trn, tbl_info, lsn, hook_arg);
6156 }
6157
6158
6159 /**
6160 @brief Updates "records" and "checksum" and calls the generic UNDO hook
6161
6162 @return Operation status, always 0 (success)
6163 */
6164
write_hook_for_undo_row_insert(enum translog_record_type type,TRN * trn,MARIA_HA * tbl_info,LSN * lsn,void * hook_arg)6165 my_bool write_hook_for_undo_row_insert(enum translog_record_type type
6166 __attribute__ ((unused)),
6167 TRN *trn, MARIA_HA *tbl_info,
6168 LSN *lsn, void *hook_arg)
6169 {
6170 MARIA_SHARE *share= tbl_info->s;
6171 share->state.state.records++;
6172 share->state.state.checksum+= *(ha_checksum *)hook_arg;
6173 return write_hook_for_undo(type, trn, tbl_info, lsn, hook_arg);
6174 }
6175
6176
6177 /**
6178 @brief Updates "records" and calls the generic UNDO hook
6179
6180 @return Operation status, always 0 (success)
6181 */
6182
write_hook_for_undo_row_delete(enum translog_record_type type,TRN * trn,MARIA_HA * tbl_info,LSN * lsn,void * hook_arg)6183 my_bool write_hook_for_undo_row_delete(enum translog_record_type type
6184 __attribute__ ((unused)),
6185 TRN *trn, MARIA_HA *tbl_info,
6186 LSN *lsn, void *hook_arg)
6187 {
6188 MARIA_SHARE *share= tbl_info->s;
6189 share->state.state.records--;
6190 share->state.state.checksum+= *(ha_checksum *)hook_arg;
6191 return write_hook_for_undo(type, trn, tbl_info, lsn, hook_arg);
6192 }
6193
6194
6195 /**
6196 @brief Upates "records" and "checksum" and calls the generic UNDO hook
6197
6198 @return Operation status, always 0 (success)
6199 */
6200
write_hook_for_undo_row_update(enum translog_record_type type,TRN * trn,MARIA_HA * tbl_info,LSN * lsn,void * hook_arg)6201 my_bool write_hook_for_undo_row_update(enum translog_record_type type
6202 __attribute__ ((unused)),
6203 TRN *trn, MARIA_HA *tbl_info,
6204 LSN *lsn, void *hook_arg)
6205 {
6206 MARIA_SHARE *share= tbl_info->s;
6207 share->state.state.checksum+= *(ha_checksum *)hook_arg;
6208 return write_hook_for_undo(type, trn, tbl_info, lsn, hook_arg);
6209 }
6210
6211
write_hook_for_undo_bulk_insert(enum translog_record_type type,TRN * trn,MARIA_HA * tbl_info,LSN * lsn,void * hook_arg)6212 my_bool write_hook_for_undo_bulk_insert(enum translog_record_type type
6213 __attribute__ ((unused)),
6214 TRN *trn, MARIA_HA *tbl_info,
6215 LSN *lsn, void *hook_arg)
6216 {
6217 /*
6218 We are going to call maria_delete_all_rows(), but without logging and
6219 syncing, as an optimization (if we crash before commit, the UNDO will
6220 empty; if we crash after commit, we have flushed and forced the files).
6221 Status still needs to be reset under log mutex, in case of a concurrent
6222 checkpoint.
6223 */
6224 _ma_reset_status(tbl_info);
6225 return write_hook_for_undo(type, trn, tbl_info, lsn, hook_arg);
6226 }
6227
6228
6229 /**
6230 @brief Updates table's lsn_of_file_id.
6231
6232 @return Operation status, always 0 (success)
6233 */
6234
write_hook_for_file_id(enum translog_record_type type,TRN * trn,MARIA_HA * tbl_info,LSN * lsn,void * hook_arg)6235 my_bool write_hook_for_file_id(enum translog_record_type type
6236 __attribute__ ((unused)),
6237 TRN *trn
6238 __attribute__ ((unused)),
6239 MARIA_HA *tbl_info,
6240 LSN *lsn,
6241 void *hook_arg
6242 __attribute__ ((unused)))
6243 {
6244 DBUG_ASSERT(cmp_translog_addr(tbl_info->s->lsn_of_file_id, *lsn) < 0);
6245 tbl_info->s->lsn_of_file_id= *lsn;
6246 return 0;
6247 }
6248
6249
6250 /**
6251 Updates transaction's rec_lsn when committing.
6252
6253 A transaction writes its commit record before being committed in trnman, so
6254 if Checkpoint happens just between the COMMIT record log write and the
6255 commit in trnman, it will record that transaction is not committed. Assume
6256 the transaction (trn1) did an INSERT; after the checkpoint, a second
6257 transaction (trn2) does a DELETE of what trn1 has inserted. Then crash,
6258 Checkpoint record says that trn1 was not committed, and REDO phase starts
6259 from Checkpoint record's LSN. So it will not find the COMMIT record of
6260 trn1, will want to roll back trn1, which will fail because the row/key
6261 which it wants to delete does not exist anymore.
6262 To avoid this, Checkpoint needs to know that the REDO phase must start
6263 before this COMMIT record, so transaction sets its rec_lsn to the COMMIT's
6264 record LSN, and as Checkpoint reads the transaction's rec_lsn, Checkpoint
6265 will know.
6266
6267 @note so after commit trn->rec_lsn is a "commit LSN", which could be of
6268 use later.
6269
6270 @return Operation status, always 0 (success)
6271 */
6272
write_hook_for_commit(enum translog_record_type type,TRN * trn,MARIA_HA * tbl_info,LSN * lsn,void * hook_arg)6273 my_bool write_hook_for_commit(enum translog_record_type type
6274 __attribute__ ((unused)),
6275 TRN *trn,
6276 MARIA_HA *tbl_info
6277 __attribute__ ((unused)),
6278 LSN *lsn,
6279 void *hook_arg
6280 __attribute__ ((unused)))
6281 {
6282 trn->rec_lsn= *lsn;
6283 return 0;
6284 }
6285
6286
6287 /***************************************************************************
6288 Applying of REDO log records
6289 ***************************************************************************/
6290
6291 /*
6292 Apply changes to head and tail pages
6293
6294 SYNOPSIS
6295 _ma_apply_redo_insert_row_head_or_tail()
6296 info Maria handler
6297 lsn LSN to put on page
6298 page_type HEAD_PAGE or TAIL_PAGE
6299 new_page True if this is first entry on page
6300 header Header (without FILEID)
6301 data Data to be put on page
6302 data_length Length of data
6303
6304 NOTE
6305 Handles LOGREC_REDO_INSERT_ROW_HEAD, LOGREC_REDO_INSERT_ROW_TAIL
6306 LOGREC_REDO_NEW_ROW_HEAD and LOGREC_REDO_NEW_ROW_TAIL
6307
6308 RETURN
6309 0 ok
6310 # Error number
6311 */
6312
_ma_apply_redo_insert_row_head_or_tail(MARIA_HA * info,LSN lsn,uint page_type,my_bool new_page,const uchar * header,const uchar * data,size_t data_length)6313 uint _ma_apply_redo_insert_row_head_or_tail(MARIA_HA *info, LSN lsn,
6314 uint page_type,
6315 my_bool new_page,
6316 const uchar *header,
6317 const uchar *data,
6318 size_t data_length)
6319 {
6320 MARIA_SHARE *share= info->s;
6321 pgcache_page_no_t page;
6322 uint rownr, empty_space;
6323 uint block_size= share->block_size;
6324 uint rec_offset;
6325 uchar *buff, *dir;
6326 uint result;
6327 MARIA_PINNED_PAGE page_link;
6328 enum pagecache_page_lock lock_method;
6329 enum pagecache_page_pin pin_method;
6330 my_off_t end_of_page;
6331 uint error;
6332 DBUG_ENTER("_ma_apply_redo_insert_row_head_or_tail");
6333
6334 page= page_korr(header);
6335 rownr= dirpos_korr(header + PAGE_STORE_SIZE);
6336
6337 DBUG_PRINT("enter", ("rowid: %lu page: %lu rownr: %u data_length: %u",
6338 (ulong) ma_recordpos(page, rownr),
6339 (ulong) page, rownr, (uint) data_length));
6340
6341 share->state.changed|= (STATE_CHANGED | STATE_NOT_ZEROFILLED |
6342 STATE_NOT_MOVABLE);
6343
6344 end_of_page= (page + 1) * share->block_size;
6345 if (end_of_page > share->state.state.data_file_length)
6346 {
6347 DBUG_PRINT("info", ("Enlarging data file from %lu to %lu",
6348 (ulong) share->state.state.data_file_length,
6349 (ulong) end_of_page));
6350 /*
6351 New page at end of file. Note that the test above is also positive if
6352 data_file_length is not a multiple of block_size (system crashed while
6353 writing the last page): in this case we just extend the last page and
6354 fill it entirely with zeroes, then the REDO will put correct data on
6355 it.
6356 */
6357 lock_method= PAGECACHE_LOCK_WRITE;
6358 pin_method= PAGECACHE_PIN;
6359
6360 DBUG_ASSERT(rownr == 0 && new_page);
6361 if (rownr != 0 || !new_page)
6362 goto crashed_file;
6363
6364 buff= info->keyread_buff;
6365 info->keyread_buff_used= 1;
6366 make_empty_page(info, buff, page_type, 1);
6367 empty_space= (block_size - PAGE_OVERHEAD_SIZE(share));
6368 rec_offset= PAGE_HEADER_SIZE(share);
6369 dir= buff+ block_size - PAGE_SUFFIX_SIZE - DIR_ENTRY_SIZE;
6370 }
6371 else
6372 {
6373 lock_method= PAGECACHE_LOCK_LEFT_WRITELOCKED;
6374 pin_method= PAGECACHE_PIN_LEFT_PINNED;
6375
6376 share->pagecache->readwrite_flags&= ~MY_WME;
6377 buff= pagecache_read(share->pagecache, &info->dfile,
6378 page, 0, 0,
6379 PAGECACHE_PLAIN_PAGE, PAGECACHE_LOCK_WRITE,
6380 &page_link.link);
6381 share->pagecache->readwrite_flags= share->pagecache->org_readwrite_flags;
6382 if (!buff)
6383 {
6384 /* Skip errors when reading outside of file and uninitialized pages */
6385 if (!new_page || (my_errno != HA_ERR_FILE_TOO_SHORT &&
6386 my_errno != HA_ERR_WRONG_CRC))
6387 {
6388 DBUG_PRINT("error", ("Error %d when reading page", (int) my_errno));
6389 goto err;
6390 }
6391 /* Create new page */
6392 buff= pagecache_block_link_to_buffer(page_link.link);
6393 buff[PAGE_TYPE_OFFSET]= UNALLOCATED_PAGE;
6394 }
6395 else if (lsn_korr(buff) >= lsn) /* Test if already applied */
6396 {
6397 check_skipped_lsn(info, lsn_korr(buff), 1, page);
6398 /* Fix bitmap, just in case */
6399 empty_space= uint2korr(buff + EMPTY_SPACE_OFFSET);
6400 if (!enough_free_entries_on_page(share, buff))
6401 empty_space= 0; /* Page is full */
6402
6403 if (_ma_bitmap_set(info, page, page_type == HEAD_PAGE, empty_space))
6404 goto err;
6405 pagecache_unlock_by_link(share->pagecache, page_link.link,
6406 PAGECACHE_LOCK_WRITE_UNLOCK,
6407 PAGECACHE_UNPIN, LSN_IMPOSSIBLE,
6408 LSN_IMPOSSIBLE, 0, FALSE);
6409 DBUG_RETURN(0);
6410 }
6411
6412 if (((uint) (buff[PAGE_TYPE_OFFSET] & PAGE_TYPE_MASK) != page_type))
6413 {
6414 /*
6415 This is a page that has been freed before and now should be
6416 changed to new type.
6417 */
6418 if (!new_page)
6419 {
6420 DBUG_PRINT("error",
6421 ("Found page of wrong type: %u, should have been %u",
6422 (uint) (buff[PAGE_TYPE_OFFSET] & PAGE_TYPE_MASK),
6423 page_type));
6424 goto crashed_file;
6425 }
6426 make_empty_page(info, buff, page_type, 0);
6427 empty_space= block_size - PAGE_HEADER_SIZE(share) - PAGE_SUFFIX_SIZE;
6428 (void) extend_directory(info, buff, block_size, 0, rownr, &empty_space,
6429 page_type == HEAD_PAGE);
6430 rec_offset= PAGE_HEADER_SIZE(share);
6431 dir= dir_entry_pos(buff, block_size, rownr);
6432 empty_space+= uint2korr(dir+2);
6433 }
6434 else
6435 {
6436 uint max_entry= (uint) buff[DIR_COUNT_OFFSET];
6437 uint length;
6438
6439 DBUG_ASSERT(!new_page);
6440 dir= dir_entry_pos(buff, block_size, rownr);
6441 empty_space= uint2korr(buff + EMPTY_SPACE_OFFSET);
6442
6443 if (max_entry <= rownr)
6444 {
6445 /* Add directory entry first in directory and data last on page */
6446 if (extend_directory(info, buff, block_size, max_entry, rownr,
6447 &empty_space, page_type == HEAD_PAGE))
6448 goto crashed_file;
6449 }
6450 if (extend_area_on_page(info, buff, dir, rownr,
6451 (uint) data_length, &empty_space,
6452 &rec_offset, &length, page_type == HEAD_PAGE))
6453 goto crashed_file;
6454 }
6455 }
6456 /* Copy data */
6457 int2store(dir+2, data_length);
6458 memcpy(buff + rec_offset, data, data_length);
6459 empty_space-= (uint) data_length;
6460 int2store(buff + EMPTY_SPACE_OFFSET, empty_space);
6461
6462 /* Fix bitmap */
6463 if (!enough_free_entries_on_page(share, buff))
6464 empty_space= 0; /* Page is full */
6465 if (_ma_bitmap_set(info, page, page_type == HEAD_PAGE, empty_space))
6466 goto err;
6467
6468 /*
6469 If page was not read before, write it but keep it pinned.
6470 We don't update its LSN When we have processed all REDOs for this page
6471 in the current REDO's group, we will stamp page with UNDO's LSN
6472 (if we stamped it now, a next REDO, in
6473 this group, for this page, would be skipped) and unpin then.
6474 */
6475 result= 0;
6476 if (lock_method == PAGECACHE_LOCK_WRITE &&
6477 pagecache_write(share->pagecache,
6478 &info->dfile, page, 0,
6479 buff, PAGECACHE_PLAIN_PAGE,
6480 lock_method, pin_method,
6481 PAGECACHE_WRITE_DELAY, &page_link.link,
6482 LSN_IMPOSSIBLE))
6483 result= my_errno;
6484
6485 page_link.unlock= PAGECACHE_LOCK_WRITE_UNLOCK;
6486 page_link.changed= 1;
6487 push_dynamic(&info->pinned_pages, (void*) &page_link);
6488
6489 /*
6490 Data page and bitmap page are in place, we can update data_file_length in
6491 case we extended the file. We could not do it earlier: bitmap code tests
6492 data_file_length to know if it has to create a new page or not.
6493 */
6494 set_if_bigger(share->state.state.data_file_length, end_of_page);
6495 DBUG_RETURN(result);
6496
6497 crashed_file:
6498 _ma_set_fatal_error(share, HA_ERR_WRONG_IN_RECORD);
6499 err:
6500 error= my_errno;
6501 if (lock_method == PAGECACHE_LOCK_LEFT_WRITELOCKED)
6502 pagecache_unlock_by_link(share->pagecache, page_link.link,
6503 PAGECACHE_LOCK_WRITE_UNLOCK,
6504 PAGECACHE_UNPIN, LSN_IMPOSSIBLE,
6505 LSN_IMPOSSIBLE, 0, FALSE);
6506 _ma_mark_file_crashed(share);
6507 DBUG_ASSERT(!maria_assert_if_crashed_table); /* catch recovery error early */
6508 DBUG_RETURN((my_errno= error));
6509 }
6510
6511
6512 /*
6513 Apply LOGREC_REDO_PURGE_ROW_HEAD & LOGREC_REDO_PURGE_ROW_TAIL
6514
6515 SYNOPSIS
6516 _ma_apply_redo_purge_row_head_or_tail()
6517 info Maria handler
6518 lsn LSN to put on page
6519 page_type HEAD_PAGE or TAIL_PAGE
6520 header Header (without FILEID)
6521
6522 NOTES
6523 This function is very similar to delete_head_or_tail()
6524
6525 RETURN
6526 0 ok
6527 # Error number
6528 */
6529
_ma_apply_redo_purge_row_head_or_tail(MARIA_HA * info,LSN lsn,uint page_type,const uchar * header)6530 uint _ma_apply_redo_purge_row_head_or_tail(MARIA_HA *info, LSN lsn,
6531 uint page_type,
6532 const uchar *header)
6533 {
6534 MARIA_SHARE *share= info->s;
6535 pgcache_page_no_t page;
6536 uint rownr, empty_space;
6537 uchar *buff;
6538 int result;
6539 uint error;
6540 MARIA_PINNED_PAGE page_link;
6541 DBUG_ENTER("_ma_apply_redo_purge_row_head_or_tail");
6542
6543 page= page_korr(header);
6544 rownr= dirpos_korr(header+PAGE_STORE_SIZE);
6545 DBUG_PRINT("enter", ("rowid: %lu page: %lu rownr: %u",
6546 (ulong) ma_recordpos(page, rownr),
6547 (ulong) page, rownr));
6548
6549 share->state.changed|= (STATE_CHANGED | STATE_NOT_ZEROFILLED |
6550 STATE_NOT_MOVABLE);
6551
6552 if (!(buff= pagecache_read(share->pagecache, &info->dfile,
6553 page, 0, 0,
6554 PAGECACHE_PLAIN_PAGE, PAGECACHE_LOCK_WRITE,
6555 &page_link.link)))
6556 goto err;
6557
6558 if (lsn_korr(buff) >= lsn)
6559 {
6560 /*
6561 Already applied
6562 Note that in case the page is not anymore a head or tail page
6563 a future redo will fix the bitmap.
6564 */
6565 check_skipped_lsn(info, lsn_korr(buff), 1, page);
6566 if ((uint) (buff[PAGE_TYPE_OFFSET] & PAGE_TYPE_MASK) == page_type)
6567 {
6568 empty_space= uint2korr(buff+EMPTY_SPACE_OFFSET);
6569 if (!enough_free_entries_on_page(share, buff))
6570 empty_space= 0; /* Page is full */
6571 if (_ma_bitmap_set(info, page, page_type == HEAD_PAGE,
6572 empty_space))
6573 goto err;
6574 }
6575 pagecache_unlock_by_link(share->pagecache, page_link.link,
6576 PAGECACHE_LOCK_WRITE_UNLOCK,
6577 PAGECACHE_UNPIN, LSN_IMPOSSIBLE,
6578 LSN_IMPOSSIBLE, 0, FALSE);
6579 DBUG_RETURN(0);
6580 }
6581
6582 DBUG_ASSERT((buff[PAGE_TYPE_OFFSET] & PAGE_TYPE_MASK) == (uchar) page_type);
6583
6584 if (delete_dir_entry(share, buff, rownr, &empty_space) < 0)
6585 {
6586 _ma_set_fatal_error(share, HA_ERR_WRONG_IN_RECORD);
6587 goto err;
6588 }
6589
6590 page_link.unlock= PAGECACHE_LOCK_WRITE_UNLOCK;
6591 page_link.changed= 1;
6592 push_dynamic(&info->pinned_pages, (void*) &page_link);
6593
6594 result= 0;
6595 if (!enough_free_entries_on_page(share, buff))
6596 empty_space= 0; /* Page is full */
6597 /* This will work even if the page was marked as UNALLOCATED_PAGE */
6598 if (_ma_bitmap_set(info, page, page_type == HEAD_PAGE, empty_space))
6599 result= my_errno;
6600
6601 DBUG_RETURN(result);
6602
6603 err:
6604 error= my_errno;
6605 pagecache_unlock_by_link(share->pagecache, page_link.link,
6606 PAGECACHE_LOCK_WRITE_UNLOCK,
6607 PAGECACHE_UNPIN, LSN_IMPOSSIBLE,
6608 LSN_IMPOSSIBLE, 0, FALSE);
6609 _ma_mark_file_crashed(share);
6610 DBUG_ASSERT(!maria_assert_if_crashed_table);
6611 DBUG_RETURN((my_errno= error));
6612
6613 }
6614
6615
6616 /**
6617 @brief Apply LOGREC_REDO_FREE_BLOCKS
6618
6619 @param info Maria handler
6620 @param header Header (without FILEID)
6621
6622 Mark the pages free in the bitmap.
6623
6624 We have to check against _ma_redo_not_needed_for_page()
6625 to guard against the case where we first clear a block and after
6626 that insert new data into the blocks. If we would unconditionally
6627 clear the bitmap here, future changes would be ignored for the page
6628 if it's not in the dirty list (ie, it would be flushed).
6629
6630 @return Operation status
6631 @retval 0 OK
6632 @retval 1 Error
6633 */
6634
_ma_apply_redo_free_blocks(MARIA_HA * info,LSN lsn,LSN redo_lsn,const uchar * header)6635 uint _ma_apply_redo_free_blocks(MARIA_HA *info,
6636 LSN lsn __attribute__((unused)),
6637 LSN redo_lsn,
6638 const uchar *header)
6639 {
6640 MARIA_SHARE *share= info->s;
6641 uint ranges;
6642 uint16 sid;
6643 DBUG_ENTER("_ma_apply_redo_free_blocks");
6644
6645 share->state.changed|= (STATE_CHANGED | STATE_NOT_ZEROFILLED |
6646 STATE_NOT_MOVABLE);
6647
6648 sid= fileid_korr(header);
6649 header+= FILEID_STORE_SIZE;
6650 ranges= pagerange_korr(header);
6651 header+= PAGERANGE_STORE_SIZE;
6652 DBUG_ASSERT(ranges > 0);
6653
6654 /** @todo leave bitmap lock to the bitmap code... */
6655 mysql_mutex_lock(&share->bitmap.bitmap_lock);
6656 while (ranges--)
6657 {
6658 my_bool res;
6659 uint page_range;
6660 pgcache_page_no_t page, start_page;
6661
6662 start_page= page= page_korr(header);
6663 header+= PAGE_STORE_SIZE;
6664 /* Page range may have this bit set to indicate a tail page */
6665 page_range= pagerange_korr(header) & ~(TAIL_BIT | START_EXTENT_BIT);
6666 DBUG_ASSERT(page_range > 0);
6667
6668 header+= PAGERANGE_STORE_SIZE;
6669
6670 DBUG_PRINT("info", ("page: %lu pages: %u", (long) page, page_range));
6671
6672 for ( ; page_range-- ; start_page++)
6673 {
6674 if (_ma_redo_not_needed_for_page(sid, redo_lsn, start_page, FALSE))
6675 continue;
6676 res= _ma_bitmap_reset_full_page_bits(info, &share->bitmap, start_page,
6677 1);
6678 if (res)
6679 {
6680 mysql_mutex_unlock(&share->bitmap.bitmap_lock);
6681 _ma_mark_file_crashed(share);
6682 DBUG_ASSERT(!maria_assert_if_crashed_table);
6683 DBUG_RETURN(res);
6684 }
6685 }
6686 }
6687 mysql_mutex_unlock(&share->bitmap.bitmap_lock);
6688 DBUG_RETURN(0);
6689 }
6690
6691
6692 /**
6693 @brief Apply LOGREC_REDO_FREE_HEAD_OR_TAIL
6694
6695 @param info Maria handler
6696 @param header Header (without FILEID)
6697
6698 @note It marks the page free in the bitmap, and sets the directory's count
6699 to 0.
6700
6701 @return Operation status
6702 @retval 0 OK
6703 @retval 1 Error
6704 */
6705
_ma_apply_redo_free_head_or_tail(MARIA_HA * info,LSN lsn,const uchar * header)6706 uint _ma_apply_redo_free_head_or_tail(MARIA_HA *info, LSN lsn,
6707 const uchar *header)
6708 {
6709 MARIA_SHARE *share= info->s;
6710 uchar *buff;
6711 pgcache_page_no_t page;
6712 MARIA_PINNED_PAGE page_link;
6713 my_bool res;
6714 DBUG_ENTER("_ma_apply_redo_free_head_or_tail");
6715
6716 share->state.changed|= (STATE_CHANGED | STATE_NOT_ZEROFILLED |
6717 STATE_NOT_MOVABLE);
6718
6719 page= page_korr(header);
6720
6721 if (!(buff= pagecache_read(share->pagecache,
6722 &info->dfile,
6723 page, 0, 0,
6724 PAGECACHE_PLAIN_PAGE,
6725 PAGECACHE_LOCK_WRITE, &page_link.link)))
6726 {
6727 pagecache_unlock_by_link(share->pagecache, page_link.link,
6728 PAGECACHE_LOCK_WRITE_UNLOCK,
6729 PAGECACHE_UNPIN, LSN_IMPOSSIBLE,
6730 LSN_IMPOSSIBLE, 0, FALSE);
6731 goto err;
6732 }
6733 if (lsn_korr(buff) >= lsn)
6734 {
6735 /* Already applied */
6736 check_skipped_lsn(info, lsn_korr(buff), 1, page);
6737 pagecache_unlock_by_link(share->pagecache, page_link.link,
6738 PAGECACHE_LOCK_WRITE_UNLOCK,
6739 PAGECACHE_UNPIN, LSN_IMPOSSIBLE,
6740 LSN_IMPOSSIBLE, 0, FALSE);
6741 }
6742 else
6743 {
6744 buff[PAGE_TYPE_OFFSET]= UNALLOCATED_PAGE;
6745 #ifdef IDENTICAL_PAGES_AFTER_RECOVERY
6746 {
6747 uint number_of_records= (uint) buff[DIR_COUNT_OFFSET];
6748 uchar *dir= dir_entry_pos(buff, share->block_size,
6749 number_of_records-1);
6750 buff[DIR_FREE_OFFSET]= END_OF_DIR_FREE_LIST;
6751 bzero(dir, number_of_records * DIR_ENTRY_SIZE);
6752 }
6753 #endif
6754
6755 page_link.unlock= PAGECACHE_LOCK_WRITE_UNLOCK;
6756 page_link.changed= 1;
6757 push_dynamic(&info->pinned_pages, (void*) &page_link);
6758 }
6759 /** @todo leave bitmap lock to the bitmap code... */
6760 mysql_mutex_lock(&share->bitmap.bitmap_lock);
6761 res= _ma_bitmap_reset_full_page_bits(info, &share->bitmap, page, 1);
6762 mysql_mutex_unlock(&share->bitmap.bitmap_lock);
6763 if (res)
6764 goto err;
6765 DBUG_RETURN(0);
6766
6767 err:
6768 _ma_mark_file_crashed(share);
6769 DBUG_ASSERT(!maria_assert_if_crashed_table);
6770 DBUG_RETURN(1);
6771 }
6772
6773
6774 /**
6775 @brief Apply LOGREC_REDO_INSERT_ROW_BLOBS
6776
6777 @param info Maria handler
6778 @parma lsn LSN to put on pages
6779 @param header Header (with FILEID)
6780 @param redo_lsn REDO record's LSN
6781 @param[out] number_of_blobs Number of blobs found in log record
6782 @param[out] number_of_ranges Number of ranges found
6783 @param[out] first_page First page touched
6784 @param[out] last_page Last page touched
6785
6786 @note Write full pages (full head & blob pages)
6787
6788 @return Operation status
6789 @retval 0 OK
6790 @retval !=0 Error
6791 */
6792
_ma_apply_redo_insert_row_blobs(MARIA_HA * info,LSN lsn,const uchar * header,LSN redo_lsn,uint * const number_of_blobs,uint * const number_of_ranges,pgcache_page_no_t * const first_page,pgcache_page_no_t * const last_page)6793 uint _ma_apply_redo_insert_row_blobs(MARIA_HA *info,
6794 LSN lsn, const uchar *header,
6795 LSN redo_lsn,
6796 uint * const number_of_blobs,
6797 uint * const number_of_ranges,
6798 pgcache_page_no_t * const first_page,
6799 pgcache_page_no_t * const last_page)
6800 {
6801 MARIA_SHARE *share= info->s;
6802 const uchar *data;
6803 uint data_size= FULL_PAGE_SIZE(share);
6804 uint blob_count, ranges;
6805 uint16 sid;
6806 pgcache_page_no_t first_page2= ULONGLONG_MAX, last_page2= 0;
6807 DBUG_ENTER("_ma_apply_redo_insert_row_blobs");
6808
6809 share->state.changed|= (STATE_CHANGED | STATE_NOT_ZEROFILLED |
6810 STATE_NOT_MOVABLE);
6811
6812 sid= fileid_korr(header);
6813 header+= FILEID_STORE_SIZE;
6814 *number_of_ranges= ranges= pagerange_korr(header);
6815 header+= PAGERANGE_STORE_SIZE;
6816 *number_of_blobs= blob_count= pagerange_korr(header);
6817 header+= PAGERANGE_STORE_SIZE;
6818 DBUG_ASSERT(ranges >= blob_count);
6819
6820 data= (header + ranges * ROW_EXTENT_SIZE +
6821 blob_count * (SUB_RANGE_SIZE + BLOCK_FILLER_SIZE));
6822
6823 while (blob_count--)
6824 {
6825 uint sub_ranges, empty_space;
6826
6827 sub_ranges= uint2korr(header);
6828 header+= SUB_RANGE_SIZE;
6829 empty_space= uint2korr(header);
6830 header+= BLOCK_FILLER_SIZE;
6831 DBUG_ASSERT(sub_ranges <= ranges && empty_space < data_size);
6832 ranges-= sub_ranges;
6833
6834 while (sub_ranges--)
6835 {
6836 uint i;
6837 uint res;
6838 uint page_range;
6839 pgcache_page_no_t page;
6840 uchar *buff;
6841 uint data_on_page= data_size;
6842
6843 page= page_korr(header);
6844 header+= PAGE_STORE_SIZE;
6845 page_range= pagerange_korr(header);
6846 header+= PAGERANGE_STORE_SIZE;
6847
6848 for (i= page_range; i-- > 0 ; page++, data+= data_on_page)
6849 {
6850 MARIA_PINNED_PAGE page_link;
6851 enum pagecache_page_lock unlock_method;
6852 enum pagecache_page_pin unpin_method;
6853
6854 set_if_smaller(first_page2, page);
6855 set_if_bigger(last_page2, page);
6856 if (i == 0 && sub_ranges == 0)
6857 data_on_page= data_size - empty_space; /* data on last page */
6858 if (_ma_redo_not_needed_for_page(sid, redo_lsn, page, FALSE))
6859 continue;
6860
6861 if (((page + 1) * share->block_size) >
6862 share->state.state.data_file_length)
6863 {
6864 /* New page or half written page at end of file */
6865 DBUG_PRINT("info", ("Enlarging data file from %lu to %lu",
6866 (ulong) share->state.state.data_file_length,
6867 (ulong) ((page + 1 ) * share->block_size)));
6868 share->state.state.data_file_length= (page + 1) * share->block_size;
6869 buff= info->keyread_buff;
6870 info->keyread_buff_used= 1;
6871 make_empty_page(info, buff, BLOB_PAGE, 0);
6872 unlock_method= PAGECACHE_LOCK_LEFT_UNLOCKED;
6873 unpin_method= PAGECACHE_PIN_LEFT_UNPINNED;
6874 }
6875 else
6876 {
6877 share->pagecache->readwrite_flags&= ~MY_WME;
6878 buff= pagecache_read(share->pagecache,
6879 &info->dfile,
6880 page, 0, 0,
6881 PAGECACHE_PLAIN_PAGE,
6882 PAGECACHE_LOCK_WRITE, &page_link.link);
6883 share->pagecache->readwrite_flags= share->pagecache->
6884 org_readwrite_flags;
6885 if (!buff)
6886 {
6887 if (my_errno != HA_ERR_FILE_TOO_SHORT &&
6888 my_errno != HA_ERR_WRONG_CRC)
6889 {
6890 /* If not read outside of file */
6891 pagecache_unlock_by_link(share->pagecache, page_link.link,
6892 PAGECACHE_LOCK_WRITE_UNLOCK,
6893 PAGECACHE_UNPIN, LSN_IMPOSSIBLE,
6894 LSN_IMPOSSIBLE, 0, FALSE);
6895 goto err;
6896 }
6897 /*
6898 Physical file was too short, create new page. It can be that
6899 recovery started with a file with N pages, wrote page N+2 into
6900 pagecache (increased data_file_length but not physical file
6901 length), now reads page N+1: the read fails.
6902 */
6903 buff= pagecache_block_link_to_buffer(page_link.link);
6904 make_empty_page(info, buff, BLOB_PAGE, 0);
6905 }
6906 else
6907 {
6908 #ifdef DBUG_ASSERT_EXISTS
6909 uchar found_page_type= (buff[PAGE_TYPE_OFFSET] & PAGE_TYPE_MASK);
6910 #endif
6911 if (lsn_korr(buff) >= lsn)
6912 {
6913 /* Already applied */
6914 check_skipped_lsn(info, lsn_korr(buff), 1, page);
6915 pagecache_unlock_by_link(share->pagecache, page_link.link,
6916 PAGECACHE_LOCK_WRITE_UNLOCK,
6917 PAGECACHE_UNPIN, LSN_IMPOSSIBLE,
6918 LSN_IMPOSSIBLE, 0, FALSE);
6919 goto fix_bitmap;
6920 }
6921 DBUG_ASSERT((found_page_type == (uchar) BLOB_PAGE) ||
6922 (found_page_type == (uchar) UNALLOCATED_PAGE));
6923 }
6924 unlock_method= PAGECACHE_LOCK_WRITE_UNLOCK;
6925 unpin_method= PAGECACHE_UNPIN;
6926 }
6927
6928 /*
6929 Blob pages are never updated twice in same redo-undo chain, so
6930 it's safe to update lsn for them here
6931 */
6932 lsn_store(buff, lsn);
6933 buff[PAGE_TYPE_OFFSET]= BLOB_PAGE;
6934 bzero(buff + LSN_SIZE + PAGE_TYPE_SIZE,
6935 FULL_PAGE_HEADER_SIZE(share) - (LSN_SIZE + PAGE_TYPE_SIZE));
6936
6937 if (data_on_page != data_size)
6938 {
6939 /*
6940 Last page may be only partly filled. We zero the rest, like
6941 write_full_pages() does.
6942 */
6943 bzero(buff + share->block_size - PAGE_SUFFIX_SIZE - empty_space,
6944 empty_space);
6945 }
6946 memcpy(buff + FULL_PAGE_HEADER_SIZE(share), data, data_on_page);
6947 if (pagecache_write(share->pagecache,
6948 &info->dfile, page, 0,
6949 buff, PAGECACHE_PLAIN_PAGE,
6950 unlock_method, unpin_method,
6951 PAGECACHE_WRITE_DELAY, 0, LSN_IMPOSSIBLE))
6952 goto err;
6953
6954 fix_bitmap:
6955 /** @todo leave bitmap lock to the bitmap code... */
6956 mysql_mutex_lock(&share->bitmap.bitmap_lock);
6957 res= _ma_bitmap_set_full_page_bits(info, &share->bitmap, page,
6958 1);
6959 mysql_mutex_unlock(&share->bitmap.bitmap_lock);
6960 if (res)
6961 goto err;
6962 }
6963 }
6964 }
6965 *first_page= first_page2;
6966 *last_page= last_page2;
6967 DBUG_RETURN(0);
6968
6969 err:
6970 _ma_mark_file_crashed(share);
6971 DBUG_ASSERT(!maria_assert_if_crashed_table);
6972 DBUG_RETURN(1);
6973 }
6974
6975
6976 /****************************************************************************
6977 Applying of UNDO entries
6978 ****************************************************************************/
6979
6980 /** Execute undo of a row insert (delete the inserted row) */
6981
_ma_apply_undo_row_insert(MARIA_HA * info,LSN undo_lsn,const uchar * header)6982 my_bool _ma_apply_undo_row_insert(MARIA_HA *info, LSN undo_lsn,
6983 const uchar *header)
6984 {
6985 pgcache_page_no_t page;
6986 uint rownr;
6987 uchar *buff;
6988 my_bool res;
6989 MARIA_PINNED_PAGE page_link;
6990 MARIA_SHARE *share= info->s;
6991 ha_checksum checksum;
6992 LSN lsn;
6993 DBUG_ENTER("_ma_apply_undo_row_insert");
6994
6995 page= page_korr(header);
6996 header+= PAGE_STORE_SIZE;
6997 rownr= dirpos_korr(header);
6998 header+= DIRPOS_STORE_SIZE;
6999 DBUG_PRINT("enter", ("rowid: %lu page: %lu rownr: %u",
7000 (ulong) ma_recordpos(page, rownr),
7001 (ulong) page, rownr));
7002
7003 buff= pagecache_read(share->pagecache,
7004 &info->dfile, page, 0,
7005 0, share->page_type,
7006 PAGECACHE_LOCK_WRITE,
7007 &page_link.link);
7008 page_link.unlock= PAGECACHE_LOCK_WRITE_UNLOCK;
7009 page_link.changed= buff != 0;
7010 push_dynamic(&info->pinned_pages, (void*) &page_link);
7011 if (!buff)
7012 goto err;
7013
7014 if (read_row_extent_info(info, buff, rownr))
7015 goto err;
7016
7017 _ma_bitmap_flushable(info, 1);
7018 if (delete_head_or_tail(info, page, rownr, 1, 1) ||
7019 delete_tails(info, info->cur_row.tail_positions))
7020 goto err;
7021
7022 if (info->cur_row.extents_count && free_full_pages(info, &info->cur_row))
7023 goto err;
7024
7025 checksum= 0;
7026 if (share->calc_checksum)
7027 checksum= (ha_checksum) 0 - ha_checksum_korr(header);
7028 info->last_auto_increment= ~ (ulonglong) 0;
7029 if (_ma_write_clr(info, undo_lsn, LOGREC_UNDO_ROW_INSERT,
7030 share->calc_checksum != 0, checksum, &lsn, (void*) 0))
7031 goto err;
7032
7033 res= 0;
7034 end:
7035 /* The following is true only if _ma_bitmap_flushable() was called earlier */
7036 if (info->non_flushable_state)
7037 _ma_bitmap_flushable(info, -1);
7038 _ma_unpin_all_pages_and_finalize_row(info, lsn);
7039 DBUG_RETURN(res);
7040
7041 err:
7042 DBUG_ASSERT(!maria_assert_if_crashed_table);
7043 res= 1;
7044 _ma_mark_file_crashed(share);
7045 /*
7046 Don't write a new LSN on the used pages. Not important as the file is
7047 marked as crashed and need to be repaired before it can be used.
7048 */
7049 lsn= LSN_IMPOSSIBLE;
7050 goto end;
7051 }
7052
7053
7054 /** Execute undo of a row delete (insert the row back where it was) */
7055
_ma_apply_undo_row_delete(MARIA_HA * info,LSN undo_lsn,const uchar * header,size_t header_length)7056 my_bool _ma_apply_undo_row_delete(MARIA_HA *info, LSN undo_lsn,
7057 const uchar *header, size_t header_length
7058 __attribute__((unused)))
7059 {
7060 MARIA_SHARE *share= info->s;
7061 MARIA_ROW row;
7062 MARIA_COLUMNDEF *column, *end_column;
7063 MARIA_BITMAP_BLOCKS *blocks;
7064 struct st_row_pos_info row_pos;
7065 uchar *record;
7066 const uchar *null_bits, *field_length_data, *extent_info;
7067 pgcache_page_no_t page;
7068 ulong *blob_lengths;
7069 uint *null_field_lengths, extent_count, rownr, length_on_head_page;
7070 DBUG_ENTER("_ma_apply_undo_row_delete");
7071
7072 /*
7073 Use cur row as a base; We need to make a copy as we will change
7074 some buffers to point directly to 'header'
7075 */
7076 memcpy(&row, &info->cur_row, sizeof(row));
7077
7078 page= page_korr(header);
7079 header+= PAGE_STORE_SIZE;
7080 rownr= dirpos_korr(header);
7081 header+= DIRPOS_STORE_SIZE;
7082 length_on_head_page= uint2korr(header);
7083 header+= 2;
7084 extent_count= pagerange_korr(header);
7085 header+= PAGERANGE_STORE_SIZE;
7086 DBUG_PRINT("enter", ("rowid: %lu page: %lu rownr: %u",
7087 (ulong) ma_recordpos(page, rownr),
7088 (ulong) page, rownr));
7089
7090 if (share->calc_checksum)
7091 {
7092 /*
7093 We extract the checksum delta here, saving a recomputation in
7094 allocate_and_write_block_record(). It's only an optimization.
7095 */
7096 row.checksum= (ha_checksum) 0 - ha_checksum_korr(header);
7097 header+= HA_CHECKSUM_STORE_SIZE;
7098 }
7099 extent_info= header;
7100 header+= extent_count * ROW_EXTENT_SIZE;
7101
7102 null_field_lengths= row.null_field_lengths;
7103 blob_lengths= row.blob_lengths;
7104
7105 /*
7106 Fill in info->cur_row with information about the row, like in
7107 calc_record_size(), to be used by write_block_record()
7108 */
7109
7110 row.normal_length= row.char_length= row.varchar_length=
7111 row.blob_length= row.extents_count= row.field_lengths_length= 0;
7112
7113 null_bits= header;
7114 header+= share->base.null_bytes;
7115 /* This will not be changed */
7116 row.empty_bits= (uchar*) header;
7117 header+= share->base.pack_bytes;
7118 if (share->base.max_field_lengths)
7119 {
7120 row.field_lengths_length= uint2korr(header);
7121 row.field_lengths= (uchar*) header + 2 ;
7122 header+= 2 + row.field_lengths_length;
7123 }
7124 if (share->base.blobs)
7125 row.blob_length= ma_get_length(&header);
7126
7127 /* We need to build up a record (without blobs) in rec_buff */
7128 if (!(record= my_malloc(share->base.reclength, MYF(MY_WME))))
7129 DBUG_RETURN(1);
7130
7131 memcpy(record, null_bits, share->base.null_bytes);
7132
7133 /* Copy field information from header to record */
7134
7135 /* Handle constant length fields that are always present */
7136 for (column= share->columndef,
7137 end_column= column+ share->base.fixed_not_null_fields;
7138 column < end_column;
7139 column++)
7140 {
7141 memcpy(record + column->offset, header, column->length);
7142 header+= column->length;
7143 }
7144
7145 /* Handle NULL fields and CHAR/VARCHAR fields */
7146 field_length_data= row.field_lengths;
7147 for (end_column= share->columndef + share->base.fields;
7148 column < end_column;
7149 column++, null_field_lengths++)
7150 {
7151 if ((record[column->null_pos] & column->null_bit) ||
7152 row.empty_bits[column->empty_pos] & column->empty_bit)
7153 {
7154 if (column->type != FIELD_BLOB)
7155 *null_field_lengths= 0;
7156 else
7157 *blob_lengths++= 0;
7158 if (share->calc_checksum)
7159 bfill(record + column->offset, column->fill_length,
7160 column->type == FIELD_SKIP_ENDSPACE ? ' ' : 0);
7161 continue;
7162 }
7163 switch (column->type) {
7164 case FIELD_CHECK:
7165 case FIELD_NORMAL: /* Fixed length field */
7166 case FIELD_ZERO:
7167 case FIELD_SKIP_PRESPACE: /* Not packed */
7168 case FIELD_SKIP_ZERO: /* Fixed length field */
7169 row.normal_length+= column->length;
7170 *null_field_lengths= column->length;
7171 memcpy(record + column->offset, header, column->length);
7172 header+= column->length;
7173 break;
7174 case FIELD_SKIP_ENDSPACE: /* CHAR */
7175 {
7176 uint length;
7177 if (column->length <= 255)
7178 length= (uint) *field_length_data++;
7179 else
7180 {
7181 length= uint2korr(field_length_data);
7182 field_length_data+= 2;
7183 }
7184 row.char_length+= length;
7185 *null_field_lengths= length;
7186 memcpy(record + column->offset, header, length);
7187 if (share->calc_checksum)
7188 bfill(record + column->offset + length, (column->length - length),
7189 ' ');
7190 header+= length;
7191 break;
7192 }
7193 case FIELD_VARCHAR:
7194 {
7195 uint length;
7196 uchar *field_pos= record + column->offset;
7197
7198 /* 256 is correct as this includes the length uchar */
7199 if (column->fill_length == 1)
7200 {
7201 field_pos[0]= *field_length_data;
7202 length= (uint) *field_length_data;
7203 }
7204 else
7205 {
7206 field_pos[0]= field_length_data[0];
7207 field_pos[1]= field_length_data[1];
7208 length= uint2korr(field_length_data);
7209 }
7210 field_length_data+= column->fill_length;
7211 field_pos+= column->fill_length;
7212 row.varchar_length+= length;
7213 *null_field_lengths= length;
7214 memcpy(field_pos, header, length);
7215 header+= length;
7216 break;
7217 }
7218 case FIELD_BLOB:
7219 {
7220 /* Copy length of blob and pointer to blob data to record */
7221 uchar *field_pos= record + column->offset;
7222 uint size_length= column->length - portable_sizeof_char_ptr;
7223 ulong blob_length= _ma_calc_blob_length(size_length, field_length_data);
7224
7225 memcpy(field_pos, field_length_data, size_length);
7226 field_length_data+= size_length;
7227 memcpy(field_pos + size_length, &header, sizeof(header));
7228 header+= blob_length;
7229 *blob_lengths++= blob_length;
7230 break;
7231 }
7232 default:
7233 DBUG_ASSERT(0);
7234 }
7235 }
7236 row.head_length= (info->row_base_length +
7237 share->base.fixed_not_null_fields_length +
7238 row.field_lengths_length +
7239 size_to_store_key_length(row.field_lengths_length) +
7240 row.normal_length +
7241 row.char_length + row.varchar_length);
7242 row.total_length= (row.head_length + row.blob_length);
7243 if (row.total_length < share->base.min_block_length)
7244 row.total_length= share->base.min_block_length;
7245
7246 /*
7247 Row is now generated. Now we need to insert record on the original
7248 pages with original size on each page.
7249 */
7250
7251 _ma_bitmap_flushable(info, 1);
7252 /* Change extent information to be usable by write_block_record() */
7253 blocks= &row.insert_blocks;
7254 if (extent_to_bitmap_blocks(info, blocks, page, extent_count, extent_info))
7255 goto err;
7256 blocks->block->org_bitmap_value= _ma_bitmap_get_page_bits(info,
7257 &share->bitmap,
7258 page);
7259 blocks->block->used|= BLOCKUSED_USE_ORG_BITMAP;
7260
7261 /* Read head page and allocate data for rowid */
7262 if (get_rowpos_in_head_or_tail_page(info, blocks->block,
7263 info->buff,
7264 length_on_head_page,
7265 HEAD_PAGE, PAGECACHE_LOCK_WRITE,
7266 rownr, &row_pos))
7267 goto err;
7268
7269 if (share->calc_checksum)
7270 {
7271 DBUG_ASSERT(row.checksum == (share->calc_checksum)(info, record));
7272 }
7273 /* Store same amount of data on head page as on original page */
7274 row_pos.length= (length_on_head_page -
7275 (extent_count + 1 - blocks->count) * ROW_EXTENT_SIZE);
7276 set_if_bigger(row_pos.length, share->base.min_block_length);
7277 if (write_block_record(info, (uchar*) 0, record, &row,
7278 blocks, blocks->block->org_bitmap_value != 0,
7279 &row_pos, undo_lsn, 0))
7280 goto err;
7281
7282 my_free(record);
7283 DBUG_RETURN(0);
7284
7285 err:
7286 DBUG_ASSERT(!maria_assert_if_crashed_table);
7287 _ma_mark_file_crashed(share);
7288 if (info->non_flushable_state)
7289 _ma_bitmap_flushable(info, -1);
7290 _ma_unpin_all_pages_and_finalize_row(info, LSN_IMPOSSIBLE);
7291 my_free(record);
7292 DBUG_RETURN(1);
7293 }
7294
7295
7296 /**
7297 Execute undo of a row update
7298
7299 @fn _ma_apply_undo_row_update()
7300
7301 @return Operation status
7302 @retval 0 OK
7303 @retval 1 Error
7304 */
7305
_ma_apply_undo_row_update(MARIA_HA * info,LSN undo_lsn,const uchar * header,size_t header_length)7306 my_bool _ma_apply_undo_row_update(MARIA_HA *info, LSN undo_lsn,
7307 const uchar *header,
7308 size_t header_length
7309 __attribute__((unused)))
7310 {
7311 MARIA_SHARE *share= info->s;
7312 MARIA_RECORD_POS record_pos;
7313 const uchar *field_length_data, *field_length_data_end, *extent_info;
7314 uchar *current_record, *orig_record;
7315 pgcache_page_no_t page;
7316 ha_checksum UNINIT_VAR(checksum_delta);
7317 uint rownr, field_length_header, extent_count, length_on_head_page;
7318 int error;
7319 DBUG_ENTER("_ma_apply_undo_row_update");
7320
7321 page= page_korr(header);
7322 header+= PAGE_STORE_SIZE;
7323 rownr= dirpos_korr(header);
7324 header+= DIRPOS_STORE_SIZE;
7325
7326 record_pos= ma_recordpos(page, rownr);
7327 DBUG_PRINT("enter", ("rowid: %lu page: %lu rownr: %u",
7328 (ulong) record_pos, (ulong) page, rownr));
7329
7330 if (share->calc_checksum)
7331 {
7332 checksum_delta= ha_checksum_korr(header);
7333 header+= HA_CHECKSUM_STORE_SIZE;
7334 }
7335 length_on_head_page= uint2korr(header);
7336 set_if_bigger(length_on_head_page, share->base.min_block_length);
7337 header+= 2;
7338 extent_count= pagerange_korr(header);
7339 header+= PAGERANGE_STORE_SIZE;
7340 extent_info= header;
7341 header+= extent_count * ROW_EXTENT_SIZE;
7342
7343 /*
7344 Set header to point to old field values, generated by
7345 fill_update_undo_parts()
7346 */
7347 field_length_header= ma_get_length(&header);
7348 field_length_data= (uchar*) header;
7349 header+= field_length_header;
7350 field_length_data_end= header;
7351
7352 /* Allocate buffer for current row & original row */
7353 if (!(current_record= my_malloc(share->base.reclength * 2, MYF(MY_WME))))
7354 DBUG_RETURN(1);
7355 orig_record= current_record+ share->base.reclength;
7356
7357 /* Read current record */
7358 if (_ma_read_block_record(info, current_record, record_pos))
7359 goto err;
7360
7361 if (*field_length_data == 255)
7362 {
7363 /* Bitmap changed */
7364 field_length_data++;
7365 memcpy(orig_record, header, share->base.null_bytes);
7366 header+= share->base.null_bytes;
7367 }
7368 else
7369 memcpy(orig_record, current_record, share->base.null_bytes);
7370 bitmap_clear_all(&info->changed_fields);
7371
7372 while (field_length_data < field_length_data_end)
7373 {
7374 uint field_nr= ma_get_length(&field_length_data), field_length;
7375 MARIA_COLUMNDEF *column= share->columndef + field_nr;
7376 uchar *orig_field_pos= orig_record + column->offset;
7377
7378 bitmap_set_bit(&info->changed_fields, field_nr);
7379 if (field_nr >= share->base.fixed_not_null_fields)
7380 {
7381 if (!(field_length= ma_get_length(&field_length_data)))
7382 {
7383 /* Null field or empty field */
7384 bfill(orig_field_pos, column->fill_length,
7385 column->type == FIELD_SKIP_ENDSPACE ? ' ' : 0);
7386 continue;
7387 }
7388 }
7389 else
7390 field_length= column->length;
7391
7392 switch (column->type) {
7393 case FIELD_CHECK:
7394 case FIELD_NORMAL: /* Fixed length field */
7395 case FIELD_ZERO:
7396 case FIELD_SKIP_PRESPACE: /* Not packed */
7397 memcpy(orig_field_pos, header, column->length);
7398 header+= column->length;
7399 break;
7400 case FIELD_SKIP_ZERO: /* Number */
7401 case FIELD_SKIP_ENDSPACE: /* CHAR */
7402 {
7403 uint diff;
7404 memcpy(orig_field_pos, header, field_length);
7405 if ((diff= (column->length - field_length)))
7406 bfill(orig_field_pos + column->length - diff, diff,
7407 column->type == FIELD_SKIP_ENDSPACE ? ' ' : 0);
7408 header+= field_length;
7409 }
7410 break;
7411 case FIELD_VARCHAR:
7412 if (column->length <= 256)
7413 {
7414 *orig_field_pos++= (uchar) field_length;
7415 }
7416 else
7417 {
7418 int2store(orig_field_pos, field_length);
7419 orig_field_pos+= 2;
7420 }
7421 memcpy(orig_field_pos, header, field_length);
7422 header+= field_length;
7423 break;
7424 case FIELD_BLOB:
7425 {
7426 uint size_length= column->length - portable_sizeof_char_ptr;
7427 _ma_store_blob_length(orig_field_pos, size_length, field_length);
7428 memcpy(orig_field_pos + size_length, &header, sizeof(header));
7429 header+= field_length;
7430 break;
7431 }
7432 default:
7433 DBUG_ASSERT(0);
7434 }
7435 }
7436 copy_not_changed_fields(info, &info->changed_fields,
7437 orig_record, current_record);
7438
7439 if (share->calc_checksum)
7440 {
7441 info->new_row.checksum= checksum_delta +
7442 (info->cur_row.checksum= (*share->calc_checksum)(info, orig_record));
7443 /* verify that record's content is sane */
7444 DBUG_ASSERT(info->new_row.checksum ==
7445 (*share->calc_checksum)(info, current_record));
7446 }
7447
7448 info->last_auto_increment= ~ (ulonglong) 0;
7449 /* Now records are up to date, execute the update to original values */
7450 if (_ma_update_at_original_place(info, page, rownr, length_on_head_page,
7451 extent_count, extent_info,
7452 current_record, orig_record, undo_lsn))
7453 goto err;
7454
7455 error= 0;
7456 end:
7457 my_free(current_record);
7458 DBUG_RETURN(error);
7459
7460 err:
7461 DBUG_ASSERT(!maria_assert_if_crashed_table);
7462 error= 1;
7463 _ma_mark_file_crashed(share);
7464 goto end;
7465 }
7466
7467
7468 /**
7469 Execute undo of a bulk insert which used repair
7470
7471 @return Operation status
7472 @retval 0 OK
7473 @retval 1 Error
7474 */
7475
_ma_apply_undo_bulk_insert(MARIA_HA * info,LSN undo_lsn)7476 my_bool _ma_apply_undo_bulk_insert(MARIA_HA *info, LSN undo_lsn)
7477 {
7478 my_bool error;
7479 LSN lsn;
7480 DBUG_ENTER("_ma_apply_undo_bulk_insert");
7481 /*
7482 We delete all rows, re-enable indices as bulk insert had disabled
7483 non-unique ones.
7484 */
7485 error= (maria_delete_all_rows(info) ||
7486 maria_enable_indexes(info) ||
7487 /* we enabled indices so need '2' below */
7488 _ma_state_info_write(info->s,
7489 MA_STATE_INFO_WRITE_DONT_MOVE_OFFSET |
7490 MA_STATE_INFO_WRITE_FULL_INFO |
7491 MA_STATE_INFO_WRITE_LOCK) ||
7492 _ma_write_clr(info, undo_lsn, LOGREC_UNDO_BULK_INSERT,
7493 FALSE, 0, &lsn, NULL));
7494 DBUG_RETURN(error);
7495 }
7496
7497
7498 /**
7499 @brief Get the TRANSLOG_ADDRESS to flush up to
7500
7501 @param page Page's content
7502 @param page_no Page's number (<offset>/<page length>)
7503 @param data_ptr Callback data pointer (pointer to MARIA_SHARE)
7504
7505 @note
7506 Usable for data (non-bitmap) and index pages
7507
7508 @retval LSN to flush up to
7509 */
7510
7511 TRANSLOG_ADDRESS
maria_page_get_lsn(uchar * page,pgcache_page_no_t page_no,uchar * data_ptr)7512 maria_page_get_lsn(uchar *page,
7513 pgcache_page_no_t page_no __attribute__((unused)),
7514 uchar* data_ptr __attribute__((unused)))
7515 {
7516 #ifndef DBUG_OFF
7517 const MARIA_SHARE *share= (MARIA_SHARE*)data_ptr;
7518 DBUG_ASSERT(share->page_type == PAGECACHE_LSN_PAGE &&
7519 share->now_transactional);
7520 #endif
7521 return lsn_korr(page);
7522 }
7523
7524
7525 /**
7526 @brief Enable reading of all rows, ignoring versioning
7527
7528 @note
7529 This is mainly useful in single user applications, like maria_pack,
7530 where we want to be able to read all rows without having to read the
7531 transaction id from the control file
7532 */
7533
maria_ignore_trids(MARIA_HA * info)7534 void maria_ignore_trids(MARIA_HA *info)
7535 {
7536 if (info->s->base.born_transactional)
7537 {
7538 if (!info->trn)
7539 _ma_set_tmp_trn_for_table(info, &dummy_transaction_object);
7540 /* Ignore transaction id when row is read */
7541 info->trn->min_read_from= ~(TrID) 0;
7542 }
7543 }
7544
7545
7546 #ifndef DBUG_OFF
7547
7548 /* The following functions are useful to call from debugger */
7549
_ma_print_block_info(MARIA_SHARE * share,uchar * buff)7550 void _ma_print_block_info(MARIA_SHARE *share, uchar *buff)
7551 {
7552 LSN lsn= lsn_korr(buff);
7553
7554 printf("LSN: " LSN_FMT " type: %u dir_entries: %u dir_free: %u empty_space: %u\n",
7555 LSN_IN_PARTS(lsn),
7556 (uint)buff[PAGE_TYPE_OFFSET],
7557 (uint)buff[DIR_COUNT_OFFSET],
7558 (uint)buff[DIR_FREE_OFFSET],
7559 (uint) uint2korr(buff + EMPTY_SPACE_OFFSET));
7560 printf("Start of directory: %lu\n",
7561 maria_block_size - PAGE_SUFFIX_SIZE -
7562 (uint) buff[DIR_COUNT_OFFSET] * DIR_ENTRY_SIZE);
7563 _ma_print_directory(share, stdout, buff, maria_block_size);
7564 }
7565 #endif
7566