1 /* Copyright (C) 2007-2008 Michael Widenius
2
3 This program is free software; you can redistribute it and/or modify
4 it under the terms of the GNU General Public License as published by
5 the Free Software Foundation; version 2 of the License.
6
7 This program is distributed in the hope that it will be useful,
8 but WITHOUT ANY WARRANTY; without even the implied warranty of
9 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
10 GNU General Public License for more details.
11
12 You should have received a copy of the GNU General Public License
13 along with this program; if not, write to the Free Software
14 Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1335 USA */
15
16 /*
17 Storage of records in block
18
19 Some clarifications about the abbrev used:
20
21 NULL fields -> Fields that may have contain a NULL value.
22 Not null fields -> Fields that may not contain a NULL value.
23 Critical fields -> Fields that can't be null and can't be dropped without
24 causing a table reorganization.
25
26
27 Maria will have a LSN at start of each page (excluding the bitmap pages)
28
29 The different page types that are in a data file are:
30
31 Bitmap pages Map of free pages in the next extent (8192 page size
32 gives us 256M of mapped pages / bitmap)
33 Head page Start of rows are stored on this page.
34 A rowid always points to a head page
35 Blob page This page is totally filled with data from one blob or by
36 a set of long VARCHAR/CHAR fields
37 Tail page This contains the last part from different rows, blobs
38 or varchar fields.
39
40 The data file starts with a bitmap page, followed by as many data
41 pages as the bitmap can cover. After this there is a new bitmap page
42 and more data pages etc.
43
44 For information about the bitmap page, see ma_bitmap.c
45
46 Structure of data and tail page:
47
48 The page has a row directory at end of page to allow us to do deletes
49 without having to reorganize the page. It also allows us to later store
50 some more bytes after each row to allow them to grow without having to move
51 around other rows.
52
53 Page header:
54
55 LSN 7 bytes Log position for last page change
56 PAGE_TYPE 1 uchar 0 unalloced / 1 for head / 2 for tail / 3 for blob
57 DIR_COUNT 1 uchar Number of row/tail entries on page
58 FREE_DIR_LINK 1 uchar Pointer to first free director entry or 255 if no
59 empty space 2 bytes Bytes of empty space on page
60
61 The most significant bit in PAGE_TYPE is set to 1 if the data on the page
62 can be compacted to get more space. (PAGE_CAN_BE_COMPACTED)
63
64 Row data
65
66 Row directory of NO entries, that consist of the following for each row
67 (in reverse order; i.e., first record is stored last):
68
69 Position 2 bytes Position of row on page
70 Length 2 bytes Length of entry
71
72 For Position and Length, the 1 most significant bit of the position and
73 the 1 most significant bit of the length could be used for some states of
74 the row (in other words, we should try to keep these reserved)
75
76 Position is 0 if the entry is not used. In this case length[0] points
77 to a previous free entry (255 if no previous entry) and length[1]
78 to the next free entry (or 255 if last free entry). This works because
79 the directory entry 255 can never be marked free (if the first directory
80 entry is freed, the directory is shrinked).
81
82 checksum 4 bytes Reserved for full page read testing and live backup.
83
84 ----------------
85
86 Structure of blob pages:
87
88 LSN 7 bytes Log position for last page change
89 PAGE_TYPE 1 uchar 3
90
91 data
92
93 -----------------
94
95 Row data structure:
96
97 Flag 1 uchar Marker of which header field exists
98 TRANSID 6 bytes TRANSID of changing transaction
99 (optional, added on insert and first
100 update/delete)
101 VER_PTR 7 bytes Pointer to older version in log
102 (undo record)
103 (optional, added after first
104 update/delete)
105 DELETE_TRANSID 6 bytes (optional). TRANSID of original row.
106 Added on delete.
107 Nulls_extended 1 uchar To allow us to add new DEFAULT NULL
108 fields (optional, added after first
109 change of row after alter table)
110 Number of ROW_EXTENT's 1-3 uchar Length encoded, optional
111 This is the number of extents the
112 row is split into
113 First row_extent 7 uchar Pointer to first row extent (optional)
114
115 Total length of length array 1-3 uchar Only used if we have
116 char/varchar/blob fields.
117 Row checksum 1 uchar Only if table created with checksums
118 Null_bits .. One bit for each NULL field (a field that may
119 have the value NULL)
120 Empty_bits .. One bit for each field that may be 'empty'.
121 (Both for null and not null fields).
122 This bit is 1 if the value for the field is
123 0 or empty string.
124
125 field_offsets 2 byte/offset
126 For each 32'th field, there is one offset
127 that points to where the field information
128 starts in the block. This is to provide
129 fast access to later field in the row
130 when we only need to return a small
131 set of fields.
132 TODO: Implement this.
133
134 Things marked above as 'optional' will only be present if the
135 corresponding bit is set in 'Flag' field. Flag gives us a way to
136 get more space on a page when doing page compaction as we don't need
137 to store TRANSID that have committed before the smallest running
138 transaction we have in memory.
139
140 Data in the following order:
141 (Field order is precalculated when table is created)
142
143 Critical fixed length, not null, fields. (Note, these can't be dropped)
144 Fixed length, null fields
145
146 Length array, 1-4 uchar per field for all CHAR/VARCHAR/BLOB fields.
147 Number of bytes used in length array per entry is depending on max length
148 for field.
149
150 ROW_EXTENT's
151 CHAR data (space stripped)
152 VARCHAR data
153 BLOB data
154
155 Fields marked in null_bits or empty_bits are not stored in data part or
156 length array.
157
158 If row doesn't fit into the given block, then the first EXTENT will be
159 stored last on the row. This is done so that we don't break any field
160 data in the middle.
161
162 We first try to store the full row into one block. If that's not possible
163 we move out each big blob into their own extents. If this is not enough we
164 move out a concatenation of all varchars to their own extent.
165
166 Each blob and the concatenated char/varchar fields are stored the following
167 way:
168 - Store the parts in as many full-contiguous pages as possible.
169 - The last part, that doesn't fill a full page, is stored in tail page.
170
171 When doing an insert of a new row, we don't have to have
172 VER_PTR in the row. This will make rows that are not changed stored
173 efficiently. On update and delete we would add TRANSID (if it was an old
174 committed row) and VER_PTR to
175 the row. On row page compaction we can easily detect rows where
176 TRANSID was committed before the longest running transaction
177 started and we can then delete TRANSID and VER_PTR from the row to
178 gain more space.
179
180 If a row is deleted in Maria, we change TRANSID to the deleting
181 transaction's id, change VER_PTR to point to the undo record for the delete,
182 and add DELETE_TRANSID (the id of the transaction which last
183 inserted/updated the row before its deletion). DELETE_TRANSID allows an old
184 transaction to avoid reading the log to know if it can see the last version
185 before delete (in other words it reduces the probability of having to follow
186 VER_PTR). TODO: depending on a compilation option, evaluate the performance
187 impact of not storing DELETE_TRANSID (which would make the row smaller).
188
189 Description of the different parts:
190
191 Flag is coded as:
192
193 Description bit
194 TRANS_ID_exists 0
195 VER_PTR_exists 1
196 Row is deleted 2 (Means that DELETE_TRANSID exists)
197 Nulls_extended_exists 3
198 Row is split 7 This means that 'Number_of_row_extents' exists
199
200 Nulls_extended is the number of new DEFAULT NULL fields in the row
201 compared to the number of DEFAULT NULL fields when the first version
202 of the table was created. If Nulls_extended doesn't exist in the row,
203 we know it's 0 as this must be one of the original rows from when the
204 table was created first time. This coding allows us to add 255*8 =
205 2048 new fields without requiring a full alter table.
206
207 Empty_bits is used to allow us to store 0, 0.0, empty string, empty
208 varstring and empty blob efficiently. (This is very good for data
209 warehousing where NULL's are often regarded as evil). Having this
210 bitmap also allows us to drop information of a field during a future
211 delete if field was deleted with ALTER TABLE DROP COLUMN. To be able
212 to handle DROP COLUMN, we must store in the index header the fields
213 that has been dropped. When unpacking a row we will ignore dropped
214 fields. When storing a row, we will mark a dropped field either with a
215 null in the null bit map or in the empty_bits and not store any data
216 for it.
217 TODO: Add code for handling dropped fields.
218
219
220 A ROW EXTENT is range of pages. One ROW_EXTENT is coded as:
221
222 START_PAGE 5 bytes
223 PAGE_COUNT 2 bytes. Bit 16 is set if this is a tail page.
224 Bit 15 is to set if this is start of a new
225 blob extent.
226
227 With 8K pages, we can cover 256M in one extent. This coding gives us a
228 maximum file size of 2^40*8192 = 8192 tera
229
230 As an example of ROW_EXTENT handling, assume a row with one integer
231 field (value 5), two big VARCHAR fields (size 250 and 8192*3), and 2
232 big BLOB fields that we have updated.
233
234 The record format for storing this into an empty file would be:
235
236 Page 1:
237
238 00 00 00 00 00 00 00 LSN
239 01 Only one row in page
240 FF No free dir entry
241 xx xx Empty space on page
242
243 10 Flag: row split, VER_PTR exists
244 01 00 00 00 00 00 TRANSID 1
245 00 00 00 00 00 01 00 VER_PTR to first block in LOG file 1
246 5 Number of row extents
247 02 00 00 00 00 03 00 VARCHAR's are stored in full pages 2,3,4
248 0 No null fields
249 0 No empty fields
250 05 00 00 00 00 00 80 Tail page for VARCHAR, rowid 0
251 06 00 00 00 00 80 00 First blob, stored at page 6-133
252 05 00 00 00 00 01 80 Tail of first blob (896 bytes) at page 5
253 86 00 00 00 00 80 00 Second blob, stored at page 134-262
254 05 00 00 00 00 02 80 Tail of second blob (896 bytes) at page 5
255 05 00 5 integer
256 FA Length of first varchar field (size 250)
257 00 60 Length of second varchar field (size 8192*3)
258 00 60 10 First medium BLOB, 1M
259 01 00 10 00 Second BLOB, 1M
260 xx xx xx xx xx xx Varchars are stored here until end of page
261
262 ..... until end of page
263
264 09 00 F4 1F Start position 9, length 8180
265 xx xx xx xx Checksum
266
267 A data page is allowed to have a wrong CRC and header as long as it is
268 marked empty in the bitmap and its directory's count is 0.
269 */
270
271 #include "maria_def.h"
272 #include "ma_blockrec.h"
273 #include "trnman.h"
274 #include "ma_trnman.h"
275 #include "ma_key_recover.h"
276 #include "ma_recovery_util.h"
277 #include <lf.h>
278
279 /*
280 Struct for having a cursor over a set of extent.
281 This is used to loop over all extents for a row when reading
282 the row data. It's also used to store the tail positions for
283 a read row to be used by a later update/delete command.
284 */
285
286 typedef struct st_maria_extent_cursor
287 {
288 /*
289 Pointer to packed uchar array of extents for the row.
290 Format is described above in the header
291 */
292 uchar *extent;
293 /* Where data starts on page; Only for debugging */
294 uchar *data_start;
295 /* Position to all tails in the row. Updated when reading a row */
296 MARIA_RECORD_POS *tail_positions;
297 /* Current page */
298 pgcache_page_no_t page;
299 /* How many pages in the page region */
300 uint page_count;
301 /* What kind of lock to use for tail pages */
302 enum pagecache_page_lock lock_for_tail_pages;
303 /* Total number of extents (i.e., entries in the 'extent' slot) */
304 uint extent_count;
305 /* <> 0 if current extent is a tail page; Set while using cursor */
306 uint tail;
307 /* Position for tail on tail page */
308 uint tail_row_nr;
309 /*
310 == 1 if we are working on the first extent (i.e., the one that is stored in
311 the row header, not an extent that is stored as part of the row data).
312 */
313 my_bool first_extent;
314 } MARIA_EXTENT_CURSOR;
315
316
317 /**
318 @brief Structure for passing down info to write_hook_for_clr_end().
319 This hooks needs to know the variation of the live checksum caused by the
320 current operation to update state.checksum under log's mutex,
321 needs to know the transaction's previous undo_lsn to set
322 trn->undo_lsn under log mutex, and needs to know the type of UNDO being
323 undone now to modify state.records under log mutex.
324 */
325
326 /** S:share,D:checksum_delta,E:expression,P:pointer_into_record,L:length */
327 #define store_checksum_in_rec(S,D,E,P,L) do \
328 { \
329 D= 0; \
330 if ((S)->calc_checksum != NULL) \
331 { \
332 D= (E); \
333 ha_checksum_store(P, D); \
334 L+= HA_CHECKSUM_STORE_SIZE; \
335 } \
336 } while (0)
337
338
339 static my_bool delete_tails(MARIA_HA *info, MARIA_RECORD_POS *tails);
340 static my_bool delete_head_or_tail(MARIA_HA *info,
341 pgcache_page_no_t page, uint record_number,
342 my_bool head, my_bool from_update);
343 #ifndef DBUG_OFF
344 static void _ma_print_directory(MARIA_SHARE *share,
345 FILE *file, uchar *buff, uint block_size);
346 #endif
347 static uchar *store_page_range(MARIA_SHARE *share,
348 uchar *to, MARIA_BITMAP_BLOCK *block,
349 ulong length,
350 uint *tot_ranges);
351 static size_t fill_insert_undo_parts(MARIA_HA *info, const uchar *record,
352 LEX_CUSTRING *log_parts,
353 uint *log_parts_count);
354 static size_t fill_update_undo_parts(MARIA_HA *info, const uchar *oldrec,
355 const uchar *newrec,
356 LEX_CUSTRING *log_parts,
357 uint *log_parts_count);
358
359 /****************************************************************************
360 Initialization
361 ****************************************************************************/
362
363 /*
364 Initialize data needed for block structures
365 */
366
367
368 /* Size of the different header elements for a row */
369
370 static uchar header_sizes[]=
371 {
372 TRANSID_SIZE,
373 VERPTR_SIZE,
374 TRANSID_SIZE, /* Delete transid */
375 1 /* Null extends */
376 };
377
378 /*
379 Calculate array of all used headers
380
381 Used to speed up:
382
383 size= 1;
384 if (flag & 1)
385 size+= TRANSID_SIZE;
386 if (flag & 2)
387 size+= VERPTR_SIZE;
388 if (flag & 4)
389 size+= TRANSID_SIZE
390 if (flag & 8)
391 size+= 1;
392
393 NOTES
394 This is called only once at startup of Maria
395 */
396
397 static uchar total_header_size[1 << array_elements(header_sizes)];
398 #define PRECALC_HEADER_BITMASK (array_elements(total_header_size) -1)
399
_ma_init_block_record_data(void)400 void _ma_init_block_record_data(void)
401 {
402 uint i;
403 bzero(total_header_size, sizeof(total_header_size));
404 total_header_size[0]= FLAG_SIZE; /* Flag uchar */
405 for (i= 1; i < array_elements(total_header_size); i++)
406 {
407 uint size= FLAG_SIZE, j, bit;
408 for (j= 0; (bit= (1 << j)) <= i; j++)
409 {
410 if (i & bit)
411 size+= header_sizes[j];
412 }
413 total_header_size[i]= size;
414 }
415 }
416
417
_ma_once_init_block_record(MARIA_SHARE * share,File data_file)418 my_bool _ma_once_init_block_record(MARIA_SHARE *share, File data_file)
419 {
420 my_bool res;
421 pgcache_page_no_t last_page;
422
423 /*
424 First calculate the max file length with can have with a pointer of size
425 rec_reflength.
426
427 The 'rec_reflength - 1' is because one byte is used for row
428 position withing the page.
429 The /2 comes from _ma_transaction_recpos_to_keypos() where we use
430 the lowest bit to mark if there is a transid following the rownr.
431 */
432 last_page= ((ulonglong) 1 << ((share->base.rec_reflength-1)*8))/2;
433 if (!last_page) /* Overflow; set max size */
434 last_page= ~(pgcache_page_no_t) 0;
435
436 res= _ma_bitmap_init(share, data_file, &last_page);
437 share->base.max_data_file_length= _ma_safe_mul(last_page + 1,
438 share->block_size);
439 #if SIZEOF_OFF_T == 4
440 set_if_smaller(share->base.max_data_file_length, INT_MAX32);
441 #endif
442 return res;
443 }
444
445
_ma_once_end_block_record(MARIA_SHARE * share)446 my_bool _ma_once_end_block_record(MARIA_SHARE *share)
447 {
448 int res= _ma_bitmap_end(share);
449 if (share->bitmap.file.file >= 0)
450 {
451 if (flush_pagecache_blocks(share->pagecache, &share->bitmap.file,
452 share->deleting ? FLUSH_IGNORE_CHANGED : FLUSH_RELEASE))
453 res= 1;
454 /*
455 File must be synced as it is going out of the maria_open_list and so
456 becoming unknown to Checkpoint.
457 */
458 if (!share->s3_path)
459 {
460 if (share->now_transactional &&
461 mysql_file_sync(share->bitmap.file.file, MYF(MY_WME)))
462 res= 1;
463 if (mysql_file_close(share->bitmap.file.file, MYF(MY_WME)))
464 res= 1;
465 }
466 /*
467 Trivial assignment to guard against multiple invocations
468 (May happen if file are closed but we want to keep the maria object
469 around a bit longer)
470 */
471 share->bitmap.file.file= -1;
472 }
473 if (share->id != 0)
474 {
475 /*
476 We de-assign the id even though index has not been flushed, this is ok
477 as close_lock serializes us with a Checkpoint looking at our share.
478 */
479 translog_deassign_id_from_share(share);
480 }
481 return res;
482 }
483
484
485 /* Init info->cur_row structure */
486
_ma_init_block_record(MARIA_HA * info)487 my_bool _ma_init_block_record(MARIA_HA *info)
488 {
489 MARIA_ROW *row= &info->cur_row, *new_row= &info->new_row;
490 MARIA_SHARE *share= info->s;
491 myf flag= MY_WME | (share->temporary ? MY_THREAD_SPECIFIC : 0);
492 uint default_extents;
493 DBUG_ENTER("_ma_init_block_record");
494
495 if (!my_multi_malloc(PSI_INSTRUMENT_ME, flag,
496 &row->empty_bits, share->base.pack_bytes,
497 &row->field_lengths,
498 share->base.max_field_lengths + 2,
499 &row->blob_lengths, sizeof(ulong) * share->base.blobs,
500 &row->null_field_lengths, (sizeof(uint) *
501 (share->base.fields -
502 share->base.blobs +
503 EXTRA_LENGTH_FIELDS)),
504 &row->tail_positions, (sizeof(MARIA_RECORD_POS) *
505 (share->base.blobs + 2)),
506 &new_row->empty_bits, share->base.pack_bytes,
507 &new_row->field_lengths,
508 share->base.max_field_lengths + 2,
509 &new_row->blob_lengths,
510 sizeof(ulong) * share->base.blobs,
511 &new_row->null_field_lengths, (sizeof(uint) *
512 (share->base.fields -
513 share->base.blobs +
514 EXTRA_LENGTH_FIELDS)),
515 &info->log_row_parts,
516 sizeof(*info->log_row_parts) *
517 (TRANSLOG_INTERNAL_PARTS + 3 +
518 share->base.fields + 3),
519 &info->update_field_data,
520 (share->base.fields * 4 +
521 share->base.max_field_lengths + 1 + 4),
522 NullS, 0))
523 DBUG_RETURN(1);
524 /* Skip over bytes used to store length of field length for logging */
525 row->field_lengths+= 2;
526 new_row->field_lengths+= 2;
527
528 /* Reserve some initial space to avoid mallocs during execution */
529 default_extents= (ELEMENTS_RESERVED_FOR_MAIN_PART + 1 +
530 (AVERAGE_BLOB_SIZE /
531 FULL_PAGE_SIZE(share) /
532 BLOB_SEGMENT_MIN_SIZE));
533
534 if (my_init_dynamic_array(PSI_INSTRUMENT_ME, &info->bitmap_blocks,
535 sizeof(MARIA_BITMAP_BLOCK),
536 default_extents, 64, flag))
537 goto err;
538 info->cur_row.extents_buffer_length= default_extents * ROW_EXTENT_SIZE;
539 if (!(info->cur_row.extents= my_malloc(PSI_INSTRUMENT_ME,
540 info->cur_row.extents_buffer_length,
541 flag)))
542 goto err;
543
544 info->row_base_length= share->base_length;
545 info->row_flag= share->base.default_row_flag;
546
547 /*
548 We need to reserve 'EXTRA_LENGTH_FIELDS' number of parts in
549 null_field_lengths to allow splitting of rows in 'find_where_to_split_row'
550 */
551 row->null_field_lengths+= EXTRA_LENGTH_FIELDS;
552 new_row->null_field_lengths+= EXTRA_LENGTH_FIELDS;
553
554 DBUG_RETURN(0);
555
556 err:
557 _ma_end_block_record(info);
558 DBUG_RETURN(1);
559 }
560
561
_ma_end_block_record(MARIA_HA * info)562 void _ma_end_block_record(MARIA_HA *info)
563 {
564 DBUG_ENTER("_ma_end_block_record");
565 my_free(info->cur_row.empty_bits);
566 delete_dynamic(&info->bitmap_blocks);
567 my_free(info->cur_row.extents);
568 my_free(info->blob_buff);
569 /*
570 The data file is closed, when needed, in ma_once_end_block_record().
571 The following protects us from doing an extra, not allowed, close
572 in maria_close()
573 */
574 info->dfile.file= -1;
575 DBUG_VOID_RETURN;
576 }
577
578
579 /****************************************************************************
580 Helper functions
581 ****************************************************************************/
582
583 /*
584 Return the next unused postion on the page after a directory entry.
585
586 SYNOPSIS
587 start_of_next_entry()
588 dir Directory entry to be used. This can not be the
589 the last entry on the page!
590
591 RETURN
592 # Position in page where next entry starts.
593 Everything between the '*dir' and this are free to be used.
594 */
595
start_of_next_entry(uchar * dir)596 static inline uint start_of_next_entry(uchar *dir)
597 {
598 uchar *prev;
599 /*
600 Find previous used entry. (There is always a previous entry as
601 the directory never starts with a deleted entry)
602 */
603 for (prev= dir - DIR_ENTRY_SIZE ;
604 prev[0] == 0 && prev[1] == 0 ;
605 prev-= DIR_ENTRY_SIZE)
606 {}
607 return (uint) uint2korr(prev);
608 }
609
610
611 /*
612 Return the offset where the previous entry ends (before on page)
613
614 SYNOPSIS
615 end_of_previous_entry()
616 dir Address for current directory entry
617 end Address to last directory entry
618
619 RETURN
620 # Position where previous entry ends (smallest address on page)
621 Everything between # and current entry are free to be used.
622 */
623
624
end_of_previous_entry(MARIA_SHARE * share,uchar * dir,uchar * end)625 static inline uint end_of_previous_entry(MARIA_SHARE *share,
626 uchar *dir, uchar *end)
627 {
628 uchar *pos;
629 for (pos= dir + DIR_ENTRY_SIZE ; pos < end ; pos+= DIR_ENTRY_SIZE)
630 {
631 uint offset;
632 if ((offset= uint2korr(pos)))
633 return offset + uint2korr(pos+2);
634 }
635 return PAGE_HEADER_SIZE(share);
636 }
637
638
639 #ifndef DBUG_OFF
640
_ma_print_directory(MARIA_SHARE * share,FILE * file,uchar * buff,uint block_size)641 static void _ma_print_directory(MARIA_SHARE *share,
642 FILE *file, uchar *buff, uint block_size)
643 {
644 uint max_entry= (uint) ((uchar *) buff)[DIR_COUNT_OFFSET], row= 0;
645 uint end_of_prev_row= PAGE_HEADER_SIZE(share);
646 uchar *dir, *end;
647
648 dir= dir_entry_pos(buff, block_size, max_entry-1);
649 end= dir_entry_pos(buff, block_size, 0);
650
651 DBUG_LOCK_FILE; /* If using DBUG_FILE */
652 fprintf(file,"Directory dump (pos:length):\n");
653
654 for (row= 1; dir <= end ; end-= DIR_ENTRY_SIZE, row++)
655 {
656 uint offset= uint2korr(end);
657 uint length= uint2korr(end+2);
658 fprintf(file, " %4u:%4u", offset, offset ? length : 0);
659 if (!(row % (80/12)))
660 fputc('\n', file);
661 if (offset)
662 {
663 DBUG_ASSERT(offset >= end_of_prev_row);
664 end_of_prev_row= offset + length;
665 }
666 }
667 fputc('\n', file);
668 fflush(file);
669 DBUG_UNLOCK_FILE;
670 }
671
672
check_directory(MARIA_SHARE * share,uchar * buff,uint block_size,uint min_row_length,uint real_empty_size)673 static void check_directory(MARIA_SHARE *share,
674 uchar *buff, uint block_size, uint min_row_length,
675 uint real_empty_size)
676 {
677 uchar *dir, *end;
678 uint max_entry= (uint) buff[DIR_COUNT_OFFSET];
679 uint start_of_dir, deleted;
680 uint end_of_prev_row= PAGE_HEADER_SIZE(share);
681 uint empty_size_on_page;
682 uint empty_size;
683 uchar free_entry, prev_free_entry;
684
685 dir= dir_entry_pos(buff, block_size, max_entry-1);
686 start_of_dir= (uint) (dir - buff);
687 end= dir_entry_pos(buff, block_size, 0);
688 deleted= empty_size= 0;
689
690 empty_size_on_page= (real_empty_size != (uint) -1 ? real_empty_size :
691 uint2korr(buff + EMPTY_SPACE_OFFSET));
692
693 /* Ensure that all rows are in increasing order and no overlaps */
694 for (; dir <= end ; end-= DIR_ENTRY_SIZE)
695 {
696 uint offset= uint2korr(end);
697 uint length= uint2korr(end+2);
698 if (offset)
699 {
700 DBUG_ASSERT(offset >= end_of_prev_row);
701 DBUG_ASSERT(!length || length >= min_row_length);
702 empty_size+= offset - end_of_prev_row;
703 end_of_prev_row= offset + length;
704 }
705 else
706 deleted++;
707 }
708 empty_size+= start_of_dir - end_of_prev_row;
709 DBUG_ASSERT(end_of_prev_row <= start_of_dir);
710 DBUG_ASSERT(empty_size == empty_size_on_page);
711
712 /* check free links */
713 free_entry= buff[DIR_FREE_OFFSET];
714 prev_free_entry= END_OF_DIR_FREE_LIST;
715 while (free_entry != END_OF_DIR_FREE_LIST)
716 {
717 uchar *dir= dir_entry_pos(buff, block_size, free_entry);
718 DBUG_ASSERT(dir[0] == 0 && dir[1] == 0);
719 DBUG_ASSERT(dir[2] == prev_free_entry);
720 prev_free_entry= free_entry;
721 free_entry= dir[3];
722 deleted--;
723 }
724 DBUG_ASSERT(deleted == 0);
725 }
726 #else
727 #define check_directory(A,B,C,D,E)
728 #endif /* DBUG_OFF */
729
730
731 /**
732 @brief Calculate if there is enough entries on the page
733 */
734
enough_free_entries(uchar * buff,uint block_size,uint wanted_entries)735 static my_bool enough_free_entries(uchar *buff, uint block_size,
736 uint wanted_entries)
737 {
738 uint entries= (uint) buff[DIR_COUNT_OFFSET];
739 uint needed_free_entries, free_entry;
740
741 if (entries + wanted_entries <= MAX_ROWS_PER_PAGE)
742 return 1;
743
744 /* Check if enough free entries in free list */
745 needed_free_entries= entries + wanted_entries - MAX_ROWS_PER_PAGE;
746
747 free_entry= (uint) buff[DIR_FREE_OFFSET];
748 while (free_entry != END_OF_DIR_FREE_LIST)
749 {
750 uchar *dir;
751 if (!--needed_free_entries)
752 return 1;
753 dir= dir_entry_pos(buff, block_size, free_entry);
754 free_entry= dir[3];
755 }
756 return 0; /* Not enough entries */
757 }
758
759
760 /**
761 @brief Check if there is room for more rows on page
762
763 @fn enough_free_entries_on_page
764
765 @return 0 Directory is full
766 @return 1 There is room for more entries on the page
767 */
768
enough_free_entries_on_page(MARIA_SHARE * share,uchar * page_buff)769 my_bool enough_free_entries_on_page(MARIA_SHARE *share,
770 uchar *page_buff)
771 {
772 enum en_page_type page_type;
773 page_type= (enum en_page_type) (page_buff[PAGE_TYPE_OFFSET] &
774 ~(uchar) PAGE_CAN_BE_COMPACTED);
775
776 if (page_type == HEAD_PAGE)
777 {
778 uint row_count= (uint) page_buff[DIR_COUNT_OFFSET];
779 return !(row_count == MAX_ROWS_PER_PAGE &&
780 page_buff[DIR_FREE_OFFSET] == END_OF_DIR_FREE_LIST);
781 }
782 return enough_free_entries(page_buff, share->block_size,
783 1 + share->base.blobs);
784 }
785
786
787 /**
788 @brief Extend a record area to fit a given size block
789
790 @fn extend_area_on_page()
791 @param info Handler
792 @param buff Page buffer
793 @param dir Pointer to dir entry in buffer
794 @param rownr Row number we working on
795 @param block_size Block size of buffer
796 @param request_length How much data we want to put at [dir]
797 @param empty_space Total empty space in buffer
798 This is updated with length after dir
799 is allocated and current block freed
800 @param head_page 1 if head page, 0 for tail page
801
802 @implementation
803 The logic is as follows (same as in _ma_update_block_record())
804 - If new data fits in old block, use old block.
805 - Extend block with empty space before block. If enough, use it.
806 - Extend block with empty space after block. If enough, use it.
807 - Use _ma_compact_block_page() to get all empty space at dir.
808
809 @note
810 The given directory entry is set to rec length.
811 empty_space doesn't include the new directory entry
812
813
814 @return
815 @retval 0 ok
816 @retval ret_offset Pointer to store offset to found area
817 @retval ret_length Pointer to store length of found area
818 @retval [dir] rec_offset is store here too
819
820 @retval 1 error (wrong info in block)
821 */
822
extend_area_on_page(MARIA_HA * info,uchar * buff,uchar * dir,uint rownr,uint request_length,uint * empty_space,uint * ret_offset,uint * ret_length,my_bool head_page)823 static my_bool extend_area_on_page(MARIA_HA *info,
824 uchar *buff, uchar *dir,
825 uint rownr,
826 uint request_length,
827 uint *empty_space, uint *ret_offset,
828 uint *ret_length,
829 my_bool head_page)
830 {
831 uint rec_offset, length, org_rec_length;
832 uint max_entry= (uint) buff[DIR_COUNT_OFFSET];
833 MARIA_SHARE *share= info->s;
834 uint block_size= share->block_size;
835 DBUG_ENTER("extend_area_on_page");
836
837 /*
838 We can't check for min length here as we may have called
839 extend_directory() to create a new (empty) entry just before
840 */
841 check_directory(share, buff, block_size, 0, *empty_space);
842
843 rec_offset= uint2korr(dir);
844 if (rec_offset)
845 {
846 /* Extending old row; Mark current space as 'free' */
847 length= org_rec_length= uint2korr(dir + 2);
848 DBUG_PRINT("info", ("rec_offset: %u length: %u request_length: %u "
849 "empty_space: %u",
850 rec_offset, org_rec_length, request_length,
851 *empty_space));
852
853 *empty_space+= org_rec_length;
854 }
855 else
856 {
857 /* Reusing free directory entry; Free it from the directory list */
858 if (dir[2] == END_OF_DIR_FREE_LIST)
859 buff[DIR_FREE_OFFSET]= dir[3];
860 else
861 {
862 uchar *prev_dir= dir_entry_pos(buff, block_size, (uint) dir[2]);
863 DBUG_ASSERT(uint2korr(prev_dir) == 0 && prev_dir[3] == (uchar) rownr);
864 prev_dir[3]= dir[3];
865 }
866 if (dir[3] != END_OF_DIR_FREE_LIST)
867 {
868 uchar *next_dir= dir_entry_pos(buff, block_size, (uint) dir[3]);
869 DBUG_ASSERT(uint2korr(next_dir) == 0 && next_dir[2] == (uchar) rownr);
870 next_dir[2]= dir[2];
871 }
872 rec_offset= start_of_next_entry(dir);
873 length= 0;
874 }
875 if (length < request_length)
876 {
877 uint old_rec_offset;
878 /*
879 New data did not fit in old position.
880 Find first possible position where to put new data.
881 */
882 old_rec_offset= rec_offset;
883 rec_offset= end_of_previous_entry(share,
884 dir, buff + block_size -
885 PAGE_SUFFIX_SIZE);
886 length+= (uint) (old_rec_offset - rec_offset);
887 DBUG_ASSERT(old_rec_offset);
888 /*
889 'length' is 0 if we are doing an insert into a not allocated block.
890 This can only happen during "REDO of INSERT" or "UNDO of DELETE."
891 */
892 if (length < request_length)
893 {
894 /*
895 Did not fit in current block + empty space. Extend with
896 empty space after block.
897 */
898 if (rownr == max_entry - 1)
899 {
900 /* Last entry; Everything is free between this and directory */
901 length= ((block_size - PAGE_SUFFIX_SIZE - DIR_ENTRY_SIZE * max_entry) -
902 rec_offset);
903 }
904 else
905 length= start_of_next_entry(dir) - rec_offset;
906 DBUG_ASSERT((int) length >= 0);
907 if (length < request_length)
908 {
909 /* Not enough continuous space, compact page to get more */
910 int2store(dir, rec_offset);
911 /* Reset length, as this may be a deleted block */
912 int2store(dir+2, 0);
913 _ma_compact_block_page(share,
914 buff, rownr, 1,
915 head_page ? info->trn->min_read_from: 0,
916 head_page ? share->base.min_block_length : 0);
917 rec_offset= uint2korr(dir);
918 length= uint2korr(dir+2);
919 if (length < request_length)
920 {
921 DBUG_PRINT("error", ("Not enough space: "
922 "length: %u request_length: %u",
923 length, request_length));
924 _ma_set_fatal_error(share, HA_ERR_WRONG_IN_RECORD);
925 DBUG_RETURN(1); /* Error in block */
926 }
927 *empty_space= length; /* All space is here */
928 }
929 }
930 }
931 int2store(dir, rec_offset);
932 int2store(dir + 2, length);
933 *ret_offset= rec_offset;
934 *ret_length= length;
935
936 check_directory(share,
937 buff, block_size,
938 head_page ? share->base.min_block_length : 0,
939 *empty_space - length);
940 DBUG_RETURN(0);
941 }
942
943
944 /**
945 @brief Copy not changed fields from 'from' to 'to'
946
947 @notes
948 Assumption is that most fields are not changed!
949 (Which is why we don't test if all bits are set for some bytes in bitmap)
950 */
951
copy_not_changed_fields(MARIA_HA * info,MY_BITMAP * changed_fields,uchar * to,uchar * from)952 void copy_not_changed_fields(MARIA_HA *info, MY_BITMAP *changed_fields,
953 uchar *to, uchar *from)
954 {
955 MARIA_COLUMNDEF *column, *end_column;
956 uchar *bitmap= (uchar*) changed_fields->bitmap;
957 MARIA_SHARE *share= info->s;
958 uint bit= 1;
959
960 for (column= share->columndef, end_column= column+ share->base.fields;
961 column < end_column; column++)
962 {
963 if (!(*bitmap & bit))
964 {
965 uint field_length= column->length;
966 if (column->type == FIELD_VARCHAR)
967 {
968 if (column->fill_length == 1)
969 field_length= (uint) from[column->offset] + 1;
970 else
971 field_length= uint2korr(from + column->offset) + 2;
972 }
973 memcpy(to + column->offset, from + column->offset, field_length);
974 }
975 if ((bit= (bit << 1)) == 256)
976 {
977 bitmap++;
978 bit= 1;
979 }
980 }
981 }
982
983 #ifdef NOT_YET_NEEDED
984 /* Calculate empty space on a page */
985
empty_space_on_page(uchar * buff,uint block_size)986 static uint empty_space_on_page(uchar *buff, uint block_size)
987 {
988 enum en_page_type;
989 page_type= (enum en_page_type) (buff[PAGE_TYPE_OFFSET] &
990 ~(uchar) PAGE_CAN_BE_COMPACTED);
991 if (page_type == UNALLOCATED_PAGE)
992 return block_size;
993 if ((uint) page_type <= TAIL_PAGE)
994 return uint2korr(buff+EMPTY_SPACE_OFFSET);
995 return 0; /* Blob page */
996 }
997 #endif
998
999
1000 /*
1001 @brief Ensure we have space for new directory entries
1002
1003 @fn make_space_for_directory()
1004 @param info Handler
1005 @param buff Page buffer
1006 @param max_entry Number of current entries in directory
1007 @param count Number of new entries to be added to directory
1008 @param first_dir First directory entry on page
1009 @param empty_space Total empty space in buffer. It's updated
1010 to reflect the new empty space
1011 @param first_pos Store position to last data byte on page here
1012 @param head_page 1 if head page, 0 for tail page.
1013
1014 @note
1015 This function is inline as the argument passing is the biggest
1016 part of the function
1017
1018 @return
1019 @retval 0 ok
1020 @retval 1 error (No data on page, fatal error)
1021 */
1022
1023 static inline my_bool
make_space_for_directory(MARIA_HA * info,uchar * buff,uint max_entry,uint count,uchar * first_dir,uint * empty_space,uint * first_pos,my_bool head_page)1024 make_space_for_directory(MARIA_HA *info,
1025 uchar *buff, uint max_entry,
1026 uint count, uchar *first_dir, uint *empty_space,
1027 uint *first_pos,
1028 my_bool head_page)
1029 {
1030 uint length_needed= DIR_ENTRY_SIZE * count;
1031 MARIA_SHARE *share= info->s;
1032
1033 /*
1034 The following is not true only in the case and UNDO is used to reinsert
1035 a row on a previously not used page
1036 */
1037 if (likely(max_entry))
1038 {
1039 /* Check if there is place for the directory entry on the page */
1040 *first_pos= uint2korr(first_dir) + uint2korr(first_dir + 2);
1041
1042 if ((uint) (first_dir - buff) < *first_pos + length_needed)
1043 {
1044 /* Create place for directory */
1045 _ma_compact_block_page(share,
1046 buff, max_entry - 1, 0,
1047 head_page ? info->trn->min_read_from : 0,
1048 head_page ? share->base.min_block_length : 0);
1049 *first_pos= (uint2korr(first_dir) + uint2korr(first_dir + 2));
1050 *empty_space= uint2korr(buff + EMPTY_SPACE_OFFSET);
1051 if (*empty_space < length_needed)
1052 {
1053 /*
1054 We should always have space, as we only come here for
1055 UNDO of DELETE (in which case we know the row was on the
1056 page before) or if the bitmap told us there was space on page
1057 */
1058 DBUG_ASSERT(!maria_assert_if_crashed_table);
1059 return(1);
1060 }
1061 }
1062 }
1063 else
1064 *first_pos= PAGE_HEADER_SIZE(share);
1065
1066 /* Reduce directory entry size from free space size */
1067 (*empty_space)-= length_needed;
1068 buff[DIR_COUNT_OFFSET]= (uchar) (max_entry + count);
1069 return(0);
1070 }
1071
1072
1073 /*
1074 Find free position in directory
1075
1076 SYNOPSIS
1077 find_free_position()
1078 info Handler
1079 buff Page
1080 block_size Size of page
1081 res_rownr Store index to free position here
1082 res_length Store length of found segment here
1083 empty_space Store length of empty space on disk here. This is
1084 all empty space, including the found block.
1085 @param head_page 1 if head page, 0 for tail page.
1086
1087 NOTES
1088 If there is a free directory entry (entry with position == 0),
1089 then use it and change it to be the size of the empty block
1090 after the previous entry. This guarantees that all row entries
1091 are stored on disk in inverse directory order, which makes life easier for
1092 '_ma_compact_block_page()' and to know if there is free space after any
1093 block.
1094
1095 If there is no free entry (entry with position == 0), then we create
1096 a new one. If there is not space for the directory entry (because
1097 the last block overlapps with the directory), we compact the page.
1098
1099 We will update the offset and the length of the found dir entry to
1100 match the position and empty space found.
1101
1102 buff[EMPTY_SPACE_OFFSET] is NOT updated but left up to the caller
1103
1104 See start of file for description of how free directory entires are linked
1105
1106 RETURN
1107 0 Error (directory full or last block goes over directory)
1108 # Pointer to directory entry on page
1109 */
1110
find_free_position(MARIA_HA * info,uchar * buff,uint block_size,uint * res_rownr,uint * res_length,uint * empty_space,my_bool head_page)1111 static uchar *find_free_position(MARIA_HA *info,
1112 uchar *buff, uint block_size, uint *res_rownr,
1113 uint *res_length, uint *empty_space,
1114 my_bool head_page)
1115 {
1116 uint max_entry, free_entry;
1117 uint length, first_pos;
1118 uchar *dir, *first_dir;
1119 MARIA_SHARE *share= info->s;
1120 DBUG_ENTER("find_free_position");
1121
1122 max_entry= (uint) buff[DIR_COUNT_OFFSET];
1123 free_entry= (uint) buff[DIR_FREE_OFFSET];
1124 *empty_space= uint2korr(buff + EMPTY_SPACE_OFFSET);
1125
1126 DBUG_PRINT("info", ("max_entry: %u free_entry: %u", max_entry, free_entry));
1127
1128 first_dir= dir_entry_pos(buff, block_size, max_entry - 1);
1129
1130 /* Search after first free position */
1131 if (free_entry != END_OF_DIR_FREE_LIST)
1132 {
1133 if (free_entry >= max_entry)
1134 DBUG_RETURN(0); /* Consistency error */
1135 dir= dir_entry_pos(buff, block_size, free_entry);
1136 DBUG_ASSERT(uint2korr(dir) == 0 && dir[2] == END_OF_DIR_FREE_LIST);
1137 /* Relink free list */
1138 if ((buff[DIR_FREE_OFFSET]= dir[3]) != END_OF_DIR_FREE_LIST)
1139 {
1140 uchar *next_entry= dir_entry_pos(buff, block_size, (uint) dir[3]);
1141 DBUG_ASSERT((uint) next_entry[2] == free_entry &&
1142 uint2korr(next_entry) == 0);
1143 next_entry[2]= END_OF_DIR_FREE_LIST; /* Backlink */
1144 }
1145
1146 first_pos= end_of_previous_entry(share,
1147 dir, buff + block_size -
1148 PAGE_SUFFIX_SIZE);
1149 length= start_of_next_entry(dir) - first_pos;
1150 int2store(dir, first_pos); /* Update dir entry */
1151 int2store(dir + 2, 0);
1152 *res_rownr= free_entry;
1153 *res_length= length;
1154
1155 check_directory(share, buff, block_size,
1156 head_page ? share->base.min_block_length : 0, (uint) -1);
1157 DBUG_RETURN(dir);
1158 }
1159 /* No free places in dir; create a new one */
1160
1161 /* Check if there is place for the directory entry */
1162 if (max_entry == MAX_ROWS_PER_PAGE)
1163 DBUG_RETURN(0);
1164
1165 if (make_space_for_directory(info, buff, max_entry, 1,
1166 first_dir, empty_space, &first_pos, head_page))
1167 DBUG_RETURN(0);
1168
1169 dir= first_dir - DIR_ENTRY_SIZE;
1170 length= (uint) (dir - buff - first_pos);
1171 DBUG_ASSERT(length <= *empty_space);
1172 int2store(dir, first_pos);
1173 int2store(dir + 2, 0); /* Max length of region */
1174 *res_rownr= max_entry;
1175 *res_length= length;
1176
1177 check_directory(share,
1178 buff, block_size,
1179 head_page ? share->base.min_block_length : 0,
1180 *empty_space);
1181 DBUG_RETURN(dir);
1182 }
1183
1184
1185 /**
1186 @brief Enlarge page directory to hold more entries
1187
1188 @fn extend_directory()
1189 @param info Handler
1190 @param buff Page buffer
1191 @param block_size Block size
1192 @param max_entry Number of directory entries on page
1193 @param new_entry Position for new entry
1194 @param empty_space Total empty space in buffer. It's updated
1195 to reflect the new empty space
1196 @param head_page 1 if head page, 0 for tail page.
1197
1198 @note
1199 This is only called on UNDO when we want to expand the directory
1200 to be able to re-insert row in a given position
1201
1202 The new directory entry will be set to cover the maximum possible space
1203
1204 @return
1205 @retval 0 ok
1206 @retval 1 error (No data on page, fatal error)
1207 */
1208
extend_directory(MARIA_HA * info,uchar * buff,uint block_size,uint max_entry,uint new_entry,uint * empty_space,my_bool head_page)1209 static my_bool extend_directory(MARIA_HA *info, uchar *buff, uint block_size,
1210 uint max_entry, uint new_entry,
1211 uint *empty_space, my_bool head_page)
1212 {
1213 uint length, first_pos;
1214 uchar *dir, *first_dir;
1215 DBUG_ENTER("extend_directory");
1216
1217 /*
1218 Note that in if max_entry is 0, then first_dir will point to
1219 an illegal directory entry. This is ok, as in this case we will
1220 not access anything through first_dir.
1221 */
1222 first_dir= dir_entry_pos(buff, block_size, max_entry) + DIR_ENTRY_SIZE;
1223
1224 if (make_space_for_directory(info, buff, max_entry,
1225 new_entry - max_entry + 1,
1226 first_dir, empty_space, &first_pos, head_page))
1227 DBUG_RETURN(1);
1228
1229 /* Set the new directory entry to cover the max possible length */
1230 dir= first_dir - DIR_ENTRY_SIZE * (new_entry - max_entry + 1);
1231 length= (uint) (dir - buff - first_pos);
1232 int2store(dir, first_pos);
1233 int2store(dir+2, length);
1234 *empty_space-= length;
1235
1236 if (new_entry-- > max_entry)
1237 {
1238 /* Link all row entries between new_entry and max_entry into free list */
1239 uint free_entry= (uint) buff[DIR_FREE_OFFSET];
1240 uint prev_entry= END_OF_DIR_FREE_LIST;
1241 buff[DIR_FREE_OFFSET]= new_entry;
1242 do
1243 {
1244 dir+= DIR_ENTRY_SIZE;
1245 dir[0]= dir[1]= 0;
1246 dir[2]= (uchar) prev_entry;
1247 dir[3]= (uchar) new_entry-1;
1248 prev_entry= new_entry;
1249 } while (new_entry-- > max_entry);
1250 if ((dir[3]= free_entry) != END_OF_DIR_FREE_LIST)
1251 {
1252 /* Relink next entry to point to newly freed entry */
1253 uchar *next_entry= dir_entry_pos(buff, block_size, (uint) dir[3]);
1254 DBUG_ASSERT(uint2korr(next_entry) == 0 &&
1255 next_entry[2] == END_OF_DIR_FREE_LIST);
1256 next_entry[2]= max_entry;
1257 }
1258 }
1259
1260 check_directory(info->s,
1261 buff, block_size,
1262 head_page ? MY_MIN(info->s->base.min_block_length, length) :
1263 0, *empty_space);
1264 DBUG_RETURN(0);
1265 }
1266
1267
1268 /****************************************************************************
1269 Updating records
1270 ****************************************************************************/
1271
1272 /*
1273 Calculate length of all the different field parts
1274
1275 SYNOPSIS
1276 calc_record_size()
1277 info Maria handler
1278 record Row to store
1279 row Store statistics about row here
1280
1281 NOTES
1282 The statistics is used to find out how much space a row will need
1283 and also where we can split a row when we need to split it into several
1284 extents.
1285 */
1286
calc_record_size(MARIA_HA * info,const uchar * record,MARIA_ROW * row)1287 static void calc_record_size(MARIA_HA *info, const uchar *record,
1288 MARIA_ROW *row)
1289 {
1290 MARIA_SHARE *share= info->s;
1291 uchar *field_length_data;
1292 MARIA_COLUMNDEF *column, *end_column;
1293 uint *null_field_lengths= row->null_field_lengths;
1294 ulong *blob_lengths= row->blob_lengths;
1295 DBUG_ENTER("calc_record_size");
1296
1297 row->normal_length= row->char_length= row->varchar_length=
1298 row->blob_length= row->extents_count= 0;
1299
1300 /* Create empty bitmap and calculate length of each varlength/char field */
1301 bzero(row->empty_bits, share->base.pack_bytes);
1302 field_length_data= row->field_lengths;
1303 for (column= share->columndef + share->base.fixed_not_null_fields,
1304 end_column= share->columndef + share->base.fields;
1305 column < end_column; column++, null_field_lengths++)
1306 {
1307 if ((record[column->null_pos] & column->null_bit))
1308 {
1309 if (column->type != FIELD_BLOB)
1310 *null_field_lengths= 0;
1311 else
1312 *blob_lengths++= 0;
1313 continue;
1314 }
1315 switch (column->type) {
1316 case FIELD_CHECK:
1317 case FIELD_NORMAL: /* Fixed length field */
1318 case FIELD_ZERO:
1319 DBUG_ASSERT(column->empty_bit == 0);
1320 /* fall through */
1321 case FIELD_SKIP_PRESPACE: /* Not packed */
1322 row->normal_length+= column->length;
1323 *null_field_lengths= column->length;
1324 break;
1325 case FIELD_SKIP_ZERO: /* Fixed length field */
1326 if (memcmp(record+ column->offset, maria_zero_string,
1327 column->length) == 0)
1328 {
1329 row->empty_bits[column->empty_pos] |= column->empty_bit;
1330 *null_field_lengths= 0;
1331 }
1332 else
1333 {
1334 row->normal_length+= column->length;
1335 *null_field_lengths= column->length;
1336 }
1337 break;
1338 case FIELD_SKIP_ENDSPACE: /* CHAR */
1339 {
1340 const uchar *pos, *end;
1341 for (pos= record + column->offset, end= pos + column->length;
1342 end > pos && end[-1] == ' '; end--)
1343 ;
1344 if (pos == end) /* If empty string */
1345 {
1346 row->empty_bits[column->empty_pos]|= column->empty_bit;
1347 *null_field_lengths= 0;
1348 }
1349 else
1350 {
1351 uint length= (uint) (end - pos);
1352 if (column->length <= 255)
1353 *field_length_data++= (uchar) length;
1354 else
1355 {
1356 int2store(field_length_data, length);
1357 field_length_data+= 2;
1358 }
1359 row->char_length+= length;
1360 *null_field_lengths= length;
1361 }
1362 break;
1363 }
1364 case FIELD_VARCHAR:
1365 {
1366 uint length, field_length_data_length;
1367 const uchar *field_pos= record + column->offset;
1368
1369 /* 256 is correct as this includes the length uchar */
1370 field_length_data[0]= field_pos[0];
1371 if (column->length <= 256)
1372 {
1373 length= (uint) (uchar) *field_pos;
1374 field_length_data_length= 1;
1375 }
1376 else
1377 {
1378 length= uint2korr(field_pos);
1379 field_length_data[1]= field_pos[1];
1380 field_length_data_length= 2;
1381 }
1382 *null_field_lengths= length;
1383 if (!length)
1384 {
1385 row->empty_bits[column->empty_pos]|= column->empty_bit;
1386 break;
1387 }
1388 row->varchar_length+= length;
1389 *null_field_lengths= length;
1390 field_length_data+= field_length_data_length;
1391 break;
1392 }
1393 case FIELD_BLOB:
1394 {
1395 const uchar *field_pos= record + column->offset;
1396 uint size_length= column->length - portable_sizeof_char_ptr;
1397 ulong blob_length= _ma_calc_blob_length(size_length, field_pos);
1398
1399 *blob_lengths++= blob_length;
1400 if (!blob_length)
1401 row->empty_bits[column->empty_pos]|= column->empty_bit;
1402 else
1403 {
1404 row->blob_length+= blob_length;
1405 memcpy(field_length_data, field_pos, size_length);
1406 field_length_data+= size_length;
1407 }
1408 break;
1409 }
1410 default:
1411 DBUG_ASSERT(0);
1412 }
1413 }
1414 row->field_lengths_length= (uint) (field_length_data - row->field_lengths);
1415 /*
1416 - info->row_base_length is base information we must have on a page in first
1417 extent:
1418 - flag byte (1) + is_nulls_extended (0 | 1) + null_bytes + pack_bytes +
1419 table_checksum (0 | 1)
1420 - row->min_length is minimum amount of data we must store on
1421 a page. bitmap code will ensure we get at list this much +
1422 total number of extents and one extent information
1423 - fixed_not_null_fields_length is length of fixed length fields that can't
1424 be compacted
1425 - head_length is the amount of data for the head page
1426 (ie, all fields except blobs)
1427 */
1428 row->min_length= (info->row_base_length +
1429 (share->base.max_field_lengths ?
1430 size_to_store_key_length(row->field_lengths_length) :
1431 0));
1432 row->head_length= (row->min_length +
1433 share->base.fixed_not_null_fields_length +
1434 row->field_lengths_length +
1435 row->normal_length +
1436 row->char_length + row->varchar_length);
1437 row->total_length= (row->head_length + row->blob_length);
1438 if (row->total_length < share->base.min_block_length)
1439 row->total_length= share->base.min_block_length;
1440 DBUG_PRINT("exit", ("head_length: %lu total_length: %lu",
1441 (ulong) row->head_length, (ulong) row->total_length));
1442 DBUG_VOID_RETURN;
1443 }
1444
1445
1446 /**
1447 Compact page by removing all space between rows
1448
1449 Moves up all rows to start of page. Moves blocks that are directly after
1450 each other with one memmove.
1451
1452 @note if rownr is the last row in the page, and extend_block is false,
1453 caller has to make sure to update bitmap page afterwards to reflect freed
1454 space.
1455
1456 @param buff Page to compact
1457 @param block_size Size of page
1458 @param rownr Put empty data after this row
1459 @param extend_block If 1, extend the block at 'rownr' to cover the
1460 whole block.
1461 @param min_read_from If <> 0, remove all trid's that are less than this
1462 */
1463
_ma_compact_block_page(MARIA_SHARE * share,uchar * buff,uint rownr,my_bool extend_block,TrID min_read_from,uint min_row_length)1464 void _ma_compact_block_page(MARIA_SHARE *share,
1465 uchar *buff, uint rownr,
1466 my_bool extend_block, TrID min_read_from,
1467 uint min_row_length)
1468 {
1469 uint max_entry= (uint) buff[DIR_COUNT_OFFSET];
1470 uint page_pos, next_free_pos, start_of_found_block, diff, end_of_found_block;
1471 uint freed_size= 0;
1472 uint block_size= share->block_size;
1473 uchar *dir, *end;
1474 DBUG_ENTER("_ma_compact_block_page");
1475 DBUG_PRINT("enter", ("rownr: %u min_read_from: %lu", rownr,
1476 (ulong) min_read_from));
1477 DBUG_ASSERT(max_entry > 0 &&
1478 max_entry < (block_size - PAGE_HEADER_SIZE(share) -
1479 PAGE_SUFFIX_SIZE) / DIR_ENTRY_SIZE);
1480
1481 /* Move all entries before and including rownr up to start of page */
1482 dir= dir_entry_pos(buff, block_size, rownr);
1483 end= dir_entry_pos(buff, block_size, 0);
1484 page_pos= next_free_pos= start_of_found_block= PAGE_HEADER_SIZE(share);
1485 diff= 0;
1486 for (; dir <= end ; end-= DIR_ENTRY_SIZE)
1487 {
1488 uint offset= uint2korr(end);
1489
1490 if (offset)
1491 {
1492 uint row_length= uint2korr(end + 2);
1493 DBUG_ASSERT(offset >= page_pos);
1494 DBUG_ASSERT(buff + offset + row_length <= dir);
1495 DBUG_ASSERT(row_length >= min_row_length || row_length == 0);
1496
1497 /* Row length can be zero if row is to be deleted */
1498 if (min_read_from && row_length && (buff[offset] & ROW_FLAG_TRANSID))
1499 {
1500 TrID transid= transid_korr(buff+offset+1);
1501 if (transid < min_read_from)
1502 {
1503 /* Remove transid from row by moving the start point of the row up */
1504 buff[offset + TRANSID_SIZE]= buff[offset] & ~ROW_FLAG_TRANSID;
1505 offset+= TRANSID_SIZE;
1506 freed_size+= TRANSID_SIZE;
1507 row_length-= TRANSID_SIZE;
1508 int2store(end+2, row_length);
1509 }
1510 }
1511
1512 if (offset != next_free_pos)
1513 {
1514 uint length= (next_free_pos - start_of_found_block);
1515 /*
1516 There was empty space before this and prev block
1517 Check if we have to move previous block up to page start
1518 */
1519 if (page_pos != start_of_found_block)
1520 {
1521 /* move up previous block */
1522 memmove(buff + page_pos, buff + start_of_found_block, length);
1523 }
1524 page_pos+= length;
1525 /* next continuous block starts here */
1526 start_of_found_block= offset;
1527 diff= offset - page_pos;
1528 }
1529 int2store(end, offset - diff); /* correct current pos */
1530 next_free_pos= offset + row_length;
1531
1532 if (unlikely(row_length < min_row_length) && row_length)
1533 {
1534 /*
1535 This can only happen in the case we compacted transid and
1536 the row become 'too short'
1537
1538 Move the current row down to it's right place and extend it
1539 with 0.
1540 */
1541 uint row_diff= min_row_length - row_length;
1542 uint length= (next_free_pos - start_of_found_block);
1543
1544 DBUG_ASSERT(page_pos != start_of_found_block);
1545 bmove(buff + page_pos, buff + start_of_found_block, length);
1546 bzero(buff+ page_pos + length, row_diff);
1547 page_pos+= min_row_length;
1548 int2store(end+2, min_row_length);
1549 freed_size-= row_diff;
1550 next_free_pos= start_of_found_block= page_pos;
1551 diff= 0;
1552 }
1553 }
1554 }
1555 if (page_pos != start_of_found_block)
1556 {
1557 uint length= (next_free_pos - start_of_found_block);
1558 memmove(buff + page_pos, buff + start_of_found_block, length);
1559 }
1560 start_of_found_block= uint2korr(dir);
1561
1562 if (rownr != max_entry - 1)
1563 {
1564 /* Move all entries after rownr to end of page */
1565 uint rownr_length;
1566
1567 DBUG_ASSERT(extend_block); /* Should always be true */
1568 next_free_pos= end_of_found_block= page_pos=
1569 block_size - DIR_ENTRY_SIZE * max_entry - PAGE_SUFFIX_SIZE;
1570 diff= 0;
1571 /* End points to entry before 'rownr' */
1572 for (dir= buff + end_of_found_block ; dir <= end ; dir+= DIR_ENTRY_SIZE)
1573 {
1574 uint offset= uint2korr(dir);
1575 uint row_length;
1576 uint row_end;
1577 if (!offset)
1578 continue;
1579 row_length= uint2korr(dir + 2);
1580 row_end= offset + row_length;
1581 DBUG_ASSERT(offset >= start_of_found_block &&
1582 row_end <= next_free_pos && row_length >= min_row_length);
1583
1584 if (min_read_from && (buff[offset] & ROW_FLAG_TRANSID))
1585 {
1586 TrID transid= transid_korr(buff + offset+1);
1587 if (transid < min_read_from)
1588 {
1589 /* Remove transid from row */
1590 buff[offset + TRANSID_SIZE]= buff[offset] & ~ROW_FLAG_TRANSID;
1591 offset+= TRANSID_SIZE;
1592 row_length-= TRANSID_SIZE;
1593 int2store(dir+2, row_length);
1594 }
1595 if (unlikely(row_length < min_row_length))
1596 {
1597 /*
1598 This can only happen in the case we compacted transid and
1599 the row become 'too short'
1600 */
1601 uint row_diff= min_row_length - row_length;
1602 if (next_free_pos < row_end + row_diff)
1603 {
1604 /*
1605 Not enough space for extending next block with enough
1606 end 0's. Move current data down to get place for them
1607 */
1608 uint move_down= row_diff - (next_free_pos - row_end);
1609 bmove(buff + offset - move_down, buff + offset, row_length);
1610 offset-= move_down;
1611 }
1612 /*
1613 Extend the next block with 0, which will be part of current
1614 row when the blocks are joined together later
1615 */
1616 bzero(buff + next_free_pos - row_diff, row_diff);
1617 next_free_pos-= row_diff;
1618 int2store(dir+2, min_row_length);
1619 }
1620 row_end= offset + row_length;
1621 }
1622
1623 if (row_end != next_free_pos)
1624 {
1625 uint length= (end_of_found_block - next_free_pos);
1626 if (page_pos != end_of_found_block)
1627 {
1628 /* move next block down */
1629 memmove(buff + page_pos - length, buff + next_free_pos, length);
1630 }
1631 page_pos-= length;
1632 /* next continuous block starts here */
1633 end_of_found_block= row_end;
1634 diff= page_pos - row_end;
1635 }
1636 int2store(dir, offset + diff); /* correct current pos */
1637 next_free_pos= offset;
1638 }
1639 if (page_pos != end_of_found_block)
1640 {
1641 uint length= (end_of_found_block - next_free_pos);
1642 memmove(buff + page_pos - length, buff + next_free_pos, length);
1643 next_free_pos= page_pos- length;
1644 }
1645
1646 /* Extend rownr block to cover hole */
1647 rownr_length= next_free_pos - start_of_found_block;
1648 int2store(dir+2, rownr_length);
1649 DBUG_ASSERT(rownr_length >= min_row_length);
1650 }
1651 else
1652 {
1653 if (extend_block)
1654 {
1655 /* Extend last block to cover whole page */
1656 uint length= ((uint) (dir - buff) - start_of_found_block);
1657 int2store(dir+2, length);
1658 DBUG_ASSERT(length >= min_row_length);
1659 }
1660 else
1661 {
1662 /* Add length gained from freed transaction id's to this page */
1663 uint length= uint2korr(buff+ EMPTY_SPACE_OFFSET) + freed_size;
1664 int2store(buff + EMPTY_SPACE_OFFSET, length);
1665 }
1666 buff[PAGE_TYPE_OFFSET]&= ~(uchar) PAGE_CAN_BE_COMPACTED;
1667 }
1668 check_directory(share, buff, block_size, min_row_length,
1669 extend_block ? 0 : (uint) -1);
1670 DBUG_EXECUTE("directory", _ma_print_directory(share,
1671 DBUG_FILE, buff, block_size););
1672 DBUG_VOID_RETURN;
1673 }
1674
1675
1676 /*
1677 Create an empty tail or head page
1678
1679 SYNOPSIS
1680 make_empty_page()
1681 buff Page buffer
1682 block_size Block size
1683 page_type HEAD_PAGE or TAIL_PAGE
1684 create_dir_entry TRUE of we should create a directory entry
1685
1686 NOTES
1687 EMPTY_SPACE is not updated
1688 */
1689
make_empty_page(MARIA_HA * info,uchar * buff,uint page_type,my_bool create_dir_entry)1690 static void make_empty_page(MARIA_HA *info, uchar *buff, uint page_type,
1691 my_bool create_dir_entry)
1692 {
1693 uint block_size= info->s->block_size;
1694 DBUG_ENTER("make_empty_page");
1695
1696 bzero(buff, PAGE_HEADER_SIZE(info->s));
1697
1698 #if !defined(DONT_ZERO_PAGE_BLOCKS) || defined(HAVE_valgrind)
1699 /*
1700 We zero the rest of the block to avoid getting old memory information
1701 to disk and to allow the file to be compressed better if archived.
1702 The code does not assume the block is zeroed.
1703 */
1704 if (page_type != BLOB_PAGE)
1705 bzero(buff+ PAGE_HEADER_SIZE(info->s),
1706 block_size - PAGE_HEADER_SIZE(info->s));
1707 #endif
1708 buff[PAGE_TYPE_OFFSET]= (uchar) page_type;
1709 buff[DIR_COUNT_OFFSET]= (int) create_dir_entry;
1710 buff[DIR_FREE_OFFSET]= END_OF_DIR_FREE_LIST;
1711 if (create_dir_entry)
1712 {
1713 /* Create directory entry to point to start of page with size 0 */
1714 buff+= block_size - PAGE_SUFFIX_SIZE - DIR_ENTRY_SIZE;
1715 int2store(buff, PAGE_HEADER_SIZE(info->s));
1716 int2store(buff+2, 0);
1717 }
1718 DBUG_VOID_RETURN;
1719 }
1720
1721
1722 /*
1723 Read or initialize new head or tail page
1724
1725 SYNOPSIS
1726 get_head_or_tail_page()
1727 info Maria handler
1728 block Block to read
1729 buff Suggest this buffer to key cache
1730 length Minimum space needed
1731 page_type HEAD_PAGE || TAIL_PAGE
1732 res Store result position here
1733
1734 NOTES
1735 We don't decremented buff[EMPTY_SPACE_OFFSET] with the allocated data
1736 as we don't know how much data the caller will actually use.
1737
1738 res->empty_space is set to length of empty space
1739
1740 RETURN
1741 0 ok All slots in 'res' are updated
1742 1 error my_errno is set
1743 */
1744
1745 struct st_row_pos_info
1746 {
1747 uchar *buff; /* page buffer */
1748 uchar *data; /* Place for data */
1749 uchar *dir; /* Directory */
1750 uint length; /* Length for data */
1751 uint rownr; /* Offset in directory */
1752 uint empty_space; /* Space left on page */
1753 };
1754
1755
get_head_or_tail_page(MARIA_HA * info,const MARIA_BITMAP_BLOCK * block,uchar * buff,uint length,uint page_type,enum pagecache_page_lock lock,struct st_row_pos_info * res)1756 static my_bool get_head_or_tail_page(MARIA_HA *info,
1757 const MARIA_BITMAP_BLOCK *block,
1758 uchar *buff, uint length, uint page_type,
1759 enum pagecache_page_lock lock,
1760 struct st_row_pos_info *res)
1761 {
1762 uint block_size;
1763 MARIA_PINNED_PAGE page_link;
1764 MARIA_SHARE *share= info->s;
1765 DBUG_ENTER("get_head_or_tail_page");
1766 DBUG_PRINT("enter", ("page_type: %u length: %u", page_type, length));
1767
1768 block_size= share->block_size;
1769 if (block->org_bitmap_value == 0) /* Empty block */
1770 {
1771 /* New page */
1772 make_empty_page(info, buff, page_type, 1);
1773 res->buff= buff;
1774 res->empty_space= res->length= (block_size - PAGE_OVERHEAD_SIZE(share));
1775 res->data= (buff + PAGE_HEADER_SIZE(share));
1776 res->dir= res->data + res->length;
1777 res->rownr= 0;
1778 DBUG_ASSERT(length <= res->length);
1779 }
1780 else
1781 {
1782 uchar *dir;
1783 /* Read old page */
1784 page_link.unlock= PAGECACHE_LOCK_WRITE_UNLOCK;
1785 res->buff= pagecache_read(share->pagecache, &info->dfile,
1786 block->page, 0, 0, share->page_type,
1787 lock, &page_link.link);
1788 page_link.changed= res->buff != 0;
1789 push_dynamic(&info->pinned_pages, (void*) &page_link);
1790 if (!page_link.changed)
1791 goto crashed;
1792
1793 DBUG_ASSERT((uint) (res->buff[PAGE_TYPE_OFFSET] & PAGE_TYPE_MASK) ==
1794 page_type);
1795 if (!(dir= find_free_position(info, res->buff, block_size, &res->rownr,
1796 &res->length, &res->empty_space,
1797 page_type == HEAD_PAGE)))
1798 goto crashed;
1799
1800 if (res->length < length)
1801 {
1802 if (res->empty_space + res->length >= length)
1803 {
1804 _ma_compact_block_page(share,
1805 res->buff, res->rownr, 1,
1806 (page_type == HEAD_PAGE ?
1807 info->trn->min_read_from : 0),
1808 (page_type == HEAD_PAGE ?
1809 share->base.min_block_length :
1810 0));
1811 /* All empty space are now after current position */
1812 dir= dir_entry_pos(res->buff, block_size, res->rownr);
1813 res->length= res->empty_space= uint2korr(dir+2);
1814 }
1815 if (res->length < length)
1816 {
1817 DBUG_PRINT("error", ("length: %u res->length: %u empty_space: %u",
1818 length, res->length, res->empty_space));
1819 goto crashed; /* Wrong bitmap information */
1820 }
1821 }
1822 res->dir= dir;
1823 res->data= res->buff + uint2korr(dir);
1824 }
1825 DBUG_RETURN(0);
1826
1827 crashed:
1828 DBUG_ASSERT(!maria_assert_if_crashed_table);
1829 _ma_set_fatal_error(share, HA_ERR_WRONG_IN_RECORD); /* File crashed */
1830 DBUG_RETURN(1);
1831 }
1832
1833
1834 /*
1835 @brief Create room for a head or tail row on a given page at given position
1836
1837 @fn get_rowpos_in_head_or_tail_page()
1838 @param info Maria handler
1839 @param block Block to read
1840 @param buff Suggest this buffer to key cache
1841 @param length Minimum space needed
1842 @param page_type HEAD_PAGE || TAIL_PAGE
1843 @param rownr Rownr to use
1844 @param res Store result position here
1845
1846 @note
1847 This is essential same as get_head_or_tail_page, with the difference
1848 that the caller species at what position the row should be put.
1849 This is used when restoring a row to it's original position as
1850 part of UNDO DELETE or UNDO UPDATE
1851
1852 @return
1853 @retval 0 ok All slots in 'res' are updated
1854 @retval 1 error my_errno is set
1855 */
1856
get_rowpos_in_head_or_tail_page(MARIA_HA * info,const MARIA_BITMAP_BLOCK * block,uchar * buff,uint length,uint page_type,enum pagecache_page_lock lock,uint rownr,struct st_row_pos_info * res)1857 static my_bool get_rowpos_in_head_or_tail_page(MARIA_HA *info,
1858 const MARIA_BITMAP_BLOCK *block,
1859 uchar *buff, uint length,
1860 uint page_type,
1861 enum pagecache_page_lock lock,
1862 uint rownr,
1863 struct st_row_pos_info *res)
1864 {
1865 MARIA_PINNED_PAGE page_link;
1866 MARIA_SHARE *share= info->s;
1867 uchar *dir;
1868 uint block_size= share->block_size;
1869 uint max_entry, max_length, rec_offset;
1870 DBUG_ENTER("get_rowpos_in_head_or_tail_page");
1871
1872 if (block->org_bitmap_value == 0) /* Empty block */
1873 {
1874 /* New page */
1875 make_empty_page(info, buff, page_type, 0);
1876 res->empty_space= block_size - PAGE_HEADER_SIZE(share) - PAGE_SUFFIX_SIZE;
1877 }
1878 else
1879 {
1880 page_link.unlock= PAGECACHE_LOCK_WRITE_UNLOCK;
1881 buff= pagecache_read(share->pagecache, &info->dfile,
1882 block->page, 0, 0, share->page_type,
1883 lock, &page_link.link);
1884 page_link.changed= buff != 0;
1885 push_dynamic(&info->pinned_pages, (void*) &page_link);
1886 if (!page_link.changed) /* Read error */
1887 goto err;
1888 DBUG_ASSERT((buff[PAGE_TYPE_OFFSET] & PAGE_TYPE_MASK) ==
1889 (uchar) page_type);
1890 if ((buff[PAGE_TYPE_OFFSET] & PAGE_TYPE_MASK) != (uchar) page_type)
1891 goto err;
1892 res->empty_space= uint2korr(buff + EMPTY_SPACE_OFFSET);
1893 }
1894
1895 max_entry= (uint) buff[DIR_COUNT_OFFSET];
1896 if (max_entry <= rownr)
1897 {
1898 if (extend_directory(info, buff, block_size,
1899 max_entry, rownr, &res->empty_space,
1900 page_type == HEAD_PAGE))
1901 goto err;
1902 }
1903
1904 /*
1905 The following dir entry is unused in case of insert / update but
1906 not in case of undo_update / undo_delete
1907 */
1908 dir= dir_entry_pos(buff, block_size, rownr);
1909
1910 if (extend_area_on_page(info, buff, dir, rownr, length,
1911 &res->empty_space, &rec_offset, &max_length,
1912 page_type == HEAD_PAGE))
1913 goto err;
1914
1915 res->buff= buff;
1916 res->rownr= rownr;
1917 res->dir= dir;
1918 res->data= buff + rec_offset;
1919 res->length= length;
1920 DBUG_RETURN(0);
1921
1922 err:
1923 DBUG_ASSERT(!maria_assert_if_crashed_table);
1924 _ma_set_fatal_error(share, HA_ERR_WRONG_IN_RECORD); /* File crashed */
1925 DBUG_RETURN(1);
1926 }
1927
1928
1929 /*
1930 Write tail for head data or blob
1931
1932 SYNOPSIS
1933 write_tail()
1934 info Maria handler
1935 block Block to tail page
1936 row_part Data to write to page
1937 length Length of data
1938
1939 NOTES
1940 block->page_count is updated to the directory offset for the tail
1941 so that we can store the position in the row extent information
1942
1943 RETURN
1944 0 ok
1945 block->page_count is set to point (dir entry + TAIL_BIT)
1946
1947 1 error; In this case my_errno is set to the error
1948 */
1949
write_tail(MARIA_HA * info,MARIA_BITMAP_BLOCK * block,uchar * row_part,uint org_length)1950 static my_bool write_tail(MARIA_HA *info,
1951 MARIA_BITMAP_BLOCK *block,
1952 uchar *row_part, uint org_length)
1953 {
1954 MARIA_SHARE *share= info->s;
1955 MARIA_PINNED_PAGE page_link;
1956 uint block_size= share->block_size, empty_space, length= org_length;
1957 struct st_row_pos_info row_pos;
1958 my_off_t position;
1959 my_bool res, block_is_read;
1960 DBUG_ENTER("write_tail");
1961 DBUG_PRINT("enter", ("page: %lu length: %u",
1962 (ulong) block->page, length));
1963
1964 info->keyread_buff_used= 1;
1965 /*
1966 Don't allocate smaller block than MIN_TAIL_SIZE (we want to give rows
1967 some place to grow in the future)
1968 */
1969 if (length < MIN_TAIL_SIZE)
1970 length= MIN_TAIL_SIZE;
1971
1972 if (block->page_count == TAIL_PAGE_COUNT_MARKER)
1973 {
1974 /*
1975 Create new tail
1976 page will be pinned & locked by get_head_or_tail_page
1977 */
1978 if (get_head_or_tail_page(info, block, info->keyread_buff, length,
1979 TAIL_PAGE, PAGECACHE_LOCK_WRITE,
1980 &row_pos))
1981 DBUG_RETURN(1);
1982 }
1983 else
1984 {
1985 /* Write tail on predefined row position */
1986 if (get_rowpos_in_head_or_tail_page(info, block, info->keyread_buff,
1987 length, TAIL_PAGE,
1988 PAGECACHE_LOCK_WRITE,
1989 block->page_count & ~TAIL_BIT,
1990 &row_pos))
1991 DBUG_RETURN(1);
1992 }
1993 DBUG_PRINT("info", ("tailid: %lu (%lu:%u)",
1994 (ulong) ma_recordpos(block->page, row_pos.rownr),
1995 (ulong) block->page, row_pos.rownr));
1996
1997 block_is_read= block->org_bitmap_value != 0;
1998
1999 memcpy(row_pos.data, row_part, org_length);
2000
2001 if (share->now_transactional)
2002 {
2003 /* Log changes in tail block */
2004 uchar log_data[FILEID_STORE_SIZE + PAGE_STORE_SIZE + DIRPOS_STORE_SIZE];
2005 LEX_CUSTRING log_array[TRANSLOG_INTERNAL_PARTS + 2];
2006 LSN lsn;
2007
2008 /*
2009 Log REDO changes of tail page
2010 Note that we have to log length, not org_length, to be sure that
2011 REDO, which doesn't use write_tail, also creates a block of at least
2012 MIN_TAIL_SIZE
2013 */
2014 page_store(log_data + FILEID_STORE_SIZE, block->page);
2015 dirpos_store(log_data + FILEID_STORE_SIZE + PAGE_STORE_SIZE,
2016 row_pos.rownr);
2017 log_array[TRANSLOG_INTERNAL_PARTS + 0].str= log_data;
2018 log_array[TRANSLOG_INTERNAL_PARTS + 0].length= sizeof(log_data);
2019 log_array[TRANSLOG_INTERNAL_PARTS + 1].str= row_pos.data;
2020 log_array[TRANSLOG_INTERNAL_PARTS + 1].length= length;
2021 if (translog_write_record(&lsn,
2022 (block_is_read ? LOGREC_REDO_INSERT_ROW_TAIL :
2023 LOGREC_REDO_NEW_ROW_TAIL),
2024 info->trn, info,
2025 (translog_size_t) (sizeof(log_data) + length),
2026 TRANSLOG_INTERNAL_PARTS + 2, log_array,
2027 log_data, NULL))
2028 DBUG_RETURN(1);
2029 }
2030
2031 int2store(row_pos.dir + 2, length);
2032 empty_space= row_pos.empty_space - length;
2033 int2store(row_pos.buff + EMPTY_SPACE_OFFSET, empty_space);
2034 block->page_count= row_pos.rownr + TAIL_BIT;
2035 /*
2036 If there is less directory entries free than number of possible tails
2037 we can write for a row, we mark the page full to ensure that we don't
2038 during _ma_bitmap_find_place() allocate more entries on the tail page
2039 than it can hold
2040 */
2041 block->empty_space= (enough_free_entries(row_pos.buff, share->block_size,
2042 1 + share->base.blobs) ?
2043 empty_space : 0);
2044 /* Keep BLOCKUSED_USE_ORG_BITMAP */
2045 block->used|= BLOCKUSED_USED | BLOCKUSED_TAIL;
2046
2047 if (block_is_read)
2048 {
2049 /* Current page link is last element in pinned_pages */
2050 MARIA_PINNED_PAGE *page_link;
2051 page_link= dynamic_element(&info->pinned_pages,
2052 info->pinned_pages.elements-1,
2053 MARIA_PINNED_PAGE*);
2054 pagecache_unlock_by_link(share->pagecache, page_link->link,
2055 PAGECACHE_LOCK_WRITE_TO_READ,
2056 PAGECACHE_PIN_LEFT_PINNED, LSN_IMPOSSIBLE,
2057 LSN_IMPOSSIBLE, 1, FALSE);
2058 DBUG_ASSERT(page_link->changed);
2059 page_link->unlock= PAGECACHE_LOCK_READ_UNLOCK;
2060 res= 0;
2061 }
2062 else
2063 {
2064 if (!(res= pagecache_write(share->pagecache,
2065 &info->dfile, block->page, 0,
2066 row_pos.buff,share->page_type,
2067 PAGECACHE_LOCK_READ,
2068 PAGECACHE_PIN,
2069 PAGECACHE_WRITE_DELAY, &page_link.link,
2070 LSN_IMPOSSIBLE)))
2071 {
2072 DBUG_ASSERT(page_link.link);
2073 page_link.unlock= PAGECACHE_LOCK_READ_UNLOCK;
2074 page_link.changed= 1;
2075 push_dynamic(&info->pinned_pages, (void*) &page_link);
2076 }
2077
2078 /* Increase data file size, if extended */
2079 position= (my_off_t) block->page * block_size;
2080 if (share->state.state.data_file_length <= position)
2081 {
2082 /*
2083 We are modifying a state member before writing the UNDO; this is a WAL
2084 violation. But for data_file_length this is ok, as long as we change
2085 data_file_length after writing any log record (FILE_ID/REDO/UNDO) (see
2086 collect_tables()).
2087 */
2088 _ma_set_share_data_file_length(share, position + block_size);
2089 }
2090 }
2091 DBUG_RETURN(res);
2092 }
2093
2094
2095 /*
2096 Write full pages
2097
2098 SYNOPSIS
2099 write_full_pages()
2100 info Maria handler
2101 lsn LSN for the undo record
2102 block Where to write data
2103 data Data to write
2104 length Length of data
2105
2106 NOTES
2107 Logging of the changes to the full pages are done in the caller
2108 write_block_record().
2109
2110 RETURN
2111 0 ok
2112 1 error on write
2113 */
2114
write_full_pages(MARIA_HA * info,LSN lsn,MARIA_BITMAP_BLOCK * block,uchar * data,ulong length)2115 static my_bool write_full_pages(MARIA_HA *info,
2116 LSN lsn,
2117 MARIA_BITMAP_BLOCK *block,
2118 uchar *data, ulong length)
2119 {
2120 pgcache_page_no_t page;
2121 MARIA_SHARE *share= info->s;
2122 uint block_size= share->block_size;
2123 uint data_size= FULL_PAGE_SIZE(share);
2124 uchar *buff= info->keyread_buff;
2125 uint page_count, sub_blocks;
2126 my_off_t position, max_position;
2127 DBUG_ENTER("write_full_pages");
2128 DBUG_PRINT("enter", ("length: %lu page: %lu page_count: %lu",
2129 (ulong) length, (ulong) block->page,
2130 (ulong) block->page_count));
2131 DBUG_ASSERT((block->page_count & TAIL_BIT) == 0);
2132
2133 info->keyread_buff_used= 1;
2134 page= block->page;
2135 page_count= block->page_count;
2136 sub_blocks= block->sub_blocks;
2137
2138 max_position= (my_off_t) (page + page_count) * block_size;
2139
2140 /* Increase data file size, if extended */
2141
2142 for (; length; data+= data_size)
2143 {
2144 uint copy_length;
2145 if (!page_count--)
2146 {
2147 if (!--sub_blocks)
2148 {
2149 _ma_set_fatal_error(share, HA_ERR_WRONG_IN_RECORD);
2150 DBUG_RETURN(1);
2151 }
2152
2153 block++;
2154 page= block->page;
2155 page_count= block->page_count - 1;
2156 DBUG_PRINT("info", ("page: %lu page_count: %lu",
2157 (ulong) block->page, (ulong) block->page_count));
2158
2159 position= (page + page_count + 1) * block_size;
2160 set_if_bigger(max_position, position);
2161 }
2162 lsn_store(buff, lsn);
2163 buff[PAGE_TYPE_OFFSET]= (uchar) BLOB_PAGE;
2164 bzero(buff + LSN_SIZE + PAGE_TYPE_SIZE,
2165 FULL_PAGE_HEADER_SIZE(share) - (LSN_SIZE + PAGE_TYPE_SIZE));
2166 copy_length= MY_MIN(data_size, length);
2167 memcpy(buff + FULL_PAGE_HEADER_SIZE(share), data, copy_length);
2168 length-= copy_length;
2169
2170 /*
2171 Zero out old information from the block. This removes possible
2172 sensitive information from the block and also makes the file
2173 easier to compress and easier to compare after recovery.
2174 */
2175 if (copy_length != data_size)
2176 bzero(buff + block_size - PAGE_SUFFIX_SIZE - (data_size - copy_length),
2177 (data_size - copy_length) + PAGE_SUFFIX_SIZE);
2178
2179 if (pagecache_write(share->pagecache,
2180 &info->dfile, page, 0,
2181 buff, share->page_type,
2182 PAGECACHE_LOCK_LEFT_UNLOCKED,
2183 PAGECACHE_PIN_LEFT_UNPINNED,
2184 PAGECACHE_WRITE_DELAY,
2185 0, info->trn->rec_lsn))
2186 DBUG_RETURN(1);
2187 page++;
2188 DBUG_ASSERT(block->used & BLOCKUSED_USED);
2189 }
2190 if (share->state.state.data_file_length < max_position)
2191 _ma_set_share_data_file_length(share, max_position);
2192 DBUG_RETURN(0);
2193 }
2194
2195
2196 /*
2197 Store ranges of full pages in compact format for logging
2198
2199 SYNOPSIS
2200 store_page_range()
2201 to Store data here
2202 block Where pages are to be written
2203 length Length of data to be written
2204 Normally this is full pages, except for the last
2205 tail block that may only partly fit the last page.
2206 tot_ranges Add here the number of ranges used
2207
2208 NOTES
2209 The format of one entry is:
2210
2211 Ranges SUB_RANGE_SIZE
2212 Empty bytes at end of last byte BLOCK_FILLER_SIZE
2213 For each range
2214 Page number PAGE_STORE_SIZE
2215 Number of pages PAGERANGE_STORE_SIZE
2216
2217 RETURN
2218 # end position for 'to'
2219 */
2220
store_page_range(MARIA_SHARE * share,uchar * to,MARIA_BITMAP_BLOCK * block,ulong length,uint * tot_ranges)2221 static uchar *store_page_range(MARIA_SHARE *share,
2222 uchar *to, MARIA_BITMAP_BLOCK *block,
2223 ulong length,
2224 uint *tot_ranges)
2225 {
2226 uint data_size= FULL_PAGE_SIZE(share);
2227 ulong pages_left= (length + data_size -1) / data_size;
2228 uint page_count, ranges, empty_space;
2229 uchar *to_start;
2230 DBUG_ENTER("store_page_range");
2231
2232 to_start= to;
2233 to+= SUB_RANGE_SIZE;
2234
2235 /* Store number of unused bytes at last page */
2236 empty_space= (uint) (pages_left * data_size - length);
2237 int2store(to, empty_space);
2238 to+= BLOCK_FILLER_SIZE;
2239
2240 ranges= 0;
2241 do
2242 {
2243 pgcache_page_no_t page;
2244 page= block->page;
2245 page_count= block->page_count;
2246 block++;
2247 if (page_count > pages_left)
2248 page_count= pages_left;
2249
2250 page_store(to, page);
2251 to+= PAGE_STORE_SIZE;
2252 pagerange_store(to, page_count);
2253 to+= PAGERANGE_STORE_SIZE;
2254 ranges++;
2255 } while ((pages_left-= page_count));
2256 /* Store number of ranges for this block */
2257 int2store(to_start, ranges);
2258 (*tot_ranges)+= ranges;
2259
2260 DBUG_RETURN(to);
2261 }
2262
2263
2264 /*
2265 Store packed extent data
2266
2267 SYNOPSIS
2268 store_extent_info()
2269 to Store first packed data here
2270 row_extents_second_part Store rest here
2271 first_block First block to store
2272 count Number of blocks
2273
2274 NOTES
2275 We don't have to store the position for the head block
2276
2277 We have to set the START_EXTENT_BIT for every extent where the
2278 blob will be stored on a page of it's own. We need this in the
2279 UNDO phase to generate MARIA_BITMAP_BLOCK's for undo-delete and
2280 undo-update.
2281 */
2282
store_extent_info(uchar * to,uchar * row_extents_second_part,MARIA_BITMAP_BLOCK * first_block,uint count)2283 static void store_extent_info(uchar *to,
2284 uchar *row_extents_second_part,
2285 MARIA_BITMAP_BLOCK *first_block,
2286 uint count)
2287 {
2288 MARIA_BITMAP_BLOCK *block, *end_block;
2289 uint copy_length;
2290 my_bool first_found= 0;
2291 DBUG_ENTER("store_extent_info");
2292 DBUG_PRINT("enter", ("count: %u", count));
2293
2294 for (block= first_block, end_block= first_block+count ;
2295 block < end_block; block++)
2296 {
2297 /* The following is only false for marker (unused) blocks */
2298 if (likely(block->used & BLOCKUSED_USED))
2299 {
2300 uint page_count= block->page_count;
2301 DBUG_ASSERT(page_count != 0);
2302 page_store(to, block->page);
2303 if (block->sub_blocks)
2304 {
2305 /*
2306 Set a bit so that we later know that this was the first block
2307 for a blob
2308 */
2309 page_count|= START_EXTENT_BIT;
2310 }
2311 pagerange_store(to + PAGE_STORE_SIZE, page_count);
2312 DBUG_DUMP("extent", to, ROW_EXTENT_SIZE);
2313 to+= ROW_EXTENT_SIZE;
2314 if (!first_found)
2315 {
2316 first_found= 1;
2317 to= row_extents_second_part;
2318 }
2319 }
2320 }
2321 copy_length= (count - 1) * ROW_EXTENT_SIZE;
2322 /*
2323 In some unlikely cases we have allocated to many blocks. Clear this
2324 data.
2325 */
2326 bzero(to, (size_t) (row_extents_second_part + copy_length - to));
2327 DBUG_VOID_RETURN;
2328 }
2329
2330
2331 /**
2332 @brief
2333 Convert extent info read from file to MARIA_BITMAP_BLOCKS suitable
2334 for write_block_record
2335
2336 @note
2337 In case of blobs, this function marks all the blob pages in the bitmap
2338 as full pages. The bitmap bits for other pages will be marked
2339 when write_block_record() calls _ma_bitmap_release_unused().
2340
2341 This function will be removed in Maria 2.0 when we instead of delete rows
2342 mark them as deleted and only remove them after commit.
2343
2344 @return
2345 @retval 0 ok
2346 @retval 1 Error (out of memory or disk error changing bitmap) or
2347 wrong information in extent information
2348 */
2349
extent_to_bitmap_blocks(MARIA_HA * info,MARIA_BITMAP_BLOCKS * blocks,pgcache_page_no_t head_page,uint extent_count,const uchar * extent_info)2350 static my_bool extent_to_bitmap_blocks(MARIA_HA *info,
2351 MARIA_BITMAP_BLOCKS *blocks,
2352 pgcache_page_no_t head_page,
2353 uint extent_count,
2354 const uchar *extent_info)
2355 {
2356 MARIA_BITMAP_BLOCK *block, *start_block;
2357 MARIA_SHARE *share= info->s;
2358 uint i, tail_page;
2359 DBUG_ENTER("extent_to_bitmap_blocks");
2360
2361 if (allocate_dynamic(&info->bitmap_blocks, extent_count + 2))
2362 DBUG_RETURN(1);
2363 block= blocks->block= dynamic_element(&info->bitmap_blocks, 0,
2364 MARIA_BITMAP_BLOCK*);
2365 blocks->count= extent_count + 1;
2366 blocks->tail_page_skipped= blocks->page_skipped= 0;
2367 block->page= head_page;
2368 block->page_count= 1;
2369 block->used= BLOCKUSED_USED | BLOCKUSED_USE_ORG_BITMAP;
2370 /* Impossible value, will force storage of real value */
2371 block->org_bitmap_value= 255;
2372
2373 start_block= block++;
2374 for (i=0 ;
2375 i++ < extent_count ;
2376 block++, extent_info+= ROW_EXTENT_SIZE)
2377 {
2378 uint page_count= uint2korr(extent_info + ROW_EXTENT_PAGE_SIZE);
2379 if (page_count & START_EXTENT_BIT)
2380 {
2381 page_count&= ~START_EXTENT_BIT;
2382 start_block->sub_blocks= (uint) (block - start_block);
2383 start_block= block;
2384 }
2385 block->page= page_korr(extent_info);
2386 block->page_count= page_count;
2387 block->sub_blocks= 0;
2388 if (block->page_count == 0)
2389 {
2390 /* Extend allocated but not used by write_block_record() */
2391 DBUG_ASSERT(block->page == 0);
2392 /* This is the last block */
2393 blocks->count= i;
2394 break;
2395 }
2396 if ((tail_page= page_count & TAIL_BIT))
2397 page_count= 1;
2398
2399 /* Check if wrong data */
2400 if (block->page == 0 || page_count == 0 ||
2401 (block->page + page_count) * share->block_size >
2402 share->state.state.data_file_length)
2403 {
2404 DBUG_PRINT("error", ("page: %lu page_count: %u tail: %u length: %ld data_length: %ld",
2405 (ulong) block->page,
2406 (block->page_count & ~TAIL_BIT),
2407 (uint) MY_TEST(block->page_count & TAIL_BIT),
2408 (ulong) ((block->page + (page_count & ~TAIL_BIT)) *
2409 share->block_size),
2410 (ulong) share->state.state.data_file_length));
2411 DBUG_RETURN(1);
2412 }
2413 if (tail_page)
2414 {
2415 block->org_bitmap_value= _ma_bitmap_get_page_bits(info, &share->bitmap,
2416 block->page);
2417 block->used= (BLOCKUSED_TAIL | BLOCKUSED_USED |
2418 BLOCKUSED_USE_ORG_BITMAP);
2419 }
2420 else
2421 {
2422 my_bool res;
2423 mysql_mutex_lock(&share->bitmap.bitmap_lock);
2424 res= _ma_bitmap_set_full_page_bits(info, &share->bitmap,
2425 block->page, page_count);
2426 mysql_mutex_unlock(&share->bitmap.bitmap_lock);
2427 if (res)
2428 DBUG_RETURN(1);
2429 block->used= BLOCKUSED_USED;
2430 }
2431 }
2432 start_block->sub_blocks= (uint) (block - start_block);
2433 DBUG_RETURN(0);
2434 }
2435
2436
2437 /*
2438 Free regions of pages with logging
2439
2440 NOTES
2441 We are removing filler events and tail page events from
2442 row->extents to get smaller log.
2443
2444 RETURN
2445 0 ok
2446 1 error
2447 */
2448
free_full_pages(MARIA_HA * info,MARIA_ROW * row)2449 static my_bool free_full_pages(MARIA_HA *info, MARIA_ROW *row)
2450 {
2451 uchar log_data[FILEID_STORE_SIZE + PAGERANGE_STORE_SIZE];
2452 LEX_CUSTRING log_array[TRANSLOG_INTERNAL_PARTS + 2];
2453 LSN lsn;
2454 size_t extents_length;
2455 uchar *extents= row->extents;
2456 DBUG_ENTER("free_full_pages");
2457
2458 if (info->s->now_transactional)
2459 {
2460 /* Compact events by removing filler and tail events */
2461 uchar *new_block= 0;
2462 uchar *end, *to, *compact_extent_info;
2463 my_bool res, buff_alloced;
2464 uint extents_count;
2465
2466 alloc_on_stack(*info->stack_end_ptr, compact_extent_info, buff_alloced,
2467 row->extents_count * ROW_EXTENT_SIZE);
2468 if (!compact_extent_info)
2469 DBUG_RETURN(1);
2470
2471 to= compact_extent_info;
2472 for (end= extents + row->extents_count * ROW_EXTENT_SIZE ;
2473 extents < end ;
2474 extents+= ROW_EXTENT_SIZE)
2475 {
2476 uint page_count= uint2korr(extents + ROW_EXTENT_PAGE_SIZE);
2477 page_count&= ~START_EXTENT_BIT;
2478 if (! (page_count & TAIL_BIT) && page_count != 0)
2479 {
2480 /* Found correct extent */
2481 if (!new_block)
2482 new_block= extents; /* First extent in range */
2483 continue;
2484 }
2485 /* Found extent to remove, copy everything found so far */
2486 if (new_block)
2487 {
2488 size_t length= (size_t) (extents - new_block);
2489 memcpy(to, new_block, length);
2490 to+= length;
2491 new_block= 0;
2492 }
2493 }
2494 if (new_block)
2495 {
2496 size_t length= (size_t) (extents - new_block);
2497 memcpy(to, new_block, length);
2498 to+= length;
2499 }
2500
2501 if (!unlikely(extents_length= (uint) (to - compact_extent_info)))
2502 {
2503 /*
2504 No ranges. This happens in the rear case when we have a allocated
2505 place for a blob on a tail page but it did fit into the main page.
2506 */
2507 stack_alloc_free(compact_extent_info, buff_alloced);
2508 DBUG_RETURN(0);
2509 }
2510 extents_count= (uint) (extents_length / ROW_EXTENT_SIZE);
2511 pagerange_store(log_data + FILEID_STORE_SIZE, extents_count);
2512 log_array[TRANSLOG_INTERNAL_PARTS + 0].str= log_data;
2513 log_array[TRANSLOG_INTERNAL_PARTS + 0].length= sizeof(log_data);
2514 log_array[TRANSLOG_INTERNAL_PARTS + 1].str= compact_extent_info;
2515 log_array[TRANSLOG_INTERNAL_PARTS + 1].length= extents_length;
2516 res= translog_write_record(&lsn, LOGREC_REDO_FREE_BLOCKS, info->trn,
2517 info,
2518 (translog_size_t) (sizeof(log_data) +
2519 extents_length),
2520 TRANSLOG_INTERNAL_PARTS + 2, log_array,
2521 log_data, NULL);
2522 stack_alloc_free(compact_extent_info, buff_alloced);
2523 if (res)
2524 DBUG_RETURN(1);
2525 }
2526
2527 DBUG_RETURN(_ma_bitmap_free_full_pages(info, row->extents,
2528 row->extents_count));
2529 }
2530
2531
2532 /*
2533 Free one page range
2534
2535 NOTES
2536 This is very similar to free_full_pages()
2537
2538 RETURN
2539 0 ok
2540 1 error
2541 */
2542
free_full_page_range(MARIA_HA * info,pgcache_page_no_t page,uint count)2543 static my_bool free_full_page_range(MARIA_HA *info, pgcache_page_no_t page,
2544 uint count)
2545 {
2546 my_bool res= 0;
2547 uint delete_count;
2548 MARIA_SHARE *share= info->s;
2549 DBUG_ENTER("free_full_page_range");
2550
2551 delete_count= count;
2552 if (share->state.state.data_file_length ==
2553 (page + count) * share->block_size)
2554 {
2555 /*
2556 Don't delete last page from pagecache as this will make the file
2557 shorter than expected if the last operation extended the file
2558 */
2559 delete_count--;
2560 }
2561 if (delete_count &&
2562 pagecache_delete_pages(share->pagecache, &info->dfile,
2563 page, delete_count, PAGECACHE_LOCK_WRITE, 1))
2564 res= 1;
2565
2566 if (share->now_transactional)
2567 {
2568 LSN lsn;
2569 /** @todo unify log_data's shape with delete_head_or_tail() */
2570 uchar log_data[FILEID_STORE_SIZE + PAGERANGE_STORE_SIZE +
2571 ROW_EXTENT_SIZE];
2572 LEX_CUSTRING log_array[TRANSLOG_INTERNAL_PARTS + 1];
2573 DBUG_ASSERT(info->trn->rec_lsn);
2574 pagerange_store(log_data + FILEID_STORE_SIZE, 1);
2575 page_store(log_data + FILEID_STORE_SIZE + PAGERANGE_STORE_SIZE,
2576 page);
2577 int2store(log_data + FILEID_STORE_SIZE + PAGERANGE_STORE_SIZE +
2578 PAGE_STORE_SIZE, count);
2579 log_array[TRANSLOG_INTERNAL_PARTS + 0].str= log_data;
2580 log_array[TRANSLOG_INTERNAL_PARTS + 0].length= sizeof(log_data);
2581
2582 if (translog_write_record(&lsn, LOGREC_REDO_FREE_BLOCKS,
2583 info->trn, info,
2584 (translog_size_t) sizeof(log_data),
2585 TRANSLOG_INTERNAL_PARTS + 1, log_array,
2586 log_data, NULL))
2587 res= 1;
2588 }
2589 mysql_mutex_lock(&share->bitmap.bitmap_lock);
2590 if (_ma_bitmap_reset_full_page_bits(info, &share->bitmap, page, count))
2591 res= 1;
2592 mysql_mutex_unlock(&share->bitmap.bitmap_lock);
2593 DBUG_RETURN(res);
2594 }
2595
2596
2597 /**
2598 @brief Write a record to a (set of) pages
2599
2600 @fn write_block_record()
2601 @param info Maria handler
2602 @param old_record Original record in case of update; NULL in case of
2603 insert
2604 @param record Record we should write
2605 @param row Statistics about record (calculated by
2606 calc_record_size())
2607 @param bitmap_blocks On which pages the record should be stored
2608 @param head_block_is_read 1 if head block existed. 0 if new block.
2609 @param row_pos Position on head page where to put head part of
2610 record
2611 @param undo_lsn <> LSN_ERROR if we are executing an UNDO
2612 @param old_record_checksum Checksum of old_record: ignored if table does
2613 not have live checksum; otherwise if
2614 old_record==NULL it must be 0.
2615
2616 @note
2617 On return all pinned pages are released.
2618
2619 [page_buff + EMPTY_SPACE_OFFSET] is set to
2620 row_pos->empty_space - head_length
2621
2622 @return Operation status
2623 @retval 0 OK
2624 @retval 1 Error
2625 */
2626
write_block_record(MARIA_HA * info,const uchar * old_record,const uchar * record,MARIA_ROW * row,MARIA_BITMAP_BLOCKS * bitmap_blocks,my_bool head_block_is_read,struct st_row_pos_info * row_pos,LSN undo_lsn,ha_checksum old_record_checksum)2627 static my_bool write_block_record(MARIA_HA *info,
2628 const uchar *old_record,
2629 const uchar *record,
2630 MARIA_ROW *row,
2631 MARIA_BITMAP_BLOCKS *bitmap_blocks,
2632 my_bool head_block_is_read,
2633 struct st_row_pos_info *row_pos,
2634 LSN undo_lsn,
2635 ha_checksum old_record_checksum)
2636 {
2637 uchar *data, *end_of_data, *tmp_data_used, *tmp_data;
2638 uchar *UNINIT_VAR(row_extents_first_part), *UNINIT_VAR(row_extents_second_part);
2639 uchar *field_length_data;
2640 uchar *page_buff;
2641 MARIA_BITMAP_BLOCK *block, *head_block;
2642 MARIA_SHARE *share= info->s;
2643 MARIA_COLUMNDEF *column, *end_column;
2644 MARIA_PINNED_PAGE page_link;
2645 uint block_size, flag, head_length;
2646 ulong *blob_lengths;
2647 my_bool row_extents_in_use, blob_full_pages_exists;
2648 LSN lsn;
2649 my_off_t position;
2650 uint save_my_errno;
2651 myf myflag= MY_WME | (share->temporary ? MY_THREAD_SPECIFIC : 0);
2652 DBUG_ENTER("write_block_record");
2653
2654 head_block= bitmap_blocks->block;
2655 block_size= share->block_size;
2656
2657 page_buff= row_pos->buff;
2658 /* Position on head page where we should store the head part */
2659 data= row_pos->data;
2660 end_of_data= data + row_pos->length;
2661
2662 /* Write header */
2663 flag= info->row_flag;
2664 row_extents_in_use= 0;
2665 if (unlikely(row->total_length > row_pos->length))
2666 {
2667 /* Need extent */
2668 DBUG_ASSERT(bitmap_blocks->count > 1);
2669 if (bitmap_blocks->count <= 1)
2670 goto crashed; /* Wrong in bitmap */
2671 flag|= ROW_FLAG_EXTENTS;
2672 row_extents_in_use= 1;
2673 }
2674 /* For now we have only a minimum header */
2675 *data++= (uchar) flag;
2676 if (flag & ROW_FLAG_TRANSID)
2677 {
2678 transid_store(data, info->trn->trid);
2679 data+= TRANSID_SIZE;
2680 }
2681
2682 if (unlikely(flag & ROW_FLAG_NULLS_EXTENDED))
2683 *data++= (uchar) (share->base.null_bytes -
2684 share->base.original_null_bytes);
2685 if (row_extents_in_use)
2686 {
2687 /* Store first extent in header */
2688 store_key_length_inc(data, bitmap_blocks->count - 1);
2689 row_extents_first_part= data;
2690 data+= ROW_EXTENT_SIZE;
2691 }
2692 if (share->base.max_field_lengths)
2693 store_key_length_inc(data, row->field_lengths_length);
2694 if (share->calc_checksum)
2695 {
2696 *(data++)= (uchar) (row->checksum); /* store least significant byte */
2697 DBUG_ASSERT(!((old_record_checksum != 0) && (old_record == NULL)));
2698 }
2699 memcpy(data, record, share->base.null_bytes);
2700 data+= share->base.null_bytes;
2701 memcpy(data, row->empty_bits, share->base.pack_bytes);
2702 data+= share->base.pack_bytes;
2703
2704 DBUG_ASSERT(row_extents_in_use || undo_lsn != LSN_ERROR ||
2705 (uint) (data - row_pos->data) == row->min_length);
2706
2707 /*
2708 Allocate a buffer of rest of data (except blobs)
2709
2710 To avoid double copying of data, we copy as many columns that fits into
2711 the page. The rest goes into info->packed_row.
2712
2713 Using an extra buffer, instead of doing continuous writes to different
2714 pages, uses less code and we don't need to have to do a complex call
2715 for every data segment we want to store.
2716 */
2717 if (_ma_alloc_buffer(&info->rec_buff, &info->rec_buff_size,
2718 row->head_length, myflag))
2719 DBUG_RETURN(1);
2720
2721 tmp_data_used= 0; /* Either 0 or last used uchar in 'data' */
2722 tmp_data= data;
2723
2724 if (row_extents_in_use)
2725 {
2726 uint copy_length= (bitmap_blocks->count - 2) * ROW_EXTENT_SIZE;
2727 if (!tmp_data_used && tmp_data + copy_length > end_of_data)
2728 {
2729 tmp_data_used= tmp_data;
2730 tmp_data= info->rec_buff;
2731 }
2732 row_extents_second_part= tmp_data;
2733 /*
2734 We will copy the extents here when we have figured out the tail
2735 positions.
2736 */
2737 tmp_data+= copy_length;
2738 }
2739
2740 /* Copy fields that has fixed lengths (primary key etc) */
2741 for (column= share->columndef,
2742 end_column= column + share->base.fixed_not_null_fields;
2743 column < end_column; column++)
2744 {
2745 if (!tmp_data_used && tmp_data + column->length > end_of_data)
2746 {
2747 tmp_data_used= tmp_data;
2748 tmp_data= info->rec_buff;
2749 }
2750 memcpy(tmp_data, record + column->offset, column->length);
2751 tmp_data+= column->length;
2752 }
2753
2754 /* Copy length of data for variable length fields */
2755 if (!tmp_data_used && tmp_data + row->field_lengths_length > end_of_data)
2756 {
2757 tmp_data_used= tmp_data;
2758 tmp_data= info->rec_buff;
2759 }
2760 field_length_data= row->field_lengths;
2761 memcpy(tmp_data, field_length_data, row->field_lengths_length);
2762 tmp_data+= row->field_lengths_length;
2763
2764 DBUG_ASSERT(row_extents_in_use || undo_lsn != LSN_ERROR ||
2765 (uint) (tmp_data - row_pos->data) == row->min_length +
2766 share->base.fixed_not_null_fields_length +
2767 row->field_lengths_length);
2768
2769 /* Copy variable length fields and fields with null/zero */
2770 for (end_column= share->columndef + share->base.fields - share->base.blobs;
2771 column < end_column ;
2772 column++)
2773 {
2774 const uchar *field_pos;
2775 ulong length;
2776 if ((record[column->null_pos] & column->null_bit) ||
2777 (row->empty_bits[column->empty_pos] & column->empty_bit))
2778 continue;
2779
2780 field_pos= record + column->offset;
2781 switch (column->type) {
2782 case FIELD_NORMAL: /* Fixed length field */
2783 case FIELD_SKIP_PRESPACE:
2784 case FIELD_SKIP_ZERO: /* Fixed length field */
2785 length= column->length;
2786 break;
2787 case FIELD_SKIP_ENDSPACE: /* CHAR */
2788 /* Char that is space filled */
2789 if (column->length <= 255)
2790 length= (uint) (uchar) *field_length_data++;
2791 else
2792 {
2793 length= uint2korr(field_length_data);
2794 field_length_data+= 2;
2795 }
2796 break;
2797 case FIELD_VARCHAR:
2798 if (column->length <= 256)
2799 {
2800 length= (uint) (uchar) *field_length_data++;
2801 field_pos++; /* Skip length uchar */
2802 }
2803 else
2804 {
2805 length= uint2korr(field_length_data);
2806 field_length_data+= 2;
2807 field_pos+= 2;
2808 }
2809 DBUG_ASSERT(length <= column->length);
2810 break;
2811 default: /* Wrong data */
2812 DBUG_ASSERT(!maria_assert_if_crashed_table);
2813 length=0;
2814 break;
2815 }
2816 if (!tmp_data_used && tmp_data + length > end_of_data)
2817 {
2818 /* Data didn't fit in page; Change to use tmp buffer */
2819 tmp_data_used= tmp_data;
2820 tmp_data= info->rec_buff;
2821 }
2822 memcpy((char*) tmp_data, field_pos, length);
2823 tmp_data+= length;
2824 }
2825
2826 block= head_block + head_block->sub_blocks; /* Point to first blob data */
2827
2828 end_column= column + share->base.blobs;
2829 blob_lengths= row->blob_lengths;
2830 if (!tmp_data_used)
2831 {
2832 /* Still room on page; Copy as many blobs we can into this page */
2833 data= tmp_data;
2834 for (; column < end_column &&
2835 *blob_lengths <= (ulong)(end_of_data - data);
2836 column++, blob_lengths++)
2837 {
2838 uchar *tmp_pos;
2839 uint length;
2840 if (!*blob_lengths) /* Null or "" */
2841 continue;
2842 length= column->length - portable_sizeof_char_ptr;
2843 memcpy(&tmp_pos, record + column->offset + length, sizeof(char*));
2844 memcpy(data, tmp_pos, *blob_lengths);
2845 data+= *blob_lengths;
2846 /*
2847 The following is not true when we want to insert data into original
2848 place. In this case we don't have any extra blocks allocated
2849 */
2850 if (likely(undo_lsn == LSN_ERROR))
2851 {
2852 /* Skip over tail page that was prepared for storing blob */
2853 block++;
2854 bitmap_blocks->tail_page_skipped= 1;
2855 }
2856 }
2857 if (head_block->sub_blocks > 1)
2858 {
2859 /* We have allocated pages that where not used */
2860 bitmap_blocks->page_skipped= 1;
2861 }
2862 }
2863 else
2864 data= tmp_data_used; /* Get last used on page */
2865
2866 /* Update page directory */
2867 head_length= (uint) (data - row_pos->data);
2868 DBUG_PRINT("info", ("Used head length on page: %u header_length: %u",
2869 head_length,
2870 (uint) (flag & ROW_FLAG_TRANSID ? TRANSID_SIZE : 0)));
2871 if (head_length < share->base.min_block_length)
2872 {
2873 /* Extend row to be of size min_block_length */
2874 uint diff_length= share->base.min_block_length - head_length;
2875 bzero(data, diff_length);
2876 data+= diff_length;
2877 head_length= share->base.min_block_length;
2878 }
2879 DBUG_ASSERT(data <= end_of_data);
2880 /*
2881 If this is a redo entry (ie, undo_lsn != LSN_ERROR) then we should have
2882 written exactly head_length bytes (same as original record).
2883 */
2884 DBUG_ASSERT(undo_lsn == LSN_ERROR || head_length == row_pos->length);
2885 int2store(row_pos->dir + 2, head_length);
2886 /* update empty space at start of block */
2887 row_pos->empty_space-= head_length;
2888 int2store(page_buff + EMPTY_SPACE_OFFSET, row_pos->empty_space);
2889 /* Mark in bitmaps how the current page was actually used */
2890 head_block->empty_space= row_pos->empty_space;
2891 if (page_buff[DIR_COUNT_OFFSET] == MAX_ROWS_PER_PAGE &&
2892 page_buff[DIR_FREE_OFFSET] == END_OF_DIR_FREE_LIST)
2893 head_block->empty_space= 0; /* Page is full */
2894 head_block->used|= BLOCKUSED_USED;
2895
2896 check_directory(share,
2897 page_buff, share->block_size, share->base.min_block_length,
2898 (uint) -1);
2899
2900 /*
2901 Now we have to write tail pages, as we need to store the position
2902 to them in the row extent header.
2903
2904 We first write out all blob tails, to be able to store them in
2905 the current page or 'tmp_data'.
2906
2907 Then we write the tail of the non-blob fields (The position to the
2908 tail page is stored either in row header, the extents in the head
2909 page or in the first full page of the non-blob data. It's never in
2910 the tail page of the non-blob data)
2911 */
2912
2913 blob_full_pages_exists= 0;
2914 if (row_extents_in_use)
2915 {
2916 if (column != end_column) /* If blob fields */
2917 {
2918 MARIA_COLUMNDEF *save_column= column;
2919 MARIA_BITMAP_BLOCK *save_block= block;
2920 MARIA_BITMAP_BLOCK *end_block;
2921 ulong *save_blob_lengths= blob_lengths;
2922
2923 for (; column < end_column; column++, blob_lengths++)
2924 {
2925 uchar *blob_pos;
2926 if (!*blob_lengths) /* Null or "" */
2927 continue;
2928 if (block[block->sub_blocks - 1].used & BLOCKUSED_TAIL)
2929 {
2930 uint length;
2931 length= column->length - portable_sizeof_char_ptr;
2932 memcpy(&blob_pos, record + column->offset + length, sizeof(char*));
2933 length= *blob_lengths % FULL_PAGE_SIZE(share); /* tail size */
2934 if (length != *blob_lengths)
2935 blob_full_pages_exists= 1;
2936 if (write_tail(info, block + block->sub_blocks-1,
2937 blob_pos + *blob_lengths - length,
2938 length))
2939 goto disk_err;
2940 }
2941 else
2942 blob_full_pages_exists= 1;
2943
2944 for (end_block= block + block->sub_blocks; block < end_block; block++)
2945 {
2946 /*
2947 Set only a bit, to not cause bitmap code to believe a block is full
2948 when there is still a lot of entries in it.
2949 */
2950 block->used|= BLOCKUSED_USED;
2951 }
2952 }
2953 DBUG_ASSERT((undo_lsn == LSN_ERROR ||
2954 block == bitmap_blocks->block + bitmap_blocks->count));
2955 column= save_column;
2956 block= save_block;
2957 blob_lengths= save_blob_lengths;
2958 }
2959
2960 if (tmp_data_used) /* non blob data overflows */
2961 {
2962 MARIA_BITMAP_BLOCK *cur_block, *end_block, *last_head_block;
2963 MARIA_BITMAP_BLOCK *head_tail_block= 0;
2964 ulong length;
2965 ulong data_length= (ulong) (tmp_data - info->rec_buff);
2966
2967 #ifdef SANITY_CHECKS
2968 DBUG_ASSERT(head_block->sub_blocks != 1);
2969 if (head_block->sub_blocks == 1)
2970 goto crashed; /* no reserved full or tails */
2971 #endif
2972 /*
2973 Find out where to write tail for non-blob fields.
2974
2975 Problem here is that the bitmap code may have allocated more
2976 space than we need. We have to handle the following cases:
2977
2978 - Bitmap code allocated a tail page we don't need.
2979 - The last full page allocated needs to be changed to a tail page
2980 (Because we where able to put more data on the head page than
2981 the bitmap allocation assumed)
2982
2983 The reserved pages in bitmap_blocks for the main page has one of
2984 the following allocations:
2985 - Full pages, with following blocks:
2986 # * full pages
2987 empty page ; To be used if we change last full to tail page. This
2988 has 'count' = 0.
2989 tail page (optional, if last full page was part full)
2990 - One tail page
2991 */
2992
2993 cur_block= head_block + 1;
2994 end_block= head_block + head_block->sub_blocks;
2995 /*
2996 Loop until we have find a block bigger than we need or
2997 we find the empty page block.
2998 */
2999 while (data_length >= (length= (cur_block->page_count *
3000 FULL_PAGE_SIZE(share))) &&
3001 cur_block->page_count)
3002 {
3003 #ifdef SANITY_CHECKS
3004 DBUG_ASSERT(!((cur_block == end_block) ||
3005 (cur_block->used & BLOCKUSED_USED)));
3006 if ((cur_block == end_block) || (cur_block->used & BLOCKUSED_USED))
3007 goto crashed;
3008 #endif
3009 data_length-= length;
3010 (cur_block++)->used|= BLOCKUSED_USED;
3011 }
3012 last_head_block= cur_block;
3013 if (data_length)
3014 {
3015 if (cur_block->page_count == 0)
3016 {
3017 /* Skip empty filler block */
3018 cur_block++;
3019 }
3020 #ifdef SANITY_CHECKS
3021 DBUG_ASSERT(!(cur_block >= end_block));
3022 if ((cur_block >= end_block))
3023 goto crashed;
3024 #endif
3025 if (cur_block->used & BLOCKUSED_TAIL)
3026 {
3027 DBUG_ASSERT(data_length < MAX_TAIL_SIZE(block_size));
3028 /* tail written to tail page */
3029 cur_block->used|= BLOCKUSED_USED;
3030 head_tail_block= cur_block;
3031 }
3032 else if (data_length > length - MAX_TAIL_SIZE(block_size))
3033 {
3034 /* tail written to full page */
3035 cur_block->used|= BLOCKUSED_USED;
3036 if ((cur_block != end_block - 1) &&
3037 (end_block[-1].used & BLOCKUSED_TAIL))
3038 bitmap_blocks->tail_page_skipped= 1;
3039 }
3040 else
3041 {
3042 /*
3043 cur_block is a full block, followed by an empty and optional
3044 tail block. Change cur_block to a tail block or split it
3045 into full blocks and tail blocks.
3046
3047 TODO:
3048 If there is enough space on the following tail block, use
3049 this instead of creating a new tail block.
3050 */
3051 DBUG_ASSERT(cur_block[1].page_count == 0);
3052 if (cur_block->page_count == 1)
3053 {
3054 /* convert full block to tail block */
3055 cur_block->used|= BLOCKUSED_USED | BLOCKUSED_TAIL;
3056 head_tail_block= cur_block;
3057 }
3058 else
3059 {
3060 DBUG_ASSERT(data_length < length - FULL_PAGE_SIZE(share));
3061 DBUG_PRINT("info", ("Splitting blocks into full and tail"));
3062 cur_block[1].page= (cur_block->page + cur_block->page_count - 1);
3063 cur_block[1].page_count= 1; /* Avoid DBUG_ASSERT */
3064 cur_block[1].used= BLOCKUSED_USED | BLOCKUSED_TAIL;
3065 cur_block->page_count--;
3066 cur_block->used|= BLOCKUSED_USED;
3067 last_head_block= head_tail_block= cur_block+1;
3068 }
3069 if (end_block[-1].used & BLOCKUSED_TAIL)
3070 bitmap_blocks->tail_page_skipped= 1;
3071 }
3072 }
3073 else
3074 {
3075 /* Must be an empty or tail page */
3076 DBUG_ASSERT(cur_block->page_count == 0 ||
3077 cur_block->used & BLOCKUSED_TAIL);
3078 if (end_block[-1].used & BLOCKUSED_TAIL)
3079 bitmap_blocks->tail_page_skipped= 1;
3080 }
3081
3082 /*
3083 Write all extents into page or tmp_data
3084
3085 Note that we still don't have a correct position for the tail
3086 of the non-blob fields.
3087 */
3088 store_extent_info(row_extents_first_part,
3089 row_extents_second_part,
3090 head_block+1, bitmap_blocks->count - 1);
3091 if (head_tail_block)
3092 {
3093 ulong block_length= (ulong) (tmp_data - info->rec_buff);
3094 uchar *extent_data;
3095
3096 length= (uint) (block_length % FULL_PAGE_SIZE(share));
3097 if (write_tail(info, head_tail_block,
3098 info->rec_buff + block_length - length,
3099 length))
3100 goto disk_err;
3101 tmp_data-= length; /* Remove the tail */
3102 if (tmp_data == info->rec_buff)
3103 {
3104 /* We have no full blocks to write for the head part */
3105 tmp_data_used= 0;
3106 }
3107
3108 /* Store the tail position for the non-blob fields */
3109 if (head_tail_block == head_block + 1)
3110 {
3111 /*
3112 We had a head block + tail block, which means that the
3113 tail block is the first extent
3114 */
3115 extent_data= row_extents_first_part;
3116 }
3117 else
3118 {
3119 /*
3120 We have a head block + some full blocks + tail block
3121 last_head_block is pointing after the last used extent
3122 for the head block.
3123 */
3124 extent_data= row_extents_second_part +
3125 ((last_head_block - head_block) - 2) * ROW_EXTENT_SIZE;
3126 }
3127 /* Write information for tail block in the reserved space */
3128 page_store(extent_data, head_tail_block->page);
3129 pagerange_store(extent_data + PAGE_STORE_SIZE,
3130 head_tail_block->page_count);
3131 }
3132 }
3133 else
3134 store_extent_info(row_extents_first_part,
3135 row_extents_second_part,
3136 head_block+1, bitmap_blocks->count - 1);
3137 }
3138
3139 if (share->now_transactional)
3140 {
3141 uchar log_data[FILEID_STORE_SIZE + PAGE_STORE_SIZE + DIRPOS_STORE_SIZE];
3142 LEX_CUSTRING log_array[TRANSLOG_INTERNAL_PARTS + 2];
3143
3144 /* Log REDO changes of head page */
3145 page_store(log_data + FILEID_STORE_SIZE, head_block->page);
3146 dirpos_store(log_data + FILEID_STORE_SIZE + PAGE_STORE_SIZE,
3147 row_pos->rownr);
3148 log_array[TRANSLOG_INTERNAL_PARTS + 0].str= log_data;
3149 log_array[TRANSLOG_INTERNAL_PARTS + 0].length= sizeof(log_data);
3150 log_array[TRANSLOG_INTERNAL_PARTS + 1].str= row_pos->data;
3151 log_array[TRANSLOG_INTERNAL_PARTS + 1].length= head_length;
3152 if (translog_write_record(&lsn,
3153 head_block_is_read ?
3154 LOGREC_REDO_INSERT_ROW_HEAD :
3155 LOGREC_REDO_NEW_ROW_HEAD,
3156 info->trn,
3157 info,
3158 (translog_size_t) (sizeof(log_data) +
3159 head_length),
3160 TRANSLOG_INTERNAL_PARTS + 2, log_array,
3161 log_data, NULL))
3162 goto disk_err;
3163 }
3164
3165 #ifdef RECOVERY_EXTRA_DEBUG
3166 if (info->trn->undo_lsn != LSN_IMPOSSIBLE)
3167 {
3168 /* Stop right after the REDO; testing incomplete log record groups */
3169 DBUG_EXECUTE_IF("maria_flush_whole_log",
3170 {
3171 DBUG_PRINT("maria_flush_whole_log", ("now"));
3172 translog_flush(translog_get_horizon());
3173 });
3174 DBUG_EXECUTE_IF("maria_crash",
3175 { DBUG_PRINT("maria_crash", ("now")); DBUG_SUICIDE(); });
3176 }
3177 #endif
3178
3179 if (head_block_is_read)
3180 {
3181 MARIA_PINNED_PAGE *page_link;
3182 /* Head page is always the first pinned page */
3183 page_link= dynamic_element(&info->pinned_pages, 0,
3184 MARIA_PINNED_PAGE*);
3185 pagecache_unlock_by_link(share->pagecache, page_link->link,
3186 PAGECACHE_LOCK_WRITE_TO_READ,
3187 PAGECACHE_PIN_LEFT_PINNED, LSN_IMPOSSIBLE,
3188 LSN_IMPOSSIBLE, 1, FALSE);
3189 page_link->unlock= PAGECACHE_LOCK_READ_UNLOCK;
3190 page_link->changed= 1;
3191 }
3192 else
3193 {
3194 if (pagecache_write(share->pagecache,
3195 &info->dfile, head_block->page, 0,
3196 page_buff, share->page_type,
3197 head_block_is_read ? PAGECACHE_LOCK_WRITE_TO_READ :
3198 PAGECACHE_LOCK_READ,
3199 head_block_is_read ? PAGECACHE_PIN_LEFT_PINNED :
3200 PAGECACHE_PIN,
3201 PAGECACHE_WRITE_DELAY, &page_link.link,
3202 LSN_IMPOSSIBLE))
3203 goto disk_err;
3204 DBUG_ASSERT(page_link.link);
3205 page_link.unlock= PAGECACHE_LOCK_READ_UNLOCK;
3206 page_link.changed= 1;
3207 push_dynamic(&info->pinned_pages, (void*) &page_link);
3208
3209 /* Increase data file size, if extended */
3210 position= (my_off_t) head_block->page * block_size;
3211 if (share->state.state.data_file_length <= position)
3212 _ma_set_share_data_file_length(share, position + block_size);
3213 }
3214
3215 if (share->now_transactional && (tmp_data_used || blob_full_pages_exists))
3216 {
3217 /*
3218 Log REDO writes for all full pages (head part and all blobs)
3219 We write all here to be able to generate the UNDO record early
3220 so that we can write the LSN for the UNDO record to all full pages.
3221 */
3222 uchar tmp_log_data[FILEID_STORE_SIZE + PAGERANGE_STORE_SIZE +
3223 (ROW_EXTENT_SIZE + BLOCK_FILLER_SIZE + SUB_RANGE_SIZE) *
3224 ROW_EXTENTS_ON_STACK];
3225 uchar *log_data, *log_pos;
3226 LEX_CUSTRING tmp_log_array[TRANSLOG_INTERNAL_PARTS + 2 +
3227 ROW_EXTENTS_ON_STACK];
3228 LEX_CUSTRING *log_array_pos, *log_array;
3229 int error;
3230 translog_size_t log_entry_length= 0;
3231 uint ext_length, extents= 0, sub_extents= 0;
3232
3233 /* If few extents, then allocate things on stack to avoid a malloc call */
3234 if (bitmap_blocks->count < ROW_EXTENTS_ON_STACK)
3235 {
3236 log_array= tmp_log_array;
3237 log_data= tmp_log_data;
3238 }
3239 else
3240 {
3241 if (!my_multi_malloc(PSI_INSTRUMENT_ME, MYF(MY_WME), &log_array,
3242 (uint) ((bitmap_blocks->count +
3243 TRANSLOG_INTERNAL_PARTS + 2) *
3244 sizeof(*log_array)),
3245 &log_data, FILEID_STORE_SIZE + PAGERANGE_STORE_SIZE +
3246 bitmap_blocks->count * (ROW_EXTENT_SIZE +
3247 BLOCK_FILLER_SIZE +
3248 SUB_RANGE_SIZE),
3249 NullS))
3250 goto disk_err;
3251 }
3252 log_pos= log_data + FILEID_STORE_SIZE + PAGERANGE_STORE_SIZE * 2;
3253 log_array_pos= log_array+ TRANSLOG_INTERNAL_PARTS+1;
3254
3255 if (tmp_data_used)
3256 {
3257 /* Full head page */
3258 translog_size_t block_length= (translog_size_t) (tmp_data -
3259 info->rec_buff);
3260 log_pos= store_page_range(share,
3261 log_pos, head_block+1,
3262 (ulong) block_length, &extents);
3263 log_array_pos->str= info->rec_buff;
3264 log_array_pos->length= block_length;
3265 log_entry_length+= block_length;
3266 log_array_pos++;
3267 sub_extents++;
3268 }
3269 if (blob_full_pages_exists)
3270 {
3271 MARIA_COLUMNDEF *tmp_column= column;
3272 ulong *tmp_blob_lengths= blob_lengths;
3273 MARIA_BITMAP_BLOCK *tmp_block= block;
3274
3275 /* Full blob pages */
3276 for (; tmp_column < end_column; tmp_column++, tmp_blob_lengths++)
3277 {
3278 ulong blob_length;
3279 uint length;
3280
3281 if (!*tmp_blob_lengths) /* Null or "" */
3282 continue;
3283 blob_length= *tmp_blob_lengths;
3284 length= tmp_column->length - portable_sizeof_char_ptr;
3285 /*
3286 If last part of blog was on tail page, change blob_length to
3287 reflect this
3288 */
3289 if (tmp_block[tmp_block->sub_blocks - 1].used & BLOCKUSED_TAIL)
3290 blob_length-= (blob_length % FULL_PAGE_SIZE(share));
3291 if (blob_length)
3292 {
3293 memcpy((void*) &log_array_pos->str,
3294 record + tmp_column->offset + length,
3295 sizeof(uchar*));
3296 log_array_pos->length= blob_length;
3297 log_entry_length+= blob_length;
3298 log_array_pos++;
3299 sub_extents++;
3300
3301 log_pos= store_page_range(share,
3302 log_pos, tmp_block,
3303 blob_length, &extents);
3304 }
3305 tmp_block+= tmp_block->sub_blocks;
3306 }
3307 }
3308
3309 log_array[TRANSLOG_INTERNAL_PARTS + 0].str= log_data;
3310 ext_length= (uint) (log_pos - log_data);
3311 log_array[TRANSLOG_INTERNAL_PARTS + 0].length= ext_length;
3312 pagerange_store(log_data+ FILEID_STORE_SIZE, extents);
3313 pagerange_store(log_data+ FILEID_STORE_SIZE + PAGERANGE_STORE_SIZE,
3314 sub_extents);
3315
3316 log_entry_length+= ext_length;
3317 /* trn->rec_lsn is already set earlier in this function */
3318 error= translog_write_record(&lsn, LOGREC_REDO_INSERT_ROW_BLOBS,
3319 info->trn, info, log_entry_length,
3320 (uint) (log_array_pos - log_array),
3321 log_array, log_data, NULL);
3322 if (log_array != tmp_log_array)
3323 my_free(log_array);
3324 if (error)
3325 goto disk_err;
3326 }
3327
3328 /* Write UNDO or CLR record */
3329 lsn= LSN_IMPOSSIBLE;
3330 if (share->now_transactional)
3331 {
3332 LEX_CUSTRING *log_array= info->log_row_parts;
3333
3334 if (undo_lsn != LSN_ERROR)
3335 {
3336 /*
3337 Store if this CLR is about UNDO_DELETE or UNDO_UPDATE;
3338 in the first case, Recovery, when it sees the CLR_END in the
3339 REDO phase, may decrement the records' count.
3340 */
3341 if (_ma_write_clr(info, undo_lsn,
3342 old_record ? LOGREC_UNDO_ROW_UPDATE :
3343 LOGREC_UNDO_ROW_DELETE,
3344 share->calc_checksum != 0,
3345 row->checksum - old_record_checksum,
3346 &lsn, (void*) 0))
3347 goto disk_err;
3348 }
3349 else
3350 {
3351 uchar log_data[LSN_STORE_SIZE + FILEID_STORE_SIZE +
3352 PAGE_STORE_SIZE + DIRPOS_STORE_SIZE + 2 +
3353 HA_CHECKSUM_STORE_SIZE + 2 + PAGERANGE_STORE_SIZE +
3354 ROW_EXTENT_SIZE];
3355 uchar *log_pos;
3356 ha_checksum checksum_delta;
3357
3358 /* LOGREC_UNDO_ROW_INSERT & LOGREC_UNDO_ROW_UPDATE share same header */
3359 lsn_store(log_data, info->trn->undo_lsn);
3360 page_store(log_data + LSN_STORE_SIZE + FILEID_STORE_SIZE,
3361 head_block->page);
3362 dirpos_store(log_data + LSN_STORE_SIZE + FILEID_STORE_SIZE +
3363 PAGE_STORE_SIZE,
3364 row_pos->rownr);
3365 log_pos= (log_data + LSN_STORE_SIZE + FILEID_STORE_SIZE +
3366 PAGE_STORE_SIZE + DIRPOS_STORE_SIZE);
3367 store_checksum_in_rec(share, checksum_delta,
3368 row->checksum - old_record_checksum,
3369 log_pos, log_pos);
3370 compile_time_assert(sizeof(ha_checksum) == HA_CHECKSUM_STORE_SIZE);
3371
3372 log_array[TRANSLOG_INTERNAL_PARTS + 0].str= log_data;
3373 log_array[TRANSLOG_INTERNAL_PARTS + 0].length= (uint) (log_pos -
3374 log_data);
3375
3376 if (!old_record)
3377 {
3378 /* Store undo_lsn in case we are aborting the insert */
3379 row->orig_undo_lsn= info->trn->undo_lsn;
3380 /* Write UNDO log record for the INSERT */
3381 if (translog_write_record(&lsn, LOGREC_UNDO_ROW_INSERT,
3382 info->trn, info,
3383 (translog_size_t)
3384 log_array[TRANSLOG_INTERNAL_PARTS +
3385 0].length,
3386 TRANSLOG_INTERNAL_PARTS + 1,
3387 log_array,
3388 log_data + LSN_STORE_SIZE, &checksum_delta))
3389 goto disk_err;
3390 }
3391 else
3392 {
3393 /* Write UNDO log record for the UPDATE */
3394 size_t row_length, extents_length;
3395 uint row_parts_count, cur_head_length;
3396
3397 /*
3398 Write head length and extents of the original row so that we
3399 during UNDO can put it back in the original position.
3400 We don't store size for TRANSID, as we don't write this during
3401 UNDO.
3402 */
3403 cur_head_length= (info->cur_row.head_length -
3404 info->cur_row.header_length);
3405 int2store(log_pos, cur_head_length);
3406 pagerange_store(log_pos + 2, info->cur_row.extents_count);
3407 log_pos+= 2 + PAGERANGE_STORE_SIZE;
3408 log_array[TRANSLOG_INTERNAL_PARTS + 0].length+= (2 +
3409 PAGERANGE_STORE_SIZE);
3410 info->log_row_parts[TRANSLOG_INTERNAL_PARTS+1].str=
3411 info->cur_row.extents;
3412 info->log_row_parts[TRANSLOG_INTERNAL_PARTS+1].length=
3413 extents_length= info->cur_row.extents_count * ROW_EXTENT_SIZE;
3414
3415 row_length= fill_update_undo_parts(info, old_record, record,
3416 log_array +
3417 TRANSLOG_INTERNAL_PARTS + 2,
3418 &row_parts_count);
3419 if (translog_write_record(&lsn, LOGREC_UNDO_ROW_UPDATE, info->trn,
3420 info,
3421 (translog_size_t)
3422 (log_array[TRANSLOG_INTERNAL_PARTS +
3423 0].length + extents_length +
3424 row_length),
3425 TRANSLOG_INTERNAL_PARTS + 2 +
3426 row_parts_count,
3427 log_array,
3428 log_data + LSN_STORE_SIZE,
3429 &checksum_delta))
3430 goto disk_err;
3431 }
3432 }
3433 }
3434 /* Release not used space in used pages */
3435 if (_ma_bitmap_release_unused(info, bitmap_blocks))
3436 goto disk_err;
3437 _ma_unpin_all_pages(info, lsn);
3438
3439 if (tmp_data_used)
3440 {
3441 /*
3442 Write data stored in info->rec_buff to pages
3443 This is the char/varchar data that didn't fit into the head page.
3444 */
3445 DBUG_ASSERT(bitmap_blocks->count != 0);
3446 if (write_full_pages(info, lsn, head_block + 1,
3447 info->rec_buff, (ulong) (tmp_data - info->rec_buff)))
3448 goto disk_err;
3449 }
3450
3451 /* Write rest of blobs (data, but no tails as they are already written) */
3452 for (; column < end_column; column++, blob_lengths++)
3453 {
3454 uchar *blob_pos;
3455 uint length;
3456 ulong blob_length;
3457 if (!*blob_lengths) /* Null or "" */
3458 continue;
3459 length= column->length - portable_sizeof_char_ptr;
3460 memcpy(&blob_pos, record + column->offset + length, sizeof(char*));
3461 /* remove tail part */
3462 blob_length= *blob_lengths;
3463 if (block[block->sub_blocks - 1].used & BLOCKUSED_TAIL)
3464 blob_length-= (blob_length % FULL_PAGE_SIZE(share));
3465
3466 if (blob_length && write_full_pages(info, lsn, block,
3467 blob_pos, blob_length))
3468 goto disk_err;
3469 block+= block->sub_blocks;
3470 }
3471
3472 _ma_finalize_row(info);
3473 DBUG_RETURN(0);
3474
3475 crashed:
3476 DBUG_ASSERT(!maria_assert_if_crashed_table);
3477 /* Something was wrong with data on page */
3478 _ma_set_fatal_error(share, HA_ERR_WRONG_IN_RECORD);
3479
3480 disk_err:
3481 /**
3482 @todo RECOVERY we are going to let dirty pages go to disk while we have
3483 logged UNDO, this violates WAL. We must mark the table corrupted!
3484
3485 @todo RECOVERY we have written some REDOs without a closing UNDO,
3486 it's possible that a next operation by this transaction succeeds and then
3487 Recovery would glue the "orphan REDOs" to the succeeded operation and
3488 execute the failed REDOs. We need some mark "abort this group" in the
3489 log, or mark the table corrupted (then user will repair it and thus REDOs
3490 will be skipped).
3491
3492 @todo RECOVERY to not let write errors go unnoticed, pagecache_write()
3493 should take a MARIA_HA* in argument, and it it
3494 fails when flushing a page to disk it should call
3495 (*the_maria_ha->write_error_func)(the_maria_ha)
3496 and this hook will mark the table corrupted.
3497 Maybe hook should be stored in the pagecache's block structure, or in a
3498 hash "file->maria_ha*".
3499
3500 @todo RECOVERY we should distinguish below between log write error and
3501 table write error. The former should stop Maria immediately, the latter
3502 should mark the table corrupted.
3503 */
3504 /*
3505 Unpin all pinned pages to not cause problems for disk cache. This is
3506 safe to call even if we already called _ma_unpin_all_pages() above.
3507 */
3508 save_my_errno= my_errno;
3509 _ma_unpin_all_pages_and_finalize_row(info, LSN_IMPOSSIBLE);
3510 my_errno= save_my_errno;
3511 DBUG_RETURN(1);
3512 }
3513
3514
3515 /*
3516 @brief Write a record
3517
3518 @fn allocate_and_write_block_record()
3519 @param info Maria handler
3520 @param record Record to write
3521 @param row Information about fields in 'record'
3522 @param undo_lsn <> LSN_ERROR if we are executing an UNDO
3523
3524 @return
3525 @retval 0 ok
3526 @retval 1 Error
3527 */
3528
allocate_and_write_block_record(MARIA_HA * info,const uchar * record,MARIA_ROW * row,LSN undo_lsn)3529 static my_bool allocate_and_write_block_record(MARIA_HA *info,
3530 const uchar *record,
3531 MARIA_ROW *row,
3532 LSN undo_lsn)
3533 {
3534 struct st_row_pos_info row_pos;
3535 MARIA_BITMAP_BLOCKS *blocks= &row->insert_blocks;
3536 int save_my_errno;
3537 DBUG_ENTER("allocate_and_write_block_record");
3538
3539 _ma_bitmap_flushable(info, 1);
3540 if (_ma_bitmap_find_place(info, row, blocks))
3541 goto err; /* Error reading bitmap */
3542
3543 /*
3544 Sleep; a checkpoint will happen and should not send this over-allocated
3545 bitmap to disk but rather wait.
3546 */
3547 DBUG_EXECUTE_IF("maria_over_alloc_bitmap", sleep(10););
3548
3549 /* page will be pinned & locked by get_head_or_tail_page */
3550 if (get_head_or_tail_page(info, blocks->block, info->buff,
3551 MY_MAX(row->space_on_head_page,
3552 info->s->base.min_block_length),
3553 HEAD_PAGE,
3554 PAGECACHE_LOCK_WRITE, &row_pos))
3555 goto err;
3556 row->lastpos= ma_recordpos(blocks->block->page, row_pos.rownr);
3557 if (info->s->calc_checksum)
3558 {
3559 if (undo_lsn == LSN_ERROR)
3560 row->checksum= (info->s->calc_checksum)(info, record);
3561 else
3562 {
3563 /* _ma_apply_undo_row_delete() already set row's checksum. Verify it. */
3564 DBUG_ASSERT(row->checksum == (info->s->calc_checksum)(info, record));
3565 }
3566 }
3567 DBUG_PRINT("info", ("rowid: %lu (%lu:%u) length: %u", (ulong) row->lastpos,
3568 (ulong) ma_recordpos_to_page(row->lastpos),
3569 ma_recordpos_to_dir_entry(row->lastpos),
3570 row_pos.length));
3571 if (write_block_record(info, (uchar*) 0, record, row,
3572 blocks, blocks->block->org_bitmap_value != 0,
3573 &row_pos, undo_lsn, 0))
3574 goto err;
3575 /* Now let checkpoint happen but don't commit */
3576 DBUG_EXECUTE_IF("maria_over_alloc_bitmap", sleep(1000););
3577 DBUG_RETURN(0);
3578
3579 err:
3580 save_my_errno= my_errno;
3581 if (info->non_flushable_state)
3582 _ma_bitmap_flushable(info, -1);
3583 _ma_unpin_all_pages_and_finalize_row(info, LSN_IMPOSSIBLE);
3584 my_errno= save_my_errno;
3585 DBUG_RETURN(1);
3586 }
3587
3588
3589 /*
3590 Write a record and return rowid for it
3591
3592 SYNOPSIS
3593 _ma_write_init_block_record()
3594 info Maria handler
3595 record Record to write
3596
3597 NOTES
3598 This is done BEFORE we write the keys to the row!
3599
3600 RETURN
3601 HA_OFFSET_ERROR Something went wrong
3602 # Rowid for row
3603 */
3604
_ma_write_init_block_record(MARIA_HA * info,const uchar * record)3605 MARIA_RECORD_POS _ma_write_init_block_record(MARIA_HA *info,
3606 const uchar *record)
3607 {
3608 DBUG_ENTER("_ma_write_init_block_record");
3609
3610 calc_record_size(info, record, &info->cur_row);
3611 if (allocate_and_write_block_record(info, record,
3612 &info->cur_row, LSN_ERROR))
3613 DBUG_RETURN(HA_OFFSET_ERROR);
3614 DBUG_RETURN(info->cur_row.lastpos);
3615 }
3616
3617
3618 /*
3619 Dummy function for (*info->s->write_record)()
3620
3621 Nothing to do here, as we already wrote the record in
3622 _ma_write_init_block_record()
3623 */
3624
_ma_write_block_record(MARIA_HA * info,const uchar * record)3625 my_bool _ma_write_block_record(MARIA_HA *info __attribute__ ((unused)),
3626 const uchar *record __attribute__ ((unused)))
3627 {
3628 return 0; /* Row already written */
3629 }
3630
3631
3632 /**
3633 @brief Remove row written by _ma_write_block_record() and log undo
3634
3635 @param info Maria handler
3636
3637 @note
3638 This is called in case we got a duplicate unique key while
3639 writing keys.
3640
3641 @return Operation status
3642 @retval 0 OK
3643 @retval 1 Error
3644 */
3645
_ma_write_abort_block_record(MARIA_HA * info)3646 my_bool _ma_write_abort_block_record(MARIA_HA *info)
3647 {
3648 my_bool res= 0;
3649 MARIA_BITMAP_BLOCKS *blocks= &info->cur_row.insert_blocks;
3650 MARIA_BITMAP_BLOCK *block, *end;
3651 LSN lsn= LSN_IMPOSSIBLE;
3652 MARIA_SHARE *share= info->s;
3653 DBUG_ENTER("_ma_write_abort_block_record");
3654
3655 _ma_bitmap_lock(share); /* Lock bitmap from other insert threads */
3656 if (delete_head_or_tail(info,
3657 ma_recordpos_to_page(info->cur_row.lastpos),
3658 ma_recordpos_to_dir_entry(info->cur_row.lastpos), 1,
3659 0))
3660 res= 1;
3661 for (block= blocks->block + 1, end= block + blocks->count - 1; block < end;
3662 block++)
3663 {
3664 if (block->used & BLOCKUSED_USED)
3665 {
3666 if (block->used & BLOCKUSED_TAIL)
3667 {
3668 /*
3669 block->page_count is set to the tail directory entry number in
3670 write_block_record()
3671 */
3672 if (delete_head_or_tail(info, block->page,
3673 block->page_count & ~TAIL_BIT,
3674 0, 0))
3675 res= 1;
3676 }
3677 else
3678 {
3679 if (free_full_page_range(info, block->page, block->page_count))
3680 res= 1;
3681 }
3682 }
3683 }
3684 _ma_bitmap_unlock(share);
3685 if (share->now_transactional)
3686 {
3687 /*
3688 Write clr to mark end of aborted row insert.
3689 The above delete_head_or_tail() calls will only log redo, not undo.
3690 The undo just before the row insert is stored in row->orig_undo_lsn.
3691
3692 When applying undo's, we can skip all undo records between current
3693 lsn and row->orig_undo_lsn as logically things are as before the
3694 attempted insert.
3695 */
3696 if (_ma_write_clr(info, info->cur_row.orig_undo_lsn,
3697 LOGREC_UNDO_ROW_INSERT,
3698 share->calc_checksum != 0,
3699 (ha_checksum) 0 - info->cur_row.checksum,
3700 &lsn, (void*) 0))
3701 res= 1;
3702 }
3703 _ma_unpin_all_pages_and_finalize_row(info, lsn);
3704 DBUG_RETURN(res);
3705 }
3706
3707
3708 /*
3709 Update a record
3710
3711 NOTES
3712 For the moment, we assume that info->curr_row.extents is always updated
3713 when a row is read. In the future we may decide to read this on demand
3714 for rows split into many extents.
3715 */
3716
_ma_update_block_record2(MARIA_HA * info,MARIA_RECORD_POS record_pos,const uchar * oldrec,const uchar * record,LSN undo_lsn)3717 static my_bool _ma_update_block_record2(MARIA_HA *info,
3718 MARIA_RECORD_POS record_pos,
3719 const uchar *oldrec,
3720 const uchar *record,
3721 LSN undo_lsn)
3722 {
3723 MARIA_BITMAP_BLOCKS *blocks= &info->cur_row.insert_blocks;
3724 uchar *buff;
3725 MARIA_ROW *cur_row= &info->cur_row, *new_row= &info->new_row;
3726 MARIA_PINNED_PAGE page_link;
3727 uint rownr, org_empty_size, head_length;
3728 uint block_size= info->s->block_size;
3729 uint errpos __attribute__((unused)) = 0;
3730 uchar *dir;
3731 pgcache_page_no_t page;
3732 struct st_row_pos_info row_pos;
3733 my_bool res;
3734 ha_checksum old_checksum;
3735 MARIA_SHARE *share= info->s;
3736 DBUG_ENTER("_ma_update_block_record2");
3737 DBUG_PRINT("enter", ("rowid: %lu", (long) record_pos));
3738
3739 #ifdef ENABLE_IF_PROBLEM_WITH_UPDATE
3740 DBUG_DUMP("oldrec", oldrec, share->base.reclength);
3741 DBUG_DUMP("newrec", record, share->base.reclength);
3742 #endif
3743
3744 /*
3745 Checksums of new and old rows were computed by callers already; new
3746 row's was put into cur_row, old row's was put into new_row.
3747 */
3748 old_checksum= new_row->checksum;
3749 new_row->checksum= cur_row->checksum;
3750 calc_record_size(info, record, new_row);
3751 page= ma_recordpos_to_page(record_pos);
3752
3753 _ma_bitmap_flushable(info, 1);
3754 buff= pagecache_read(share->pagecache,
3755 &info->dfile, (pgcache_page_no_t) page, 0, 0,
3756 share->page_type,
3757 PAGECACHE_LOCK_WRITE, &page_link.link);
3758 page_link.unlock= PAGECACHE_LOCK_WRITE_UNLOCK;
3759 page_link.changed= buff != 0;
3760 push_dynamic(&info->pinned_pages, (void*) &page_link);
3761 if (!buff)
3762 goto err;
3763
3764 org_empty_size= uint2korr(buff + EMPTY_SPACE_OFFSET);
3765 rownr= ma_recordpos_to_dir_entry(record_pos);
3766 dir= dir_entry_pos(buff, block_size, rownr);
3767
3768 /*
3769 We can't use cur_row->head_length as the block may have been compacted
3770 since we read it.
3771 */
3772 head_length= uint2korr(dir + 2);
3773
3774 if ((org_empty_size + head_length) >= new_row->total_length)
3775 {
3776 uint rec_offset, length;
3777 MARIA_BITMAP_BLOCK block;
3778
3779 DBUG_PRINT("info", ("org_empty_size: %u org_length: %u new_length: %lu",
3780 org_empty_size, head_length,
3781 new_row->total_length));
3782
3783 /*
3784 We can fit the new row in the same page as the original head part
3785 of the row
3786 */
3787 block.org_bitmap_value= _ma_free_size_to_head_pattern(&share->bitmap,
3788 org_empty_size);
3789 if (extend_area_on_page(info, buff, dir, rownr,
3790 new_row->total_length, &org_empty_size,
3791 &rec_offset, &length, 1))
3792 {
3793 errpos= 1;
3794 goto err;
3795 }
3796
3797 row_pos.buff= buff;
3798 row_pos.rownr= rownr;
3799 row_pos.empty_space= org_empty_size;
3800 row_pos.dir= dir;
3801 row_pos.data= buff + rec_offset;
3802 row_pos.length= length;
3803 blocks->block= █
3804 blocks->count= 1;
3805 block.page= page;
3806 block.sub_blocks= 1;
3807 block.used= BLOCKUSED_USED | BLOCKUSED_USE_ORG_BITMAP;
3808 block.empty_space= row_pos.empty_space;
3809
3810 if (*cur_row->tail_positions &&
3811 delete_tails(info, cur_row->tail_positions))
3812 {
3813 errpos= 2;
3814 goto err;
3815 }
3816 if (cur_row->extents_count && free_full_pages(info, cur_row))
3817 {
3818 errpos= 3;
3819 goto err;
3820 }
3821 res= write_block_record(info, oldrec, record, new_row, blocks,
3822 1, &row_pos, undo_lsn, old_checksum);
3823 /* We can't update or delete this without re-reading it again */
3824 info->update&= ~HA_STATE_AKTIV;
3825 DBUG_RETURN(res);
3826 }
3827 /* Delete old row */
3828 if (*cur_row->tail_positions &&
3829 delete_tails(info, cur_row->tail_positions))
3830 {
3831 errpos= 4;
3832 goto err;
3833 }
3834 if (cur_row->extents_count && free_full_pages(info, cur_row))
3835 {
3836 errpos= 5;
3837 goto err;
3838 }
3839
3840 head_length= uint2korr(dir + 2);
3841 if (_ma_bitmap_find_new_place(info, new_row, page, head_length +
3842 org_empty_size, blocks))
3843 {
3844 errpos= 6;
3845 goto err;
3846 }
3847
3848 /*
3849 Allocate all size in block for record
3850 TODO:
3851 Need to improve this to do compact if we can fit one more blob into
3852 the head page
3853 */
3854 if ((head_length < new_row->space_on_head_page ||
3855 (new_row->total_length <= head_length &&
3856 org_empty_size + head_length >= new_row->total_length)))
3857 {
3858 _ma_compact_block_page(share,
3859 buff, rownr, 1,
3860 info->trn->min_read_from,
3861 share->base.min_block_length);
3862 org_empty_size= 0;
3863 head_length= uint2korr(dir + 2);
3864 }
3865
3866 row_pos.buff= buff;
3867 row_pos.rownr= rownr;
3868 row_pos.empty_space= org_empty_size + head_length;
3869 row_pos.dir= dir;
3870 row_pos.data= buff + uint2korr(dir);
3871 row_pos.length= head_length;
3872 if ((res= write_block_record(info, oldrec, record, new_row, blocks, 1,
3873 &row_pos, undo_lsn, old_checksum)))
3874 {
3875 errpos= 7;
3876 goto err;
3877 }
3878 DBUG_RETURN(0);
3879
3880 err:
3881 DBUG_ASSERT(!maria_assert_if_crashed_table);
3882 DBUG_PRINT("error", ("errpos: %d", errpos));
3883 if (info->non_flushable_state)
3884 _ma_bitmap_flushable(info, -1);
3885 _ma_unpin_all_pages_and_finalize_row(info, LSN_IMPOSSIBLE);
3886 DBUG_RETURN(1);
3887 }
3888
3889
3890 /*
3891 @brief Store new row on it's original position
3892
3893 @note
3894 This is basicly a copy of _ma_update_block_record2
3895 When we have a purge thread for deleted row, we can remove this function
3896 and use _ma_update_block_record2 instead.
3897
3898 This is the main reason we don't make a lot of subfunctions that are
3899 common between _ma_update_block_record2() and this function.
3900
3901 Note: If something goes wrong we mark the file crashed
3902 */
3903
_ma_update_at_original_place(MARIA_HA * info,pgcache_page_no_t page,uint rownr,uint length_on_head_page,uint extent_count,const uchar * extent_info,const uchar * oldrec,const uchar * record,LSN undo_lsn)3904 static my_bool _ma_update_at_original_place(MARIA_HA *info,
3905 pgcache_page_no_t page,
3906 uint rownr,
3907 uint length_on_head_page,
3908 uint extent_count,
3909 const uchar *extent_info,
3910 const uchar *oldrec,
3911 const uchar *record,
3912 LSN undo_lsn)
3913 {
3914 MARIA_BITMAP_BLOCKS *blocks;
3915 MARIA_BITMAP_BLOCK *block;
3916 MARIA_ROW *cur_row= &info->cur_row, *new_row= &info->new_row;
3917 MARIA_PINNED_PAGE page_link;
3918 MARIA_SHARE *share= info->s;
3919 ha_checksum old_checksum;
3920 uint org_empty_size, empty_size;
3921 uint block_size= info->s->block_size;
3922 uchar *dir, *buff;
3923 struct st_row_pos_info row_pos;
3924 my_bool res;
3925 uint rec_offset, length;
3926 DBUG_ENTER("_ma_update_at_original_place");
3927
3928 #ifdef ENABLE_IF_PROBLEM_WITH_UPDATE
3929 DBUG_DUMP("oldrec", oldrec, share->base.reclength);
3930 DBUG_DUMP("newrec", record, share->base.reclength);
3931 #endif
3932
3933 /*
3934 Checksums of new and old rows were computed by callers already; new
3935 row's was put into cur_row, old row's was put into new_row.
3936 */
3937 old_checksum= new_row->checksum;
3938 new_row->checksum= cur_row->checksum;
3939 calc_record_size(info, record, new_row);
3940
3941 _ma_bitmap_flushable(info, 1);
3942 buff= pagecache_read(share->pagecache,
3943 &info->dfile, (pgcache_page_no_t) page, 0, 0,
3944 share->page_type,
3945 PAGECACHE_LOCK_WRITE, &page_link.link);
3946 page_link.unlock= PAGECACHE_LOCK_WRITE_UNLOCK;
3947 page_link.changed= buff != 0;
3948 push_dynamic(&info->pinned_pages, (void*) &page_link);
3949 if (!buff)
3950 goto err;
3951
3952 org_empty_size= uint2korr(buff + EMPTY_SPACE_OFFSET);
3953 dir= dir_entry_pos(buff, block_size, rownr);
3954
3955 if ((org_empty_size + cur_row->head_length) < length_on_head_page)
3956 {
3957 DBUG_PRINT("error",
3958 ("org_empty_size: %u head_length: %u length_on_page: %u",
3959 org_empty_size, (uint) cur_row->head_length,
3960 length_on_head_page));
3961 _ma_set_fatal_error(share, HA_ERR_WRONG_IN_RECORD);
3962 goto err;
3963 }
3964
3965 /*
3966 We can fit the new row in the same page as the original head part
3967 of the row
3968 */
3969 empty_size= org_empty_size;
3970 if (extend_area_on_page(info, buff, dir, rownr,
3971 length_on_head_page, &empty_size,
3972 &rec_offset, &length, 1))
3973 goto err;
3974
3975 row_pos.buff= buff;
3976 row_pos.rownr= rownr;
3977 row_pos.empty_space= empty_size;
3978 row_pos.dir= dir;
3979 row_pos.data= buff + rec_offset;
3980
3981 /* Delete old row */
3982 if (*cur_row->tail_positions &&
3983 delete_tails(info, cur_row->tail_positions))
3984 goto err;
3985 if (cur_row->extents_count && free_full_pages(info, cur_row))
3986 goto err;
3987
3988 /* Change extent information to be usable by write_block_record() */
3989 blocks= &cur_row->insert_blocks;
3990 if (extent_to_bitmap_blocks(info, blocks, page, extent_count, extent_info))
3991 goto err;
3992 block= blocks->block;
3993 block->empty_space= row_pos.empty_space;
3994 block->org_bitmap_value=
3995 _ma_free_size_to_head_pattern(&share->bitmap,
3996 (enough_free_entries_on_page(share, buff) ?
3997 org_empty_size : 0));
3998
3999 DBUG_ASSERT(block->org_bitmap_value ==
4000 _ma_bitmap_get_page_bits(info, &info->s->bitmap, page));
4001 block->used|= BLOCKUSED_USE_ORG_BITMAP;
4002
4003 /*
4004 We have to use <= below as the new_row may be smaller than the original
4005 row as the new row doesn't have transaction id
4006 */
4007
4008 DBUG_ASSERT(blocks->count > 1 ||
4009 MY_MAX(new_row->total_length, share->base.min_block_length) <=
4010 length_on_head_page);
4011
4012 /* Store same amount of data on head page as on original page */
4013 row_pos.length= (length_on_head_page -
4014 (extent_count + 1 - blocks->count) * ROW_EXTENT_SIZE);
4015 set_if_bigger(row_pos.length, share->base.min_block_length);
4016 if ((res= write_block_record(info, oldrec, record, new_row, blocks,
4017 1, &row_pos, undo_lsn, old_checksum)))
4018 goto err;
4019 DBUG_RETURN(0);
4020
4021 err:
4022 DBUG_ASSERT(!maria_assert_if_crashed_table);
4023 _ma_mark_file_crashed(share);
4024 if (info->non_flushable_state)
4025 _ma_bitmap_flushable(info, -1);
4026 _ma_unpin_all_pages_and_finalize_row(info, LSN_IMPOSSIBLE);
4027 DBUG_RETURN(1);
4028 }
4029
4030
4031 /* Wrapper for _ma_update_block_record2() used by ma_update() */
4032
_ma_update_block_record(MARIA_HA * info,MARIA_RECORD_POS record_pos,const uchar * orig_rec,const uchar * new_rec)4033 my_bool _ma_update_block_record(MARIA_HA *info, MARIA_RECORD_POS record_pos,
4034 const uchar *orig_rec, const uchar *new_rec)
4035 {
4036 return _ma_update_block_record2(info, record_pos, orig_rec, new_rec,
4037 LSN_ERROR);
4038 }
4039
4040
4041 /*
4042 Delete a directory entry
4043
4044 SYNOPSIS
4045 delete_dir_entry()
4046 buff Page buffer
4047 record_number Record number to delete
4048 empty_space Empty space on page after delete
4049
4050 RETURN
4051 -1 Error on page
4052 0 ok
4053 1 Page is now empty
4054 */
4055
delete_dir_entry(MARIA_SHARE * share,uchar * buff,uint record_number,uint * empty_space_res)4056 static int delete_dir_entry(MARIA_SHARE *share,
4057 uchar *buff, uint record_number,
4058 uint *empty_space_res)
4059 {
4060 uint block_size= share->block_size;
4061 uint number_of_records= (uint) buff[DIR_COUNT_OFFSET];
4062 uint length, empty_space;
4063 uchar *dir;
4064 DBUG_ENTER("delete_dir_entry");
4065 DBUG_PRINT("enter", ("record_number: %u number_of_records: %u",
4066 record_number, number_of_records));
4067
4068 #ifdef SANITY_CHECKS
4069 if (record_number >= number_of_records ||
4070 record_number > ((block_size - LSN_SIZE - PAGE_TYPE_SIZE - 1 -
4071 PAGE_SUFFIX_SIZE) / DIR_ENTRY_SIZE))
4072 {
4073 DBUG_PRINT("error", ("record_number: %u number_of_records: %u",
4074 record_number, number_of_records));
4075
4076 DBUG_RETURN(-1);
4077 }
4078 #endif
4079
4080 check_directory(share, buff, block_size, 0, (uint) -1);
4081 empty_space= uint2korr(buff + EMPTY_SPACE_OFFSET);
4082 dir= dir_entry_pos(buff, block_size, record_number);
4083 length= uint2korr(dir + 2); /* Length of entry we just deleted */
4084 DBUG_ASSERT(uint2korr(dir) != 0 && length < block_size);
4085
4086 if (record_number == number_of_records - 1)
4087 {
4088 /* Delete this entry and all following free directory entries */
4089 uchar *end= buff + block_size - PAGE_SUFFIX_SIZE;
4090 number_of_records--;
4091 dir+= DIR_ENTRY_SIZE;
4092 empty_space+= DIR_ENTRY_SIZE;
4093
4094 /* Unlink and free the next empty ones */
4095 while (dir < end && dir[0] == 0 && dir[1] == 0)
4096 {
4097 number_of_records--;
4098 if (dir[2] == END_OF_DIR_FREE_LIST)
4099 buff[DIR_FREE_OFFSET]= dir[3];
4100 else
4101 {
4102 uchar *prev_entry= dir_entry_pos(buff, block_size, (uint) dir[2]);
4103 DBUG_ASSERT(uint2korr(prev_entry) == 0 && prev_entry[3] ==
4104 number_of_records);
4105 prev_entry[3]= dir[3];
4106 }
4107 if (dir[3] != END_OF_DIR_FREE_LIST)
4108 {
4109 uchar *next_entry= dir_entry_pos(buff, block_size, (uint) dir[3]);
4110 DBUG_ASSERT(uint2korr(next_entry) == 0 && next_entry[2] ==
4111 number_of_records);
4112 next_entry[2]= dir[2];
4113 }
4114 dir+= DIR_ENTRY_SIZE;
4115 empty_space+= DIR_ENTRY_SIZE;
4116 }
4117
4118 if (number_of_records == 0)
4119 {
4120 /* All entries on page deleted */
4121 DBUG_PRINT("info", ("Page marked as unallocated"));
4122 buff[PAGE_TYPE_OFFSET]= UNALLOCATED_PAGE;
4123 #ifdef IDENTICAL_PAGES_AFTER_RECOVERY
4124 {
4125 dir= dir_entry_pos(buff, block_size, record_number);
4126 bzero(dir, (record_number+1) * DIR_ENTRY_SIZE);
4127 }
4128 #endif
4129 *empty_space_res= block_size;
4130 DBUG_RETURN(1);
4131 }
4132 buff[DIR_COUNT_OFFSET]= (uchar) number_of_records;
4133 }
4134 else
4135 {
4136 /* Update directory */
4137 dir[0]= dir[1]= 0;
4138 dir[2]= END_OF_DIR_FREE_LIST;
4139 if ((dir[3]= buff[DIR_FREE_OFFSET]) != END_OF_DIR_FREE_LIST)
4140 {
4141 /* Relink next entry to point to newly freed entry */
4142 uchar *next_entry= dir_entry_pos(buff, block_size, (uint) dir[3]);
4143 DBUG_ASSERT(uint2korr(next_entry) == 0 &&
4144 next_entry[2] == END_OF_DIR_FREE_LIST);
4145 next_entry[2]= record_number;
4146 }
4147 buff[DIR_FREE_OFFSET]= record_number;
4148 }
4149 empty_space+= length;
4150
4151 int2store(buff + EMPTY_SPACE_OFFSET, empty_space);
4152 buff[PAGE_TYPE_OFFSET]|= (uchar) PAGE_CAN_BE_COMPACTED;
4153
4154 *empty_space_res= empty_space;
4155
4156 check_directory(share, buff, block_size, 0, empty_space);
4157 DBUG_RETURN(0);
4158 }
4159
4160
4161 /*
4162 Delete a head a tail part
4163
4164 SYNOPSIS
4165 delete_head_or_tail()
4166 info Maria handler
4167 page Page (not file offset!) on which the row is
4168 head 1 if this is a head page
4169 from_update 1 if we are called from update. In this case we
4170 leave the page as write locked as we may put
4171 the new row into the old position.
4172
4173 RETURN
4174 0 ok
4175 1 error
4176 */
4177
delete_head_or_tail(MARIA_HA * info,pgcache_page_no_t page,uint record_number,my_bool head,my_bool from_update)4178 static my_bool delete_head_or_tail(MARIA_HA *info,
4179 pgcache_page_no_t page, uint record_number,
4180 my_bool head, my_bool from_update)
4181 {
4182 MARIA_SHARE *share= info->s;
4183 uint empty_space;
4184 int res;
4185 my_bool page_is_empty;
4186 uchar *buff;
4187 LSN lsn;
4188 MARIA_PINNED_PAGE page_link;
4189 enum pagecache_page_lock lock_at_write, lock_at_unpin;
4190 DBUG_ENTER("delete_head_or_tail");
4191 DBUG_PRINT("enter", ("id: %lu (%lu:%u)",
4192 (ulong) ma_recordpos(page, record_number),
4193 (ulong) page, record_number));
4194
4195 buff= pagecache_read(share->pagecache,
4196 &info->dfile, page, 0, 0,
4197 share->page_type,
4198 PAGECACHE_LOCK_WRITE, &page_link.link);
4199 page_link.unlock= PAGECACHE_LOCK_WRITE_UNLOCK;
4200 page_link.changed= buff != 0;
4201 push_dynamic(&info->pinned_pages, (void*) &page_link);
4202 if (!buff)
4203 DBUG_RETURN(1);
4204 DBUG_ASSERT((buff[PAGE_TYPE_OFFSET] & PAGE_TYPE_MASK) ==
4205 (head ? HEAD_PAGE : TAIL_PAGE));
4206
4207 if (from_update)
4208 {
4209 lock_at_write= PAGECACHE_LOCK_LEFT_WRITELOCKED;
4210 lock_at_unpin= PAGECACHE_LOCK_WRITE_UNLOCK;
4211 }
4212 else
4213 {
4214 lock_at_write= PAGECACHE_LOCK_WRITE_TO_READ;
4215 lock_at_unpin= PAGECACHE_LOCK_READ_UNLOCK;
4216 }
4217
4218 res= delete_dir_entry(share, buff, record_number, &empty_space);
4219 if (res < 0)
4220 DBUG_RETURN(1);
4221 if (res == 0) /* after our deletion, page is still not empty */
4222 {
4223 uchar log_data[FILEID_STORE_SIZE + PAGE_STORE_SIZE + DIRPOS_STORE_SIZE];
4224 LEX_CUSTRING log_array[TRANSLOG_INTERNAL_PARTS + 1];
4225 page_is_empty= 0;
4226 if (share->now_transactional)
4227 {
4228 /* Log REDO data */
4229 page_store(log_data + FILEID_STORE_SIZE, page);
4230 dirpos_store(log_data + FILEID_STORE_SIZE + PAGE_STORE_SIZE,
4231 record_number);
4232
4233 log_array[TRANSLOG_INTERNAL_PARTS + 0].str= log_data;
4234 log_array[TRANSLOG_INTERNAL_PARTS + 0].length= sizeof(log_data);
4235 if (translog_write_record(&lsn, (head ? LOGREC_REDO_PURGE_ROW_HEAD :
4236 LOGREC_REDO_PURGE_ROW_TAIL),
4237 info->trn, info,
4238 (translog_size_t) sizeof(log_data),
4239 TRANSLOG_INTERNAL_PARTS + 1, log_array,
4240 log_data, NULL))
4241 DBUG_RETURN(1);
4242 }
4243 }
4244 else /* page is now empty */
4245 {
4246 page_is_empty= 1;
4247 if (share->now_transactional)
4248 {
4249 uchar log_data[FILEID_STORE_SIZE + PAGE_STORE_SIZE];
4250 LEX_CUSTRING log_array[TRANSLOG_INTERNAL_PARTS + 1];
4251 page_store(log_data + FILEID_STORE_SIZE, page);
4252 log_array[TRANSLOG_INTERNAL_PARTS + 0].str= log_data;
4253 log_array[TRANSLOG_INTERNAL_PARTS + 0].length= sizeof(log_data);
4254 if (translog_write_record(&lsn, LOGREC_REDO_FREE_HEAD_OR_TAIL,
4255 info->trn, info,
4256 (translog_size_t) sizeof(log_data),
4257 TRANSLOG_INTERNAL_PARTS + 1, log_array,
4258 log_data, NULL))
4259 DBUG_RETURN(1);
4260 }
4261 /*
4262 Mark that this page must be written to disk by page cache, even
4263 if we could call pagecache_delete() on it.
4264 This is needed to ensure that repair finds the empty page on disk
4265 and not old data.
4266 */
4267 pagecache_set_write_on_delete_by_link(page_link.link);
4268 DBUG_ASSERT(empty_space >= share->bitmap.sizes[0]);
4269 }
4270
4271 pagecache_unlock_by_link(share->pagecache, page_link.link,
4272 lock_at_write,
4273 PAGECACHE_PIN_LEFT_PINNED, LSN_IMPOSSIBLE,
4274 LSN_IMPOSSIBLE, 1, FALSE);
4275 page_link.unlock= lock_at_unpin;
4276 set_dynamic(&info->pinned_pages, (void*) &page_link,
4277 info->pinned_pages.elements-1);
4278
4279 DBUG_PRINT("info", ("empty_space: %u", empty_space));
4280
4281 /*
4282 If there is not enough space for all possible tails, mark the
4283 page full
4284 */
4285 if (!head && !page_is_empty && !enough_free_entries(buff, share->block_size,
4286 1 + share->base.blobs))
4287 empty_space= 0;
4288
4289 DBUG_RETURN(_ma_bitmap_set(info, page, head, empty_space));
4290 }
4291
4292
4293 /*
4294 delete all tails
4295
4296 SYNOPSIS
4297 delete_tails()
4298 info Handler
4299 tails Pointer to vector of tail positions, ending with 0
4300
4301 RETURN
4302 0 ok
4303 1 error
4304 */
4305
delete_tails(MARIA_HA * info,MARIA_RECORD_POS * tails)4306 static my_bool delete_tails(MARIA_HA *info, MARIA_RECORD_POS *tails)
4307 {
4308 my_bool res= 0;
4309 DBUG_ENTER("delete_tails");
4310 for (; *tails; tails++)
4311 {
4312 if (delete_head_or_tail(info,
4313 ma_recordpos_to_page(*tails),
4314 ma_recordpos_to_dir_entry(*tails), 0, 1))
4315 res= 1;
4316 }
4317 DBUG_RETURN(res);
4318 }
4319
4320
4321 /*
4322 Delete a record
4323
4324 NOTES
4325 For the moment, we assume that info->cur_row.extents is always updated
4326 when a row is read. In the future we may decide to read this on demand
4327 for rows with many splits.
4328 */
4329
_ma_delete_block_record(MARIA_HA * info,const uchar * record)4330 my_bool _ma_delete_block_record(MARIA_HA *info, const uchar *record)
4331 {
4332 pgcache_page_no_t page;
4333 uint record_number;
4334 MARIA_SHARE *share= info->s;
4335 LSN lsn= LSN_IMPOSSIBLE;
4336 DBUG_ENTER("_ma_delete_block_record");
4337
4338 page= ma_recordpos_to_page(info->cur_row.lastpos);
4339 record_number= ma_recordpos_to_dir_entry(info->cur_row.lastpos);
4340 DBUG_PRINT("enter", ("rowid: %lu (%lu:%u)", (ulong) info->cur_row.lastpos,
4341 (ulong) page, record_number));
4342
4343 _ma_bitmap_flushable(info, 1);
4344 if (delete_head_or_tail(info, page, record_number, 1, 0) ||
4345 delete_tails(info, info->cur_row.tail_positions))
4346 goto err;
4347
4348 if (info->cur_row.extents_count && free_full_pages(info, &info->cur_row))
4349 goto err;
4350
4351 if (share->now_transactional)
4352 {
4353 uchar log_data[LSN_STORE_SIZE + FILEID_STORE_SIZE + PAGE_STORE_SIZE +
4354 DIRPOS_STORE_SIZE + 2 + PAGERANGE_STORE_SIZE +
4355 HA_CHECKSUM_STORE_SIZE];
4356 uchar *log_pos;
4357 size_t row_length;
4358 uint row_parts_count, extents_length;
4359 ha_checksum checksum_delta;
4360
4361 /* Write UNDO record */
4362 lsn_store(log_data, info->trn->undo_lsn);
4363 page_store(log_data + LSN_STORE_SIZE + FILEID_STORE_SIZE, page);
4364 log_pos= log_data + LSN_STORE_SIZE + FILEID_STORE_SIZE + PAGE_STORE_SIZE;
4365 dirpos_store(log_pos, record_number);
4366 log_pos+= DIRPOS_STORE_SIZE;
4367 int2store(log_pos, info->cur_row.head_length -
4368 info->cur_row.header_length);
4369 log_pos+= 2;
4370 pagerange_store(log_pos, info->cur_row.extents_count);
4371 log_pos+= PAGERANGE_STORE_SIZE;
4372
4373 info->log_row_parts[TRANSLOG_INTERNAL_PARTS].str= log_data;
4374 info->log_row_parts[TRANSLOG_INTERNAL_PARTS].length=
4375 sizeof(log_data) - HA_CHECKSUM_STORE_SIZE;
4376 store_checksum_in_rec(share, checksum_delta,
4377 (ha_checksum) 0 - info->cur_row.checksum, log_pos,
4378 info->log_row_parts[TRANSLOG_INTERNAL_PARTS +
4379 0].length);
4380 info->log_row_parts[TRANSLOG_INTERNAL_PARTS+1].str=
4381 info->cur_row.extents;
4382 info->log_row_parts[TRANSLOG_INTERNAL_PARTS+1].length=
4383 extents_length= info->cur_row.extents_count * ROW_EXTENT_SIZE;
4384
4385 row_length= fill_insert_undo_parts(info, record,
4386 (info->log_row_parts +
4387 TRANSLOG_INTERNAL_PARTS + 2),
4388 &row_parts_count);
4389
4390 if (translog_write_record(&lsn, LOGREC_UNDO_ROW_DELETE, info->trn,
4391 info,
4392 (translog_size_t)
4393 (info->log_row_parts[TRANSLOG_INTERNAL_PARTS +
4394 0].length + row_length +
4395 extents_length),
4396 TRANSLOG_INTERNAL_PARTS + 2 + row_parts_count,
4397 info->log_row_parts,
4398 log_data + LSN_STORE_SIZE,
4399 &checksum_delta))
4400 goto err;
4401 }
4402
4403 _ma_bitmap_flushable(info, -1);
4404 _ma_unpin_all_pages_and_finalize_row(info, lsn);
4405 DBUG_RETURN(0);
4406
4407 err:
4408 DBUG_ASSERT(!maria_assert_if_crashed_table);
4409 _ma_bitmap_flushable(info, -1);
4410 _ma_unpin_all_pages_and_finalize_row(info, LSN_IMPOSSIBLE);
4411 DBUG_RETURN(1);
4412 }
4413
4414
4415 /****************************************************************************
4416 Reading of records
4417 ****************************************************************************/
4418
4419 /*
4420 Read position to record from record directory at end of page
4421
4422 SYNOPSIS
4423 get_record_position()
4424 buff page buffer
4425 block_size block size for page
4426 record_number Record number in index
4427 end_of_data pointer to end of data for record
4428
4429 RETURN
4430 0 Error in data
4431 # Pointer to start of record.
4432 In this case *end_of_data is set.
4433 */
4434
get_record_position(MARIA_SHARE * share,uchar * buff,uint record_number,uchar ** end_of_data)4435 static uchar *get_record_position(MARIA_SHARE *share, uchar *buff,
4436 uint record_number, uchar **end_of_data)
4437 {
4438 uint block_size= share->block_size;
4439 uint number_of_records= (uint) buff[DIR_COUNT_OFFSET];
4440 uchar *dir;
4441 uchar *data;
4442 uint offset, length;
4443
4444 #ifdef SANITY_CHECKS
4445 if (record_number >= number_of_records ||
4446 record_number > ((block_size - PAGE_HEADER_SIZE(share) - PAGE_SUFFIX_SIZE)
4447 / DIR_ENTRY_SIZE))
4448 {
4449 DBUG_PRINT("error",
4450 ("Wrong row number: record_number: %u number_of_records: %u",
4451 record_number, number_of_records));
4452 return 0;
4453 }
4454 #endif
4455
4456 dir= dir_entry_pos(buff, block_size, record_number);
4457 offset= uint2korr(dir);
4458 length= uint2korr(dir + 2);
4459 #ifdef SANITY_CHECKS
4460 if (offset < PAGE_HEADER_SIZE(share) ||
4461 offset + length > (block_size -
4462 number_of_records * DIR_ENTRY_SIZE -
4463 PAGE_SUFFIX_SIZE))
4464 {
4465 DBUG_PRINT("error",
4466 ("Wrong row position: record_number: %u offset: %u "
4467 "length: %u number_of_records: %u",
4468 record_number, offset, length, number_of_records));
4469 return 0;
4470 }
4471 #endif
4472 data= buff + offset;
4473 *end_of_data= data + length;
4474 return data;
4475 }
4476
4477
4478 /*
4479 Init extent
4480
4481 NOTES
4482 extent is a cursor over which pages to read
4483 */
4484
init_extent(MARIA_EXTENT_CURSOR * extent,uchar * extent_info,uint extents,MARIA_RECORD_POS * tail_positions)4485 static void init_extent(MARIA_EXTENT_CURSOR *extent, uchar *extent_info,
4486 uint extents, MARIA_RECORD_POS *tail_positions)
4487 {
4488 uint page_count;
4489 extent->extent= extent_info;
4490 extent->extent_count= extents;
4491 extent->page= page_korr(extent_info); /* First extent */
4492 page_count= (uint2korr(extent_info + ROW_EXTENT_PAGE_SIZE) &
4493 ~START_EXTENT_BIT);
4494 extent->tail= page_count & TAIL_BIT;
4495 if (extent->tail)
4496 {
4497 extent->page_count= 1;
4498 extent->tail_row_nr= page_count & ~TAIL_BIT;
4499 }
4500 else
4501 extent->page_count= page_count;
4502 extent->tail_positions= tail_positions;
4503 extent->lock_for_tail_pages= PAGECACHE_LOCK_LEFT_UNLOCKED;
4504 }
4505
4506
4507 /*
4508 Read next extent
4509
4510 SYNOPSIS
4511 read_next_extent()
4512 info Maria handler
4513 extent Pointer to current extent (this is updated to point
4514 to next)
4515 end_of_data Pointer to end of data in read block (out)
4516
4517 NOTES
4518 New block is read into info->buff
4519
4520 RETURN
4521 0 Error; my_errno is set
4522 # Pointer to start of data in read block
4523 In this case end_of_data is updated to point to end of data.
4524 */
4525
read_next_extent(MARIA_HA * info,MARIA_EXTENT_CURSOR * extent,uchar ** end_of_data)4526 static uchar *read_next_extent(MARIA_HA *info, MARIA_EXTENT_CURSOR *extent,
4527 uchar **end_of_data)
4528 {
4529 MARIA_SHARE *share= info->s;
4530 uchar *buff, *data;
4531 MARIA_PINNED_PAGE page_link;
4532 enum pagecache_page_lock lock;
4533 DBUG_ENTER("read_next_extent");
4534
4535 if (!extent->page_count)
4536 {
4537 uint page_count;
4538 if (!--extent->extent_count)
4539 goto crashed;
4540 extent->extent+= ROW_EXTENT_SIZE;
4541 extent->page= page_korr(extent->extent);
4542 page_count= (uint2korr(extent->extent+ROW_EXTENT_PAGE_SIZE) &
4543 ~START_EXTENT_BIT);
4544 if (!page_count)
4545 goto crashed;
4546 extent->tail= page_count & TAIL_BIT;
4547 if (extent->tail)
4548 extent->tail_row_nr= page_count & ~TAIL_BIT;
4549 else
4550 extent->page_count= page_count;
4551 DBUG_PRINT("info",("New extent. Page: %lu page_count: %u tail_flag: %d",
4552 (ulong) extent->page, extent->page_count,
4553 extent->tail != 0));
4554 }
4555 extent->first_extent= 0;
4556
4557 lock= PAGECACHE_LOCK_LEFT_UNLOCKED;
4558 if (extent->tail)
4559 lock= extent->lock_for_tail_pages;
4560
4561 buff= pagecache_read(share->pagecache,
4562 &info->dfile, extent->page, 0,
4563 info->buff, share->page_type,
4564 lock, &page_link.link);
4565 if (lock != PAGECACHE_LOCK_LEFT_UNLOCKED)
4566 {
4567 /* Read during UNDO */
4568 page_link.unlock= PAGECACHE_LOCK_WRITE_UNLOCK;
4569 page_link.changed= buff != 0;
4570 push_dynamic(&info->pinned_pages, (void*) &page_link);
4571 }
4572 if (!buff)
4573 {
4574 /* check if we tried to read over end of file (ie: bad data in record) */
4575 if ((extent->page + 1) * share->block_size >
4576 share->state.state.data_file_length)
4577 goto crashed;
4578 DBUG_RETURN(0);
4579 }
4580
4581 if (!extent->tail)
4582 {
4583 /* Full data page */
4584 if ((buff[PAGE_TYPE_OFFSET] & PAGE_TYPE_MASK) != BLOB_PAGE)
4585 goto crashed;
4586 extent->page++; /* point to next page */
4587 extent->page_count--;
4588 *end_of_data= buff + share->block_size - PAGE_SUFFIX_SIZE;
4589 info->cur_row.full_page_count++; /* For maria_chk */
4590 DBUG_RETURN(extent->data_start= buff + FULL_PAGE_HEADER_SIZE(share));
4591 }
4592
4593 /* Found tail */
4594 if ((buff[PAGE_TYPE_OFFSET] & PAGE_TYPE_MASK) != TAIL_PAGE)
4595 goto crashed;
4596 *(extent->tail_positions++)= ma_recordpos(extent->page,
4597 extent->tail_row_nr);
4598 info->cur_row.tail_count++; /* For maria_chk */
4599
4600 if (!(data= get_record_position(share, buff,
4601 extent->tail_row_nr,
4602 end_of_data)))
4603 goto crashed;
4604 extent->data_start= data;
4605 extent->page_count= 0; /* No more data in extent */
4606 DBUG_RETURN(data);
4607
4608
4609 crashed:
4610 DBUG_ASSERT(!maria_assert_if_crashed_table);
4611 _ma_set_fatal_error(share, HA_ERR_WRONG_IN_RECORD);
4612 DBUG_PRINT("error", ("wrong extent information"));
4613 DBUG_RETURN(0);
4614 }
4615
4616
4617 /*
4618 Read data that may be split over many blocks
4619
4620 SYNOPSIS
4621 read_long_data()
4622 info Maria handler
4623 to Store result string here (this is allocated)
4624 extent Pointer to current extent position
4625 data Current position in buffer
4626 end_of_data End of data in buffer
4627
4628 NOTES
4629 When we have to read a new buffer, it's read into info->buff
4630
4631 This loop is implemented by goto's instead of a for() loop as
4632 the code is notable smaller and faster this way (and it's not nice
4633 to jump into a for loop() or into a 'then' clause)
4634
4635 RETURN
4636 0 ok
4637 1 error
4638 */
4639
read_long_data2(MARIA_HA * info,uchar * to,ulong length,MARIA_EXTENT_CURSOR * extent,uchar ** data,uchar ** end_of_data)4640 static my_bool read_long_data2(MARIA_HA *info, uchar *to, ulong length,
4641 MARIA_EXTENT_CURSOR *extent,
4642 uchar **data, uchar **end_of_data)
4643 {
4644 uint left_length= (uint) (*end_of_data - *data);
4645 DBUG_ENTER("read_long_data2");
4646 DBUG_PRINT("enter", ("length: %lu left_length: %u",
4647 length, left_length));
4648 DBUG_ASSERT(*data <= *end_of_data);
4649
4650 /*
4651 Fields are never split in middle. This means that if length > rest-of-data
4652 we should start reading from the next extent. The reason we may have
4653 data left on the page is that if the fixed part of the row was less than
4654 min_block_length the head block was extended to min_block_length.
4655
4656 This may change in the future, which is why we have the loop written
4657 the way it's written.
4658 */
4659 if (extent->first_extent && length > left_length)
4660 {
4661 *end_of_data= *data;
4662 left_length= 0;
4663 }
4664
4665 for(;;)
4666 {
4667 if (unlikely(left_length >= length))
4668 {
4669 memcpy(to, *data, length);
4670 (*data)+= length;
4671 DBUG_PRINT("info", ("left_length: %u", left_length - (uint) length));
4672 DBUG_RETURN(0);
4673 }
4674 memcpy(to, *data, left_length);
4675 to+= left_length;
4676 length-= left_length;
4677 if (!(*data= read_next_extent(info, extent, end_of_data)))
4678 break;
4679 left_length= (uint) (*end_of_data - *data);
4680 }
4681 DBUG_RETURN(1);
4682 }
4683
read_long_data(MARIA_HA * info,uchar * to,ulong length,MARIA_EXTENT_CURSOR * extent,uchar ** data,uchar ** end_of_data)4684 static inline my_bool read_long_data(MARIA_HA *info, uchar *to, ulong length,
4685 MARIA_EXTENT_CURSOR *extent,
4686 uchar **data, uchar **end_of_data)
4687 {
4688 uint left_length= (uint) (*end_of_data - *data);
4689 if (likely(left_length >= length))
4690 {
4691 memcpy(to, *data, length);
4692 (*data)+= length;
4693 return 0;
4694 }
4695 return read_long_data2(info, to, length, extent, data, end_of_data);
4696 }
4697
4698
4699 /*
4700 Read a record from page (helper function for _ma_read_block_record())
4701
4702 SYNOPSIS
4703 _ma_read_block_record2()
4704 info Maria handler
4705 record Store record here
4706 data Start of head data for row
4707 end_of_data End of data for row
4708
4709 NOTES
4710 The head page is already read by caller
4711 Following data is update in info->cur_row:
4712
4713 cur_row.head_length is set to size of entry in head block
4714 cur_row.tail_positions is set to point to all tail blocks
4715 cur_row.extents points to extents data
4716 cur_row.extents_counts contains number of extents
4717 cur_row.empty_bits is set to empty bits
4718 cur_row.field_lengths contains packed length of all fields
4719 cur_row.blob_length contains total length of all blobs
4720 cur_row.checksum contains checksum of read record.
4721
4722 RETURN
4723 0 ok
4724 # Error code
4725 */
4726
_ma_read_block_record2(MARIA_HA * info,uchar * record,uchar * data,uchar * end_of_data)4727 int _ma_read_block_record2(MARIA_HA *info, uchar *record,
4728 uchar *data, uchar *end_of_data)
4729 {
4730 MARIA_SHARE *share= info->s;
4731 uchar *UNINIT_VAR(field_length_data), *UNINIT_VAR(blob_buffer), *start_of_data;
4732 uint flag, null_bytes, cur_null_bytes, row_extents, field_lengths;
4733 my_bool found_blob= 0;
4734 MARIA_EXTENT_CURSOR extent;
4735 MARIA_COLUMNDEF *column, *end_column;
4736 MARIA_ROW *cur_row= &info->cur_row;
4737 myf myflag= MY_WME | (share->temporary ? MY_THREAD_SPECIFIC : 0);
4738 DBUG_ENTER("_ma_read_block_record2");
4739
4740 start_of_data= data;
4741 flag= (uint) (uchar) data[0];
4742 cur_null_bytes= share->base.original_null_bytes;
4743 null_bytes= share->base.null_bytes;
4744 cur_row->head_length= (uint) (end_of_data - data);
4745 cur_row->full_page_count= cur_row->tail_count= 0;
4746 cur_row->blob_length= 0;
4747 /* Number of bytes in header that we don't need to write during undo */
4748 cur_row->header_length= total_header_size[(flag & PRECALC_HEADER_BITMASK)]-1;
4749
4750 if (flag & ROW_FLAG_TRANSID)
4751 {
4752 cur_row->trid= transid_korr(data+1);
4753 if (!info->trn)
4754 {
4755 /* File crashed */
4756 DBUG_ASSERT(!maria_assert_if_crashed_table);
4757 _ma_set_fatal_error(share, HA_ERR_WRONG_IN_RECORD);
4758 DBUG_RETURN(HA_ERR_WRONG_IN_RECORD);
4759 }
4760 if (!trnman_can_read_from(info->trn, cur_row->trid))
4761 DBUG_RETURN(my_errno= HA_ERR_ROW_NOT_VISIBLE);
4762 }
4763
4764 /* Skip trans header (for now, until we have MVCC csupport) */
4765 data+= cur_row->header_length + 1 ;
4766 if (flag & ROW_FLAG_NULLS_EXTENDED)
4767 cur_null_bytes+= data[-1];
4768
4769 row_extents= 0;
4770 if (flag & ROW_FLAG_EXTENTS)
4771 {
4772 uint row_extent_size;
4773 /*
4774 Record is split over many data pages.
4775 Get number of extents and first extent
4776 */
4777 get_key_length(row_extents, data);
4778 cur_row->extents_count= row_extents;
4779 row_extent_size= row_extents * ROW_EXTENT_SIZE;
4780 if (cur_row->extents_buffer_length < row_extent_size &&
4781 _ma_alloc_buffer(&cur_row->extents,
4782 &cur_row->extents_buffer_length,
4783 row_extent_size, myflag))
4784 DBUG_RETURN(my_errno);
4785 memcpy(cur_row->extents, data, ROW_EXTENT_SIZE);
4786 data+= ROW_EXTENT_SIZE;
4787 init_extent(&extent, cur_row->extents, row_extents,
4788 cur_row->tail_positions);
4789 }
4790 else
4791 {
4792 cur_row->extents_count= 0;
4793 (*cur_row->tail_positions)= 0;
4794 extent.page_count= 0;
4795 extent.extent_count= 1;
4796 }
4797 extent.first_extent= 1;
4798
4799 field_lengths= 0;
4800 if (share->base.max_field_lengths)
4801 {
4802 get_key_length(field_lengths, data);
4803 cur_row->field_lengths_length= field_lengths;
4804 #ifdef SANITY_CHECKS
4805 if (field_lengths > share->base.max_field_lengths)
4806 goto err;
4807 #endif
4808 }
4809
4810 if (share->calc_checksum)
4811 cur_row->checksum= (uint) (uchar) *data++;
4812 /* data now points on null bits */
4813 memcpy(record, data, cur_null_bytes);
4814 if (unlikely(cur_null_bytes != null_bytes))
4815 {
4816 /*
4817 This only happens if we have added more NULL columns with
4818 ALTER TABLE and are fetching an old, not yet modified old row
4819 */
4820 bzero(record + cur_null_bytes, (uint) (null_bytes - cur_null_bytes));
4821 }
4822 data+= null_bytes;
4823 /* We copy the empty bits to be able to use them for delete/update */
4824 memcpy(cur_row->empty_bits, data, share->base.pack_bytes);
4825 data+= share->base.pack_bytes;
4826
4827 /* TODO: Use field offsets, instead of just skipping them */
4828 data+= share->base.field_offsets * FIELD_OFFSET_SIZE;
4829
4830 /*
4831 Read row extents (note that first extent was already read into
4832 cur_row->extents above)
4833 */
4834 if (row_extents > 1)
4835 {
4836 if (read_long_data(info, cur_row->extents + ROW_EXTENT_SIZE,
4837 (row_extents - 1) * ROW_EXTENT_SIZE,
4838 &extent, &data, &end_of_data))
4839 DBUG_RETURN(my_errno);
4840 }
4841
4842 /*
4843 Data now points to start of fixed length field data that can't be null
4844 or 'empty'. Note that these fields can't be split over blocks.
4845 */
4846 for (column= share->columndef,
4847 end_column= column + share->base.fixed_not_null_fields;
4848 column < end_column; column++)
4849 {
4850 uint column_length= column->length;
4851 if (data + column_length > end_of_data &&
4852 !(data= read_next_extent(info, &extent, &end_of_data)))
4853 goto err;
4854 memcpy(record + column->offset, data, column_length);
4855 data+= column_length;
4856 }
4857
4858 /* Read array of field lengths. This may be stored in several extents */
4859 if (field_lengths)
4860 {
4861 field_length_data= cur_row->field_lengths;
4862 if (read_long_data(info, field_length_data, field_lengths, &extent,
4863 &data, &end_of_data))
4864 DBUG_RETURN(my_errno);
4865 }
4866
4867 /* Read variable length data. Each of these may be split over many extents */
4868 for (end_column= share->columndef + share->base.fields;
4869 column < end_column; column++)
4870 {
4871 enum en_fieldtype type= column->type;
4872 uchar *field_pos= record + column->offset;
4873 /* First check if field is present in record */
4874 if ((record[column->null_pos] & column->null_bit) ||
4875 (cur_row->empty_bits[column->empty_pos] & column->empty_bit))
4876 {
4877 bfill(record + column->offset, column->fill_length,
4878 type == FIELD_SKIP_ENDSPACE ? ' ' : 0);
4879 continue;
4880 }
4881 switch (type) {
4882 case FIELD_NORMAL: /* Fixed length field */
4883 case FIELD_SKIP_PRESPACE:
4884 case FIELD_SKIP_ZERO: /* Fixed length field */
4885 if (data + column->length > end_of_data &&
4886 !(data= read_next_extent(info, &extent, &end_of_data)))
4887 goto err;
4888 memcpy(field_pos, data, column->length);
4889 data+= column->length;
4890 break;
4891 case FIELD_SKIP_ENDSPACE: /* CHAR */
4892 {
4893 /* Char that is space filled */
4894 uint length;
4895 if (column->length <= 255)
4896 length= (uint) (uchar) *field_length_data++;
4897 else
4898 {
4899 length= uint2korr(field_length_data);
4900 field_length_data+= 2;
4901 }
4902 #ifdef SANITY_CHECKS
4903 if (length > column->length)
4904 goto err;
4905 #endif
4906 if (read_long_data(info, field_pos, length, &extent, &data,
4907 &end_of_data))
4908 DBUG_RETURN(my_errno);
4909 bfill(field_pos + length, column->length - length, ' ');
4910 break;
4911 }
4912 case FIELD_VARCHAR:
4913 {
4914 ulong length;
4915 if (column->length <= 256)
4916 {
4917 length= (uint) (uchar) (*field_pos++= *field_length_data++);
4918 }
4919 else
4920 {
4921 length= uint2korr(field_length_data);
4922 field_pos[0]= field_length_data[0];
4923 field_pos[1]= field_length_data[1];
4924 field_pos+= 2;
4925 field_length_data+= 2;
4926 }
4927 #ifdef SANITY_CHECKS
4928 if (length > column->length)
4929 goto err;
4930 #endif
4931 if (read_long_data(info, field_pos, length, &extent, &data,
4932 &end_of_data))
4933 DBUG_RETURN(my_errno);
4934 break;
4935 }
4936 case FIELD_BLOB:
4937 {
4938 uint column_size_length= column->length - portable_sizeof_char_ptr;
4939 ulong blob_length= _ma_calc_blob_length(column_size_length,
4940 field_length_data);
4941
4942 if (!found_blob)
4943 {
4944 /* Calculate total length for all blobs */
4945 ulong blob_lengths= 0;
4946 uchar *length_data= field_length_data;
4947 MARIA_COLUMNDEF *blob_field= column;
4948
4949 found_blob= 1;
4950 for (; blob_field < end_column; blob_field++)
4951 {
4952 uint size_length;
4953 if ((record[blob_field->null_pos] & blob_field->null_bit) ||
4954 (cur_row->empty_bits[blob_field->empty_pos] &
4955 blob_field->empty_bit))
4956 continue;
4957 size_length= blob_field->length - portable_sizeof_char_ptr;
4958 blob_lengths+= _ma_calc_blob_length(size_length, length_data);
4959 length_data+= size_length;
4960 }
4961 cur_row->blob_length= blob_lengths;
4962 DBUG_PRINT("info", ("Total blob length: %lu", blob_lengths));
4963 if (_ma_alloc_buffer(&info->blob_buff, &info->blob_buff_size,
4964 blob_lengths, myflag))
4965 DBUG_RETURN(my_errno);
4966 blob_buffer= info->blob_buff;
4967 }
4968
4969 memcpy(field_pos, field_length_data, column_size_length);
4970 memcpy(field_pos + column_size_length, (uchar *) &blob_buffer,
4971 sizeof(char*));
4972 field_length_data+= column_size_length;
4973
4974 /*
4975 After we have read one extent, then each blob is in it's own extent
4976 */
4977 if (!extent.first_extent || (ulong) (end_of_data - data) < blob_length)
4978 end_of_data= data; /* Force read of next extent */
4979
4980 if (read_long_data(info, blob_buffer, blob_length, &extent, &data,
4981 &end_of_data))
4982 DBUG_RETURN(my_errno);
4983 blob_buffer+= blob_length;
4984 break;
4985 }
4986 default:
4987 #ifdef EXTRA_DEBUG
4988 DBUG_ASSERT(0); /* purecov: deadcode */
4989 #endif
4990 goto err;
4991 }
4992 continue;
4993 }
4994
4995 if (row_extents)
4996 {
4997 DBUG_PRINT("info", ("Row read: page_count: %u extent_count: %u",
4998 extent.page_count, extent.extent_count));
4999 *extent.tail_positions= 0; /* End marker */
5000 if (extent.page_count)
5001 goto err;
5002 if (extent.extent_count > 1)
5003 {
5004 if (_ma_check_if_zero(extent.extent + ROW_EXTENT_SIZE,
5005 (extent.extent_count-1) * ROW_EXTENT_SIZE))
5006 {
5007 DBUG_PRINT("error", ("Data in extent is not zero"));
5008 DBUG_DUMP("extent", extent.extent + ROW_EXTENT_SIZE,
5009 (extent.extent_count-1) * ROW_EXTENT_SIZE);
5010 goto err;
5011 }
5012 }
5013 }
5014 else
5015 {
5016 DBUG_PRINT("info", ("Row read"));
5017 /*
5018 data should normally point to end_of_date. The only exception is if
5019 the row is very short in which case we allocated 'min_block_length' data
5020 for allowing the row to expand.
5021 */
5022 if (data != end_of_data && (uint) (end_of_data - start_of_data) >
5023 share->base.min_block_length)
5024 goto err;
5025 }
5026 #ifdef EXTRA_DEBUG
5027 if (share->calc_checksum && !info->in_check_table)
5028 {
5029 /* Esnure that row checksum is correct */
5030 DBUG_ASSERT(((share->calc_checksum)(info, record) & 255) ==
5031 cur_row->checksum);
5032 }
5033 #endif
5034 info->update|= HA_STATE_AKTIV; /* We have an active record */
5035 DBUG_RETURN(0);
5036
5037 err:
5038 DBUG_ASSERT(!maria_assert_if_crashed_table);
5039 /* Something was wrong with data on record */
5040 DBUG_PRINT("error", ("Found record with wrong data"));
5041 _ma_set_fatal_error(share, HA_ERR_WRONG_IN_RECORD);
5042 DBUG_RETURN(HA_ERR_WRONG_IN_RECORD);
5043 }
5044
5045
5046 /** @brief Read positions to tail blocks and full blocks
5047
5048 @fn read_row_extent_info()
5049 @param info Handler
5050
5051 @notes
5052 This function is a simpler version of _ma_read_block_record2()
5053 The data about the used pages is stored in info->cur_row.
5054
5055 @return Status
5056 @retval 0 ok
5057 @retval 1 Error. my_errno contains error number
5058 */
5059
read_row_extent_info(MARIA_HA * info,uchar * buff,uint record_number)5060 static my_bool read_row_extent_info(MARIA_HA *info, uchar *buff,
5061 uint record_number)
5062 {
5063 MARIA_SHARE *share= info->s;
5064 MARIA_EXTENT_CURSOR extent;
5065 MARIA_RECORD_POS *tail_pos;
5066 uchar *data, *end_of_data;
5067 uint flag, row_extents, row_extents_size;
5068 uint field_lengths __attribute__ ((unused));
5069 uchar *extents, *end;
5070 myf myflag= MY_WME | (share->temporary ? MY_THREAD_SPECIFIC : 0);
5071 DBUG_ENTER("read_row_extent_info");
5072
5073 if (!(data= get_record_position(share, buff,
5074 record_number, &end_of_data)))
5075 DBUG_RETURN(1); /* Wrong in record */
5076
5077 flag= (uint) (uchar) data[0];
5078 /* Skip trans header */
5079 data+= total_header_size[(flag & PRECALC_HEADER_BITMASK)];
5080
5081 row_extents= 0;
5082 row_extents_size= 0;
5083 if (flag & ROW_FLAG_EXTENTS)
5084 {
5085 /*
5086 Record is split over many data pages.
5087 Get number of extents and first extent
5088 */
5089 get_key_length(row_extents, data);
5090 row_extents_size= row_extents * ROW_EXTENT_SIZE;
5091 if (info->cur_row.extents_buffer_length < row_extents_size &&
5092 _ma_alloc_buffer(&info->cur_row.extents,
5093 &info->cur_row.extents_buffer_length,
5094 row_extents_size, myflag))
5095 DBUG_RETURN(1);
5096 memcpy(info->cur_row.extents, data, ROW_EXTENT_SIZE);
5097 data+= ROW_EXTENT_SIZE;
5098 init_extent(&extent, info->cur_row.extents, row_extents,
5099 info->cur_row.tail_positions);
5100 extent.first_extent= 1;
5101 }
5102 info->cur_row.extents_count= row_extents;
5103
5104 /*
5105 field_lengths looks unused but get_key_length will
5106 increment data, which is required as data it's used later.
5107 */
5108 if (share->base.max_field_lengths)
5109 get_key_length(field_lengths, data);
5110
5111 if (share->calc_checksum)
5112 info->cur_row.checksum= (uint) (uchar) *data++;
5113 if (row_extents > 1)
5114 {
5115 data+= share->base.null_bytes;
5116 data+= share->base.pack_bytes;
5117 data+= share->base.field_offsets * FIELD_OFFSET_SIZE;
5118
5119 /*
5120 Read row extents (note that first extent was already read into
5121 info->cur_row.extents above)
5122 Lock tails with write lock as we will delete them later.
5123 */
5124 extent.lock_for_tail_pages= PAGECACHE_LOCK_LEFT_WRITELOCKED;
5125 if (read_long_data(info, info->cur_row.extents + ROW_EXTENT_SIZE,
5126 row_extents_size - ROW_EXTENT_SIZE,
5127 &extent, &data, &end_of_data))
5128 DBUG_RETURN(1);
5129 }
5130
5131 /* Update tail_positions with pointer to tails */
5132 tail_pos= info->cur_row.tail_positions;
5133 for (extents= info->cur_row.extents, end= extents + row_extents_size;
5134 extents < end;
5135 extents+= ROW_EXTENT_SIZE)
5136 {
5137 pgcache_page_no_t page= uint5korr(extents);
5138 uint page_count= uint2korr(extents + ROW_EXTENT_PAGE_SIZE);
5139 if (page_count & TAIL_BIT)
5140 *(tail_pos++)= ma_recordpos(page, (page_count & ~ (TAIL_BIT |
5141 START_EXTENT_BIT)));
5142 }
5143 *tail_pos= 0; /* End marker */
5144 DBUG_RETURN(0);
5145 }
5146
5147
5148 /*
5149 Read a record based on record position
5150
5151 @fn _ma_read_block_record()
5152 @param info Maria handler
5153 @param record Store record here
5154 @param record_pos Record position
5155
5156 @return Status
5157 @retval 0 ok
5158 @retval # Error number
5159 */
5160
_ma_read_block_record(MARIA_HA * info,uchar * record,MARIA_RECORD_POS record_pos)5161 int _ma_read_block_record(MARIA_HA *info, uchar *record,
5162 MARIA_RECORD_POS record_pos)
5163 {
5164 MARIA_SHARE *share= info->s;
5165 uchar *data, *end_of_data, *buff;
5166 uint offset;
5167 int ret;
5168 DBUG_ENTER("_ma_read_block_record");
5169 DBUG_PRINT("enter", ("rowid: %lu page: %lu rownr: %u",
5170 (ulong) record_pos,
5171 (ulong) ma_recordpos_to_page(record_pos),
5172 ma_recordpos_to_dir_entry(record_pos)));
5173
5174 offset= ma_recordpos_to_dir_entry(record_pos);
5175
5176 if (!(buff= pagecache_read(share->pagecache,
5177 &info->dfile, ma_recordpos_to_page(record_pos), 0,
5178 info->buff, share->page_type,
5179 PAGECACHE_LOCK_LEFT_UNLOCKED, 0)))
5180 DBUG_RETURN(my_errno);
5181
5182 /*
5183 Unallocated page access can happen if this is an access to a page where
5184 all rows where deleted as part of this statement.
5185 */
5186 DBUG_ASSERT((buff[PAGE_TYPE_OFFSET] & PAGE_TYPE_MASK) == HEAD_PAGE ||
5187 (buff[PAGE_TYPE_OFFSET] & PAGE_TYPE_MASK) == UNALLOCATED_PAGE);
5188
5189 if (((buff[PAGE_TYPE_OFFSET] & PAGE_TYPE_MASK) == UNALLOCATED_PAGE) ||
5190 !(data= get_record_position(share, buff, offset, &end_of_data)))
5191 {
5192 DBUG_ASSERT(!maria_assert_if_crashed_table);
5193 DBUG_PRINT("warning", ("Wrong directory entry in data block"));
5194 my_errno= HA_ERR_RECORD_DELETED; /* File crashed */
5195 DBUG_RETURN(HA_ERR_RECORD_DELETED);
5196 }
5197 ret= _ma_read_block_record2(info, record, data, end_of_data);
5198 DBUG_RETURN(ret);
5199 }
5200
5201
5202 /* compare unique constraint between stored rows */
5203
_ma_cmp_block_unique(MARIA_HA * info,MARIA_UNIQUEDEF * def,const uchar * record,MARIA_RECORD_POS pos)5204 my_bool _ma_cmp_block_unique(MARIA_HA *info, MARIA_UNIQUEDEF *def,
5205 const uchar *record, MARIA_RECORD_POS pos)
5206 {
5207 uchar *org_rec_buff, *old_record;
5208 size_t org_rec_buff_size;
5209 int error;
5210 my_bool buff_alloced;
5211 DBUG_ENTER("_ma_cmp_block_unique");
5212
5213 alloc_on_stack(*info->stack_end_ptr, old_record, buff_alloced,
5214 info->s->base.reclength);
5215 if (!old_record)
5216 DBUG_RETURN(1);
5217
5218 /* Don't let the compare destroy blobs that may be in use */
5219 org_rec_buff= info->rec_buff;
5220 org_rec_buff_size= info->rec_buff_size;
5221 if (info->s->base.blobs)
5222 {
5223 /* Force realloc of record buffer*/
5224 info->rec_buff= 0;
5225 info->rec_buff_size= 0;
5226 }
5227 error= _ma_read_block_record(info, old_record, pos);
5228 if (!error)
5229 error= _ma_unique_comp(def, record, old_record, def->null_are_equal);
5230 if (info->s->base.blobs)
5231 {
5232 my_free(info->rec_buff);
5233 info->rec_buff= org_rec_buff;
5234 info->rec_buff_size= org_rec_buff_size;
5235 }
5236 DBUG_PRINT("exit", ("result: %d", error));
5237 stack_alloc_free(old_record, buff_alloced);
5238 DBUG_RETURN(error != 0);
5239 }
5240
5241
5242 /****************************************************************************
5243 Table scan
5244 ****************************************************************************/
5245
5246 /*
5247 Allocate buffers for table scan
5248
5249 SYNOPSIS
5250 _ma_scan_init_block_record(MARIA_HA *info)
5251
5252 IMPLEMENTATION
5253 We allocate one buffer for the current bitmap and one buffer for the
5254 current page
5255
5256 RETURN
5257 0 ok
5258 1 error (couldn't allocate memory or disk error)
5259 */
5260
_ma_scan_init_block_record(MARIA_HA * info)5261 my_bool _ma_scan_init_block_record(MARIA_HA *info)
5262 {
5263 MARIA_SHARE *share= info->s;
5264 myf flag= MY_WME | (share->temporary ? MY_THREAD_SPECIFIC : 0);
5265 DBUG_ENTER("_ma_scan_init_block_record");
5266 DBUG_ASSERT(info->dfile.file == share->bitmap.file.file);
5267
5268 /*
5269 bitmap_buff may already be allocated if this is the second call to
5270 rnd_init() without a rnd_end() in between, see sql/handler.h
5271 */
5272 if (!(info->scan.bitmap_buff ||
5273 ((info->scan.bitmap_buff=
5274 (uchar *) my_malloc(PSI_INSTRUMENT_ME, share->block_size * 2,
5275 flag)))))
5276 DBUG_RETURN(1);
5277 info->scan.page_buff= info->scan.bitmap_buff + share->block_size;
5278 info->scan.bitmap_end= info->scan.bitmap_buff + share->bitmap.max_total_size;
5279
5280 /* Set scan variables to get _ma_scan_block() to start with reading bitmap */
5281 info->scan.number_of_rows= 0;
5282 info->scan.bitmap_pos= info->scan.bitmap_end;
5283 info->scan.bitmap_page= (pgcache_page_no_t) 0 - share->bitmap.pages_covered;
5284 info->scan.max_page= share->state.state.data_file_length / share->block_size;
5285 /*
5286 We need to flush what's in memory (bitmap.map) to page cache otherwise, as
5287 we are going to read bitmaps from page cache in table scan (see
5288 _ma_scan_block_record()), we may miss recently inserted rows (bitmap page
5289 in page cache would be too old).
5290 */
5291 DBUG_RETURN(_ma_bitmap_flush(info->s));
5292 }
5293
5294
5295 /* Free buffers allocated by _ma_scan_block_init() */
5296
_ma_scan_end_block_record(MARIA_HA * info)5297 void _ma_scan_end_block_record(MARIA_HA *info)
5298 {
5299 DBUG_ENTER("_ma_scan_end_block_record");
5300 my_free(info->scan.bitmap_buff);
5301 info->scan.bitmap_buff= 0;
5302 if (info->scan_save)
5303 {
5304 my_free(info->scan_save);
5305 info->scan_save= 0;
5306 }
5307 DBUG_VOID_RETURN;
5308 }
5309
5310
5311 /**
5312 @brief Save current scan position
5313
5314 @note
5315 For the moment we can only remember one position, but this is
5316 good enough for MySQL usage
5317
5318 @return
5319 @retval 0 ok
5320 @retval HA_ERR_WRONG_IN_RECORD Could not allocate memory to hold position
5321 */
5322
_ma_scan_remember_block_record(MARIA_HA * info,MARIA_RECORD_POS * lastpos)5323 int _ma_scan_remember_block_record(MARIA_HA *info,
5324 MARIA_RECORD_POS *lastpos)
5325 {
5326 uchar *bitmap_buff;
5327 DBUG_ENTER("_ma_scan_remember_block_record");
5328 if (!(info->scan_save))
5329 {
5330 if (!(info->scan_save= my_malloc(PSI_INSTRUMENT_ME,
5331 ALIGN_SIZE(sizeof(*info->scan_save)) +
5332 info->s->block_size * 2,
5333 MYF(MY_WME))))
5334 DBUG_RETURN(HA_ERR_OUT_OF_MEM);
5335 info->scan_save->bitmap_buff= ((uchar*) info->scan_save +
5336 ALIGN_SIZE(sizeof(*info->scan_save)));
5337 }
5338 /* For checking if pages have changed since we last read it */
5339 info->scan.row_changes= info->row_changes;
5340
5341 /* Remember used bitmap and used head page */
5342 bitmap_buff= info->scan_save->bitmap_buff;
5343 memcpy(info->scan_save, &info->scan, sizeof(*info->scan_save));
5344 info->scan_save->bitmap_buff= bitmap_buff;
5345 memcpy(bitmap_buff, info->scan.bitmap_buff, info->s->block_size * 2);
5346
5347 /* Point to the last read row */
5348 *lastpos= info->cur_row.nextpos - 1;
5349 info->scan_save->dir+= DIR_ENTRY_SIZE;
5350 DBUG_RETURN(0);
5351 }
5352
5353
5354 /**
5355 @brief restore scan block it's original values
5356
5357 @return
5358 0 ok
5359 # error
5360
5361 @note
5362 In theory we could swap bitmap buffers instead of copy them.
5363 For the moment we don't do that because there are variables pointing
5364 inside the buffers and it's a bit of hassle to either make them relative
5365 or repoint them.
5366
5367 If the data file has changed, we will re-read the new block record
5368 to ensure that when we continue scanning we can ignore any deleted rows.
5369 */
5370
_ma_scan_restore_block_record(MARIA_HA * info,MARIA_RECORD_POS lastpos)5371 int _ma_scan_restore_block_record(MARIA_HA *info,
5372 MARIA_RECORD_POS lastpos)
5373 {
5374 uchar *bitmap_buff;
5375 DBUG_ENTER("_ma_scan_restore_block_record");
5376
5377 info->cur_row.nextpos= lastpos;
5378 bitmap_buff= info->scan.bitmap_buff;
5379 memcpy(&info->scan, info->scan_save, sizeof(*info->scan_save));
5380 info->scan.bitmap_buff= bitmap_buff;
5381 memcpy(bitmap_buff, info->scan_save->bitmap_buff, info->s->block_size * 2);
5382
5383 if (info->scan.row_changes != info->row_changes)
5384 {
5385 /*
5386 Table has been changed. We have to re-read the current page block as
5387 data may have changed on it that we have to see.
5388 */
5389 if (!(pagecache_read(info->s->pagecache,
5390 &info->dfile,
5391 ma_recordpos_to_page(info->scan.row_base_page),
5392 0, info->scan.page_buff,
5393 info->s->page_type,
5394 PAGECACHE_LOCK_LEFT_UNLOCKED, 0)))
5395 DBUG_RETURN(my_errno);
5396 info->scan.number_of_rows=
5397 (uint) (uchar) info->scan.page_buff[DIR_COUNT_OFFSET];
5398 info->scan.dir_end= (info->scan.page_buff + info->s->block_size -
5399 PAGE_SUFFIX_SIZE -
5400 info->scan.number_of_rows * DIR_ENTRY_SIZE);
5401 }
5402 DBUG_RETURN(0);
5403 }
5404
5405
5406 /*
5407 Read next record while scanning table
5408
5409 SYNOPSIS
5410 _ma_scan_block_record()
5411 info Maria handler
5412 record Store found here
5413 record_pos Value stored in info->cur_row.next_pos after last call
5414 This is offset inside the current pagebuff
5415 skip_deleted
5416
5417 NOTES
5418 - One must have called mi_scan() before this
5419 - In this version, we don't actually need record_pos, we as easily
5420 use a variable in info->scan
5421
5422 IMPLEMENTATION
5423 Current code uses a lot of goto's to separate the different kind of
5424 states we may be in. This gives us a minimum of executed if's for
5425 the normal cases. I tried several different ways to code this, but
5426 the current one was in the end the most readable and fastest.
5427
5428 RETURN
5429 0 ok
5430 # Error code (Normally HA_ERR_END_OF_FILE)
5431 */
5432
_ma_scan_block_record(MARIA_HA * info,uchar * record,MARIA_RECORD_POS record_pos,my_bool skip_deleted)5433 int _ma_scan_block_record(MARIA_HA *info, uchar *record,
5434 MARIA_RECORD_POS record_pos,
5435 my_bool skip_deleted __attribute__ ((unused)))
5436 {
5437 uint block_size;
5438 MARIA_SHARE *share= info->s;
5439 DBUG_ENTER("_ma_scan_block_record");
5440
5441 restart_record_read:
5442 /* Find next row in current page */
5443 while (likely(record_pos < info->scan.number_of_rows))
5444 {
5445 uint length, offset;
5446 uchar *data, *end_of_data;
5447 int error;
5448
5449 /* Ensure that scan.dir and record_pos are in sync */
5450 DBUG_ASSERT(info->scan.dir == dir_entry_pos(info->scan.page_buff,
5451 share->block_size,
5452 (uint) record_pos));
5453
5454 /* Search for a valid directory entry (not 0) */
5455 while (!(offset= uint2korr(info->scan.dir)))
5456 {
5457 info->scan.dir-= DIR_ENTRY_SIZE;
5458 record_pos++;
5459 #ifdef SANITY_CHECKS
5460 if (info->scan.dir < info->scan.dir_end)
5461 {
5462 DBUG_ASSERT(!maria_assert_if_crashed_table);
5463 goto err;
5464 }
5465 #endif
5466 }
5467 /*
5468 This should always be true as the directory should always start with
5469 a valid entry.
5470 */
5471 DBUG_ASSERT(info->scan.dir >= info->scan.dir_end);
5472
5473 /* found row */
5474 info->cur_row.lastpos= info->scan.row_base_page + record_pos;
5475 info->cur_row.nextpos= record_pos + 1;
5476 data= info->scan.page_buff + offset;
5477 length= uint2korr(info->scan.dir + 2);
5478 end_of_data= data + length;
5479 info->scan.dir-= DIR_ENTRY_SIZE; /* Point to next row to process */
5480 #ifdef SANITY_CHECKS
5481 if (end_of_data > info->scan.dir_end ||
5482 offset < PAGE_HEADER_SIZE(share) ||
5483 length < share->base.min_block_length)
5484 {
5485 DBUG_ASSERT(!(end_of_data > info->scan.dir_end));
5486 DBUG_ASSERT(!(offset < PAGE_HEADER_SIZE(share)));
5487 DBUG_ASSERT(!(length < share->base.min_block_length));
5488 goto err;
5489 }
5490 #endif
5491 DBUG_PRINT("info", ("rowid: %lu", (ulong) info->cur_row.lastpos));
5492 error= _ma_read_block_record2(info, record, data, end_of_data);
5493 if (error != HA_ERR_ROW_NOT_VISIBLE)
5494 DBUG_RETURN(error);
5495 record_pos++;
5496 }
5497
5498 /* Find next head page in current bitmap */
5499 restart_bitmap_scan:
5500 block_size= share->block_size;
5501 if (likely(info->scan.bitmap_pos < info->scan.bitmap_end))
5502 {
5503 uchar *data= info->scan.bitmap_pos;
5504 longlong bits= info->scan.bits;
5505 uint bit_pos= info->scan.bit_pos;
5506
5507 do
5508 {
5509 while (likely(bits))
5510 {
5511 uint pattern= (uint) (bits & 7);
5512 bits >>= 3;
5513 bit_pos++;
5514 if (pattern > 0 && pattern <= 4)
5515 {
5516 /* Found head page; Read it */
5517 pgcache_page_no_t page;
5518 info->scan.bitmap_pos= data;
5519 info->scan.bits= bits;
5520 info->scan.bit_pos= bit_pos;
5521 page= (info->scan.bitmap_page + 1 +
5522 (data - info->scan.bitmap_buff) / 6 * 16 + bit_pos - 1);
5523 info->scan.row_base_page= ma_recordpos(page, 0);
5524 if (page >= info->scan.max_page)
5525 {
5526 DBUG_PRINT("info", ("Found end of file"));
5527 DBUG_RETURN((my_errno= HA_ERR_END_OF_FILE));
5528 }
5529 if (!(pagecache_read(share->pagecache,
5530 &info->dfile,
5531 page, 0, info->scan.page_buff,
5532 share->page_type,
5533 PAGECACHE_LOCK_LEFT_UNLOCKED, 0)))
5534 DBUG_RETURN(my_errno);
5535 if (((info->scan.page_buff[PAGE_TYPE_OFFSET] & PAGE_TYPE_MASK) !=
5536 HEAD_PAGE))
5537 {
5538 /*
5539 This may happen if someone has been deleting all rows
5540 from a page since we read the bitmap, so it may be ok.
5541 Print warning in debug log and continue.
5542 */
5543 DBUG_PRINT("warning",
5544 ("Found page of type %d when expecting head page",
5545 (info->scan.page_buff[PAGE_TYPE_OFFSET] &
5546 PAGE_TYPE_MASK)));
5547 continue;
5548 }
5549 if ((info->scan.number_of_rows=
5550 (uint) (uchar) info->scan.page_buff[DIR_COUNT_OFFSET]) == 0)
5551 {
5552 DBUG_PRINT("error", ("Wrong page header"));
5553 _ma_set_fatal_error(share, HA_ERR_WRONG_IN_RECORD);
5554 DBUG_RETURN(HA_ERR_WRONG_IN_RECORD);
5555 }
5556 DBUG_PRINT("info", ("Page %lu has %u rows",
5557 (ulong) page, info->scan.number_of_rows));
5558 info->scan.dir= (info->scan.page_buff + block_size -
5559 PAGE_SUFFIX_SIZE - DIR_ENTRY_SIZE);
5560 info->scan.dir_end= (info->scan.dir -
5561 (info->scan.number_of_rows - 1) *
5562 DIR_ENTRY_SIZE);
5563 record_pos= 0;
5564 goto restart_record_read;
5565 }
5566 }
5567 for (data+= 6; data < info->scan.bitmap_end; data+= 6)
5568 {
5569 bits= uint6korr(data);
5570 /* Skip not allocated pages and blob / full tail pages */
5571 if (bits && bits != 07777777777777777LL)
5572 break;
5573 }
5574 bit_pos= 0;
5575 } while (data < info->scan.bitmap_end);
5576 }
5577
5578 /* Read next bitmap */
5579 info->scan.bitmap_page+= share->bitmap.pages_covered;
5580 if (unlikely(info->scan.bitmap_page >= info->scan.max_page))
5581 {
5582 DBUG_PRINT("info", ("Found end of file"));
5583 DBUG_RETURN((my_errno= HA_ERR_END_OF_FILE));
5584 }
5585 DBUG_PRINT("info", ("Reading bitmap at %lu",
5586 (ulong) info->scan.bitmap_page));
5587 if (!(pagecache_read(share->pagecache, &info->s->bitmap.file,
5588 info->scan.bitmap_page,
5589 0, info->scan.bitmap_buff, PAGECACHE_PLAIN_PAGE,
5590 PAGECACHE_LOCK_LEFT_UNLOCKED, 0)))
5591 DBUG_RETURN(my_errno);
5592 /* Skip scanning 'bits' in bitmap scan code */
5593 info->scan.bitmap_pos= info->scan.bitmap_buff - 6;
5594 info->scan.bits= 0;
5595 goto restart_bitmap_scan;
5596
5597 err:
5598 DBUG_ASSERT(!maria_assert_if_crashed_table);
5599 DBUG_PRINT("error", ("Wrong data on page"));
5600 _ma_set_fatal_error(share, HA_ERR_WRONG_IN_RECORD);
5601 DBUG_RETURN(HA_ERR_WRONG_IN_RECORD);
5602 }
5603
5604
5605 /*
5606 Compare a row against a stored one
5607
5608 NOTES
5609 Not implemented, as block record is not supposed to be used in a shared
5610 global environment
5611 */
5612
_ma_compare_block_record(MARIA_HA * info,const uchar * record)5613 my_bool _ma_compare_block_record(MARIA_HA *info __attribute__ ((unused)),
5614 const uchar *record __attribute__ ((unused)))
5615 {
5616 return 0;
5617 }
5618
5619
5620 /*
5621 Store an integer with simple packing
5622
5623 SYNOPSIS
5624 ma_store_integer()
5625 to Store the packed integer here
5626 nr Integer to store
5627
5628 NOTES
5629 This is mostly used to store field numbers and lengths of strings.
5630 We have to cast the result for the LL() becasue of a bug in Forte CC
5631 compiler.
5632
5633 Packing used is:
5634 nr < 251 is stored as is (in 1 byte)
5635 Numbers that require 1-4 bytes are stored as char(250+byte_length), data
5636 Bigger numbers are stored as 255, data as ulonglong (not yet done).
5637
5638 RETURN
5639 Position in 'to' after the packed length
5640 */
5641
ma_store_length(uchar * to,ulong nr)5642 uchar *ma_store_length(uchar *to, ulong nr)
5643 {
5644 if (nr < 251)
5645 {
5646 *to=(uchar) nr;
5647 return to+1;
5648 }
5649 if (nr < 65536)
5650 {
5651 if (nr <= 255)
5652 {
5653 to[0]= (uchar) 251;
5654 to[1]= (uchar) nr;
5655 return to+2;
5656 }
5657 to[0]= (uchar) 252;
5658 int2store(to+1, nr);
5659 return to+3;
5660 }
5661 if (nr < 16777216)
5662 {
5663 *to++= (uchar) 253;
5664 int3store(to, nr);
5665 return to+3;
5666 }
5667 *to++= (uchar) 254;
5668 int4store(to, nr);
5669 return to+4;
5670 }
5671
5672
5673 /* Calculate how many bytes needed to store a number */
5674
ma_calc_length_for_store_length(ulong nr)5675 uint ma_calc_length_for_store_length(ulong nr)
5676 {
5677 if (nr < 251)
5678 return 1;
5679 if (nr < 65536)
5680 {
5681 if (nr <= 255)
5682 return 2;
5683 return 3;
5684 }
5685 if (nr < 16777216)
5686 return 4;
5687 return 5;
5688 }
5689
5690
5691 /* Retrive a stored number */
5692
ma_get_length(const uchar ** packet)5693 static ulong ma_get_length(const uchar **packet)
5694 {
5695 reg1 const uchar *pos= *packet;
5696 if (*pos < 251)
5697 {
5698 (*packet)++;
5699 return (ulong) *pos;
5700 }
5701 if (*pos == 251)
5702 {
5703 (*packet)+= 2;
5704 return (ulong) pos[1];
5705 }
5706 if (*pos == 252)
5707 {
5708 (*packet)+= 3;
5709 return (ulong) uint2korr(pos+1);
5710 }
5711 if (*pos == 253)
5712 {
5713 (*packet)+= 4;
5714 return (ulong) uint3korr(pos+1);
5715 }
5716 DBUG_ASSERT(*pos == 254);
5717 (*packet)+= 5;
5718 return (ulong) uint4korr(pos+1);
5719 }
5720
5721
5722 /*
5723 Fill array with pointers to field parts to be stored in log for insert
5724
5725 SYNOPSIS
5726 fill_insert_undo_parts()
5727 info Maria handler
5728 record Inserted row
5729 log_parts Store pointers to changed memory areas here
5730 log_parts_count See RETURN
5731
5732 NOTES
5733 We have information in info->cur_row about the read row.
5734
5735 RETURN
5736 length of data in log_parts.
5737 log_parts_count contains number of used log_parts
5738 */
5739
fill_insert_undo_parts(MARIA_HA * info,const uchar * record,LEX_CUSTRING * log_parts,uint * log_parts_count)5740 static size_t fill_insert_undo_parts(MARIA_HA *info, const uchar *record,
5741 LEX_CUSTRING *log_parts,
5742 uint *log_parts_count)
5743 {
5744 MARIA_SHARE *share= info->s;
5745 MARIA_COLUMNDEF *column, *end_column;
5746 uchar *field_lengths= info->cur_row.field_lengths;
5747 size_t row_length;
5748 MARIA_ROW *cur_row= &info->cur_row;
5749 LEX_CUSTRING *start_log_parts;
5750 DBUG_ENTER("fill_insert_undo_parts");
5751
5752 start_log_parts= log_parts;
5753
5754 /* Store null bits */
5755 log_parts->str= record;
5756 log_parts->length= share->base.null_bytes;
5757 row_length= log_parts->length;
5758 log_parts++;
5759
5760 /* Stored bitmap over packed (zero length or all-zero fields) */
5761 log_parts->str= info->cur_row.empty_bits;
5762 log_parts->length= share->base.pack_bytes;
5763 row_length+= log_parts->length;
5764 log_parts++;
5765
5766 if (share->base.max_field_lengths)
5767 {
5768 /* Store length of all not empty char, varchar and blob fields */
5769 log_parts->str= field_lengths - 2;
5770 log_parts->length= info->cur_row.field_lengths_length+2;
5771 int2store(log_parts->str, info->cur_row.field_lengths_length);
5772 row_length+= log_parts->length;
5773 log_parts++;
5774 }
5775
5776 if (share->base.blobs)
5777 {
5778 /*
5779 Store total blob length to make buffer allocation easier during UNDO
5780 */
5781 log_parts->str= info->length_buff;
5782 log_parts->length= (uint) (ma_store_length(info->length_buff,
5783 info->cur_row.blob_length) -
5784 (uchar*) log_parts->str);
5785 row_length+= log_parts->length;
5786 log_parts++;
5787 }
5788
5789 /* Handle constant length fields that are always present */
5790 for (column= share->columndef,
5791 end_column= column+ share->base.fixed_not_null_fields;
5792 column < end_column;
5793 column++)
5794 {
5795 log_parts->str= record + column->offset;
5796 log_parts->length= column->length;
5797 row_length+= log_parts->length;
5798 log_parts++;
5799 }
5800
5801 /* Handle NULL fields and CHAR/VARCHAR fields */
5802 for (end_column= share->columndef + share->base.fields - share->base.blobs;
5803 column < end_column;
5804 column++)
5805 {
5806 const uchar *column_pos;
5807 size_t column_length;
5808 if ((record[column->null_pos] & column->null_bit) ||
5809 cur_row->empty_bits[column->empty_pos] & column->empty_bit)
5810 continue;
5811
5812 column_pos= record+ column->offset;
5813 column_length= column->length;
5814
5815 switch (column->type) {
5816 case FIELD_CHECK:
5817 case FIELD_NORMAL: /* Fixed length field */
5818 case FIELD_ZERO:
5819 case FIELD_SKIP_PRESPACE: /* Not packed */
5820 case FIELD_SKIP_ZERO: /* Fixed length field */
5821 break;
5822 case FIELD_SKIP_ENDSPACE: /* CHAR */
5823 {
5824 if (column->length <= 255)
5825 column_length= *field_lengths++;
5826 else
5827 {
5828 column_length= uint2korr(field_lengths);
5829 field_lengths+= 2;
5830 }
5831 break;
5832 }
5833 case FIELD_VARCHAR:
5834 {
5835 if (column->fill_length == 1)
5836 column_length= *field_lengths;
5837 else
5838 column_length= uint2korr(field_lengths);
5839 field_lengths+= column->fill_length;
5840 column_pos+= column->fill_length;
5841 break;
5842 }
5843 default:
5844 DBUG_ASSERT(0);
5845 }
5846 log_parts->str= column_pos;
5847 log_parts->length= column_length;
5848 row_length+= log_parts->length;
5849 log_parts++;
5850 }
5851
5852 /* Add blobs */
5853 for (end_column+= share->base.blobs; column < end_column; column++)
5854 {
5855 const uchar *field_pos= record + column->offset;
5856 uint size_length= column->length - portable_sizeof_char_ptr;
5857 ulong blob_length= _ma_calc_blob_length(size_length, field_pos);
5858
5859 /*
5860 We don't have to check for null, as blob_length is guranteed to be 0
5861 if the blob is null
5862 */
5863 if (blob_length)
5864 {
5865 uchar *blob_pos;
5866 memcpy(&blob_pos, record + column->offset + size_length,
5867 sizeof(blob_pos));
5868 log_parts->str= blob_pos;
5869 log_parts->length= blob_length;
5870 row_length+= log_parts->length;
5871 log_parts++;
5872 }
5873 }
5874 *log_parts_count= (uint) (log_parts - start_log_parts);
5875 DBUG_RETURN(row_length);
5876 }
5877
5878
5879 /*
5880 Fill array with pointers to field parts to be stored in log for update
5881
5882 SYNOPSIS
5883 fill_update_undo_parts()
5884 info Maria handler
5885 oldrec Original row
5886 newrec New row
5887 log_parts Store pointers to changed memory areas here
5888 log_parts_count See RETURN
5889
5890 IMPLEMENTATION
5891 Format of undo record:
5892
5893 Fields are stored in same order as the field array.
5894
5895 Offset to changed field data (packed)
5896
5897 For each changed field
5898 Fieldnumber (packed)
5899 Length, if variable length field (packed)
5900
5901 For each changed field
5902 Data
5903
5904 Packing is using ma_store_integer()
5905
5906 The reason we store field numbers & length separated from data (ie, not
5907 after each other) is to get better cpu caching when we loop over
5908 fields (as we probably don't have to access data for each field when we
5909 want to read and old row through the undo log record).
5910
5911 As a special case, we use '255' for the field number of the null bitmap.
5912
5913 RETURN
5914 length of data in log_parts.
5915 log_parts_count contains number of used log_parts
5916 */
5917
fill_update_undo_parts(MARIA_HA * info,const uchar * oldrec,const uchar * newrec,LEX_CUSTRING * log_parts,uint * log_parts_count)5918 static size_t fill_update_undo_parts(MARIA_HA *info, const uchar *oldrec,
5919 const uchar *newrec,
5920 LEX_CUSTRING *log_parts,
5921 uint *log_parts_count)
5922 {
5923 MARIA_SHARE *share= info->s;
5924 MARIA_COLUMNDEF *column, *end_column;
5925 MARIA_ROW *old_row= &info->cur_row, *new_row= &info->new_row;
5926 uchar *field_data, *start_field_data, *length_str;
5927 uchar *old_field_lengths= old_row->field_lengths;
5928 uchar *new_field_lengths= new_row->field_lengths;
5929 size_t row_length= 0;
5930 uint field_lengths;
5931 LEX_CUSTRING *start_log_parts;
5932 my_bool new_column_is_empty;
5933 DBUG_ENTER("fill_update_undo_parts");
5934
5935 start_log_parts= log_parts;
5936
5937 /*
5938 First log part is for number of fields, field numbers and lengths
5939 The +4 is to reserve place for the number of changed fields.
5940 */
5941 start_field_data= field_data= info->update_field_data + 4;
5942 log_parts++;
5943
5944 if (memcmp(oldrec, newrec, share->base.null_bytes))
5945 {
5946 /* Store changed null bits */
5947 *field_data++= (uchar) 255; /* Special case */
5948 log_parts->str= oldrec;
5949 log_parts->length= share->base.null_bytes;
5950 row_length= log_parts->length;
5951 log_parts++;
5952 }
5953
5954 /* Handle constant length fields */
5955 for (column= share->columndef,
5956 end_column= column+ share->base.fixed_not_null_fields;
5957 column < end_column;
5958 column++)
5959 {
5960 if (memcmp(oldrec + column->offset, newrec + column->offset,
5961 column->length))
5962 {
5963 field_data= ma_store_length(field_data,
5964 (uint) (column - share->columndef));
5965 log_parts->str= oldrec + column->offset;
5966 log_parts->length= column->length;
5967 row_length+= column->length;
5968 log_parts++;
5969 }
5970 }
5971
5972 /* Handle the rest: NULL fields and CHAR/VARCHAR fields and BLOB's */
5973 for (end_column= share->columndef + share->base.fields;
5974 column < end_column;
5975 column++)
5976 {
5977 const uchar *new_column_pos, *old_column_pos;
5978 size_t new_column_length, old_column_length;
5979
5980 /* First check if old column is null or empty */
5981 if (oldrec[column->null_pos] & column->null_bit)
5982 {
5983 /*
5984 It's safe to skip this one as either the new column is also null
5985 (no change) or the new_column is not null, in which case the null-bit
5986 maps differed and we have already stored the null bitmap.
5987 */
5988 continue;
5989 }
5990 if (old_row->empty_bits[column->empty_pos] & column->empty_bit)
5991 {
5992 if (new_row->empty_bits[column->empty_pos] & column->empty_bit)
5993 continue; /* Both are empty; skip */
5994
5995 /* Store null length column */
5996 field_data= ma_store_length(field_data,
5997 (uint) (column - share->columndef));
5998 field_data= ma_store_length(field_data, 0);
5999 continue;
6000 }
6001 /*
6002 Remember if the 'new' value is empty (as in this case we must always
6003 log the original value
6004 */
6005 new_column_is_empty= ((newrec[column->null_pos] & column->null_bit) ||
6006 (new_row->empty_bits[column->empty_pos] &
6007 column->empty_bit));
6008
6009 old_column_pos= oldrec + column->offset;
6010 new_column_pos= newrec + column->offset;
6011 old_column_length= new_column_length= column->length;
6012
6013 switch (column->type) {
6014 case FIELD_CHECK:
6015 case FIELD_NORMAL: /* Fixed length field */
6016 case FIELD_ZERO:
6017 case FIELD_SKIP_PRESPACE: /* Not packed */
6018 case FIELD_SKIP_ZERO: /* Fixed length field */
6019 break;
6020 case FIELD_VARCHAR:
6021 new_column_length--; /* Skip length prefix */
6022 old_column_pos+= column->fill_length;
6023 new_column_pos+= column->fill_length;
6024 /* Fall through */
6025 case FIELD_SKIP_ENDSPACE: /* CHAR */
6026 {
6027 if (new_column_length <= 255)
6028 {
6029 old_column_length= *old_field_lengths++;
6030 if (!new_column_is_empty)
6031 new_column_length= *new_field_lengths++;
6032 }
6033 else
6034 {
6035 old_column_length= uint2korr(old_field_lengths);
6036 old_field_lengths+= 2;
6037 if (!new_column_is_empty)
6038 {
6039 new_column_length= uint2korr(new_field_lengths);
6040 new_field_lengths+= 2;
6041 }
6042 }
6043 break;
6044 }
6045 case FIELD_BLOB:
6046 {
6047 uint size_length= column->length - portable_sizeof_char_ptr;
6048 old_column_length= _ma_calc_blob_length(size_length, old_column_pos);
6049 memcpy((void*) &old_column_pos, oldrec + column->offset + size_length,
6050 sizeof(old_column_pos));
6051 if (!new_column_is_empty)
6052 {
6053 new_column_length= _ma_calc_blob_length(size_length, new_column_pos);
6054 memcpy((void*) &new_column_pos, newrec + column->offset + size_length,
6055 sizeof(old_column_pos));
6056 }
6057 break;
6058 }
6059 default:
6060 DBUG_ASSERT(0);
6061 }
6062
6063 if (new_column_is_empty || new_column_length != old_column_length ||
6064 memcmp(old_column_pos, new_column_pos, new_column_length))
6065 {
6066 field_data= ma_store_length(field_data,
6067 (ulong) (column - share->columndef));
6068 field_data= ma_store_length(field_data, (ulong) old_column_length);
6069
6070 log_parts->str= old_column_pos;
6071 log_parts->length= old_column_length;
6072 row_length+= old_column_length;
6073 log_parts++;
6074 }
6075 }
6076
6077 *log_parts_count= (uint) (log_parts - start_log_parts);
6078
6079 /* Store length of field length data before the field/field_lengths */
6080 field_lengths= (uint) (field_data - start_field_data);
6081 length_str= start_field_data - ma_calc_length_for_store_length(field_lengths);
6082 start_log_parts->str= length_str;
6083 ma_store_length(length_str, field_lengths);
6084 start_log_parts->length= (size_t) (field_data - start_log_parts->str);
6085 row_length+= start_log_parts->length;
6086 DBUG_RETURN(row_length);
6087 }
6088
6089 /***************************************************************************
6090 In-write hooks called under log's lock when log record is written
6091 ***************************************************************************/
6092
6093 /**
6094 @brief Sets transaction's rec_lsn if needed
6095
6096 A transaction sometimes writes a REDO even before the page is in the
6097 pagecache (example: brand new head or tail pages; full pages). So, if
6098 Checkpoint happens just after the REDO write, it needs to know that the
6099 REDO phase must start before this REDO. Scanning the pagecache cannot
6100 tell that as the page is not in the cache. So, transaction sets its rec_lsn
6101 to the REDO's LSN or somewhere before, and Checkpoint reads the
6102 transaction's rec_lsn.
6103
6104 @return Operation status, always 0 (success)
6105 */
6106
write_hook_for_redo(enum translog_record_type type,TRN * trn,MARIA_HA * tbl_info,LSN * lsn,void * hook_arg)6107 my_bool write_hook_for_redo(enum translog_record_type type
6108 __attribute__ ((unused)),
6109 TRN *trn, MARIA_HA *tbl_info
6110 __attribute__ ((unused)),
6111 LSN *lsn, void *hook_arg
6112 __attribute__ ((unused)))
6113 {
6114 /*
6115 Users of dummy_transaction_object must keep this TRN clean as it
6116 is used by many threads (like those manipulating non-transactional
6117 tables). It might be dangerous if one user sets rec_lsn or some other
6118 member and it is picked up by another user (like putting this rec_lsn into
6119 a page of a non-transactional table); it's safer if all members stay 0. So
6120 non-transactional log records (REPAIR, CREATE, RENAME, DROP) should not
6121 call this hook; we trust them but verify ;)
6122 */
6123 DBUG_ASSERT(trn->trid != 0);
6124 /*
6125 If the hook stays so simple, it would be faster to pass
6126 !trn->rec_lsn ? trn->rec_lsn : some_dummy_lsn
6127 to translog_write_record(), like Monty did in his original code, and not
6128 have a hook. For now we keep it like this.
6129 */
6130 if (trn->rec_lsn == 0)
6131 trn->rec_lsn= *lsn;
6132 return 0;
6133 }
6134
6135
6136 /**
6137 @brief Sets transaction's undo_lsn, first_undo_lsn if needed
6138
6139 @return Operation status, always 0 (success)
6140 */
6141
write_hook_for_undo(enum translog_record_type type,TRN * trn,MARIA_HA * tbl_info,LSN * lsn,void * hook_arg)6142 my_bool write_hook_for_undo(enum translog_record_type type
6143 __attribute__ ((unused)),
6144 TRN *trn, MARIA_HA *tbl_info
6145 __attribute__ ((unused)),
6146 LSN *lsn, void *hook_arg
6147 __attribute__ ((unused)))
6148 {
6149 DBUG_ASSERT(trn->trid != 0);
6150 trn->undo_lsn= *lsn;
6151 if (unlikely(LSN_WITH_FLAGS_TO_LSN(trn->first_undo_lsn) == 0))
6152 trn->first_undo_lsn=
6153 trn->undo_lsn | LSN_WITH_FLAGS_TO_FLAGS(trn->first_undo_lsn);
6154 return 0;
6155 /*
6156 when we implement purging, we will specialize this hook: UNDO_PURGE
6157 records will additionally set trn->undo_purge_lsn
6158 */
6159 }
6160
6161
6162 /**
6163 @brief Sets the table's records count and checksum and others to 0, then
6164 calls the generic REDO hook.
6165
6166 @return Operation status, always 0 (success)
6167 */
6168
write_hook_for_redo_delete_all(enum translog_record_type type,TRN * trn,MARIA_HA * tbl_info,LSN * lsn,void * hook_arg)6169 my_bool write_hook_for_redo_delete_all(enum translog_record_type type
6170 __attribute__ ((unused)),
6171 TRN *trn, MARIA_HA *tbl_info
6172 __attribute__ ((unused)),
6173 LSN *lsn, void *hook_arg)
6174 {
6175 _ma_reset_status(tbl_info);
6176 return write_hook_for_redo(type, trn, tbl_info, lsn, hook_arg);
6177 }
6178
6179
6180 /**
6181 @brief Updates "records" and "checksum" and calls the generic UNDO hook
6182
6183 @return Operation status, always 0 (success)
6184 */
6185
write_hook_for_undo_row_insert(enum translog_record_type type,TRN * trn,MARIA_HA * tbl_info,LSN * lsn,void * hook_arg)6186 my_bool write_hook_for_undo_row_insert(enum translog_record_type type
6187 __attribute__ ((unused)),
6188 TRN *trn, MARIA_HA *tbl_info,
6189 LSN *lsn, void *hook_arg)
6190 {
6191 MARIA_SHARE *share= tbl_info->s;
6192 share->state.state.records++;
6193 share->state.state.checksum+= *(ha_checksum *)hook_arg;
6194 return write_hook_for_undo(type, trn, tbl_info, lsn, hook_arg);
6195 }
6196
6197
6198 /**
6199 @brief Updates "records" and calls the generic UNDO hook
6200
6201 @return Operation status, always 0 (success)
6202 */
6203
write_hook_for_undo_row_delete(enum translog_record_type type,TRN * trn,MARIA_HA * tbl_info,LSN * lsn,void * hook_arg)6204 my_bool write_hook_for_undo_row_delete(enum translog_record_type type
6205 __attribute__ ((unused)),
6206 TRN *trn, MARIA_HA *tbl_info,
6207 LSN *lsn, void *hook_arg)
6208 {
6209 MARIA_SHARE *share= tbl_info->s;
6210 share->state.state.records--;
6211 share->state.state.checksum+= *(ha_checksum *)hook_arg;
6212 return write_hook_for_undo(type, trn, tbl_info, lsn, hook_arg);
6213 }
6214
6215
6216 /**
6217 @brief Upates "records" and "checksum" and calls the generic UNDO hook
6218
6219 @return Operation status, always 0 (success)
6220 */
6221
write_hook_for_undo_row_update(enum translog_record_type type,TRN * trn,MARIA_HA * tbl_info,LSN * lsn,void * hook_arg)6222 my_bool write_hook_for_undo_row_update(enum translog_record_type type
6223 __attribute__ ((unused)),
6224 TRN *trn, MARIA_HA *tbl_info,
6225 LSN *lsn, void *hook_arg)
6226 {
6227 MARIA_SHARE *share= tbl_info->s;
6228 share->state.state.checksum+= *(ha_checksum *)hook_arg;
6229 return write_hook_for_undo(type, trn, tbl_info, lsn, hook_arg);
6230 }
6231
6232
write_hook_for_undo_bulk_insert(enum translog_record_type type,TRN * trn,MARIA_HA * tbl_info,LSN * lsn,void * hook_arg)6233 my_bool write_hook_for_undo_bulk_insert(enum translog_record_type type
6234 __attribute__ ((unused)),
6235 TRN *trn, MARIA_HA *tbl_info,
6236 LSN *lsn, void *hook_arg)
6237 {
6238 /*
6239 We are going to call maria_delete_all_rows(), but without logging and
6240 syncing, as an optimization (if we crash before commit, the UNDO will
6241 empty; if we crash after commit, we have flushed and forced the files).
6242 Status still needs to be reset under log mutex, in case of a concurrent
6243 checkpoint.
6244 */
6245 _ma_reset_status(tbl_info);
6246 return write_hook_for_undo(type, trn, tbl_info, lsn, hook_arg);
6247 }
6248
6249
6250 /**
6251 @brief Updates table's lsn_of_file_id.
6252
6253 @return Operation status, always 0 (success)
6254 */
6255
write_hook_for_file_id(enum translog_record_type type,TRN * trn,MARIA_HA * tbl_info,LSN * lsn,void * hook_arg)6256 my_bool write_hook_for_file_id(enum translog_record_type type
6257 __attribute__ ((unused)),
6258 TRN *trn
6259 __attribute__ ((unused)),
6260 MARIA_HA *tbl_info,
6261 LSN *lsn,
6262 void *hook_arg
6263 __attribute__ ((unused)))
6264 {
6265 DBUG_ASSERT(cmp_translog_addr(tbl_info->s->lsn_of_file_id, *lsn) < 0);
6266 tbl_info->s->lsn_of_file_id= *lsn;
6267 return 0;
6268 }
6269
6270
6271 /**
6272 Updates transaction's rec_lsn when committing.
6273
6274 A transaction writes its commit record before being committed in trnman, so
6275 if Checkpoint happens just between the COMMIT record log write and the
6276 commit in trnman, it will record that transaction is not committed. Assume
6277 the transaction (trn1) did an INSERT; after the checkpoint, a second
6278 transaction (trn2) does a DELETE of what trn1 has inserted. Then crash,
6279 Checkpoint record says that trn1 was not committed, and REDO phase starts
6280 from Checkpoint record's LSN. So it will not find the COMMIT record of
6281 trn1, will want to roll back trn1, which will fail because the row/key
6282 which it wants to delete does not exist anymore.
6283 To avoid this, Checkpoint needs to know that the REDO phase must start
6284 before this COMMIT record, so transaction sets its rec_lsn to the COMMIT's
6285 record LSN, and as Checkpoint reads the transaction's rec_lsn, Checkpoint
6286 will know.
6287
6288 @note so after commit trn->rec_lsn is a "commit LSN", which could be of
6289 use later.
6290
6291 @return Operation status, always 0 (success)
6292 */
6293
write_hook_for_commit(enum translog_record_type type,TRN * trn,MARIA_HA * tbl_info,LSN * lsn,void * hook_arg)6294 my_bool write_hook_for_commit(enum translog_record_type type
6295 __attribute__ ((unused)),
6296 TRN *trn,
6297 MARIA_HA *tbl_info
6298 __attribute__ ((unused)),
6299 LSN *lsn,
6300 void *hook_arg
6301 __attribute__ ((unused)))
6302 {
6303 trn->rec_lsn= *lsn;
6304 return 0;
6305 }
6306
6307
6308 /***************************************************************************
6309 Applying of REDO log records
6310 ***************************************************************************/
6311
6312 /*
6313 Apply changes to head and tail pages
6314
6315 SYNOPSIS
6316 _ma_apply_redo_insert_row_head_or_tail()
6317 info Maria handler
6318 lsn LSN to put on page
6319 page_type HEAD_PAGE or TAIL_PAGE
6320 new_page True if this is first entry on page
6321 header Header (without FILEID)
6322 data Data to be put on page
6323 data_length Length of data
6324
6325 NOTE
6326 Handles LOGREC_REDO_INSERT_ROW_HEAD, LOGREC_REDO_INSERT_ROW_TAIL
6327 LOGREC_REDO_NEW_ROW_HEAD and LOGREC_REDO_NEW_ROW_TAIL
6328
6329 RETURN
6330 0 ok
6331 # Error number
6332 */
6333
_ma_apply_redo_insert_row_head_or_tail(MARIA_HA * info,LSN lsn,uint page_type,my_bool new_page,const uchar * header,const uchar * data,size_t data_length)6334 uint _ma_apply_redo_insert_row_head_or_tail(MARIA_HA *info, LSN lsn,
6335 uint page_type,
6336 my_bool new_page,
6337 const uchar *header,
6338 const uchar *data,
6339 size_t data_length)
6340 {
6341 MARIA_SHARE *share= info->s;
6342 pgcache_page_no_t page;
6343 uint rownr, empty_space;
6344 uint block_size= share->block_size;
6345 uint rec_offset;
6346 uchar *buff, *dir;
6347 uint result;
6348 MARIA_PINNED_PAGE page_link;
6349 enum pagecache_page_lock lock_method;
6350 enum pagecache_page_pin pin_method;
6351 my_off_t end_of_page;
6352 uint error;
6353 DBUG_ENTER("_ma_apply_redo_insert_row_head_or_tail");
6354
6355 page= page_korr(header);
6356 rownr= dirpos_korr(header + PAGE_STORE_SIZE);
6357
6358 DBUG_PRINT("enter", ("rowid: %lu page: %lu rownr: %u data_length: %u",
6359 (ulong) ma_recordpos(page, rownr),
6360 (ulong) page, rownr, (uint) data_length));
6361
6362 share->state.changed|= (STATE_CHANGED | STATE_NOT_ZEROFILLED |
6363 STATE_NOT_MOVABLE);
6364
6365 end_of_page= (page + 1) * share->block_size;
6366 if (end_of_page > share->state.state.data_file_length)
6367 {
6368 DBUG_PRINT("info", ("Enlarging data file from %lu to %lu",
6369 (ulong) share->state.state.data_file_length,
6370 (ulong) end_of_page));
6371 /*
6372 New page at end of file. Note that the test above is also positive if
6373 data_file_length is not a multiple of block_size (system crashed while
6374 writing the last page): in this case we just extend the last page and
6375 fill it entirely with zeroes, then the REDO will put correct data on
6376 it.
6377 */
6378 lock_method= PAGECACHE_LOCK_WRITE;
6379 pin_method= PAGECACHE_PIN;
6380
6381 DBUG_ASSERT(rownr == 0 && new_page);
6382 if (rownr != 0 || !new_page)
6383 goto crashed_file;
6384
6385 buff= info->keyread_buff;
6386 info->keyread_buff_used= 1;
6387 make_empty_page(info, buff, page_type, 1);
6388 empty_space= (block_size - PAGE_OVERHEAD_SIZE(share));
6389 rec_offset= PAGE_HEADER_SIZE(share);
6390 dir= buff+ block_size - PAGE_SUFFIX_SIZE - DIR_ENTRY_SIZE;
6391 }
6392 else
6393 {
6394 lock_method= PAGECACHE_LOCK_LEFT_WRITELOCKED;
6395 pin_method= PAGECACHE_PIN_LEFT_PINNED;
6396
6397 share->pagecache->readwrite_flags&= ~MY_WME;
6398 share->silence_encryption_errors= 1;
6399 buff= pagecache_read(share->pagecache, &info->dfile,
6400 page, 0, 0,
6401 PAGECACHE_PLAIN_PAGE, PAGECACHE_LOCK_WRITE,
6402 &page_link.link);
6403 share->pagecache->readwrite_flags= share->pagecache->org_readwrite_flags;
6404 share->silence_encryption_errors= 0;
6405 if (!buff)
6406 {
6407 /* Skip errors when reading outside of file and uninitialized pages */
6408 if (!new_page || (my_errno != HA_ERR_FILE_TOO_SHORT &&
6409 my_errno != HA_ERR_WRONG_CRC &&
6410 my_errno != HA_ERR_DECRYPTION_FAILED))
6411 {
6412 DBUG_PRINT("error", ("Error %d when reading page", (int) my_errno));
6413 goto err;
6414 }
6415 /* Create new page */
6416 buff= pagecache_block_link_to_buffer(page_link.link);
6417 buff[PAGE_TYPE_OFFSET]= UNALLOCATED_PAGE;
6418 }
6419 else if (lsn_korr(buff) >= lsn) /* Test if already applied */
6420 {
6421 check_skipped_lsn(info, lsn_korr(buff), 1, page);
6422 /* Fix bitmap, just in case */
6423 empty_space= uint2korr(buff + EMPTY_SPACE_OFFSET);
6424 if (!enough_free_entries_on_page(share, buff))
6425 empty_space= 0; /* Page is full */
6426
6427 if (_ma_bitmap_set(info, page, page_type == HEAD_PAGE, empty_space))
6428 goto err;
6429 pagecache_unlock_by_link(share->pagecache, page_link.link,
6430 PAGECACHE_LOCK_WRITE_UNLOCK,
6431 PAGECACHE_UNPIN, LSN_IMPOSSIBLE,
6432 LSN_IMPOSSIBLE, 0, FALSE);
6433 DBUG_RETURN(0);
6434 }
6435
6436 if (((uint) (buff[PAGE_TYPE_OFFSET] & PAGE_TYPE_MASK) != page_type))
6437 {
6438 /*
6439 This is a page that has been freed before and now should be
6440 changed to new type.
6441 */
6442 if (!new_page)
6443 {
6444 DBUG_PRINT("error",
6445 ("Found page of wrong type: %u, should have been %u",
6446 (uint) (buff[PAGE_TYPE_OFFSET] & PAGE_TYPE_MASK),
6447 page_type));
6448 goto crashed_file;
6449 }
6450 make_empty_page(info, buff, page_type, 0);
6451 empty_space= block_size - PAGE_HEADER_SIZE(share) - PAGE_SUFFIX_SIZE;
6452 (void) extend_directory(info, buff, block_size, 0, rownr, &empty_space,
6453 page_type == HEAD_PAGE);
6454 rec_offset= PAGE_HEADER_SIZE(share);
6455 dir= dir_entry_pos(buff, block_size, rownr);
6456 empty_space+= uint2korr(dir+2);
6457 }
6458 else
6459 {
6460 uint max_entry= (uint) buff[DIR_COUNT_OFFSET];
6461 uint length;
6462
6463 DBUG_ASSERT(!new_page);
6464 dir= dir_entry_pos(buff, block_size, rownr);
6465 empty_space= uint2korr(buff + EMPTY_SPACE_OFFSET);
6466
6467 if (max_entry <= rownr)
6468 {
6469 /* Add directory entry first in directory and data last on page */
6470 if (extend_directory(info, buff, block_size, max_entry, rownr,
6471 &empty_space, page_type == HEAD_PAGE))
6472 goto crashed_file;
6473 }
6474 if (extend_area_on_page(info, buff, dir, rownr,
6475 (uint) data_length, &empty_space,
6476 &rec_offset, &length, page_type == HEAD_PAGE))
6477 goto crashed_file;
6478 }
6479 }
6480 /* Copy data */
6481 int2store(dir+2, data_length);
6482 memcpy(buff + rec_offset, data, data_length);
6483 empty_space-= (uint) data_length;
6484 int2store(buff + EMPTY_SPACE_OFFSET, empty_space);
6485
6486 /* Fix bitmap */
6487 if (!enough_free_entries_on_page(share, buff))
6488 empty_space= 0; /* Page is full */
6489 if (_ma_bitmap_set(info, page, page_type == HEAD_PAGE, empty_space))
6490 goto err;
6491
6492 /*
6493 If page was not read before, write it but keep it pinned.
6494 We don't update its LSN When we have processed all REDOs for this page
6495 in the current REDO's group, we will stamp page with UNDO's LSN
6496 (if we stamped it now, a next REDO, in
6497 this group, for this page, would be skipped) and unpin then.
6498 */
6499 result= 0;
6500 if (lock_method == PAGECACHE_LOCK_WRITE &&
6501 pagecache_write(share->pagecache,
6502 &info->dfile, page, 0,
6503 buff, PAGECACHE_PLAIN_PAGE,
6504 lock_method, pin_method,
6505 PAGECACHE_WRITE_DELAY, &page_link.link,
6506 LSN_IMPOSSIBLE))
6507 result= my_errno;
6508
6509 page_link.unlock= PAGECACHE_LOCK_WRITE_UNLOCK;
6510 page_link.changed= 1;
6511 push_dynamic(&info->pinned_pages, (void*) &page_link);
6512
6513 /*
6514 Data page and bitmap page are in place, we can update data_file_length in
6515 case we extended the file. We could not do it earlier: bitmap code tests
6516 data_file_length to know if it has to create a new page or not.
6517 */
6518 set_if_bigger(share->state.state.data_file_length, end_of_page);
6519 DBUG_RETURN(result);
6520
6521 crashed_file:
6522 _ma_set_fatal_error(share, HA_ERR_WRONG_IN_RECORD);
6523 err:
6524 error= my_errno;
6525 if (lock_method == PAGECACHE_LOCK_LEFT_WRITELOCKED)
6526 pagecache_unlock_by_link(share->pagecache, page_link.link,
6527 PAGECACHE_LOCK_WRITE_UNLOCK,
6528 PAGECACHE_UNPIN, LSN_IMPOSSIBLE,
6529 LSN_IMPOSSIBLE, 0, FALSE);
6530 _ma_mark_file_crashed(share);
6531 DBUG_ASSERT(!maria_assert_if_crashed_table); /* catch recovery error early */
6532 DBUG_RETURN((my_errno= error));
6533 }
6534
6535
6536 /*
6537 Apply LOGREC_REDO_PURGE_ROW_HEAD & LOGREC_REDO_PURGE_ROW_TAIL
6538
6539 SYNOPSIS
6540 _ma_apply_redo_purge_row_head_or_tail()
6541 info Maria handler
6542 lsn LSN to put on page
6543 page_type HEAD_PAGE or TAIL_PAGE
6544 header Header (without FILEID)
6545
6546 NOTES
6547 This function is very similar to delete_head_or_tail()
6548
6549 RETURN
6550 0 ok
6551 # Error number
6552 */
6553
_ma_apply_redo_purge_row_head_or_tail(MARIA_HA * info,LSN lsn,uint page_type,const uchar * header)6554 uint _ma_apply_redo_purge_row_head_or_tail(MARIA_HA *info, LSN lsn,
6555 uint page_type,
6556 const uchar *header)
6557 {
6558 MARIA_SHARE *share= info->s;
6559 pgcache_page_no_t page;
6560 uint rownr, empty_space;
6561 uchar *buff;
6562 int result;
6563 uint error;
6564 MARIA_PINNED_PAGE page_link;
6565 DBUG_ENTER("_ma_apply_redo_purge_row_head_or_tail");
6566
6567 page= page_korr(header);
6568 rownr= dirpos_korr(header+PAGE_STORE_SIZE);
6569 DBUG_PRINT("enter", ("rowid: %lu page: %lu rownr: %u",
6570 (ulong) ma_recordpos(page, rownr),
6571 (ulong) page, rownr));
6572
6573 share->state.changed|= (STATE_CHANGED | STATE_NOT_ZEROFILLED |
6574 STATE_NOT_MOVABLE);
6575
6576 if (!(buff= pagecache_read(share->pagecache, &info->dfile,
6577 page, 0, 0,
6578 PAGECACHE_PLAIN_PAGE, PAGECACHE_LOCK_WRITE,
6579 &page_link.link)))
6580 goto err;
6581
6582 if (lsn_korr(buff) >= lsn)
6583 {
6584 /*
6585 Already applied
6586 Note that in case the page is not anymore a head or tail page
6587 a future redo will fix the bitmap.
6588 */
6589 check_skipped_lsn(info, lsn_korr(buff), 1, page);
6590 if ((uint) (buff[PAGE_TYPE_OFFSET] & PAGE_TYPE_MASK) == page_type)
6591 {
6592 empty_space= uint2korr(buff+EMPTY_SPACE_OFFSET);
6593 if (!enough_free_entries_on_page(share, buff))
6594 empty_space= 0; /* Page is full */
6595 if (_ma_bitmap_set(info, page, page_type == HEAD_PAGE,
6596 empty_space))
6597 goto err;
6598 }
6599 pagecache_unlock_by_link(share->pagecache, page_link.link,
6600 PAGECACHE_LOCK_WRITE_UNLOCK,
6601 PAGECACHE_UNPIN, LSN_IMPOSSIBLE,
6602 LSN_IMPOSSIBLE, 0, FALSE);
6603 DBUG_RETURN(0);
6604 }
6605
6606 DBUG_ASSERT((buff[PAGE_TYPE_OFFSET] & PAGE_TYPE_MASK) == (uchar) page_type);
6607
6608 if (delete_dir_entry(share, buff, rownr, &empty_space) < 0)
6609 {
6610 _ma_set_fatal_error(share, HA_ERR_WRONG_IN_RECORD);
6611 goto err;
6612 }
6613
6614 page_link.unlock= PAGECACHE_LOCK_WRITE_UNLOCK;
6615 page_link.changed= 1;
6616 push_dynamic(&info->pinned_pages, (void*) &page_link);
6617
6618 result= 0;
6619 if (!enough_free_entries_on_page(share, buff))
6620 empty_space= 0; /* Page is full */
6621 /* This will work even if the page was marked as UNALLOCATED_PAGE */
6622 if (_ma_bitmap_set(info, page, page_type == HEAD_PAGE, empty_space))
6623 result= my_errno;
6624
6625 DBUG_RETURN(result);
6626
6627 err:
6628 error= my_errno;
6629 pagecache_unlock_by_link(share->pagecache, page_link.link,
6630 PAGECACHE_LOCK_WRITE_UNLOCK,
6631 PAGECACHE_UNPIN, LSN_IMPOSSIBLE,
6632 LSN_IMPOSSIBLE, 0, FALSE);
6633 _ma_mark_file_crashed(share);
6634 DBUG_ASSERT(!maria_assert_if_crashed_table);
6635 DBUG_RETURN((my_errno= error));
6636
6637 }
6638
6639
6640 /**
6641 @brief Apply LOGREC_REDO_FREE_BLOCKS
6642
6643 @param info Maria handler
6644 @param header Header (without FILEID)
6645
6646 Mark the pages free in the bitmap.
6647
6648 We have to check against _ma_redo_not_needed_for_page()
6649 to guard against the case where we first clear a block and after
6650 that insert new data into the blocks. If we would unconditionally
6651 clear the bitmap here, future changes would be ignored for the page
6652 if it's not in the dirty list (ie, it would be flushed).
6653
6654 @return Operation status
6655 @retval 0 OK
6656 @retval 1 Error
6657 */
6658
_ma_apply_redo_free_blocks(MARIA_HA * info,LSN lsn,LSN redo_lsn,const uchar * header)6659 uint _ma_apply_redo_free_blocks(MARIA_HA *info,
6660 LSN lsn __attribute__((unused)),
6661 LSN redo_lsn,
6662 const uchar *header)
6663 {
6664 MARIA_SHARE *share= info->s;
6665 uint ranges;
6666 uint16 sid;
6667 DBUG_ENTER("_ma_apply_redo_free_blocks");
6668
6669 share->state.changed|= (STATE_CHANGED | STATE_NOT_ZEROFILLED |
6670 STATE_NOT_MOVABLE);
6671
6672 sid= fileid_korr(header);
6673 header+= FILEID_STORE_SIZE;
6674 ranges= pagerange_korr(header);
6675 header+= PAGERANGE_STORE_SIZE;
6676 DBUG_ASSERT(ranges > 0);
6677
6678 /** @todo leave bitmap lock to the bitmap code... */
6679 mysql_mutex_lock(&share->bitmap.bitmap_lock);
6680 while (ranges--)
6681 {
6682 my_bool res;
6683 uint page_range;
6684 pgcache_page_no_t page, start_page;
6685
6686 start_page= page= page_korr(header);
6687 header+= PAGE_STORE_SIZE;
6688 /* Page range may have this bit set to indicate a tail page */
6689 page_range= pagerange_korr(header) & ~(TAIL_BIT | START_EXTENT_BIT);
6690 DBUG_ASSERT(page_range > 0);
6691
6692 header+= PAGERANGE_STORE_SIZE;
6693
6694 DBUG_PRINT("info", ("page: %lu pages: %u", (long) page, page_range));
6695
6696 for ( ; page_range-- ; start_page++)
6697 {
6698 if (_ma_redo_not_needed_for_page(sid, redo_lsn, start_page, FALSE))
6699 continue;
6700 res= _ma_bitmap_reset_full_page_bits(info, &share->bitmap, start_page,
6701 1);
6702 if (res)
6703 {
6704 mysql_mutex_unlock(&share->bitmap.bitmap_lock);
6705 _ma_mark_file_crashed(share);
6706 DBUG_ASSERT(!maria_assert_if_crashed_table);
6707 DBUG_RETURN(res);
6708 }
6709 }
6710 }
6711 mysql_mutex_unlock(&share->bitmap.bitmap_lock);
6712 DBUG_RETURN(0);
6713 }
6714
6715
6716 /**
6717 @brief Apply LOGREC_REDO_FREE_HEAD_OR_TAIL
6718
6719 @param info Maria handler
6720 @param header Header (without FILEID)
6721
6722 @note It marks the page free in the bitmap, and sets the directory's count
6723 to 0.
6724
6725 @return Operation status
6726 @retval 0 OK
6727 @retval 1 Error
6728 */
6729
_ma_apply_redo_free_head_or_tail(MARIA_HA * info,LSN lsn,const uchar * header)6730 uint _ma_apply_redo_free_head_or_tail(MARIA_HA *info, LSN lsn,
6731 const uchar *header)
6732 {
6733 MARIA_SHARE *share= info->s;
6734 uchar *buff;
6735 pgcache_page_no_t page;
6736 MARIA_PINNED_PAGE page_link;
6737 my_bool res;
6738 DBUG_ENTER("_ma_apply_redo_free_head_or_tail");
6739
6740 share->state.changed|= (STATE_CHANGED | STATE_NOT_ZEROFILLED |
6741 STATE_NOT_MOVABLE);
6742
6743 page= page_korr(header);
6744
6745 if (!(buff= pagecache_read(share->pagecache,
6746 &info->dfile,
6747 page, 0, 0,
6748 PAGECACHE_PLAIN_PAGE,
6749 PAGECACHE_LOCK_WRITE, &page_link.link)))
6750 {
6751 pagecache_unlock_by_link(share->pagecache, page_link.link,
6752 PAGECACHE_LOCK_WRITE_UNLOCK,
6753 PAGECACHE_UNPIN, LSN_IMPOSSIBLE,
6754 LSN_IMPOSSIBLE, 0, FALSE);
6755 goto err;
6756 }
6757 if (lsn_korr(buff) >= lsn)
6758 {
6759 /* Already applied */
6760 check_skipped_lsn(info, lsn_korr(buff), 1, page);
6761 pagecache_unlock_by_link(share->pagecache, page_link.link,
6762 PAGECACHE_LOCK_WRITE_UNLOCK,
6763 PAGECACHE_UNPIN, LSN_IMPOSSIBLE,
6764 LSN_IMPOSSIBLE, 0, FALSE);
6765 }
6766 else
6767 {
6768 buff[PAGE_TYPE_OFFSET]= UNALLOCATED_PAGE;
6769 #ifdef IDENTICAL_PAGES_AFTER_RECOVERY
6770 {
6771 uint number_of_records= (uint) buff[DIR_COUNT_OFFSET];
6772 uchar *dir= dir_entry_pos(buff, share->block_size,
6773 number_of_records-1);
6774 buff[DIR_FREE_OFFSET]= END_OF_DIR_FREE_LIST;
6775 bzero(dir, number_of_records * DIR_ENTRY_SIZE);
6776 }
6777 #endif
6778
6779 page_link.unlock= PAGECACHE_LOCK_WRITE_UNLOCK;
6780 page_link.changed= 1;
6781 push_dynamic(&info->pinned_pages, (void*) &page_link);
6782 }
6783 /** @todo leave bitmap lock to the bitmap code... */
6784 mysql_mutex_lock(&share->bitmap.bitmap_lock);
6785 res= _ma_bitmap_reset_full_page_bits(info, &share->bitmap, page, 1);
6786 mysql_mutex_unlock(&share->bitmap.bitmap_lock);
6787 if (res)
6788 goto err;
6789 DBUG_RETURN(0);
6790
6791 err:
6792 _ma_mark_file_crashed(share);
6793 DBUG_ASSERT(!maria_assert_if_crashed_table);
6794 DBUG_RETURN(1);
6795 }
6796
6797
6798 /**
6799 @brief Apply LOGREC_REDO_INSERT_ROW_BLOBS
6800
6801 @param info Maria handler
6802 @parma lsn LSN to put on pages
6803 @param header Header (with FILEID)
6804 @param redo_lsn REDO record's LSN
6805 @param[out] number_of_blobs Number of blobs found in log record
6806 @param[out] number_of_ranges Number of ranges found
6807 @param[out] first_page First page touched
6808 @param[out] last_page Last page touched
6809
6810 @note Write full pages (full head & blob pages)
6811
6812 @return Operation status
6813 @retval 0 OK
6814 @retval !=0 Error
6815 */
6816
_ma_apply_redo_insert_row_blobs(MARIA_HA * info,LSN lsn,const uchar * header,LSN redo_lsn,uint * const number_of_blobs,uint * const number_of_ranges,pgcache_page_no_t * const first_page,pgcache_page_no_t * const last_page)6817 uint _ma_apply_redo_insert_row_blobs(MARIA_HA *info,
6818 LSN lsn, const uchar *header,
6819 LSN redo_lsn,
6820 uint * const number_of_blobs,
6821 uint * const number_of_ranges,
6822 pgcache_page_no_t * const first_page,
6823 pgcache_page_no_t * const last_page)
6824 {
6825 MARIA_SHARE *share= info->s;
6826 const uchar *data;
6827 uint data_size= FULL_PAGE_SIZE(share);
6828 uint blob_count, ranges;
6829 uint16 sid;
6830 pgcache_page_no_t first_page2= ULONGLONG_MAX, last_page2= 0;
6831 DBUG_ENTER("_ma_apply_redo_insert_row_blobs");
6832
6833 share->state.changed|= (STATE_CHANGED | STATE_NOT_ZEROFILLED |
6834 STATE_NOT_MOVABLE);
6835
6836 sid= fileid_korr(header);
6837 header+= FILEID_STORE_SIZE;
6838 *number_of_ranges= ranges= pagerange_korr(header);
6839 header+= PAGERANGE_STORE_SIZE;
6840 *number_of_blobs= blob_count= pagerange_korr(header);
6841 header+= PAGERANGE_STORE_SIZE;
6842 DBUG_ASSERT(ranges >= blob_count);
6843
6844 data= (header + ranges * ROW_EXTENT_SIZE +
6845 blob_count * (SUB_RANGE_SIZE + BLOCK_FILLER_SIZE));
6846
6847 while (blob_count--)
6848 {
6849 uint sub_ranges, empty_space;
6850
6851 sub_ranges= uint2korr(header);
6852 header+= SUB_RANGE_SIZE;
6853 empty_space= uint2korr(header);
6854 header+= BLOCK_FILLER_SIZE;
6855 DBUG_ASSERT(sub_ranges <= ranges && empty_space < data_size);
6856 ranges-= sub_ranges;
6857
6858 while (sub_ranges--)
6859 {
6860 uint i;
6861 uint res;
6862 uint page_range;
6863 pgcache_page_no_t page;
6864 uchar *buff;
6865 uint data_on_page= data_size;
6866
6867 page= page_korr(header);
6868 header+= PAGE_STORE_SIZE;
6869 page_range= pagerange_korr(header);
6870 header+= PAGERANGE_STORE_SIZE;
6871
6872 for (i= page_range; i-- > 0 ; page++, data+= data_on_page)
6873 {
6874 MARIA_PINNED_PAGE page_link;
6875 enum pagecache_page_lock unlock_method;
6876 enum pagecache_page_pin unpin_method;
6877
6878 set_if_smaller(first_page2, page);
6879 set_if_bigger(last_page2, page);
6880 if (i == 0 && sub_ranges == 0)
6881 data_on_page= data_size - empty_space; /* data on last page */
6882 if (_ma_redo_not_needed_for_page(sid, redo_lsn, page, FALSE))
6883 continue;
6884
6885 if (((page + 1) * share->block_size) >
6886 share->state.state.data_file_length)
6887 {
6888 /* New page or half written page at end of file */
6889 DBUG_PRINT("info", ("Enlarging data file from %lu to %lu",
6890 (ulong) share->state.state.data_file_length,
6891 (ulong) ((page + 1 ) * share->block_size)));
6892 share->state.state.data_file_length= (page + 1) * share->block_size;
6893 buff= info->keyread_buff;
6894 info->keyread_buff_used= 1;
6895 make_empty_page(info, buff, BLOB_PAGE, 0);
6896 unlock_method= PAGECACHE_LOCK_LEFT_UNLOCKED;
6897 unpin_method= PAGECACHE_PIN_LEFT_UNPINNED;
6898 }
6899 else
6900 {
6901 share->pagecache->readwrite_flags&= ~MY_WME;
6902 share->silence_encryption_errors= 1;
6903 buff= pagecache_read(share->pagecache,
6904 &info->dfile,
6905 page, 0, 0,
6906 PAGECACHE_PLAIN_PAGE,
6907 PAGECACHE_LOCK_WRITE, &page_link.link);
6908 share->pagecache->readwrite_flags= share->pagecache->
6909 org_readwrite_flags;
6910 share->silence_encryption_errors= 0;
6911 if (!buff)
6912 {
6913 if (my_errno != HA_ERR_FILE_TOO_SHORT &&
6914 my_errno != HA_ERR_WRONG_CRC &&
6915 my_errno != HA_ERR_DECRYPTION_FAILED)
6916 {
6917 /* If not read outside of file */
6918 pagecache_unlock_by_link(share->pagecache, page_link.link,
6919 PAGECACHE_LOCK_WRITE_UNLOCK,
6920 PAGECACHE_UNPIN, LSN_IMPOSSIBLE,
6921 LSN_IMPOSSIBLE, 0, FALSE);
6922 goto err;
6923 }
6924 /*
6925 Physical file was too short, create new page. It can be that
6926 recovery started with a file with N pages, wrote page N+2 into
6927 pagecache (increased data_file_length but not physical file
6928 length), now reads page N+1: the read fails.
6929 */
6930 buff= pagecache_block_link_to_buffer(page_link.link);
6931 make_empty_page(info, buff, BLOB_PAGE, 0);
6932 }
6933 else
6934 {
6935 #ifdef DBUG_ASSERT_EXISTS
6936 uchar found_page_type= (buff[PAGE_TYPE_OFFSET] & PAGE_TYPE_MASK);
6937 #endif
6938 if (lsn_korr(buff) >= lsn)
6939 {
6940 /* Already applied */
6941 check_skipped_lsn(info, lsn_korr(buff), 1, page);
6942 pagecache_unlock_by_link(share->pagecache, page_link.link,
6943 PAGECACHE_LOCK_WRITE_UNLOCK,
6944 PAGECACHE_UNPIN, LSN_IMPOSSIBLE,
6945 LSN_IMPOSSIBLE, 0, FALSE);
6946 goto fix_bitmap;
6947 }
6948 DBUG_ASSERT((found_page_type == (uchar) BLOB_PAGE) ||
6949 (found_page_type == (uchar) UNALLOCATED_PAGE));
6950 }
6951 unlock_method= PAGECACHE_LOCK_WRITE_UNLOCK;
6952 unpin_method= PAGECACHE_UNPIN;
6953 }
6954
6955 /*
6956 Blob pages are never updated twice in same redo-undo chain, so
6957 it's safe to update lsn for them here
6958 */
6959 lsn_store(buff, lsn);
6960 buff[PAGE_TYPE_OFFSET]= BLOB_PAGE;
6961 bzero(buff + LSN_SIZE + PAGE_TYPE_SIZE,
6962 FULL_PAGE_HEADER_SIZE(share) - (LSN_SIZE + PAGE_TYPE_SIZE));
6963
6964 if (data_on_page != data_size)
6965 {
6966 /*
6967 Last page may be only partly filled. We zero the rest, like
6968 write_full_pages() does.
6969 */
6970 bzero(buff + share->block_size - PAGE_SUFFIX_SIZE - empty_space,
6971 empty_space);
6972 }
6973 memcpy(buff + FULL_PAGE_HEADER_SIZE(share), data, data_on_page);
6974 if (pagecache_write(share->pagecache,
6975 &info->dfile, page, 0,
6976 buff, PAGECACHE_PLAIN_PAGE,
6977 unlock_method, unpin_method,
6978 PAGECACHE_WRITE_DELAY, 0, LSN_IMPOSSIBLE))
6979 goto err;
6980
6981 fix_bitmap:
6982 /** @todo leave bitmap lock to the bitmap code... */
6983 mysql_mutex_lock(&share->bitmap.bitmap_lock);
6984 res= _ma_bitmap_set_full_page_bits(info, &share->bitmap, page,
6985 1);
6986 mysql_mutex_unlock(&share->bitmap.bitmap_lock);
6987 if (res)
6988 goto err;
6989 }
6990 }
6991 }
6992 *first_page= first_page2;
6993 *last_page= last_page2;
6994 DBUG_RETURN(0);
6995
6996 err:
6997 _ma_mark_file_crashed(share);
6998 DBUG_ASSERT(!maria_assert_if_crashed_table);
6999 DBUG_RETURN(1);
7000 }
7001
7002
7003 /****************************************************************************
7004 Applying of UNDO entries
7005 ****************************************************************************/
7006
7007 /** Execute undo of a row insert (delete the inserted row) */
7008
_ma_apply_undo_row_insert(MARIA_HA * info,LSN undo_lsn,const uchar * header)7009 my_bool _ma_apply_undo_row_insert(MARIA_HA *info, LSN undo_lsn,
7010 const uchar *header)
7011 {
7012 pgcache_page_no_t page;
7013 uint rownr;
7014 uchar *buff;
7015 my_bool res;
7016 MARIA_PINNED_PAGE page_link;
7017 MARIA_SHARE *share= info->s;
7018 ha_checksum checksum;
7019 LSN lsn;
7020 DBUG_ENTER("_ma_apply_undo_row_insert");
7021
7022 page= page_korr(header);
7023 header+= PAGE_STORE_SIZE;
7024 rownr= dirpos_korr(header);
7025 header+= DIRPOS_STORE_SIZE;
7026 DBUG_PRINT("enter", ("rowid: %lu page: %lu rownr: %u",
7027 (ulong) ma_recordpos(page, rownr),
7028 (ulong) page, rownr));
7029
7030 buff= pagecache_read(share->pagecache,
7031 &info->dfile, page, 0,
7032 0, share->page_type,
7033 PAGECACHE_LOCK_WRITE,
7034 &page_link.link);
7035 page_link.unlock= PAGECACHE_LOCK_WRITE_UNLOCK;
7036 page_link.changed= buff != 0;
7037 push_dynamic(&info->pinned_pages, (void*) &page_link);
7038 if (!buff)
7039 goto err;
7040
7041 if (read_row_extent_info(info, buff, rownr))
7042 goto err;
7043
7044 _ma_bitmap_flushable(info, 1);
7045 if (delete_head_or_tail(info, page, rownr, 1, 1) ||
7046 delete_tails(info, info->cur_row.tail_positions))
7047 goto err;
7048
7049 if (info->cur_row.extents_count && free_full_pages(info, &info->cur_row))
7050 goto err;
7051
7052 checksum= 0;
7053 if (share->calc_checksum)
7054 checksum= (ha_checksum) 0 - ha_checksum_korr(header);
7055 info->last_auto_increment= ~ (ulonglong) 0;
7056 if (_ma_write_clr(info, undo_lsn, LOGREC_UNDO_ROW_INSERT,
7057 share->calc_checksum != 0, checksum, &lsn, (void*) 0))
7058 goto err;
7059
7060 res= 0;
7061 end:
7062 /* The following is true only if _ma_bitmap_flushable() was called earlier */
7063 if (info->non_flushable_state)
7064 _ma_bitmap_flushable(info, -1);
7065 _ma_unpin_all_pages_and_finalize_row(info, lsn);
7066 DBUG_RETURN(res);
7067
7068 err:
7069 DBUG_ASSERT(!maria_assert_if_crashed_table);
7070 res= 1;
7071 _ma_mark_file_crashed(share);
7072 /*
7073 Don't write a new LSN on the used pages. Not important as the file is
7074 marked as crashed and need to be repaired before it can be used.
7075 */
7076 lsn= LSN_IMPOSSIBLE;
7077 goto end;
7078 }
7079
7080
7081 /** Execute undo of a row delete (insert the row back where it was) */
7082
_ma_apply_undo_row_delete(MARIA_HA * info,LSN undo_lsn,const uchar * header,size_t header_length)7083 my_bool _ma_apply_undo_row_delete(MARIA_HA *info, LSN undo_lsn,
7084 const uchar *header, size_t header_length
7085 __attribute__((unused)))
7086 {
7087 MARIA_SHARE *share= info->s;
7088 MARIA_ROW row;
7089 MARIA_COLUMNDEF *column, *end_column;
7090 MARIA_BITMAP_BLOCKS *blocks;
7091 struct st_row_pos_info row_pos;
7092 uchar *record;
7093 const uchar *null_bits, *field_length_data, *extent_info;
7094 pgcache_page_no_t page;
7095 ulong *blob_lengths;
7096 uint *null_field_lengths, extent_count, rownr, length_on_head_page;
7097 DBUG_ENTER("_ma_apply_undo_row_delete");
7098
7099 /*
7100 Use cur row as a base; We need to make a copy as we will change
7101 some buffers to point directly to 'header'
7102 */
7103 memcpy(&row, &info->cur_row, sizeof(row));
7104
7105 page= page_korr(header);
7106 header+= PAGE_STORE_SIZE;
7107 rownr= dirpos_korr(header);
7108 header+= DIRPOS_STORE_SIZE;
7109 length_on_head_page= uint2korr(header);
7110 header+= 2;
7111 extent_count= pagerange_korr(header);
7112 header+= PAGERANGE_STORE_SIZE;
7113 DBUG_PRINT("enter", ("rowid: %lu page: %lu rownr: %u",
7114 (ulong) ma_recordpos(page, rownr),
7115 (ulong) page, rownr));
7116
7117 if (share->calc_checksum)
7118 {
7119 /*
7120 We extract the checksum delta here, saving a recomputation in
7121 allocate_and_write_block_record(). It's only an optimization.
7122 */
7123 row.checksum= (ha_checksum) 0 - ha_checksum_korr(header);
7124 header+= HA_CHECKSUM_STORE_SIZE;
7125 }
7126 extent_info= header;
7127 header+= extent_count * ROW_EXTENT_SIZE;
7128
7129 null_field_lengths= row.null_field_lengths;
7130 blob_lengths= row.blob_lengths;
7131
7132 /*
7133 Fill in info->cur_row with information about the row, like in
7134 calc_record_size(), to be used by write_block_record()
7135 */
7136
7137 row.normal_length= row.char_length= row.varchar_length=
7138 row.blob_length= row.extents_count= row.field_lengths_length= 0;
7139
7140 null_bits= header;
7141 header+= share->base.null_bytes;
7142 /* This will not be changed */
7143 row.empty_bits= (uchar*) header;
7144 header+= share->base.pack_bytes;
7145 if (share->base.max_field_lengths)
7146 {
7147 row.field_lengths_length= uint2korr(header);
7148 row.field_lengths= (uchar*) header + 2 ;
7149 header+= 2 + row.field_lengths_length;
7150 }
7151 if (share->base.blobs)
7152 row.blob_length= ma_get_length(&header);
7153
7154 /* We need to build up a record (without blobs) in rec_buff */
7155 if (!(record= my_malloc(PSI_INSTRUMENT_ME, share->base.reclength,
7156 MYF(MY_WME))))
7157 DBUG_RETURN(1);
7158
7159 memcpy(record, null_bits, share->base.null_bytes);
7160
7161 /* Copy field information from header to record */
7162
7163 /* Handle constant length fields that are always present */
7164 for (column= share->columndef,
7165 end_column= column+ share->base.fixed_not_null_fields;
7166 column < end_column;
7167 column++)
7168 {
7169 memcpy(record + column->offset, header, column->length);
7170 header+= column->length;
7171 }
7172
7173 /* Handle NULL fields and CHAR/VARCHAR fields */
7174 field_length_data= row.field_lengths;
7175 for (end_column= share->columndef + share->base.fields;
7176 column < end_column;
7177 column++, null_field_lengths++)
7178 {
7179 if ((record[column->null_pos] & column->null_bit) ||
7180 row.empty_bits[column->empty_pos] & column->empty_bit)
7181 {
7182 if (column->type != FIELD_BLOB)
7183 *null_field_lengths= 0;
7184 else
7185 *blob_lengths++= 0;
7186 if (share->calc_checksum)
7187 bfill(record + column->offset, column->fill_length,
7188 column->type == FIELD_SKIP_ENDSPACE ? ' ' : 0);
7189 continue;
7190 }
7191 switch (column->type) {
7192 case FIELD_CHECK:
7193 case FIELD_NORMAL: /* Fixed length field */
7194 case FIELD_ZERO:
7195 case FIELD_SKIP_PRESPACE: /* Not packed */
7196 case FIELD_SKIP_ZERO: /* Fixed length field */
7197 row.normal_length+= column->length;
7198 *null_field_lengths= column->length;
7199 memcpy(record + column->offset, header, column->length);
7200 header+= column->length;
7201 break;
7202 case FIELD_SKIP_ENDSPACE: /* CHAR */
7203 {
7204 uint length;
7205 if (column->length <= 255)
7206 length= (uint) *field_length_data++;
7207 else
7208 {
7209 length= uint2korr(field_length_data);
7210 field_length_data+= 2;
7211 }
7212 row.char_length+= length;
7213 *null_field_lengths= length;
7214 memcpy(record + column->offset, header, length);
7215 if (share->calc_checksum)
7216 bfill(record + column->offset + length, (column->length - length),
7217 ' ');
7218 header+= length;
7219 break;
7220 }
7221 case FIELD_VARCHAR:
7222 {
7223 uint length;
7224 uchar *field_pos= record + column->offset;
7225
7226 /* 256 is correct as this includes the length uchar */
7227 if (column->fill_length == 1)
7228 {
7229 field_pos[0]= *field_length_data;
7230 length= (uint) *field_length_data;
7231 }
7232 else
7233 {
7234 field_pos[0]= field_length_data[0];
7235 field_pos[1]= field_length_data[1];
7236 length= uint2korr(field_length_data);
7237 }
7238 field_length_data+= column->fill_length;
7239 field_pos+= column->fill_length;
7240 row.varchar_length+= length;
7241 *null_field_lengths= length;
7242 memcpy(field_pos, header, length);
7243 header+= length;
7244 break;
7245 }
7246 case FIELD_BLOB:
7247 {
7248 /* Copy length of blob and pointer to blob data to record */
7249 uchar *field_pos= record + column->offset;
7250 uint size_length= column->length - portable_sizeof_char_ptr;
7251 ulong blob_length= _ma_calc_blob_length(size_length, field_length_data);
7252
7253 memcpy(field_pos, field_length_data, size_length);
7254 field_length_data+= size_length;
7255 memcpy(field_pos + size_length, &header, sizeof(header));
7256 header+= blob_length;
7257 *blob_lengths++= blob_length;
7258 break;
7259 }
7260 default:
7261 DBUG_ASSERT(0);
7262 }
7263 }
7264 row.head_length= (info->row_base_length +
7265 share->base.fixed_not_null_fields_length +
7266 row.field_lengths_length +
7267 size_to_store_key_length(row.field_lengths_length) +
7268 row.normal_length +
7269 row.char_length + row.varchar_length);
7270 row.total_length= (row.head_length + row.blob_length);
7271 if (row.total_length < share->base.min_block_length)
7272 row.total_length= share->base.min_block_length;
7273
7274 /*
7275 Row is now generated. Now we need to insert record on the original
7276 pages with original size on each page.
7277 */
7278
7279 _ma_bitmap_flushable(info, 1);
7280 /* Change extent information to be usable by write_block_record() */
7281 blocks= &row.insert_blocks;
7282 if (extent_to_bitmap_blocks(info, blocks, page, extent_count, extent_info))
7283 goto err;
7284 blocks->block->org_bitmap_value= _ma_bitmap_get_page_bits(info,
7285 &share->bitmap,
7286 page);
7287 blocks->block->used|= BLOCKUSED_USE_ORG_BITMAP;
7288
7289 /* Read head page and allocate data for rowid */
7290 if (get_rowpos_in_head_or_tail_page(info, blocks->block,
7291 info->buff,
7292 length_on_head_page,
7293 HEAD_PAGE, PAGECACHE_LOCK_WRITE,
7294 rownr, &row_pos))
7295 goto err;
7296
7297 if (share->calc_checksum)
7298 {
7299 DBUG_ASSERT(row.checksum == (share->calc_checksum)(info, record));
7300 }
7301 /* Store same amount of data on head page as on original page */
7302 row_pos.length= (length_on_head_page -
7303 (extent_count + 1 - blocks->count) * ROW_EXTENT_SIZE);
7304 set_if_bigger(row_pos.length, share->base.min_block_length);
7305 if (write_block_record(info, (uchar*) 0, record, &row,
7306 blocks, blocks->block->org_bitmap_value != 0,
7307 &row_pos, undo_lsn, 0))
7308 goto err;
7309
7310 my_free(record);
7311 DBUG_RETURN(0);
7312
7313 err:
7314 DBUG_ASSERT(!maria_assert_if_crashed_table);
7315 _ma_mark_file_crashed(share);
7316 if (info->non_flushable_state)
7317 _ma_bitmap_flushable(info, -1);
7318 _ma_unpin_all_pages_and_finalize_row(info, LSN_IMPOSSIBLE);
7319 my_free(record);
7320 DBUG_RETURN(1);
7321 }
7322
7323
7324 /**
7325 Execute undo of a row update
7326
7327 @fn _ma_apply_undo_row_update()
7328
7329 @return Operation status
7330 @retval 0 OK
7331 @retval 1 Error
7332 */
7333
_ma_apply_undo_row_update(MARIA_HA * info,LSN undo_lsn,const uchar * header,size_t header_length)7334 my_bool _ma_apply_undo_row_update(MARIA_HA *info, LSN undo_lsn,
7335 const uchar *header,
7336 size_t header_length
7337 __attribute__((unused)))
7338 {
7339 MARIA_SHARE *share= info->s;
7340 MARIA_RECORD_POS record_pos;
7341 const uchar *field_length_data, *field_length_data_end, *extent_info;
7342 uchar *current_record, *orig_record;
7343 pgcache_page_no_t page;
7344 ha_checksum UNINIT_VAR(checksum_delta);
7345 uint rownr, field_length_header, extent_count, length_on_head_page;
7346 int error;
7347 DBUG_ENTER("_ma_apply_undo_row_update");
7348
7349 page= page_korr(header);
7350 header+= PAGE_STORE_SIZE;
7351 rownr= dirpos_korr(header);
7352 header+= DIRPOS_STORE_SIZE;
7353
7354 record_pos= ma_recordpos(page, rownr);
7355 DBUG_PRINT("enter", ("rowid: %lu page: %lu rownr: %u",
7356 (ulong) record_pos, (ulong) page, rownr));
7357
7358 if (share->calc_checksum)
7359 {
7360 checksum_delta= ha_checksum_korr(header);
7361 header+= HA_CHECKSUM_STORE_SIZE;
7362 }
7363 length_on_head_page= uint2korr(header);
7364 set_if_bigger(length_on_head_page, share->base.min_block_length);
7365 header+= 2;
7366 extent_count= pagerange_korr(header);
7367 header+= PAGERANGE_STORE_SIZE;
7368 extent_info= header;
7369 header+= extent_count * ROW_EXTENT_SIZE;
7370
7371 /*
7372 Set header to point to old field values, generated by
7373 fill_update_undo_parts()
7374 */
7375 field_length_header= ma_get_length(&header);
7376 field_length_data= (uchar*) header;
7377 header+= field_length_header;
7378 field_length_data_end= header;
7379
7380 /* Allocate buffer for current row & original row */
7381 if (!(current_record= my_malloc(PSI_INSTRUMENT_ME, share->base.reclength * 2,
7382 MYF(MY_WME))))
7383 DBUG_RETURN(1);
7384 orig_record= current_record+ share->base.reclength;
7385
7386 /* Read current record */
7387 if (_ma_read_block_record(info, current_record, record_pos))
7388 goto err;
7389
7390 if (*field_length_data == 255)
7391 {
7392 /* Bitmap changed */
7393 field_length_data++;
7394 memcpy(orig_record, header, share->base.null_bytes);
7395 header+= share->base.null_bytes;
7396 }
7397 else
7398 memcpy(orig_record, current_record, share->base.null_bytes);
7399 bitmap_clear_all(&info->changed_fields);
7400
7401 while (field_length_data < field_length_data_end)
7402 {
7403 uint field_nr= ma_get_length(&field_length_data), field_length;
7404 MARIA_COLUMNDEF *column= share->columndef + field_nr;
7405 uchar *orig_field_pos= orig_record + column->offset;
7406
7407 bitmap_set_bit(&info->changed_fields, field_nr);
7408 if (field_nr >= share->base.fixed_not_null_fields)
7409 {
7410 if (!(field_length= ma_get_length(&field_length_data)))
7411 {
7412 /* Null field or empty field */
7413 bfill(orig_field_pos, column->fill_length,
7414 column->type == FIELD_SKIP_ENDSPACE ? ' ' : 0);
7415 continue;
7416 }
7417 }
7418 else
7419 field_length= column->length;
7420
7421 switch (column->type) {
7422 case FIELD_CHECK:
7423 case FIELD_NORMAL: /* Fixed length field */
7424 case FIELD_ZERO:
7425 case FIELD_SKIP_PRESPACE: /* Not packed */
7426 memcpy(orig_field_pos, header, column->length);
7427 header+= column->length;
7428 break;
7429 case FIELD_SKIP_ZERO: /* Number */
7430 case FIELD_SKIP_ENDSPACE: /* CHAR */
7431 {
7432 uint diff;
7433 memcpy(orig_field_pos, header, field_length);
7434 if ((diff= (column->length - field_length)))
7435 bfill(orig_field_pos + column->length - diff, diff,
7436 column->type == FIELD_SKIP_ENDSPACE ? ' ' : 0);
7437 header+= field_length;
7438 }
7439 break;
7440 case FIELD_VARCHAR:
7441 if (column->length <= 256)
7442 {
7443 *orig_field_pos++= (uchar) field_length;
7444 }
7445 else
7446 {
7447 int2store(orig_field_pos, field_length);
7448 orig_field_pos+= 2;
7449 }
7450 memcpy(orig_field_pos, header, field_length);
7451 header+= field_length;
7452 break;
7453 case FIELD_BLOB:
7454 {
7455 uint size_length= column->length - portable_sizeof_char_ptr;
7456 _ma_store_blob_length(orig_field_pos, size_length, field_length);
7457 memcpy(orig_field_pos + size_length, &header, sizeof(header));
7458 header+= field_length;
7459 break;
7460 }
7461 default:
7462 DBUG_ASSERT(0);
7463 }
7464 }
7465 copy_not_changed_fields(info, &info->changed_fields,
7466 orig_record, current_record);
7467
7468 if (share->calc_checksum)
7469 {
7470 info->new_row.checksum= checksum_delta +
7471 (info->cur_row.checksum= (*share->calc_checksum)(info, orig_record));
7472 /* verify that record's content is sane */
7473 DBUG_ASSERT(info->new_row.checksum ==
7474 (*share->calc_checksum)(info, current_record));
7475 }
7476
7477 info->last_auto_increment= ~ (ulonglong) 0;
7478 /* Now records are up to date, execute the update to original values */
7479 if (_ma_update_at_original_place(info, page, rownr, length_on_head_page,
7480 extent_count, extent_info,
7481 current_record, orig_record, undo_lsn))
7482 goto err;
7483
7484 error= 0;
7485 end:
7486 my_free(current_record);
7487 DBUG_RETURN(error);
7488
7489 err:
7490 DBUG_ASSERT(!maria_assert_if_crashed_table);
7491 error= 1;
7492 _ma_mark_file_crashed(share);
7493 goto end;
7494 }
7495
7496
7497 /**
7498 Execute undo of a bulk insert which used repair
7499
7500 @return Operation status
7501 @retval 0 OK
7502 @retval 1 Error
7503 */
7504
_ma_apply_undo_bulk_insert(MARIA_HA * info,LSN undo_lsn)7505 my_bool _ma_apply_undo_bulk_insert(MARIA_HA *info, LSN undo_lsn)
7506 {
7507 my_bool error;
7508 LSN lsn;
7509 DBUG_ENTER("_ma_apply_undo_bulk_insert");
7510 /*
7511 We delete all rows, re-enable indices as bulk insert had disabled
7512 non-unique ones.
7513 */
7514 error= (maria_delete_all_rows(info) ||
7515 maria_enable_indexes(info) ||
7516 /* we enabled indices so need '2' below */
7517 _ma_state_info_write(info->s,
7518 MA_STATE_INFO_WRITE_DONT_MOVE_OFFSET |
7519 MA_STATE_INFO_WRITE_FULL_INFO |
7520 MA_STATE_INFO_WRITE_LOCK) ||
7521 _ma_write_clr(info, undo_lsn, LOGREC_UNDO_BULK_INSERT,
7522 FALSE, 0, &lsn, NULL));
7523 DBUG_RETURN(error);
7524 }
7525
7526
7527 /**
7528 @brief Get the TRANSLOG_ADDRESS to flush up to
7529
7530 @param page Page's content
7531 @param page_no Page's number (<offset>/<page length>)
7532 @param data_ptr Callback data pointer (pointer to MARIA_SHARE)
7533
7534 @note
7535 Usable for data (non-bitmap) and index pages
7536
7537 @retval LSN to flush up to
7538 */
7539
7540 TRANSLOG_ADDRESS
maria_page_get_lsn(uchar * page,pgcache_page_no_t page_no,uchar * data_ptr)7541 maria_page_get_lsn(uchar *page,
7542 pgcache_page_no_t page_no __attribute__((unused)),
7543 uchar* data_ptr __attribute__((unused)))
7544 {
7545 #ifndef DBUG_OFF
7546 const MARIA_SHARE *share= (MARIA_SHARE*)data_ptr;
7547 DBUG_ASSERT(share->page_type == PAGECACHE_LSN_PAGE &&
7548 share->now_transactional);
7549 #endif
7550 return lsn_korr(page);
7551 }
7552
7553
7554 /**
7555 @brief Enable reading of all rows, ignoring versioning
7556
7557 @note
7558 This is mainly useful in single user applications, like maria_pack,
7559 where we want to be able to read all rows without having to read the
7560 transaction id from the control file
7561 */
7562
maria_ignore_trids(MARIA_HA * info)7563 void maria_ignore_trids(MARIA_HA *info)
7564 {
7565 if (info->s->base.born_transactional)
7566 {
7567 if (!info->trn)
7568 _ma_set_tmp_trn_for_table(info, &dummy_transaction_object);
7569 /* Ignore transaction id when row is read */
7570 info->trn->min_read_from= ~(TrID) 0;
7571 }
7572 }
7573
7574
7575 #ifndef DBUG_OFF
7576
7577 /* The following functions are useful to call from debugger */
7578
_ma_print_block_info(MARIA_SHARE * share,uchar * buff)7579 void _ma_print_block_info(MARIA_SHARE *share, uchar *buff)
7580 {
7581 LSN lsn= lsn_korr(buff);
7582
7583 printf("LSN: " LSN_FMT " type: %u dir_entries: %u dir_free: %u empty_space: %u\n",
7584 LSN_IN_PARTS(lsn),
7585 (uint)buff[PAGE_TYPE_OFFSET],
7586 (uint)buff[DIR_COUNT_OFFSET],
7587 (uint)buff[DIR_FREE_OFFSET],
7588 (uint) uint2korr(buff + EMPTY_SPACE_OFFSET));
7589 printf("Start of directory: %lu\n",
7590 maria_block_size - PAGE_SUFFIX_SIZE -
7591 (uint) buff[DIR_COUNT_OFFSET] * DIR_ENTRY_SIZE);
7592 _ma_print_directory(share, stdout, buff, maria_block_size);
7593 }
7594 #endif
7595