1 /* Copyright (C) 2007-2008 Michael Widenius
2 
3    This program is free software; you can redistribute it and/or modify
4    it under the terms of the GNU General Public License as published by
5    the Free Software Foundation; version 2 of the License.
6 
7    This program is distributed in the hope that it will be useful,
8    but WITHOUT ANY WARRANTY; without even the implied warranty of
9    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
10    GNU General Public License for more details.
11 
12    You should have received a copy of the GNU General Public License
13    along with this program; if not, write to the Free Software
14    Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1335 USA */
15 
16 /*
17   Storage of records in block
18 
19   Some clarifications about the abbrev used:
20 
21   NULL fields      -> Fields that may have contain a NULL value.
22   Not null fields  -> Fields that may not contain a NULL value.
23   Critical fields  -> Fields that can't be null and can't be dropped without
24 		      causing a table reorganization.
25 
26 
27   Maria will have a LSN at start of each page (excluding the bitmap pages)
28 
29   The different page types that are in a data file are:
30 
31   Bitmap pages     Map of free pages in the next extent (8192 page size
32                    gives us 256M of mapped pages / bitmap)
33   Head page        Start of rows are stored on this page.
34                    A rowid always points to a head page
35   Blob page        This page is totally filled with data from one blob or by
36                    a set of long VARCHAR/CHAR fields
37   Tail page        This contains the last part from different rows, blobs
38                    or varchar fields.
39 
40   The data file starts with a bitmap page, followed by as many data
41   pages as the bitmap can cover. After this there is a new bitmap page
42   and more data pages etc.
43 
44   For information about the bitmap page, see ma_bitmap.c
45 
46   Structure of data and tail page:
47 
48   The page has a row directory at end of page to allow us to do deletes
49   without having to reorganize the page.  It also allows us to later store
50   some more bytes after each row to allow them to grow without having to move
51   around other rows.
52 
53   Page header:
54 
55   LSN            7 bytes   Log position for last page change
56   PAGE_TYPE      1 uchar   0 unalloced / 1 for head / 2 for tail / 3 for blob
57   DIR_COUNT      1 uchar   Number of row/tail entries on page
58   FREE_DIR_LINK  1 uchar   Pointer to first free director entry or 255 if no
59   empty space    2 bytes   Bytes of empty space on page
60 
61   The most significant bit in PAGE_TYPE is set to 1 if the data on the page
62   can be compacted to get more space. (PAGE_CAN_BE_COMPACTED)
63 
64   Row data
65 
66   Row directory of NO entries, that consist of the following for each row
67   (in reverse order; i.e., first record is stored last):
68 
69   Position     2 bytes Position of row on page
70   Length       2 bytes Length of entry
71 
72   For Position and Length, the 1 most significant bit of the position and
73   the 1 most significant bit of the length could be used for some states of
74   the row (in other words, we should try to keep these reserved)
75 
76   Position is 0 if the entry is not used.  In this case length[0] points
77   to a previous free entry (255 if no previous entry) and length[1]
78   to the next free entry (or 255 if last free entry). This works because
79   the directory entry 255 can never be marked free (if the first directory
80   entry is freed, the directory is shrinked).
81 
82   checksum     4 bytes  Reserved for full page read testing and live backup.
83 
84   ----------------
85 
86   Structure of blob pages:
87 
88   LSN          7 bytes  Log position for last page change
89   PAGE_TYPE    1 uchar   3
90 
91   data
92 
93   -----------------
94 
95   Row data structure:
96 
97   Flag                          1 uchar   Marker of which header field exists
98   TRANSID                       6 bytes  TRANSID of changing transaction
99                                          (optional, added on insert and first
100                                          update/delete)
101   VER_PTR                       7 bytes  Pointer to older version in log
102                                          (undo record)
103                                          (optional, added after first
104                                          update/delete)
105   DELETE_TRANSID                6 bytes  (optional). TRANSID of original row.
106                                          Added on delete.
107   Nulls_extended                1 uchar   To allow us to add new DEFAULT NULL
108                                          fields (optional, added after first
109                                          change of row after alter table)
110   Number of ROW_EXTENT's        1-3 uchar Length encoded, optional
111                                          This is the number of extents the
112                                          row is split into
113   First row_extent              7 uchar  Pointer to first row extent (optional)
114 
115   Total length of length array  1-3 uchar Only used if we have
116                                          char/varchar/blob fields.
117   Row checksum		        1 uchar   Only if table created with checksums
118   Null_bits             ..      One bit for each NULL field (a field that may
119 				have the value NULL)
120   Empty_bits            ..      One bit for each field that may be 'empty'.
121 				(Both for null and not null fields).
122                                 This bit is 1 if the value for the field is
123                                 0 or empty string.
124 
125   field_offsets                 2 byte/offset
126                                   For each 32'th field, there is one offset
127                                   that points to where the field information
128                                   starts in the block. This is to provide
129                                   fast access to later field in the row
130                                   when we only need to return a small
131                                   set of fields.
132                                   TODO: Implement this.
133 
134   Things marked above as 'optional' will only be present if the
135   corresponding bit is set in 'Flag' field.  Flag gives us a way to
136   get more space on a page when doing page compaction as we don't need
137   to store TRANSID that have committed before the smallest running
138   transaction we have in memory.
139 
140   Data in the following order:
141   (Field order is precalculated when table is created)
142 
143   Critical fixed length, not null, fields. (Note, these can't be dropped)
144   Fixed length, null fields
145 
146   Length array, 1-4 uchar per field for all CHAR/VARCHAR/BLOB fields.
147   Number of bytes used in length array per entry is depending on max length
148   for field.
149 
150   ROW_EXTENT's
151   CHAR data (space stripped)
152   VARCHAR data
153   BLOB data
154 
155   Fields marked in null_bits or empty_bits are not stored in data part or
156   length array.
157 
158   If row doesn't fit into the given block, then the first EXTENT will be
159   stored last on the row. This is done so that we don't break any field
160   data in the middle.
161 
162   We first try to store the full row into one block. If that's not possible
163   we move out each big blob into their own extents. If this is not enough we
164   move out a concatenation of all varchars to their own extent.
165 
166   Each blob and the concatenated char/varchar fields are stored the following
167   way:
168   - Store the parts in as many full-contiguous pages as possible.
169   - The last part, that doesn't fill a full page, is stored in tail page.
170 
171   When doing an insert of a new row, we don't have to have
172   VER_PTR in the row. This will make rows that are not changed stored
173   efficiently. On update and delete we would add TRANSID (if it was an old
174   committed row) and VER_PTR to
175   the row. On row page compaction we can easily detect rows where
176   TRANSID was committed before the longest running transaction
177   started and we can then delete TRANSID and VER_PTR from the row to
178   gain more space.
179 
180   If a row is deleted in Maria, we change TRANSID to the deleting
181   transaction's id, change VER_PTR to point to the undo record for the delete,
182   and add DELETE_TRANSID (the id of the transaction which last
183   inserted/updated the row before its deletion). DELETE_TRANSID allows an old
184   transaction to avoid reading the log to know if it can see the last version
185   before delete (in other words it reduces the probability of having to follow
186   VER_PTR). TODO: depending on a compilation option, evaluate the performance
187   impact of not storing DELETE_TRANSID (which would make the row smaller).
188 
189   Description of the different parts:
190 
191   Flag is coded as:
192 
193   Description           bit
194   TRANS_ID_exists       0
195   VER_PTR_exists        1
196   Row is deleted        2       (Means that DELETE_TRANSID exists)
197   Nulls_extended_exists 3
198   Row is split          7       This means that 'Number_of_row_extents' exists
199 
200   Nulls_extended is the number of new DEFAULT NULL fields in the row
201   compared to the number of DEFAULT NULL fields when the first version
202   of the table was created.  If Nulls_extended doesn't exist in the row,
203   we know it's 0 as this must be one of the original rows from when the
204   table was created first time.  This coding allows us to add 255*8 =
205   2048 new fields without requiring a full alter table.
206 
207   Empty_bits is used to allow us to store 0, 0.0, empty string, empty
208   varstring and empty blob efficiently. (This is very good for data
209   warehousing where NULL's are often regarded as evil). Having this
210   bitmap also allows us to drop information of a field during a future
211   delete if field was deleted with ALTER TABLE DROP COLUMN.  To be able
212   to handle DROP COLUMN, we must store in the index header the fields
213   that has been dropped. When unpacking a row we will ignore dropped
214   fields. When storing a row, we will mark a dropped field either with a
215   null in the null bit map or in the empty_bits and not store any data
216   for it.
217   TODO: Add code for handling dropped fields.
218 
219 
220   A ROW EXTENT is range of pages. One ROW_EXTENT is coded as:
221 
222   START_PAGE            5 bytes
223   PAGE_COUNT            2 bytes.  Bit 16 is set if this is a tail page.
224                                   Bit 15 is to set if this is start of a new
225                                   blob extent.
226 
227   With 8K pages, we can cover 256M in one extent. This coding gives us a
228   maximum file size of 2^40*8192 = 8192 tera
229 
230   As an example of ROW_EXTENT handling, assume a row with one integer
231   field (value 5), two big VARCHAR fields (size 250 and 8192*3), and 2
232   big BLOB fields that we have updated.
233 
234   The record format for storing this into an empty file would be:
235 
236   Page 1:
237 
238   00 00 00 00 00 00 00          LSN
239   01                            Only one row in page
240   FF                            No free dir entry
241   xx xx                         Empty space on page
242 
243   10                            Flag: row split, VER_PTR exists
244   01 00 00 00 00 00             TRANSID 1
245   00 00 00 00 00 01 00          VER_PTR to first block in LOG file 1
246   5                             Number of row extents
247   02 00 00 00 00 03 00          VARCHAR's are stored in full pages 2,3,4
248   0                             No null fields
249   0                             No empty fields
250   05 00 00 00 00 00 80          Tail page for VARCHAR, rowid 0
251   06 00 00 00 00 80 00          First blob, stored at page 6-133
252   05 00 00 00 00 01 80          Tail of first blob (896 bytes) at page 5
253   86 00 00 00 00 80 00          Second blob, stored at page 134-262
254   05 00 00 00 00 02 80          Tail of second blob (896 bytes) at page 5
255   05 00                         5 integer
256   FA                            Length of first varchar field (size 250)
257   00 60                         Length of second varchar field (size 8192*3)
258   00 60 10                      First medium BLOB, 1M
259   01 00 10 00                   Second BLOB, 1M
260   xx xx xx xx xx xx             Varchars are stored here until end of page
261 
262   ..... until end of page
263 
264   09 00 F4 1F                   Start position 9, length 8180
265   xx xx xx xx			Checksum
266 
267   A data page is allowed to have a wrong CRC and header as long as it is
268   marked empty in the bitmap and its directory's count is 0.
269 */
270 
271 #include "maria_def.h"
272 #include "ma_blockrec.h"
273 #include "trnman.h"
274 #include "ma_trnman.h"
275 #include "ma_key_recover.h"
276 #include "ma_recovery_util.h"
277 #include <lf.h>
278 
279 /*
280   Struct for having a cursor over a set of extent.
281   This is used to loop over all extents for a row when reading
282   the row data. It's also used to store the tail positions for
283   a read row to be used by a later update/delete command.
284 */
285 
286 typedef struct st_maria_extent_cursor
287 {
288   /*
289     Pointer to packed uchar array of extents for the row.
290     Format is described above in the header
291   */
292   uchar *extent;
293   /* Where data starts on page; Only for debugging */
294   uchar *data_start;
295   /* Position to all tails in the row. Updated when reading a row */
296   MARIA_RECORD_POS *tail_positions;
297   /* Current page */
298   pgcache_page_no_t page;
299   /* How many pages in the page region */
300   uint page_count;
301   /* What kind of lock to use for tail pages */
302   enum pagecache_page_lock lock_for_tail_pages;
303   /* Total number of extents (i.e., entries in the 'extent' slot) */
304   uint extent_count;
305   /* <> 0 if current extent is a tail page; Set while using cursor */
306   uint tail;
307   /* Position for tail on tail page */
308   uint tail_row_nr;
309   /*
310     == 1 if we are working on the first extent (i.e., the one that is stored in
311     the row header, not an extent that is stored as part of the row data).
312   */
313   my_bool first_extent;
314 } MARIA_EXTENT_CURSOR;
315 
316 
317 /**
318    @brief Structure for passing down info to write_hook_for_clr_end().
319    This hooks needs to know the variation of the live checksum caused by the
320    current operation to update state.checksum under log's mutex,
321    needs to know the transaction's previous undo_lsn to set
322    trn->undo_lsn under log mutex, and needs to know the type of UNDO being
323    undone now to modify state.records under log mutex.
324 */
325 
326 /** S:share,D:checksum_delta,E:expression,P:pointer_into_record,L:length */
327 #define store_checksum_in_rec(S,D,E,P,L)        do      \
328   {                                                     \
329     D= 0;                                               \
330     if ((S)->calc_checksum != NULL)                     \
331     {                                                   \
332       D= (E);                                           \
333       ha_checksum_store(P, D);                          \
334       L+= HA_CHECKSUM_STORE_SIZE;                       \
335     }                                                   \
336   } while (0)
337 
338 
339 static my_bool delete_tails(MARIA_HA *info, MARIA_RECORD_POS *tails);
340 static my_bool delete_head_or_tail(MARIA_HA *info,
341                                    pgcache_page_no_t page, uint record_number,
342                                    my_bool head, my_bool from_update);
343 #ifndef DBUG_OFF
344 static void _ma_print_directory(MARIA_SHARE *share,
345                                 FILE *file, uchar *buff, uint block_size);
346 #endif
347 static uchar *store_page_range(MARIA_SHARE *share,
348                                uchar *to, MARIA_BITMAP_BLOCK *block,
349                                ulong length,
350                                uint *tot_ranges);
351 static size_t fill_insert_undo_parts(MARIA_HA *info, const uchar *record,
352                                      LEX_CUSTRING *log_parts,
353                                      uint *log_parts_count);
354 static size_t fill_update_undo_parts(MARIA_HA *info, const uchar *oldrec,
355                                      const uchar *newrec,
356                                      LEX_CUSTRING *log_parts,
357                                      uint *log_parts_count);
358 
359 /****************************************************************************
360   Initialization
361 ****************************************************************************/
362 
363 /*
364   Initialize data needed for block structures
365 */
366 
367 
368 /* Size of the different header elements for a row */
369 
370 static uchar header_sizes[]=
371 {
372   TRANSID_SIZE,
373   VERPTR_SIZE,
374   TRANSID_SIZE,                                 /* Delete transid */
375   1                                             /* Null extends */
376 };
377 
378 /*
379   Calculate array of all used headers
380 
381   Used to speed up:
382 
383   size= 1;
384   if (flag & 1)
385     size+= TRANSID_SIZE;
386   if (flag & 2)
387     size+= VERPTR_SIZE;
388   if (flag & 4)
389     size+= TRANSID_SIZE
390   if (flag & 8)
391     size+= 1;
392 
393    NOTES
394      This is called only once at startup of Maria
395 */
396 
397 static uchar total_header_size[1 << array_elements(header_sizes)];
398 #define PRECALC_HEADER_BITMASK (array_elements(total_header_size) -1)
399 
_ma_init_block_record_data(void)400 void _ma_init_block_record_data(void)
401 {
402   uint i;
403   bzero(total_header_size, sizeof(total_header_size));
404   total_header_size[0]= FLAG_SIZE;              /* Flag uchar */
405   for (i= 1; i < array_elements(total_header_size); i++)
406   {
407     uint size= FLAG_SIZE, j, bit;
408     for (j= 0; (bit= (1 << j)) <= i; j++)
409     {
410       if (i & bit)
411         size+= header_sizes[j];
412     }
413     total_header_size[i]= size;
414   }
415 }
416 
417 
_ma_once_init_block_record(MARIA_SHARE * share,File data_file)418 my_bool _ma_once_init_block_record(MARIA_SHARE *share, File data_file)
419 {
420   my_bool res;
421   pgcache_page_no_t last_page;
422 
423   /*
424     First calculate the max file length with can have with a pointer of size
425     rec_reflength.
426 
427     The 'rec_reflength - 1' is because one byte is used for row
428     position withing the page.
429     The /2 comes from _ma_transaction_recpos_to_keypos() where we use
430     the lowest bit to mark if there is a transid following the rownr.
431   */
432   last_page= ((ulonglong) 1 << ((share->base.rec_reflength-1)*8))/2;
433   if (!last_page)                                  /* Overflow; set max size */
434     last_page= ~(pgcache_page_no_t) 0;
435 
436   res= _ma_bitmap_init(share, data_file, &last_page);
437   share->base.max_data_file_length= _ma_safe_mul(last_page + 1,
438                                                  share->block_size);
439 #if SIZEOF_OFF_T == 4
440   set_if_smaller(share->base.max_data_file_length, INT_MAX32);
441 #endif
442   return res;
443 }
444 
445 
_ma_once_end_block_record(MARIA_SHARE * share)446 my_bool _ma_once_end_block_record(MARIA_SHARE *share)
447 {
448   int res= _ma_bitmap_end(share);
449   if (share->bitmap.file.file >= 0)
450   {
451     if (flush_pagecache_blocks(share->pagecache, &share->bitmap.file,
452                        share->deleting ? FLUSH_IGNORE_CHANGED : FLUSH_RELEASE))
453       res= 1;
454     /*
455       File must be synced as it is going out of the maria_open_list and so
456       becoming unknown to Checkpoint.
457     */
458     if (!share->s3_path)
459     {
460       if (share->now_transactional &&
461           mysql_file_sync(share->bitmap.file.file, MYF(MY_WME)))
462         res= 1;
463       if (mysql_file_close(share->bitmap.file.file, MYF(MY_WME)))
464         res= 1;
465     }
466     /*
467       Trivial assignment to guard against multiple invocations
468       (May happen if file are closed but we want to keep the maria object
469       around a bit longer)
470     */
471     share->bitmap.file.file= -1;
472   }
473   if (share->id != 0)
474   {
475     /*
476       We de-assign the id even though index has not been flushed, this is ok
477       as close_lock serializes us with a Checkpoint looking at our share.
478     */
479     translog_deassign_id_from_share(share);
480   }
481   return res;
482 }
483 
484 
485 /* Init info->cur_row structure */
486 
_ma_init_block_record(MARIA_HA * info)487 my_bool _ma_init_block_record(MARIA_HA *info)
488 {
489   MARIA_ROW *row= &info->cur_row, *new_row= &info->new_row;
490   MARIA_SHARE *share= info->s;
491   myf flag= MY_WME | (share->temporary ? MY_THREAD_SPECIFIC : 0);
492   uint default_extents;
493   DBUG_ENTER("_ma_init_block_record");
494 
495   if (!my_multi_malloc(PSI_INSTRUMENT_ME, flag,
496                        &row->empty_bits, share->base.pack_bytes,
497                        &row->field_lengths,
498                        share->base.max_field_lengths + 2,
499                        &row->blob_lengths, sizeof(ulong) * share->base.blobs,
500                        &row->null_field_lengths, (sizeof(uint) *
501                                                   (share->base.fields -
502                                                    share->base.blobs +
503                                                    EXTRA_LENGTH_FIELDS)),
504                        &row->tail_positions, (sizeof(MARIA_RECORD_POS) *
505                                               (share->base.blobs + 2)),
506                        &new_row->empty_bits, share->base.pack_bytes,
507                        &new_row->field_lengths,
508                        share->base.max_field_lengths + 2,
509                        &new_row->blob_lengths,
510                        sizeof(ulong) * share->base.blobs,
511                        &new_row->null_field_lengths, (sizeof(uint) *
512                                                       (share->base.fields -
513                                                        share->base.blobs +
514                                                        EXTRA_LENGTH_FIELDS)),
515                        &info->log_row_parts,
516                        sizeof(*info->log_row_parts) *
517                        (TRANSLOG_INTERNAL_PARTS + 3 +
518                         share->base.fields + 3),
519                        &info->update_field_data,
520                        (share->base.fields * 4 +
521                         share->base.max_field_lengths + 1 + 4),
522                        NullS, 0))
523     DBUG_RETURN(1);
524   /* Skip over bytes used to store length of field length for logging */
525   row->field_lengths+= 2;
526   new_row->field_lengths+= 2;
527 
528   /* Reserve some initial space to avoid mallocs during execution */
529   default_extents= (ELEMENTS_RESERVED_FOR_MAIN_PART + 1 +
530                     (AVERAGE_BLOB_SIZE /
531                      FULL_PAGE_SIZE(share) /
532                      BLOB_SEGMENT_MIN_SIZE));
533 
534   if (my_init_dynamic_array(PSI_INSTRUMENT_ME, &info->bitmap_blocks,
535                             sizeof(MARIA_BITMAP_BLOCK),
536                             default_extents, 64, flag))
537     goto err;
538   info->cur_row.extents_buffer_length= default_extents * ROW_EXTENT_SIZE;
539   if (!(info->cur_row.extents= my_malloc(PSI_INSTRUMENT_ME,
540                                          info->cur_row.extents_buffer_length,
541                                          flag)))
542     goto err;
543 
544   info->row_base_length= share->base_length;
545   info->row_flag= share->base.default_row_flag;
546 
547   /*
548     We need to reserve 'EXTRA_LENGTH_FIELDS' number of parts in
549     null_field_lengths to allow splitting of rows in 'find_where_to_split_row'
550   */
551   row->null_field_lengths+= EXTRA_LENGTH_FIELDS;
552   new_row->null_field_lengths+= EXTRA_LENGTH_FIELDS;
553 
554   DBUG_RETURN(0);
555 
556 err:
557   _ma_end_block_record(info);
558   DBUG_RETURN(1);
559 }
560 
561 
_ma_end_block_record(MARIA_HA * info)562 void _ma_end_block_record(MARIA_HA *info)
563 {
564   DBUG_ENTER("_ma_end_block_record");
565   my_free(info->cur_row.empty_bits);
566   delete_dynamic(&info->bitmap_blocks);
567   my_free(info->cur_row.extents);
568   my_free(info->blob_buff);
569   /*
570     The data file is closed, when needed, in ma_once_end_block_record().
571     The following protects us from doing an extra, not allowed, close
572     in maria_close()
573   */
574   info->dfile.file= -1;
575   DBUG_VOID_RETURN;
576 }
577 
578 
579 /****************************************************************************
580   Helper functions
581 ****************************************************************************/
582 
583 /*
584   Return the next unused postion on the page after a directory entry.
585 
586   SYNOPSIS
587     start_of_next_entry()
588     dir		Directory entry to be used. This can not be the
589                 the last entry on the page!
590 
591   RETURN
592     #   Position in page where next entry starts.
593         Everything between the '*dir' and this are free to be used.
594 */
595 
start_of_next_entry(uchar * dir)596 static inline uint start_of_next_entry(uchar *dir)
597 {
598   uchar *prev;
599   /*
600      Find previous used entry. (There is always a previous entry as
601      the directory never starts with a deleted entry)
602   */
603   for (prev= dir - DIR_ENTRY_SIZE ;
604        prev[0] == 0 && prev[1] == 0 ;
605        prev-= DIR_ENTRY_SIZE)
606   {}
607   return (uint) uint2korr(prev);
608 }
609 
610 
611 /*
612   Return the offset where the previous entry ends (before on page)
613 
614   SYNOPSIS
615     end_of_previous_entry()
616     dir		Address for current directory entry
617     end         Address to last directory entry
618 
619   RETURN
620     #   Position where previous entry ends (smallest address on page)
621         Everything between # and current entry are free to be used.
622 */
623 
624 
end_of_previous_entry(MARIA_SHARE * share,uchar * dir,uchar * end)625 static inline uint end_of_previous_entry(MARIA_SHARE *share,
626                                          uchar *dir, uchar *end)
627 {
628   uchar *pos;
629   for (pos= dir + DIR_ENTRY_SIZE ; pos < end ; pos+= DIR_ENTRY_SIZE)
630   {
631     uint offset;
632     if ((offset= uint2korr(pos)))
633       return offset + uint2korr(pos+2);
634   }
635   return PAGE_HEADER_SIZE(share);
636 }
637 
638 
639 #ifndef DBUG_OFF
640 
_ma_print_directory(MARIA_SHARE * share,FILE * file,uchar * buff,uint block_size)641 static void _ma_print_directory(MARIA_SHARE *share,
642                                 FILE *file, uchar *buff, uint block_size)
643 {
644   uint max_entry= (uint) ((uchar *) buff)[DIR_COUNT_OFFSET], row= 0;
645   uint end_of_prev_row= PAGE_HEADER_SIZE(share);
646   uchar *dir, *end;
647 
648   dir= dir_entry_pos(buff, block_size, max_entry-1);
649   end= dir_entry_pos(buff, block_size, 0);
650 
651   DBUG_LOCK_FILE;                               /* If using DBUG_FILE */
652   fprintf(file,"Directory dump (pos:length):\n");
653 
654   for (row= 1; dir <= end ; end-= DIR_ENTRY_SIZE, row++)
655   {
656     uint offset= uint2korr(end);
657     uint length= uint2korr(end+2);
658     fprintf(file, "   %4u:%4u", offset, offset ? length : 0);
659     if (!(row % (80/12)))
660       fputc('\n', file);
661     if (offset)
662     {
663       DBUG_ASSERT(offset >= end_of_prev_row);
664       end_of_prev_row= offset + length;
665     }
666   }
667   fputc('\n', file);
668   fflush(file);
669   DBUG_UNLOCK_FILE;
670 }
671 
672 
check_directory(MARIA_SHARE * share,uchar * buff,uint block_size,uint min_row_length,uint real_empty_size)673 static void check_directory(MARIA_SHARE *share,
674                             uchar *buff, uint block_size, uint min_row_length,
675                             uint real_empty_size)
676 {
677   uchar *dir, *end;
678   uint max_entry= (uint) buff[DIR_COUNT_OFFSET];
679   uint start_of_dir, deleted;
680   uint end_of_prev_row= PAGE_HEADER_SIZE(share);
681   uint empty_size_on_page;
682   uint empty_size;
683   uchar free_entry, prev_free_entry;
684 
685   dir= dir_entry_pos(buff, block_size, max_entry-1);
686   start_of_dir= (uint) (dir - buff);
687   end= dir_entry_pos(buff, block_size, 0);
688   deleted= empty_size= 0;
689 
690   empty_size_on_page= (real_empty_size != (uint) -1 ? real_empty_size :
691                        uint2korr(buff + EMPTY_SPACE_OFFSET));
692 
693   /* Ensure that all rows are in increasing order and no overlaps */
694   for (; dir <= end ; end-= DIR_ENTRY_SIZE)
695   {
696     uint offset= uint2korr(end);
697     uint length= uint2korr(end+2);
698     if (offset)
699     {
700       DBUG_ASSERT(offset >= end_of_prev_row);
701       DBUG_ASSERT(!length || length >= min_row_length);
702       empty_size+= offset - end_of_prev_row;
703       end_of_prev_row= offset + length;
704     }
705     else
706       deleted++;
707   }
708   empty_size+= start_of_dir - end_of_prev_row;
709   DBUG_ASSERT(end_of_prev_row <= start_of_dir);
710   DBUG_ASSERT(empty_size == empty_size_on_page);
711 
712   /* check free links */
713   free_entry= buff[DIR_FREE_OFFSET];
714   prev_free_entry= END_OF_DIR_FREE_LIST;
715   while (free_entry != END_OF_DIR_FREE_LIST)
716   {
717     uchar *dir= dir_entry_pos(buff, block_size, free_entry);
718     DBUG_ASSERT(dir[0] == 0 && dir[1] == 0);
719     DBUG_ASSERT(dir[2] == prev_free_entry);
720     prev_free_entry= free_entry;
721     free_entry= dir[3];
722     deleted--;
723   }
724   DBUG_ASSERT(deleted == 0);
725 }
726 #else
727 #define check_directory(A,B,C,D,E)
728 #endif /* DBUG_OFF */
729 
730 
731 /**
732    @brief Calculate if there is enough entries on the page
733 */
734 
enough_free_entries(uchar * buff,uint block_size,uint wanted_entries)735 static my_bool enough_free_entries(uchar *buff, uint block_size,
736                                    uint wanted_entries)
737 {
738   uint entries= (uint) buff[DIR_COUNT_OFFSET];
739   uint needed_free_entries, free_entry;
740 
741   if (entries + wanted_entries <= MAX_ROWS_PER_PAGE)
742     return 1;
743 
744   /* Check if enough free entries in free list */
745   needed_free_entries= entries + wanted_entries - MAX_ROWS_PER_PAGE;
746 
747   free_entry= (uint) buff[DIR_FREE_OFFSET];
748   while (free_entry != END_OF_DIR_FREE_LIST)
749   {
750     uchar *dir;
751     if (!--needed_free_entries)
752       return 1;
753     dir= dir_entry_pos(buff, block_size, free_entry);
754     free_entry= dir[3];
755   }
756   return 0;                                     /* Not enough entries */
757 }
758 
759 
760 /**
761    @brief Check if there is room for more rows on page
762 
763    @fn enough_free_entries_on_page
764 
765    @return 0    Directory is full
766    @return 1	There is room for more entries on the page
767 */
768 
enough_free_entries_on_page(MARIA_SHARE * share,uchar * page_buff)769 my_bool enough_free_entries_on_page(MARIA_SHARE *share,
770                                     uchar *page_buff)
771 {
772   enum en_page_type page_type;
773   page_type= (enum en_page_type) (page_buff[PAGE_TYPE_OFFSET] &
774                                   ~(uchar) PAGE_CAN_BE_COMPACTED);
775 
776   if (page_type == HEAD_PAGE)
777   {
778     uint row_count= (uint) page_buff[DIR_COUNT_OFFSET];
779     return !(row_count == MAX_ROWS_PER_PAGE &&
780              page_buff[DIR_FREE_OFFSET] == END_OF_DIR_FREE_LIST);
781   }
782   return enough_free_entries(page_buff, share->block_size,
783                              1 + share->base.blobs);
784 }
785 
786 
787 /**
788    @brief Extend a record area to fit a given size block
789 
790    @fn extend_area_on_page()
791    @param info                  Handler
792    @param buff			Page buffer
793    @param dir			Pointer to dir entry in buffer
794    @param rownr			Row number we working on
795    @param block_size		Block size of buffer
796    @param request_length	How much data we want to put at [dir]
797    @param empty_space		Total empty space in buffer
798 			        This is updated with length after dir
799                                 is allocated and current block freed
800    @param head_page		1 if head page, 0 for tail page
801 
802   @implementation
803     The logic is as follows (same as in _ma_update_block_record())
804     - If new data fits in old block, use old block.
805     - Extend block with empty space before block. If enough, use it.
806     - Extend block with empty space after block. If enough, use it.
807     - Use _ma_compact_block_page() to get all empty space at dir.
808 
809   @note
810     The given directory entry is set to rec length.
811     empty_space doesn't include the new directory entry
812 
813 
814   @return
815   @retval 0   ok
816   @retval ret_offset		Pointer to store offset to found area
817   @retval ret_length		Pointer to store length of found area
818   @retval [dir]                 rec_offset is store here too
819 
820   @retval 1   error (wrong info in block)
821 */
822 
extend_area_on_page(MARIA_HA * info,uchar * buff,uchar * dir,uint rownr,uint request_length,uint * empty_space,uint * ret_offset,uint * ret_length,my_bool head_page)823 static my_bool extend_area_on_page(MARIA_HA *info,
824                                    uchar *buff, uchar *dir,
825                                    uint rownr,
826                                    uint request_length,
827                                    uint *empty_space, uint *ret_offset,
828                                    uint *ret_length,
829                                    my_bool head_page)
830 {
831   uint rec_offset, length, org_rec_length;
832   uint max_entry= (uint) buff[DIR_COUNT_OFFSET];
833   MARIA_SHARE *share= info->s;
834   uint block_size= share->block_size;
835   DBUG_ENTER("extend_area_on_page");
836 
837   /*
838     We can't check for min length here as we may have called
839     extend_directory() to create a new (empty) entry just before
840   */
841   check_directory(share, buff, block_size, 0, *empty_space);
842 
843   rec_offset= uint2korr(dir);
844   if (rec_offset)
845   {
846     /* Extending old row;  Mark current space as 'free' */
847     length= org_rec_length= uint2korr(dir + 2);
848     DBUG_PRINT("info", ("rec_offset: %u  length: %u  request_length: %u  "
849                         "empty_space: %u",
850                         rec_offset, org_rec_length, request_length,
851                         *empty_space));
852 
853     *empty_space+= org_rec_length;
854   }
855   else
856   {
857     /* Reusing free directory entry; Free it from the directory list */
858     if (dir[2] == END_OF_DIR_FREE_LIST)
859       buff[DIR_FREE_OFFSET]= dir[3];
860     else
861     {
862       uchar *prev_dir= dir_entry_pos(buff, block_size, (uint) dir[2]);
863       DBUG_ASSERT(uint2korr(prev_dir) == 0 && prev_dir[3] == (uchar) rownr);
864       prev_dir[3]= dir[3];
865     }
866     if (dir[3] != END_OF_DIR_FREE_LIST)
867     {
868       uchar *next_dir= dir_entry_pos(buff, block_size, (uint) dir[3]);
869       DBUG_ASSERT(uint2korr(next_dir) == 0 && next_dir[2] == (uchar) rownr);
870       next_dir[2]= dir[2];
871     }
872     rec_offset= start_of_next_entry(dir);
873     length= 0;
874   }
875   if (length < request_length)
876   {
877     uint old_rec_offset;
878     /*
879       New data did not fit in old position.
880       Find first possible position where to put new data.
881     */
882     old_rec_offset= rec_offset;
883     rec_offset= end_of_previous_entry(share,
884                                       dir, buff + block_size -
885                                       PAGE_SUFFIX_SIZE);
886     length+= (uint) (old_rec_offset - rec_offset);
887     DBUG_ASSERT(old_rec_offset);
888     /*
889       'length' is 0 if we are doing an insert into a not allocated block.
890       This can only happen during "REDO of INSERT" or "UNDO of DELETE."
891     */
892     if (length < request_length)
893     {
894       /*
895         Did not fit in current block + empty space. Extend with
896         empty space after block.
897       */
898       if (rownr == max_entry - 1)
899       {
900         /* Last entry; Everything is free between this and directory */
901         length= ((block_size - PAGE_SUFFIX_SIZE - DIR_ENTRY_SIZE * max_entry) -
902                  rec_offset);
903       }
904       else
905         length= start_of_next_entry(dir) - rec_offset;
906       DBUG_ASSERT((int) length >= 0);
907       if (length < request_length)
908       {
909         /* Not enough continuous space, compact page to get more */
910         int2store(dir, rec_offset);
911         /* Reset length, as this may be a deleted block */
912         int2store(dir+2, 0);
913         _ma_compact_block_page(share,
914                                buff, rownr, 1,
915                                head_page ? info->trn->min_read_from: 0,
916                                head_page ? share->base.min_block_length : 0);
917         rec_offset= uint2korr(dir);
918         length=     uint2korr(dir+2);
919         if (length < request_length)
920         {
921           DBUG_PRINT("error", ("Not enough space: "
922                                "length: %u  request_length: %u",
923                                length, request_length));
924           _ma_set_fatal_error(share, HA_ERR_WRONG_IN_RECORD);
925           DBUG_RETURN(1);                       /* Error in block */
926         }
927         *empty_space= length;                   /* All space is here */
928       }
929     }
930   }
931   int2store(dir, rec_offset);
932   int2store(dir + 2, length);
933   *ret_offset= rec_offset;
934   *ret_length= length;
935 
936   check_directory(share,
937                   buff, block_size,
938                   head_page ? share->base.min_block_length : 0,
939                   *empty_space - length);
940   DBUG_RETURN(0);
941 }
942 
943 
944 /**
945    @brief Copy not changed fields from 'from' to 'to'
946 
947    @notes
948    Assumption is that most fields are not changed!
949    (Which is why we don't test if all bits are set for some bytes in bitmap)
950 */
951 
copy_not_changed_fields(MARIA_HA * info,MY_BITMAP * changed_fields,uchar * to,uchar * from)952 void copy_not_changed_fields(MARIA_HA *info, MY_BITMAP *changed_fields,
953                              uchar *to, uchar *from)
954 {
955   MARIA_COLUMNDEF *column, *end_column;
956   uchar *bitmap= (uchar*) changed_fields->bitmap;
957   MARIA_SHARE *share= info->s;
958   uint bit= 1;
959 
960   for (column= share->columndef, end_column= column+ share->base.fields;
961        column < end_column; column++)
962   {
963     if (!(*bitmap & bit))
964     {
965       uint field_length= column->length;
966       if (column->type == FIELD_VARCHAR)
967       {
968         if (column->fill_length == 1)
969           field_length= (uint) from[column->offset] + 1;
970         else
971           field_length= uint2korr(from + column->offset) + 2;
972       }
973       memcpy(to + column->offset, from + column->offset, field_length);
974     }
975     if ((bit= (bit << 1)) == 256)
976     {
977       bitmap++;
978       bit= 1;
979     }
980   }
981 }
982 
983 #ifdef NOT_YET_NEEDED
984 /* Calculate empty space on a page */
985 
empty_space_on_page(uchar * buff,uint block_size)986 static uint empty_space_on_page(uchar *buff, uint block_size)
987 {
988   enum en_page_type;
989   page_type= (enum en_page_type) (buff[PAGE_TYPE_OFFSET] &
990                                   ~(uchar) PAGE_CAN_BE_COMPACTED);
991   if (page_type == UNALLOCATED_PAGE)
992     return block_size;
993   if ((uint) page_type <= TAIL_PAGE)
994     return uint2korr(buff+EMPTY_SPACE_OFFSET);
995   return 0;                                     /* Blob page */
996 }
997 #endif
998 
999 
1000 /*
1001   @brief Ensure we have space for new directory entries
1002 
1003   @fn make_space_for_directory()
1004   @param info		Handler
1005   @param buff		Page buffer
1006   @param max_entry	Number of current entries in directory
1007   @param count		Number of new entries to be added to directory
1008   @param first_dir	First directory entry on page
1009   @param empty_space    Total empty space in buffer. It's updated
1010 			to reflect the new empty space
1011   @param first_pos      Store position to last data byte on page here
1012   @param head_page	1 if head page, 0 for tail page.
1013 
1014   @note
1015   This function is inline as the argument passing is the biggest
1016   part of the function
1017 
1018   @return
1019   @retval 0  ok
1020   @retval 1  error (No data on page, fatal error)
1021 */
1022 
1023 static inline my_bool
make_space_for_directory(MARIA_HA * info,uchar * buff,uint max_entry,uint count,uchar * first_dir,uint * empty_space,uint * first_pos,my_bool head_page)1024 make_space_for_directory(MARIA_HA *info,
1025                          uchar *buff, uint max_entry,
1026                          uint count, uchar *first_dir, uint *empty_space,
1027                          uint *first_pos,
1028                          my_bool head_page)
1029 {
1030   uint length_needed= DIR_ENTRY_SIZE * count;
1031   MARIA_SHARE *share= info->s;
1032 
1033   /*
1034     The following is not true only in the case and UNDO is used to reinsert
1035     a row on a previously not used page
1036   */
1037   if (likely(max_entry))
1038   {
1039     /* Check if there is place for the directory entry on the page */
1040     *first_pos= uint2korr(first_dir) + uint2korr(first_dir + 2);
1041 
1042     if ((uint) (first_dir - buff) < *first_pos + length_needed)
1043     {
1044       /* Create place for directory */
1045       _ma_compact_block_page(share,
1046                              buff, max_entry - 1, 0,
1047                              head_page ? info->trn->min_read_from : 0,
1048                              head_page ? share->base.min_block_length : 0);
1049       *first_pos= (uint2korr(first_dir) + uint2korr(first_dir + 2));
1050       *empty_space= uint2korr(buff + EMPTY_SPACE_OFFSET);
1051       if (*empty_space < length_needed)
1052       {
1053         /*
1054           We should always have space, as we only come here for
1055           UNDO of DELETE (in which case we know the row was on the
1056           page before) or if the bitmap told us there was space on page
1057         */
1058         DBUG_ASSERT(!maria_assert_if_crashed_table);
1059         return(1);
1060       }
1061     }
1062   }
1063   else
1064     *first_pos= PAGE_HEADER_SIZE(share);
1065 
1066   /* Reduce directory entry size from free space size */
1067   (*empty_space)-= length_needed;
1068   buff[DIR_COUNT_OFFSET]= (uchar) (max_entry + count);
1069   return(0);
1070 }
1071 
1072 
1073 /*
1074   Find free position in directory
1075 
1076   SYNOPSIS
1077   find_free_position()
1078     info                Handler
1079     buff                Page
1080     block_size          Size of page
1081     res_rownr           Store index to free position here
1082     res_length		Store length of found segment here
1083     empty_space		Store length of empty space on disk here. This is
1084 		        all empty space, including the found block.
1085   @param head_page	1 if head page, 0 for tail page.
1086 
1087   NOTES
1088     If there is a free directory entry (entry with position == 0),
1089     then use it and change it to be the size of the empty block
1090     after the previous entry. This guarantees that all row entries
1091     are stored on disk in inverse directory order, which makes life easier for
1092     '_ma_compact_block_page()' and to know if there is free space after any
1093     block.
1094 
1095     If there is no free entry (entry with position == 0), then we create
1096     a new one. If there is not space for the directory entry (because
1097     the last block overlapps with the directory), we compact the page.
1098 
1099     We will update the offset and the length of the found dir entry to
1100     match the position and empty space found.
1101 
1102     buff[EMPTY_SPACE_OFFSET] is NOT updated but left up to the caller
1103 
1104     See start of file for description of how free directory entires are linked
1105 
1106   RETURN
1107     0      Error (directory full or last block goes over directory)
1108     #      Pointer to directory entry on page
1109 */
1110 
find_free_position(MARIA_HA * info,uchar * buff,uint block_size,uint * res_rownr,uint * res_length,uint * empty_space,my_bool head_page)1111 static uchar *find_free_position(MARIA_HA *info,
1112                                  uchar *buff, uint block_size, uint *res_rownr,
1113                                  uint *res_length, uint *empty_space,
1114                                  my_bool head_page)
1115 {
1116   uint max_entry, free_entry;
1117   uint length, first_pos;
1118   uchar *dir, *first_dir;
1119   MARIA_SHARE *share= info->s;
1120   DBUG_ENTER("find_free_position");
1121 
1122   max_entry= (uint) buff[DIR_COUNT_OFFSET];
1123   free_entry= (uint) buff[DIR_FREE_OFFSET];
1124   *empty_space= uint2korr(buff + EMPTY_SPACE_OFFSET);
1125 
1126   DBUG_PRINT("info", ("max_entry: %u  free_entry: %u", max_entry, free_entry));
1127 
1128   first_dir= dir_entry_pos(buff, block_size, max_entry - 1);
1129 
1130   /* Search after first free position */
1131   if (free_entry != END_OF_DIR_FREE_LIST)
1132   {
1133     if (free_entry >= max_entry)
1134       DBUG_RETURN(0);                           /* Consistency error */
1135     dir= dir_entry_pos(buff, block_size, free_entry);
1136     DBUG_ASSERT(uint2korr(dir) == 0 && dir[2] == END_OF_DIR_FREE_LIST);
1137     /* Relink free list */
1138     if ((buff[DIR_FREE_OFFSET]= dir[3]) != END_OF_DIR_FREE_LIST)
1139     {
1140       uchar *next_entry= dir_entry_pos(buff, block_size, (uint) dir[3]);
1141       DBUG_ASSERT((uint) next_entry[2] == free_entry &&
1142                   uint2korr(next_entry) == 0);
1143       next_entry[2]= END_OF_DIR_FREE_LIST;      /* Backlink */
1144     }
1145 
1146     first_pos= end_of_previous_entry(share,
1147                                      dir, buff + block_size -
1148                                      PAGE_SUFFIX_SIZE);
1149     length= start_of_next_entry(dir) - first_pos;
1150     int2store(dir, first_pos);                /* Update dir entry */
1151     int2store(dir + 2, 0);
1152     *res_rownr= free_entry;
1153     *res_length= length;
1154 
1155     check_directory(share, buff, block_size,
1156                     head_page ? share->base.min_block_length : 0, (uint) -1);
1157     DBUG_RETURN(dir);
1158   }
1159   /* No free places in dir; create a new one */
1160 
1161   /* Check if there is place for the directory entry */
1162   if (max_entry == MAX_ROWS_PER_PAGE)
1163     DBUG_RETURN(0);
1164 
1165   if (make_space_for_directory(info, buff, max_entry, 1,
1166                                first_dir, empty_space, &first_pos, head_page))
1167     DBUG_RETURN(0);
1168 
1169   dir= first_dir - DIR_ENTRY_SIZE;
1170   length= (uint) (dir - buff - first_pos);
1171   DBUG_ASSERT(length <= *empty_space);
1172   int2store(dir, first_pos);
1173   int2store(dir + 2, 0);                      /* Max length of region */
1174   *res_rownr= max_entry;
1175   *res_length= length;
1176 
1177   check_directory(share,
1178                   buff, block_size,
1179                   head_page ? share->base.min_block_length : 0,
1180                   *empty_space);
1181   DBUG_RETURN(dir);
1182 }
1183 
1184 
1185 /**
1186    @brief Enlarge page directory to hold more entries
1187 
1188    @fn extend_directory()
1189    @param info          Handler
1190    @param buff		Page buffer
1191    @param block_size	Block size
1192    @param max_entry	Number of directory entries on page
1193    @param new_entry	Position for new entry
1194    @param empty_space	Total empty space in buffer. It's updated
1195 			to reflect the new empty space
1196    @param head_page	1 if head page, 0 for tail page.
1197 
1198    @note
1199    This is only called on UNDO when we want to expand the directory
1200    to be able to re-insert row in a given position
1201 
1202    The new directory entry will be set to cover the maximum possible space
1203 
1204    @return
1205    @retval 0  ok
1206    @retval 1  error (No data on page, fatal error)
1207 */
1208 
extend_directory(MARIA_HA * info,uchar * buff,uint block_size,uint max_entry,uint new_entry,uint * empty_space,my_bool head_page)1209 static my_bool extend_directory(MARIA_HA *info, uchar *buff, uint block_size,
1210                                 uint max_entry, uint new_entry,
1211                                 uint *empty_space, my_bool head_page)
1212 {
1213   uint length, first_pos;
1214   uchar *dir, *first_dir;
1215   DBUG_ENTER("extend_directory");
1216 
1217   /*
1218     Note that in if max_entry is 0, then first_dir will point to
1219     an illegal directory entry. This is ok, as in this case we will
1220     not access anything through first_dir.
1221   */
1222   first_dir= dir_entry_pos(buff, block_size, max_entry) + DIR_ENTRY_SIZE;
1223 
1224   if (make_space_for_directory(info, buff, max_entry,
1225                                new_entry - max_entry + 1,
1226                                first_dir, empty_space, &first_pos, head_page))
1227     DBUG_RETURN(1);
1228 
1229   /* Set the new directory entry to cover the max possible length */
1230   dir= first_dir - DIR_ENTRY_SIZE * (new_entry - max_entry + 1);
1231   length= (uint) (dir - buff - first_pos);
1232   int2store(dir, first_pos);
1233   int2store(dir+2, length);
1234   *empty_space-= length;
1235 
1236   if (new_entry-- > max_entry)
1237   {
1238     /* Link all row entries between new_entry and max_entry into free list */
1239     uint free_entry= (uint) buff[DIR_FREE_OFFSET];
1240     uint prev_entry= END_OF_DIR_FREE_LIST;
1241     buff[DIR_FREE_OFFSET]= new_entry;
1242     do
1243     {
1244       dir+= DIR_ENTRY_SIZE;
1245       dir[0]= dir[1]= 0;
1246       dir[2]= (uchar) prev_entry;
1247       dir[3]= (uchar) new_entry-1;
1248       prev_entry= new_entry;
1249     } while (new_entry-- > max_entry);
1250     if ((dir[3]= free_entry) != END_OF_DIR_FREE_LIST)
1251     {
1252       /* Relink next entry to point to newly freed entry */
1253       uchar *next_entry= dir_entry_pos(buff, block_size, (uint) dir[3]);
1254       DBUG_ASSERT(uint2korr(next_entry) == 0 &&
1255                   next_entry[2] == END_OF_DIR_FREE_LIST);
1256       next_entry[2]= max_entry;
1257     }
1258   }
1259 
1260   check_directory(info->s,
1261                   buff, block_size,
1262                   head_page ? MY_MIN(info->s->base.min_block_length, length) :
1263                   0, *empty_space);
1264   DBUG_RETURN(0);
1265 }
1266 
1267 
1268 /****************************************************************************
1269   Updating records
1270 ****************************************************************************/
1271 
1272 /*
1273   Calculate length of all the different field parts
1274 
1275   SYNOPSIS
1276     calc_record_size()
1277     info	Maria handler
1278     record      Row to store
1279     row		Store statistics about row here
1280 
1281   NOTES
1282     The statistics is used to find out how much space a row will need
1283     and also where we can split a row when we need to split it into several
1284     extents.
1285 */
1286 
calc_record_size(MARIA_HA * info,const uchar * record,MARIA_ROW * row)1287 static void calc_record_size(MARIA_HA *info, const uchar *record,
1288                              MARIA_ROW *row)
1289 {
1290   MARIA_SHARE *share= info->s;
1291   uchar *field_length_data;
1292   MARIA_COLUMNDEF *column, *end_column;
1293   uint *null_field_lengths= row->null_field_lengths;
1294   ulong *blob_lengths= row->blob_lengths;
1295   DBUG_ENTER("calc_record_size");
1296 
1297   row->normal_length= row->char_length= row->varchar_length=
1298     row->blob_length= row->extents_count= 0;
1299 
1300   /* Create empty bitmap and calculate length of each varlength/char field */
1301   bzero(row->empty_bits, share->base.pack_bytes);
1302   field_length_data= row->field_lengths;
1303   for (column= share->columndef + share->base.fixed_not_null_fields,
1304        end_column= share->columndef + share->base.fields;
1305        column < end_column; column++, null_field_lengths++)
1306   {
1307     if ((record[column->null_pos] & column->null_bit))
1308     {
1309       if (column->type != FIELD_BLOB)
1310         *null_field_lengths= 0;
1311       else
1312         *blob_lengths++= 0;
1313       continue;
1314     }
1315     switch (column->type) {
1316     case FIELD_CHECK:
1317     case FIELD_NORMAL:                          /* Fixed length field */
1318     case FIELD_ZERO:
1319       DBUG_ASSERT(column->empty_bit == 0);
1320       /* fall through */
1321     case FIELD_SKIP_PRESPACE:                   /* Not packed */
1322       row->normal_length+= column->length;
1323       *null_field_lengths= column->length;
1324       break;
1325     case FIELD_SKIP_ZERO:                       /* Fixed length field */
1326       if (memcmp(record+ column->offset, maria_zero_string,
1327                  column->length) == 0)
1328       {
1329         row->empty_bits[column->empty_pos] |= column->empty_bit;
1330         *null_field_lengths= 0;
1331       }
1332       else
1333       {
1334         row->normal_length+= column->length;
1335         *null_field_lengths= column->length;
1336       }
1337       break;
1338     case FIELD_SKIP_ENDSPACE:                   /* CHAR */
1339     {
1340       const uchar *pos, *end;
1341       for (pos= record + column->offset, end= pos + column->length;
1342            end > pos && end[-1] == ' '; end--)
1343         ;
1344       if (pos == end)                           /* If empty string */
1345       {
1346         row->empty_bits[column->empty_pos]|= column->empty_bit;
1347         *null_field_lengths= 0;
1348       }
1349       else
1350       {
1351         uint length= (uint) (end - pos);
1352         if (column->length <= 255)
1353           *field_length_data++= (uchar) length;
1354         else
1355         {
1356           int2store(field_length_data, length);
1357           field_length_data+= 2;
1358         }
1359         row->char_length+= length;
1360         *null_field_lengths= length;
1361       }
1362       break;
1363     }
1364     case FIELD_VARCHAR:
1365     {
1366       uint length, field_length_data_length;
1367       const uchar *field_pos= record + column->offset;
1368 
1369       /* 256 is correct as this includes the length uchar */
1370       field_length_data[0]= field_pos[0];
1371       if (column->length <= 256)
1372       {
1373         length= (uint) (uchar) *field_pos;
1374         field_length_data_length= 1;
1375       }
1376       else
1377       {
1378         length= uint2korr(field_pos);
1379         field_length_data[1]= field_pos[1];
1380         field_length_data_length= 2;
1381       }
1382       *null_field_lengths= length;
1383       if (!length)
1384       {
1385         row->empty_bits[column->empty_pos]|= column->empty_bit;
1386         break;
1387       }
1388       row->varchar_length+= length;
1389       *null_field_lengths= length;
1390       field_length_data+= field_length_data_length;
1391       break;
1392     }
1393     case FIELD_BLOB:
1394     {
1395       const uchar *field_pos= record + column->offset;
1396       uint size_length= column->length - portable_sizeof_char_ptr;
1397       ulong blob_length= _ma_calc_blob_length(size_length, field_pos);
1398 
1399       *blob_lengths++= blob_length;
1400       if (!blob_length)
1401         row->empty_bits[column->empty_pos]|= column->empty_bit;
1402       else
1403       {
1404         row->blob_length+= blob_length;
1405         memcpy(field_length_data, field_pos, size_length);
1406         field_length_data+= size_length;
1407       }
1408       break;
1409     }
1410     default:
1411       DBUG_ASSERT(0);
1412     }
1413   }
1414   row->field_lengths_length= (uint) (field_length_data - row->field_lengths);
1415   /*
1416     - info->row_base_length is base information we must have on a page in first
1417       extent:
1418       - flag byte (1) + is_nulls_extended (0 | 1) + null_bytes + pack_bytes +
1419         table_checksum (0 | 1)
1420     - row->min_length is minimum amount of data we must store on
1421       a page. bitmap code will ensure we get at list this much +
1422       total number of extents and one extent information
1423     - fixed_not_null_fields_length is length of fixed length fields that can't
1424       be compacted
1425     - head_length is the amount of data for the head page
1426      (ie, all fields except blobs)
1427   */
1428   row->min_length=   (info->row_base_length +
1429                       (share->base.max_field_lengths ?
1430                        size_to_store_key_length(row->field_lengths_length) :
1431                        0));
1432   row->head_length= (row->min_length +
1433                      share->base.fixed_not_null_fields_length +
1434                      row->field_lengths_length +
1435                      row->normal_length +
1436                      row->char_length + row->varchar_length);
1437   row->total_length= (row->head_length + row->blob_length);
1438   if (row->total_length < share->base.min_block_length)
1439     row->total_length= share->base.min_block_length;
1440   DBUG_PRINT("exit", ("head_length: %lu  total_length: %lu",
1441                       (ulong) row->head_length, (ulong) row->total_length));
1442   DBUG_VOID_RETURN;
1443 }
1444 
1445 
1446 /**
1447   Compact page by removing all space between rows
1448 
1449   Moves up all rows to start of page. Moves blocks that are directly after
1450   each other with one memmove.
1451 
1452   @note if rownr is the last row in the page, and extend_block is false,
1453   caller has to make sure to update bitmap page afterwards to reflect freed
1454   space.
1455 
1456   @param  buff          Page to compact
1457   @param  block_size    Size of page
1458   @param  rownr         Put empty data after this row
1459   @param  extend_block	If 1, extend the block at 'rownr' to cover the
1460                         whole block.
1461   @param  min_read_from If <> 0, remove all trid's that are less than this
1462 */
1463 
_ma_compact_block_page(MARIA_SHARE * share,uchar * buff,uint rownr,my_bool extend_block,TrID min_read_from,uint min_row_length)1464 void _ma_compact_block_page(MARIA_SHARE *share,
1465                             uchar *buff, uint rownr,
1466                             my_bool extend_block, TrID min_read_from,
1467                             uint min_row_length)
1468 {
1469   uint max_entry= (uint) buff[DIR_COUNT_OFFSET];
1470   uint page_pos, next_free_pos, start_of_found_block, diff, end_of_found_block;
1471   uint freed_size= 0;
1472   uint block_size= share->block_size;
1473   uchar *dir, *end;
1474   DBUG_ENTER("_ma_compact_block_page");
1475   DBUG_PRINT("enter", ("rownr: %u  min_read_from: %lu", rownr,
1476                        (ulong) min_read_from));
1477   DBUG_ASSERT(max_entry > 0 &&
1478               max_entry < (block_size - PAGE_HEADER_SIZE(share) -
1479                            PAGE_SUFFIX_SIZE) / DIR_ENTRY_SIZE);
1480 
1481   /* Move all entries before and including rownr up to start of page */
1482   dir= dir_entry_pos(buff, block_size, rownr);
1483   end= dir_entry_pos(buff, block_size, 0);
1484   page_pos= next_free_pos= start_of_found_block= PAGE_HEADER_SIZE(share);
1485   diff= 0;
1486   for (; dir <= end ; end-= DIR_ENTRY_SIZE)
1487   {
1488     uint offset= uint2korr(end);
1489 
1490     if (offset)
1491     {
1492       uint row_length= uint2korr(end + 2);
1493       DBUG_ASSERT(offset >= page_pos);
1494       DBUG_ASSERT(buff + offset + row_length <= dir);
1495       DBUG_ASSERT(row_length >= min_row_length || row_length == 0);
1496 
1497       /* Row length can be zero if row is to be deleted */
1498       if (min_read_from && row_length && (buff[offset] & ROW_FLAG_TRANSID))
1499       {
1500         TrID transid= transid_korr(buff+offset+1);
1501         if (transid < min_read_from)
1502         {
1503           /* Remove transid from row by moving the start point of the row up */
1504           buff[offset + TRANSID_SIZE]= buff[offset] & ~ROW_FLAG_TRANSID;
1505           offset+= TRANSID_SIZE;
1506           freed_size+= TRANSID_SIZE;
1507           row_length-= TRANSID_SIZE;
1508           int2store(end+2, row_length);
1509         }
1510       }
1511 
1512       if (offset != next_free_pos)
1513       {
1514         uint length= (next_free_pos - start_of_found_block);
1515         /*
1516           There was empty space before this and prev block
1517           Check if we have to move previous block up to page start
1518         */
1519         if (page_pos != start_of_found_block)
1520         {
1521           /* move up previous block */
1522           memmove(buff + page_pos, buff + start_of_found_block, length);
1523         }
1524         page_pos+= length;
1525         /* next continuous block starts here */
1526         start_of_found_block= offset;
1527         diff= offset - page_pos;
1528       }
1529       int2store(end, offset - diff);            /* correct current pos */
1530       next_free_pos= offset + row_length;
1531 
1532       if (unlikely(row_length < min_row_length) && row_length)
1533       {
1534         /*
1535           This can only happen in the case we compacted transid and
1536           the row become 'too short'
1537 
1538           Move the current row down to it's right place and extend it
1539           with 0.
1540         */
1541         uint row_diff= min_row_length - row_length;
1542         uint length= (next_free_pos - start_of_found_block);
1543 
1544         DBUG_ASSERT(page_pos != start_of_found_block);
1545         bmove(buff + page_pos, buff + start_of_found_block, length);
1546         bzero(buff+ page_pos + length, row_diff);
1547         page_pos+= min_row_length;
1548         int2store(end+2, min_row_length);
1549         freed_size-= row_diff;
1550         next_free_pos= start_of_found_block= page_pos;
1551         diff= 0;
1552       }
1553     }
1554   }
1555   if (page_pos != start_of_found_block)
1556   {
1557     uint length= (next_free_pos - start_of_found_block);
1558     memmove(buff + page_pos, buff + start_of_found_block, length);
1559   }
1560   start_of_found_block= uint2korr(dir);
1561 
1562   if (rownr != max_entry - 1)
1563   {
1564     /* Move all entries after rownr to end of page */
1565     uint rownr_length;
1566 
1567     DBUG_ASSERT(extend_block);                  /* Should always be true */
1568     next_free_pos= end_of_found_block= page_pos=
1569       block_size - DIR_ENTRY_SIZE * max_entry - PAGE_SUFFIX_SIZE;
1570     diff= 0;
1571     /* End points to entry before 'rownr' */
1572     for (dir= buff + end_of_found_block ; dir <= end ; dir+= DIR_ENTRY_SIZE)
1573     {
1574       uint offset= uint2korr(dir);
1575       uint row_length;
1576       uint row_end;
1577       if (!offset)
1578         continue;
1579       row_length= uint2korr(dir + 2);
1580       row_end= offset + row_length;
1581       DBUG_ASSERT(offset >= start_of_found_block &&
1582                   row_end <= next_free_pos && row_length >= min_row_length);
1583 
1584       if (min_read_from && (buff[offset] & ROW_FLAG_TRANSID))
1585       {
1586         TrID transid= transid_korr(buff + offset+1);
1587         if (transid < min_read_from)
1588         {
1589           /* Remove transid from row */
1590           buff[offset + TRANSID_SIZE]= buff[offset] & ~ROW_FLAG_TRANSID;
1591           offset+= TRANSID_SIZE;
1592           row_length-= TRANSID_SIZE;
1593           int2store(dir+2, row_length);
1594         }
1595         if (unlikely(row_length < min_row_length))
1596         {
1597           /*
1598             This can only happen in the case we compacted transid and
1599             the row become 'too short'
1600           */
1601           uint row_diff= min_row_length - row_length;
1602           if (next_free_pos < row_end + row_diff)
1603           {
1604             /*
1605               Not enough space for extending next block with enough
1606               end 0's. Move current data down to get place for them
1607             */
1608             uint move_down= row_diff - (next_free_pos - row_end);
1609             bmove(buff + offset - move_down, buff + offset, row_length);
1610             offset-= move_down;
1611           }
1612           /*
1613             Extend the next block with 0, which will be part of current
1614             row when the blocks are joined together later
1615           */
1616           bzero(buff + next_free_pos - row_diff, row_diff);
1617           next_free_pos-= row_diff;
1618           int2store(dir+2, min_row_length);
1619         }
1620         row_end= offset + row_length;
1621       }
1622 
1623       if (row_end != next_free_pos)
1624       {
1625         uint length= (end_of_found_block - next_free_pos);
1626         if (page_pos != end_of_found_block)
1627         {
1628           /* move next block down */
1629           memmove(buff + page_pos - length, buff + next_free_pos, length);
1630         }
1631         page_pos-= length;
1632         /* next continuous block starts here */
1633         end_of_found_block= row_end;
1634         diff= page_pos - row_end;
1635       }
1636       int2store(dir, offset + diff);            /* correct current pos */
1637       next_free_pos= offset;
1638     }
1639     if (page_pos != end_of_found_block)
1640     {
1641       uint length= (end_of_found_block - next_free_pos);
1642       memmove(buff + page_pos - length, buff + next_free_pos, length);
1643       next_free_pos= page_pos- length;
1644     }
1645 
1646     /* Extend rownr block to cover hole */
1647     rownr_length= next_free_pos - start_of_found_block;
1648     int2store(dir+2, rownr_length);
1649     DBUG_ASSERT(rownr_length >= min_row_length);
1650   }
1651   else
1652   {
1653     if (extend_block)
1654     {
1655       /* Extend last block to cover whole page */
1656       uint length= ((uint) (dir - buff) - start_of_found_block);
1657       int2store(dir+2, length);
1658       DBUG_ASSERT(length >= min_row_length);
1659     }
1660     else
1661     {
1662       /* Add length gained from freed transaction id's to this page */
1663       uint length= uint2korr(buff+ EMPTY_SPACE_OFFSET) + freed_size;
1664       int2store(buff + EMPTY_SPACE_OFFSET, length);
1665     }
1666     buff[PAGE_TYPE_OFFSET]&= ~(uchar) PAGE_CAN_BE_COMPACTED;
1667   }
1668   check_directory(share, buff, block_size, min_row_length,
1669                   extend_block ? 0 : (uint) -1);
1670   DBUG_EXECUTE("directory", _ma_print_directory(share,
1671                                                 DBUG_FILE, buff, block_size););
1672   DBUG_VOID_RETURN;
1673 }
1674 
1675 
1676 /*
1677   Create an empty tail or head page
1678 
1679   SYNOPSIS
1680     make_empty_page()
1681     buff		Page buffer
1682     block_size		Block size
1683     page_type		HEAD_PAGE or TAIL_PAGE
1684     create_dir_entry	TRUE of we should create a directory entry
1685 
1686   NOTES
1687     EMPTY_SPACE is not updated
1688 */
1689 
make_empty_page(MARIA_HA * info,uchar * buff,uint page_type,my_bool create_dir_entry)1690 static void make_empty_page(MARIA_HA *info, uchar *buff, uint page_type,
1691                             my_bool create_dir_entry)
1692 {
1693   uint block_size= info->s->block_size;
1694   DBUG_ENTER("make_empty_page");
1695 
1696   bzero(buff, PAGE_HEADER_SIZE(info->s));
1697 
1698 #if !defined(DONT_ZERO_PAGE_BLOCKS) || defined(HAVE_valgrind)
1699   /*
1700     We zero the rest of the block to avoid getting old memory information
1701     to disk and to allow the file to be compressed better if archived.
1702     The code does not assume the block is zeroed.
1703   */
1704   if (page_type != BLOB_PAGE)
1705     bzero(buff+ PAGE_HEADER_SIZE(info->s),
1706           block_size - PAGE_HEADER_SIZE(info->s));
1707 #endif
1708   buff[PAGE_TYPE_OFFSET]= (uchar) page_type;
1709   buff[DIR_COUNT_OFFSET]= (int) create_dir_entry;
1710   buff[DIR_FREE_OFFSET]=  END_OF_DIR_FREE_LIST;
1711   if (create_dir_entry)
1712   {
1713     /* Create directory entry to point to start of page with size 0 */
1714     buff+= block_size - PAGE_SUFFIX_SIZE - DIR_ENTRY_SIZE;
1715     int2store(buff, PAGE_HEADER_SIZE(info->s));
1716     int2store(buff+2, 0);
1717   }
1718   DBUG_VOID_RETURN;
1719 }
1720 
1721 
1722 /*
1723   Read or initialize new head or tail page
1724 
1725   SYNOPSIS
1726     get_head_or_tail_page()
1727     info                        Maria handler
1728     block                       Block to read
1729     buff                        Suggest this buffer to key cache
1730     length                      Minimum space needed
1731     page_type			HEAD_PAGE || TAIL_PAGE
1732     res                         Store result position here
1733 
1734   NOTES
1735     We don't decremented buff[EMPTY_SPACE_OFFSET] with the allocated data
1736     as we don't know how much data the caller will actually use.
1737 
1738     res->empty_space is set to length of empty space
1739 
1740   RETURN
1741     0  ok     All slots in 'res' are updated
1742     1  error  my_errno is set
1743 */
1744 
1745 struct st_row_pos_info
1746 {
1747   uchar *buff;                                  /* page buffer */
1748   uchar *data;                                  /* Place for data */
1749   uchar *dir;                                   /* Directory */
1750   uint length;                                  /* Length for data */
1751   uint rownr;                                   /* Offset in directory */
1752   uint empty_space;                             /* Space left on page */
1753 };
1754 
1755 
get_head_or_tail_page(MARIA_HA * info,const MARIA_BITMAP_BLOCK * block,uchar * buff,uint length,uint page_type,enum pagecache_page_lock lock,struct st_row_pos_info * res)1756 static my_bool get_head_or_tail_page(MARIA_HA *info,
1757                                      const MARIA_BITMAP_BLOCK *block,
1758                                      uchar *buff, uint length, uint page_type,
1759                                      enum pagecache_page_lock lock,
1760                                      struct st_row_pos_info *res)
1761 {
1762   uint block_size;
1763   MARIA_PINNED_PAGE page_link;
1764   MARIA_SHARE *share= info->s;
1765   DBUG_ENTER("get_head_or_tail_page");
1766   DBUG_PRINT("enter", ("page_type: %u  length: %u", page_type, length));
1767 
1768   block_size= share->block_size;
1769   if (block->org_bitmap_value == 0)             /* Empty block */
1770   {
1771     /* New page */
1772     make_empty_page(info, buff, page_type, 1);
1773     res->buff= buff;
1774     res->empty_space= res->length= (block_size - PAGE_OVERHEAD_SIZE(share));
1775     res->data= (buff + PAGE_HEADER_SIZE(share));
1776     res->dir= res->data + res->length;
1777     res->rownr= 0;
1778     DBUG_ASSERT(length <= res->length);
1779   }
1780   else
1781   {
1782     uchar *dir;
1783     /* Read old page */
1784     page_link.unlock= PAGECACHE_LOCK_WRITE_UNLOCK;
1785     res->buff= pagecache_read(share->pagecache, &info->dfile,
1786                               block->page, 0, 0, share->page_type,
1787                               lock, &page_link.link);
1788     page_link.changed= res->buff != 0;
1789     push_dynamic(&info->pinned_pages, (void*) &page_link);
1790     if (!page_link.changed)
1791       goto crashed;
1792 
1793     DBUG_ASSERT((uint) (res->buff[PAGE_TYPE_OFFSET] & PAGE_TYPE_MASK) ==
1794                 page_type);
1795     if (!(dir= find_free_position(info, res->buff, block_size, &res->rownr,
1796                                   &res->length, &res->empty_space,
1797                                   page_type == HEAD_PAGE)))
1798       goto crashed;
1799 
1800     if (res->length < length)
1801     {
1802       if (res->empty_space + res->length >= length)
1803       {
1804         _ma_compact_block_page(share,
1805                                res->buff, res->rownr, 1,
1806                                (page_type == HEAD_PAGE ?
1807                                 info->trn->min_read_from : 0),
1808                                (page_type == HEAD_PAGE ?
1809                                 share->base.min_block_length :
1810                                 0));
1811         /* All empty space are now after current position */
1812         dir= dir_entry_pos(res->buff, block_size, res->rownr);
1813         res->length= res->empty_space= uint2korr(dir+2);
1814       }
1815       if (res->length < length)
1816       {
1817         DBUG_PRINT("error", ("length: %u  res->length: %u  empty_space: %u",
1818                              length, res->length, res->empty_space));
1819         goto crashed;                         /* Wrong bitmap information */
1820       }
1821     }
1822     res->dir= dir;
1823     res->data= res->buff + uint2korr(dir);
1824   }
1825   DBUG_RETURN(0);
1826 
1827 crashed:
1828   DBUG_ASSERT(!maria_assert_if_crashed_table);
1829   _ma_set_fatal_error(share, HA_ERR_WRONG_IN_RECORD);  /* File crashed */
1830   DBUG_RETURN(1);
1831 }
1832 
1833 
1834 /*
1835   @brief Create room for a head or tail row on a given page at given position
1836 
1837   @fn get_rowpos_in_head_or_tail_page()
1838   @param info                        Maria handler
1839   @param block                       Block to read
1840   @param buff                        Suggest this buffer to key cache
1841   @param length                      Minimum space needed
1842   @param page_type	             HEAD_PAGE || TAIL_PAGE
1843   @param rownr			     Rownr to use
1844   @param res                         Store result position here
1845 
1846   @note
1847     This is essential same as get_head_or_tail_page, with the difference
1848     that the caller species at what position the row should be put.
1849     This is used when restoring a row to it's original position as
1850     part of UNDO DELETE or UNDO UPDATE
1851 
1852   @return
1853   @retval 0  ok     All slots in 'res' are updated
1854   @retval 1  error  my_errno is set
1855 */
1856 
get_rowpos_in_head_or_tail_page(MARIA_HA * info,const MARIA_BITMAP_BLOCK * block,uchar * buff,uint length,uint page_type,enum pagecache_page_lock lock,uint rownr,struct st_row_pos_info * res)1857 static my_bool get_rowpos_in_head_or_tail_page(MARIA_HA *info,
1858                                                const MARIA_BITMAP_BLOCK *block,
1859                                                uchar *buff, uint length,
1860                                                uint page_type,
1861                                                enum pagecache_page_lock lock,
1862                                                uint rownr,
1863                                                struct st_row_pos_info *res)
1864 {
1865   MARIA_PINNED_PAGE page_link;
1866   MARIA_SHARE *share= info->s;
1867   uchar *dir;
1868   uint block_size= share->block_size;
1869   uint max_entry, max_length, rec_offset;
1870   DBUG_ENTER("get_rowpos_in_head_or_tail_page");
1871 
1872   if (block->org_bitmap_value == 0)             /* Empty block */
1873   {
1874     /* New page */
1875     make_empty_page(info, buff, page_type, 0);
1876     res->empty_space= block_size - PAGE_HEADER_SIZE(share) - PAGE_SUFFIX_SIZE;
1877   }
1878   else
1879   {
1880     page_link.unlock= PAGECACHE_LOCK_WRITE_UNLOCK;
1881     buff= pagecache_read(share->pagecache, &info->dfile,
1882                          block->page, 0, 0, share->page_type,
1883                          lock, &page_link.link);
1884     page_link.changed= buff != 0;
1885     push_dynamic(&info->pinned_pages, (void*) &page_link);
1886     if (!page_link.changed)                     /* Read error */
1887       goto err;
1888     DBUG_ASSERT((buff[PAGE_TYPE_OFFSET] & PAGE_TYPE_MASK) ==
1889                 (uchar) page_type);
1890     if ((buff[PAGE_TYPE_OFFSET] & PAGE_TYPE_MASK) != (uchar) page_type)
1891       goto err;
1892     res->empty_space= uint2korr(buff + EMPTY_SPACE_OFFSET);
1893   }
1894 
1895   max_entry= (uint) buff[DIR_COUNT_OFFSET];
1896   if (max_entry <= rownr)
1897   {
1898     if (extend_directory(info, buff, block_size,
1899                          max_entry, rownr, &res->empty_space,
1900                          page_type == HEAD_PAGE))
1901       goto err;
1902   }
1903 
1904   /*
1905     The following dir entry is unused in case of insert / update but
1906     not in case of undo_update / undo_delete
1907   */
1908   dir= dir_entry_pos(buff, block_size, rownr);
1909 
1910   if (extend_area_on_page(info, buff, dir, rownr, length,
1911                           &res->empty_space, &rec_offset, &max_length,
1912                           page_type == HEAD_PAGE))
1913     goto err;
1914 
1915   res->buff= buff;
1916   res->rownr= rownr;
1917   res->dir= dir;
1918   res->data= buff + rec_offset;
1919   res->length= length;
1920   DBUG_RETURN(0);
1921 
1922 err:
1923   DBUG_ASSERT(!maria_assert_if_crashed_table);
1924   _ma_set_fatal_error(share, HA_ERR_WRONG_IN_RECORD);  /* File crashed */
1925   DBUG_RETURN(1);
1926 }
1927 
1928 
1929 /*
1930   Write tail for head data or blob
1931 
1932   SYNOPSIS
1933     write_tail()
1934     info                Maria handler
1935     block               Block to tail page
1936     row_part            Data to write to page
1937     length              Length of data
1938 
1939   NOTES
1940     block->page_count is updated to the directory offset for the tail
1941     so that we can store the position in the row extent information
1942 
1943   RETURN
1944     0  ok
1945        block->page_count is set to point (dir entry + TAIL_BIT)
1946 
1947     1  error; In this case my_errno is set to the error
1948 */
1949 
write_tail(MARIA_HA * info,MARIA_BITMAP_BLOCK * block,uchar * row_part,uint org_length)1950 static my_bool write_tail(MARIA_HA *info,
1951                           MARIA_BITMAP_BLOCK *block,
1952                           uchar *row_part, uint org_length)
1953 {
1954   MARIA_SHARE *share= info->s;
1955   MARIA_PINNED_PAGE page_link;
1956   uint block_size= share->block_size, empty_space, length= org_length;
1957   struct st_row_pos_info row_pos;
1958   my_off_t position;
1959   my_bool res, block_is_read;
1960   DBUG_ENTER("write_tail");
1961   DBUG_PRINT("enter", ("page: %lu  length: %u",
1962                        (ulong) block->page, length));
1963 
1964   info->keyread_buff_used= 1;
1965   /*
1966     Don't allocate smaller block than MIN_TAIL_SIZE (we want to give rows
1967     some place to grow in the future)
1968   */
1969   if (length < MIN_TAIL_SIZE)
1970     length= MIN_TAIL_SIZE;
1971 
1972   if (block->page_count == TAIL_PAGE_COUNT_MARKER)
1973   {
1974     /*
1975       Create new tail
1976       page will be pinned & locked by get_head_or_tail_page
1977     */
1978     if (get_head_or_tail_page(info, block, info->keyread_buff, length,
1979                               TAIL_PAGE, PAGECACHE_LOCK_WRITE,
1980                               &row_pos))
1981       DBUG_RETURN(1);
1982   }
1983   else
1984   {
1985     /* Write tail on predefined row position */
1986     if (get_rowpos_in_head_or_tail_page(info, block, info->keyread_buff,
1987                                         length, TAIL_PAGE,
1988                                         PAGECACHE_LOCK_WRITE,
1989                                         block->page_count & ~TAIL_BIT,
1990                                         &row_pos))
1991       DBUG_RETURN(1);
1992   }
1993   DBUG_PRINT("info", ("tailid: %lu (%lu:%u)",
1994                       (ulong) ma_recordpos(block->page, row_pos.rownr),
1995                       (ulong) block->page, row_pos.rownr));
1996 
1997   block_is_read= block->org_bitmap_value != 0;
1998 
1999   memcpy(row_pos.data, row_part, org_length);
2000 
2001   if (share->now_transactional)
2002   {
2003     /* Log changes in tail block */
2004     uchar log_data[FILEID_STORE_SIZE + PAGE_STORE_SIZE + DIRPOS_STORE_SIZE];
2005     LEX_CUSTRING log_array[TRANSLOG_INTERNAL_PARTS + 2];
2006     LSN lsn;
2007 
2008     /*
2009       Log REDO changes of tail page
2010       Note that we have to log length, not org_length, to be sure that
2011       REDO, which doesn't use write_tail, also creates a block of at least
2012       MIN_TAIL_SIZE
2013      */
2014     page_store(log_data + FILEID_STORE_SIZE, block->page);
2015     dirpos_store(log_data + FILEID_STORE_SIZE + PAGE_STORE_SIZE,
2016                  row_pos.rownr);
2017     log_array[TRANSLOG_INTERNAL_PARTS + 0].str=    log_data;
2018     log_array[TRANSLOG_INTERNAL_PARTS + 0].length= sizeof(log_data);
2019     log_array[TRANSLOG_INTERNAL_PARTS + 1].str=    row_pos.data;
2020     log_array[TRANSLOG_INTERNAL_PARTS + 1].length= length;
2021     if (translog_write_record(&lsn,
2022                               (block_is_read ? LOGREC_REDO_INSERT_ROW_TAIL :
2023                                LOGREC_REDO_NEW_ROW_TAIL),
2024                               info->trn, info,
2025                               (translog_size_t) (sizeof(log_data) + length),
2026                               TRANSLOG_INTERNAL_PARTS + 2, log_array,
2027                               log_data, NULL))
2028       DBUG_RETURN(1);
2029   }
2030 
2031   int2store(row_pos.dir + 2, length);
2032   empty_space= row_pos.empty_space - length;
2033   int2store(row_pos.buff + EMPTY_SPACE_OFFSET, empty_space);
2034   block->page_count= row_pos.rownr + TAIL_BIT;
2035   /*
2036     If there is less directory entries free than number of possible tails
2037     we can write for a row, we mark the page full to ensure that we don't
2038     during _ma_bitmap_find_place() allocate more entries on the tail page
2039     than it can hold
2040   */
2041   block->empty_space= (enough_free_entries(row_pos.buff, share->block_size,
2042                                            1 + share->base.blobs) ?
2043                        empty_space : 0);
2044   /* Keep BLOCKUSED_USE_ORG_BITMAP */
2045   block->used|= BLOCKUSED_USED | BLOCKUSED_TAIL;
2046 
2047   if (block_is_read)
2048   {
2049     /* Current page link is last element in pinned_pages */
2050     MARIA_PINNED_PAGE *page_link;
2051     page_link= dynamic_element(&info->pinned_pages,
2052                                info->pinned_pages.elements-1,
2053                                MARIA_PINNED_PAGE*);
2054     pagecache_unlock_by_link(share->pagecache, page_link->link,
2055                              PAGECACHE_LOCK_WRITE_TO_READ,
2056                              PAGECACHE_PIN_LEFT_PINNED, LSN_IMPOSSIBLE,
2057                              LSN_IMPOSSIBLE, 1, FALSE);
2058     DBUG_ASSERT(page_link->changed);
2059     page_link->unlock= PAGECACHE_LOCK_READ_UNLOCK;
2060     res= 0;
2061   }
2062   else
2063   {
2064     if (!(res= pagecache_write(share->pagecache,
2065                                &info->dfile, block->page, 0,
2066                                row_pos.buff,share->page_type,
2067                                PAGECACHE_LOCK_READ,
2068                                PAGECACHE_PIN,
2069                                PAGECACHE_WRITE_DELAY, &page_link.link,
2070                                LSN_IMPOSSIBLE)))
2071     {
2072       DBUG_ASSERT(page_link.link);
2073       page_link.unlock= PAGECACHE_LOCK_READ_UNLOCK;
2074       page_link.changed= 1;
2075       push_dynamic(&info->pinned_pages, (void*) &page_link);
2076     }
2077 
2078     /* Increase data file size, if extended */
2079     position= (my_off_t) block->page * block_size;
2080     if (share->state.state.data_file_length <= position)
2081     {
2082       /*
2083         We are modifying a state member before writing the UNDO; this is a WAL
2084         violation. But for data_file_length this is ok, as long as we change
2085         data_file_length after writing any log record (FILE_ID/REDO/UNDO) (see
2086         collect_tables()).
2087       */
2088       _ma_set_share_data_file_length(share, position + block_size);
2089     }
2090   }
2091   DBUG_RETURN(res);
2092 }
2093 
2094 
2095 /*
2096   Write full pages
2097 
2098   SYNOPSIS
2099     write_full_pages()
2100     info                Maria handler
2101     lsn			LSN for the undo record
2102     block               Where to write data
2103     data                Data to write
2104     length              Length of data
2105 
2106   NOTES
2107     Logging of the changes to the full pages are done in the caller
2108     write_block_record().
2109 
2110   RETURN
2111     0  ok
2112     1  error on write
2113 */
2114 
write_full_pages(MARIA_HA * info,LSN lsn,MARIA_BITMAP_BLOCK * block,uchar * data,ulong length)2115 static my_bool write_full_pages(MARIA_HA *info,
2116                                 LSN lsn,
2117                                 MARIA_BITMAP_BLOCK *block,
2118                                 uchar *data, ulong length)
2119 {
2120   pgcache_page_no_t page;
2121   MARIA_SHARE *share= info->s;
2122   uint block_size= share->block_size;
2123   uint data_size= FULL_PAGE_SIZE(share);
2124   uchar *buff= info->keyread_buff;
2125   uint page_count, sub_blocks;
2126   my_off_t position, max_position;
2127   DBUG_ENTER("write_full_pages");
2128   DBUG_PRINT("enter", ("length: %lu  page: %lu  page_count: %lu",
2129                        (ulong) length, (ulong) block->page,
2130                        (ulong) block->page_count));
2131   DBUG_ASSERT((block->page_count & TAIL_BIT) == 0);
2132 
2133   info->keyread_buff_used= 1;
2134   page=       block->page;
2135   page_count= block->page_count;
2136   sub_blocks= block->sub_blocks;
2137 
2138   max_position= (my_off_t) (page + page_count) * block_size;
2139 
2140   /* Increase data file size, if extended */
2141 
2142   for (; length; data+= data_size)
2143   {
2144     uint copy_length;
2145     if (!page_count--)
2146     {
2147       if (!--sub_blocks)
2148       {
2149         _ma_set_fatal_error(share, HA_ERR_WRONG_IN_RECORD);
2150         DBUG_RETURN(1);
2151       }
2152 
2153       block++;
2154       page= block->page;
2155       page_count= block->page_count - 1;
2156       DBUG_PRINT("info", ("page: %lu  page_count: %lu",
2157                           (ulong) block->page, (ulong) block->page_count));
2158 
2159       position= (page + page_count + 1) * block_size;
2160       set_if_bigger(max_position, position);
2161     }
2162     lsn_store(buff, lsn);
2163     buff[PAGE_TYPE_OFFSET]= (uchar) BLOB_PAGE;
2164     bzero(buff + LSN_SIZE + PAGE_TYPE_SIZE,
2165           FULL_PAGE_HEADER_SIZE(share) - (LSN_SIZE + PAGE_TYPE_SIZE));
2166     copy_length= MY_MIN(data_size, length);
2167     memcpy(buff + FULL_PAGE_HEADER_SIZE(share), data, copy_length);
2168     length-= copy_length;
2169 
2170     /*
2171       Zero out old information from the block. This removes possible
2172       sensitive information from the block and also makes the file
2173       easier to compress and easier to compare after recovery.
2174     */
2175     if (copy_length != data_size)
2176       bzero(buff + block_size - PAGE_SUFFIX_SIZE - (data_size - copy_length),
2177             (data_size - copy_length) + PAGE_SUFFIX_SIZE);
2178 
2179     if (pagecache_write(share->pagecache,
2180                         &info->dfile, page, 0,
2181                         buff, share->page_type,
2182                         PAGECACHE_LOCK_LEFT_UNLOCKED,
2183                         PAGECACHE_PIN_LEFT_UNPINNED,
2184                         PAGECACHE_WRITE_DELAY,
2185                         0, info->trn->rec_lsn))
2186       DBUG_RETURN(1);
2187     page++;
2188     DBUG_ASSERT(block->used & BLOCKUSED_USED);
2189   }
2190   if (share->state.state.data_file_length < max_position)
2191     _ma_set_share_data_file_length(share, max_position);
2192   DBUG_RETURN(0);
2193 }
2194 
2195 
2196 /*
2197   Store ranges of full pages in compact format for logging
2198 
2199   SYNOPSIS
2200     store_page_range()
2201     to		Store data here
2202     block       Where pages are to be written
2203     length	Length of data to be written
2204 		Normally this is full pages, except for the last
2205                 tail block that may only partly fit the last page.
2206     tot_ranges  Add here the number of ranges used
2207 
2208   NOTES
2209     The format of one entry is:
2210 
2211      Ranges				 SUB_RANGE_SIZE
2212      Empty bytes at end of last byte     BLOCK_FILLER_SIZE
2213      For each range
2214        Page number                       PAGE_STORE_SIZE
2215        Number of pages			 PAGERANGE_STORE_SIZE
2216 
2217   RETURN
2218     #  end position for 'to'
2219 */
2220 
store_page_range(MARIA_SHARE * share,uchar * to,MARIA_BITMAP_BLOCK * block,ulong length,uint * tot_ranges)2221 static uchar *store_page_range(MARIA_SHARE *share,
2222                                uchar *to, MARIA_BITMAP_BLOCK *block,
2223                                ulong length,
2224                                uint *tot_ranges)
2225 {
2226   uint data_size= FULL_PAGE_SIZE(share);
2227   ulong pages_left= (length + data_size -1) / data_size;
2228   uint page_count, ranges, empty_space;
2229   uchar *to_start;
2230   DBUG_ENTER("store_page_range");
2231 
2232   to_start= to;
2233   to+= SUB_RANGE_SIZE;
2234 
2235   /* Store number of unused bytes at last page */
2236   empty_space= (uint) (pages_left * data_size - length);
2237   int2store(to, empty_space);
2238   to+= BLOCK_FILLER_SIZE;
2239 
2240   ranges= 0;
2241   do
2242   {
2243     pgcache_page_no_t page;
2244     page=       block->page;
2245     page_count= block->page_count;
2246     block++;
2247     if (page_count > pages_left)
2248       page_count= pages_left;
2249 
2250     page_store(to, page);
2251     to+= PAGE_STORE_SIZE;
2252     pagerange_store(to, page_count);
2253     to+= PAGERANGE_STORE_SIZE;
2254     ranges++;
2255   } while ((pages_left-= page_count));
2256   /* Store number of ranges for this block */
2257   int2store(to_start, ranges);
2258   (*tot_ranges)+= ranges;
2259 
2260   DBUG_RETURN(to);
2261 }
2262 
2263 
2264 /*
2265   Store packed extent data
2266 
2267   SYNOPSIS
2268    store_extent_info()
2269    to				Store first packed data here
2270    row_extents_second_part	Store rest here
2271    first_block		        First block to store
2272    count			Number of blocks
2273 
2274   NOTES
2275     We don't have to store the position for the head block
2276 
2277     We have to set the START_EXTENT_BIT for every extent where the
2278     blob will be stored on a page of it's own. We need this in the
2279     UNDO phase to generate MARIA_BITMAP_BLOCK's for undo-delete and
2280     undo-update.
2281 */
2282 
store_extent_info(uchar * to,uchar * row_extents_second_part,MARIA_BITMAP_BLOCK * first_block,uint count)2283 static void store_extent_info(uchar *to,
2284                               uchar *row_extents_second_part,
2285                               MARIA_BITMAP_BLOCK *first_block,
2286                               uint count)
2287 {
2288   MARIA_BITMAP_BLOCK *block, *end_block;
2289   uint copy_length;
2290   my_bool first_found= 0;
2291   DBUG_ENTER("store_extent_info");
2292   DBUG_PRINT("enter", ("count: %u", count));
2293 
2294   for (block= first_block, end_block= first_block+count ;
2295        block < end_block; block++)
2296   {
2297     /* The following is only false for marker (unused) blocks */
2298     if (likely(block->used & BLOCKUSED_USED))
2299     {
2300       uint page_count= block->page_count;
2301       DBUG_ASSERT(page_count != 0);
2302       page_store(to, block->page);
2303       if (block->sub_blocks)
2304       {
2305         /*
2306           Set a bit so that we later know that this was the first block
2307           for a blob
2308         */
2309         page_count|= START_EXTENT_BIT;
2310       }
2311       pagerange_store(to + PAGE_STORE_SIZE, page_count);
2312       DBUG_DUMP("extent", to, ROW_EXTENT_SIZE);
2313       to+= ROW_EXTENT_SIZE;
2314       if (!first_found)
2315       {
2316         first_found= 1;
2317         to= row_extents_second_part;
2318       }
2319     }
2320   }
2321   copy_length= (count - 1) * ROW_EXTENT_SIZE;
2322   /*
2323     In some unlikely cases we have allocated to many blocks. Clear this
2324     data.
2325   */
2326   bzero(to, (size_t) (row_extents_second_part + copy_length - to));
2327   DBUG_VOID_RETURN;
2328 }
2329 
2330 
2331 /**
2332    @brief
2333    Convert extent info read from file to MARIA_BITMAP_BLOCKS suitable
2334    for write_block_record
2335 
2336    @note
2337    In case of blobs, this function marks all the blob pages in the bitmap
2338    as full pages. The bitmap bits for other pages will be marked
2339    when write_block_record() calls _ma_bitmap_release_unused().
2340 
2341    This function will be removed in Maria 2.0 when we instead of delete rows
2342    mark them as deleted and only remove them after commit.
2343 
2344    @return
2345    @retval 0  ok
2346    @retval 1  Error (out of memory or disk error changing bitmap) or
2347               wrong information in extent information
2348 */
2349 
extent_to_bitmap_blocks(MARIA_HA * info,MARIA_BITMAP_BLOCKS * blocks,pgcache_page_no_t head_page,uint extent_count,const uchar * extent_info)2350 static my_bool extent_to_bitmap_blocks(MARIA_HA *info,
2351                                        MARIA_BITMAP_BLOCKS *blocks,
2352                                        pgcache_page_no_t head_page,
2353                                        uint extent_count,
2354                                        const uchar *extent_info)
2355 {
2356   MARIA_BITMAP_BLOCK *block, *start_block;
2357   MARIA_SHARE *share= info->s;
2358   uint i, tail_page;
2359   DBUG_ENTER("extent_to_bitmap_blocks");
2360 
2361   if (allocate_dynamic(&info->bitmap_blocks, extent_count + 2))
2362     DBUG_RETURN(1);
2363   block= blocks->block=  dynamic_element(&info->bitmap_blocks, 0,
2364                                         MARIA_BITMAP_BLOCK*);
2365   blocks->count= extent_count + 1;
2366   blocks->tail_page_skipped= blocks->page_skipped= 0;
2367   block->page= head_page;
2368   block->page_count= 1;
2369   block->used= BLOCKUSED_USED | BLOCKUSED_USE_ORG_BITMAP;
2370   /* Impossible value, will force storage of real value */
2371   block->org_bitmap_value= 255;
2372 
2373   start_block= block++;
2374   for (i=0 ;
2375        i++ < extent_count ;
2376        block++, extent_info+= ROW_EXTENT_SIZE)
2377   {
2378     uint page_count= uint2korr(extent_info + ROW_EXTENT_PAGE_SIZE);
2379     if (page_count & START_EXTENT_BIT)
2380     {
2381       page_count&= ~START_EXTENT_BIT;
2382       start_block->sub_blocks= (uint) (block - start_block);
2383       start_block= block;
2384     }
2385     block->page= page_korr(extent_info);
2386     block->page_count= page_count;
2387     block->sub_blocks= 0;
2388     if (block->page_count == 0)
2389     {
2390       /* Extend allocated but not used by write_block_record() */
2391       DBUG_ASSERT(block->page == 0);
2392       /* This is the last block */
2393       blocks->count= i;
2394       break;
2395     }
2396     if ((tail_page= page_count & TAIL_BIT))
2397       page_count= 1;
2398 
2399     /* Check if wrong data */
2400     if (block->page == 0 || page_count == 0 ||
2401         (block->page + page_count) * share->block_size >
2402          share->state.state.data_file_length)
2403     {
2404       DBUG_PRINT("error", ("page: %lu  page_count: %u  tail: %u  length: %ld  data_length: %ld",
2405                            (ulong) block->page,
2406                            (block->page_count & ~TAIL_BIT),
2407                            (uint) MY_TEST(block->page_count & TAIL_BIT),
2408                            (ulong) ((block->page + (page_count & ~TAIL_BIT)) *
2409                                     share->block_size),
2410                            (ulong) share->state.state.data_file_length));
2411       DBUG_RETURN(1);
2412     }
2413     if (tail_page)
2414     {
2415       block->org_bitmap_value= _ma_bitmap_get_page_bits(info, &share->bitmap,
2416                                                         block->page);
2417       block->used= (BLOCKUSED_TAIL | BLOCKUSED_USED |
2418                     BLOCKUSED_USE_ORG_BITMAP);
2419     }
2420     else
2421     {
2422       my_bool res;
2423       mysql_mutex_lock(&share->bitmap.bitmap_lock);
2424       res= _ma_bitmap_set_full_page_bits(info, &share->bitmap,
2425                                          block->page, page_count);
2426       mysql_mutex_unlock(&share->bitmap.bitmap_lock);
2427       if (res)
2428         DBUG_RETURN(1);
2429       block->used= BLOCKUSED_USED;
2430     }
2431   }
2432   start_block->sub_blocks= (uint) (block - start_block);
2433   DBUG_RETURN(0);
2434 }
2435 
2436 
2437 /*
2438   Free regions of pages with logging
2439 
2440   NOTES
2441     We are removing filler events and tail page events from
2442     row->extents to get smaller log.
2443 
2444   RETURN
2445     0   ok
2446     1   error
2447 */
2448 
free_full_pages(MARIA_HA * info,MARIA_ROW * row)2449 static my_bool free_full_pages(MARIA_HA *info, MARIA_ROW *row)
2450 {
2451   uchar log_data[FILEID_STORE_SIZE + PAGERANGE_STORE_SIZE];
2452   LEX_CUSTRING log_array[TRANSLOG_INTERNAL_PARTS + 2];
2453   LSN lsn;
2454   size_t extents_length;
2455   uchar *extents= row->extents;
2456   DBUG_ENTER("free_full_pages");
2457 
2458   if (info->s->now_transactional)
2459   {
2460     /* Compact events by removing filler and tail events */
2461     uchar *new_block= 0;
2462     uchar *end, *to, *compact_extent_info;
2463     my_bool res, buff_alloced;
2464     uint extents_count;
2465 
2466     alloc_on_stack(*info->stack_end_ptr, compact_extent_info, buff_alloced,
2467                    row->extents_count * ROW_EXTENT_SIZE);
2468     if (!compact_extent_info)
2469       DBUG_RETURN(1);
2470 
2471     to= compact_extent_info;
2472     for (end= extents + row->extents_count * ROW_EXTENT_SIZE ;
2473          extents < end ;
2474          extents+= ROW_EXTENT_SIZE)
2475     {
2476       uint page_count= uint2korr(extents + ROW_EXTENT_PAGE_SIZE);
2477       page_count&= ~START_EXTENT_BIT;
2478       if (! (page_count & TAIL_BIT) && page_count != 0)
2479       {
2480         /* Found correct extent */
2481         if (!new_block)
2482           new_block= extents;                   /* First extent in range */
2483         continue;
2484       }
2485       /* Found extent to remove, copy everything found so far */
2486       if (new_block)
2487       {
2488         size_t length= (size_t) (extents - new_block);
2489         memcpy(to, new_block, length);
2490         to+= length;
2491         new_block= 0;
2492       }
2493     }
2494     if (new_block)
2495     {
2496       size_t length= (size_t) (extents - new_block);
2497       memcpy(to, new_block, length);
2498       to+= length;
2499     }
2500 
2501     if (!unlikely(extents_length= (uint) (to - compact_extent_info)))
2502     {
2503       /*
2504         No ranges. This happens in the rear case when we have a allocated
2505         place for a blob on a tail page but it did fit into the main page.
2506       */
2507       stack_alloc_free(compact_extent_info, buff_alloced);
2508       DBUG_RETURN(0);
2509     }
2510     extents_count= (uint) (extents_length / ROW_EXTENT_SIZE);
2511     pagerange_store(log_data + FILEID_STORE_SIZE, extents_count);
2512     log_array[TRANSLOG_INTERNAL_PARTS + 0].str=    log_data;
2513     log_array[TRANSLOG_INTERNAL_PARTS + 0].length= sizeof(log_data);
2514     log_array[TRANSLOG_INTERNAL_PARTS + 1].str=    compact_extent_info;
2515     log_array[TRANSLOG_INTERNAL_PARTS + 1].length= extents_length;
2516     res= translog_write_record(&lsn, LOGREC_REDO_FREE_BLOCKS, info->trn,
2517                                info,
2518                                (translog_size_t) (sizeof(log_data) +
2519                                                   extents_length),
2520                                TRANSLOG_INTERNAL_PARTS + 2, log_array,
2521                                log_data, NULL);
2522     stack_alloc_free(compact_extent_info, buff_alloced);
2523     if (res)
2524       DBUG_RETURN(1);
2525   }
2526 
2527   DBUG_RETURN(_ma_bitmap_free_full_pages(info, row->extents,
2528                                          row->extents_count));
2529 }
2530 
2531 
2532 /*
2533   Free one page range
2534 
2535   NOTES
2536     This is very similar to free_full_pages()
2537 
2538   RETURN
2539     0   ok
2540     1   error
2541 */
2542 
free_full_page_range(MARIA_HA * info,pgcache_page_no_t page,uint count)2543 static my_bool free_full_page_range(MARIA_HA *info, pgcache_page_no_t page,
2544                                     uint count)
2545 {
2546   my_bool res= 0;
2547   uint delete_count;
2548   MARIA_SHARE *share= info->s;
2549   DBUG_ENTER("free_full_page_range");
2550 
2551   delete_count= count;
2552   if (share->state.state.data_file_length ==
2553       (page + count) * share->block_size)
2554   {
2555     /*
2556       Don't delete last page from pagecache as this will make the file
2557       shorter than expected if the last operation extended the file
2558     */
2559     delete_count--;
2560   }
2561   if (delete_count &&
2562       pagecache_delete_pages(share->pagecache, &info->dfile,
2563                              page, delete_count, PAGECACHE_LOCK_WRITE, 1))
2564     res= 1;
2565 
2566   if (share->now_transactional)
2567   {
2568     LSN lsn;
2569     /** @todo unify log_data's shape with delete_head_or_tail() */
2570     uchar log_data[FILEID_STORE_SIZE + PAGERANGE_STORE_SIZE +
2571                    ROW_EXTENT_SIZE];
2572     LEX_CUSTRING log_array[TRANSLOG_INTERNAL_PARTS + 1];
2573     DBUG_ASSERT(info->trn->rec_lsn);
2574     pagerange_store(log_data + FILEID_STORE_SIZE, 1);
2575     page_store(log_data + FILEID_STORE_SIZE + PAGERANGE_STORE_SIZE,
2576               page);
2577     int2store(log_data + FILEID_STORE_SIZE + PAGERANGE_STORE_SIZE +
2578               PAGE_STORE_SIZE, count);
2579     log_array[TRANSLOG_INTERNAL_PARTS + 0].str=    log_data;
2580     log_array[TRANSLOG_INTERNAL_PARTS + 0].length= sizeof(log_data);
2581 
2582     if (translog_write_record(&lsn, LOGREC_REDO_FREE_BLOCKS,
2583                               info->trn, info,
2584                               (translog_size_t) sizeof(log_data),
2585                               TRANSLOG_INTERNAL_PARTS + 1, log_array,
2586                               log_data, NULL))
2587       res= 1;
2588   }
2589   mysql_mutex_lock(&share->bitmap.bitmap_lock);
2590   if (_ma_bitmap_reset_full_page_bits(info, &share->bitmap, page, count))
2591     res= 1;
2592   mysql_mutex_unlock(&share->bitmap.bitmap_lock);
2593   DBUG_RETURN(res);
2594 }
2595 
2596 
2597 /**
2598    @brief Write a record to a (set of) pages
2599 
2600    @fn     write_block_record()
2601    @param  info            Maria handler
2602    @param  old_record      Original record in case of update; NULL in case of
2603                            insert
2604    @param  record          Record we should write
2605    @param  row             Statistics about record (calculated by
2606                            calc_record_size())
2607    @param  bitmap_blocks   On which pages the record should be stored
2608    @param  head_block_is_read  1 if head block existed. 0 if new block.
2609    @param  row_pos         Position on head page where to put head part of
2610                            record
2611    @param  undo_lsn	   <> LSN_ERROR if we are executing an UNDO
2612    @param  old_record_checksum Checksum of old_record: ignored if table does
2613                                not have live checksum; otherwise if
2614                                old_record==NULL it must be 0.
2615 
2616    @note
2617      On return all pinned pages are released.
2618 
2619      [page_buff + EMPTY_SPACE_OFFSET] is set to
2620      row_pos->empty_space - head_length
2621 
2622    @return Operation status
2623    @retval 0      OK
2624    @retval 1      Error
2625 */
2626 
write_block_record(MARIA_HA * info,const uchar * old_record,const uchar * record,MARIA_ROW * row,MARIA_BITMAP_BLOCKS * bitmap_blocks,my_bool head_block_is_read,struct st_row_pos_info * row_pos,LSN undo_lsn,ha_checksum old_record_checksum)2627 static my_bool write_block_record(MARIA_HA *info,
2628                                   const uchar *old_record,
2629                                   const uchar *record,
2630                                   MARIA_ROW *row,
2631                                   MARIA_BITMAP_BLOCKS *bitmap_blocks,
2632                                   my_bool head_block_is_read,
2633                                   struct st_row_pos_info *row_pos,
2634                                   LSN undo_lsn,
2635                                   ha_checksum old_record_checksum)
2636 {
2637   uchar *data, *end_of_data, *tmp_data_used, *tmp_data;
2638   uchar *UNINIT_VAR(row_extents_first_part), *UNINIT_VAR(row_extents_second_part);
2639   uchar *field_length_data;
2640   uchar *page_buff;
2641   MARIA_BITMAP_BLOCK *block, *head_block;
2642   MARIA_SHARE *share= info->s;
2643   MARIA_COLUMNDEF *column, *end_column;
2644   MARIA_PINNED_PAGE page_link;
2645   uint block_size, flag, head_length;
2646   ulong *blob_lengths;
2647   my_bool row_extents_in_use, blob_full_pages_exists;
2648   LSN lsn;
2649   my_off_t position;
2650   uint save_my_errno;
2651   myf myflag= MY_WME | (share->temporary ? MY_THREAD_SPECIFIC : 0);
2652   DBUG_ENTER("write_block_record");
2653 
2654   head_block= bitmap_blocks->block;
2655   block_size= share->block_size;
2656 
2657   page_buff= row_pos->buff;
2658   /* Position on head page where we should store the head part */
2659   data= row_pos->data;
2660   end_of_data= data + row_pos->length;
2661 
2662   /* Write header */
2663   flag= info->row_flag;
2664   row_extents_in_use= 0;
2665   if (unlikely(row->total_length > row_pos->length))
2666   {
2667     /* Need extent */
2668     DBUG_ASSERT(bitmap_blocks->count > 1);
2669     if (bitmap_blocks->count <= 1)
2670       goto crashed;                             /* Wrong in bitmap */
2671     flag|= ROW_FLAG_EXTENTS;
2672     row_extents_in_use= 1;
2673   }
2674   /* For now we have only a minimum header */
2675   *data++= (uchar) flag;
2676   if (flag & ROW_FLAG_TRANSID)
2677   {
2678     transid_store(data, info->trn->trid);
2679     data+= TRANSID_SIZE;
2680   }
2681 
2682   if (unlikely(flag & ROW_FLAG_NULLS_EXTENDED))
2683     *data++= (uchar) (share->base.null_bytes -
2684                       share->base.original_null_bytes);
2685   if (row_extents_in_use)
2686   {
2687     /* Store first extent in header */
2688     store_key_length_inc(data, bitmap_blocks->count - 1);
2689     row_extents_first_part= data;
2690     data+= ROW_EXTENT_SIZE;
2691   }
2692   if (share->base.max_field_lengths)
2693     store_key_length_inc(data, row->field_lengths_length);
2694   if (share->calc_checksum)
2695   {
2696     *(data++)= (uchar) (row->checksum); /* store least significant byte */
2697     DBUG_ASSERT(!((old_record_checksum != 0) && (old_record == NULL)));
2698   }
2699   memcpy(data, record, share->base.null_bytes);
2700   data+= share->base.null_bytes;
2701   memcpy(data, row->empty_bits, share->base.pack_bytes);
2702   data+= share->base.pack_bytes;
2703 
2704   DBUG_ASSERT(row_extents_in_use || undo_lsn != LSN_ERROR ||
2705               (uint) (data - row_pos->data) == row->min_length);
2706 
2707   /*
2708     Allocate a buffer of rest of data (except blobs)
2709 
2710     To avoid double copying of data, we copy as many columns that fits into
2711     the page. The rest goes into info->packed_row.
2712 
2713     Using an extra buffer, instead of doing continuous writes to different
2714     pages, uses less code and we don't need to have to do a complex call
2715     for every data segment we want to store.
2716   */
2717   if (_ma_alloc_buffer(&info->rec_buff, &info->rec_buff_size,
2718                        row->head_length, myflag))
2719     DBUG_RETURN(1);
2720 
2721   tmp_data_used= 0;                 /* Either 0 or last used uchar in 'data' */
2722   tmp_data= data;
2723 
2724   if (row_extents_in_use)
2725   {
2726     uint copy_length= (bitmap_blocks->count - 2) * ROW_EXTENT_SIZE;
2727     if (!tmp_data_used && tmp_data + copy_length > end_of_data)
2728     {
2729       tmp_data_used= tmp_data;
2730       tmp_data= info->rec_buff;
2731     }
2732     row_extents_second_part= tmp_data;
2733     /*
2734        We will copy the extents here when we have figured out the tail
2735        positions.
2736     */
2737     tmp_data+= copy_length;
2738   }
2739 
2740   /* Copy fields that has fixed lengths (primary key etc) */
2741   for (column= share->columndef,
2742          end_column= column + share->base.fixed_not_null_fields;
2743        column < end_column; column++)
2744   {
2745     if (!tmp_data_used && tmp_data + column->length > end_of_data)
2746     {
2747       tmp_data_used= tmp_data;
2748       tmp_data= info->rec_buff;
2749     }
2750     memcpy(tmp_data, record + column->offset, column->length);
2751     tmp_data+= column->length;
2752   }
2753 
2754   /* Copy length of data for variable length fields */
2755   if (!tmp_data_used && tmp_data + row->field_lengths_length > end_of_data)
2756   {
2757     tmp_data_used= tmp_data;
2758     tmp_data= info->rec_buff;
2759   }
2760   field_length_data= row->field_lengths;
2761   memcpy(tmp_data, field_length_data, row->field_lengths_length);
2762   tmp_data+= row->field_lengths_length;
2763 
2764   DBUG_ASSERT(row_extents_in_use || undo_lsn != LSN_ERROR ||
2765               (uint) (tmp_data - row_pos->data) == row->min_length +
2766               share->base.fixed_not_null_fields_length +
2767               row->field_lengths_length);
2768 
2769   /* Copy variable length fields and fields with null/zero */
2770   for (end_column= share->columndef + share->base.fields - share->base.blobs;
2771        column < end_column ;
2772        column++)
2773   {
2774     const uchar *field_pos;
2775     ulong length;
2776     if ((record[column->null_pos] & column->null_bit) ||
2777         (row->empty_bits[column->empty_pos] & column->empty_bit))
2778       continue;
2779 
2780     field_pos= record + column->offset;
2781     switch (column->type) {
2782     case FIELD_NORMAL:                          /* Fixed length field */
2783     case FIELD_SKIP_PRESPACE:
2784     case FIELD_SKIP_ZERO:                       /* Fixed length field */
2785       length= column->length;
2786       break;
2787     case FIELD_SKIP_ENDSPACE:                   /* CHAR */
2788       /* Char that is space filled */
2789       if (column->length <= 255)
2790         length= (uint) (uchar) *field_length_data++;
2791       else
2792       {
2793         length= uint2korr(field_length_data);
2794         field_length_data+= 2;
2795       }
2796       break;
2797     case FIELD_VARCHAR:
2798       if (column->length <= 256)
2799       {
2800         length= (uint) (uchar) *field_length_data++;
2801         field_pos++;                            /* Skip length uchar */
2802       }
2803       else
2804       {
2805         length= uint2korr(field_length_data);
2806         field_length_data+= 2;
2807         field_pos+= 2;
2808       }
2809       DBUG_ASSERT(length <= column->length);
2810       break;
2811     default:                                    /* Wrong data */
2812       DBUG_ASSERT(!maria_assert_if_crashed_table);
2813       length=0;
2814       break;
2815     }
2816     if (!tmp_data_used && tmp_data + length > end_of_data)
2817     {
2818       /* Data didn't fit in page; Change to use tmp buffer */
2819       tmp_data_used= tmp_data;
2820       tmp_data= info->rec_buff;
2821     }
2822     memcpy((char*) tmp_data, field_pos, length);
2823     tmp_data+= length;
2824   }
2825 
2826   block= head_block + head_block->sub_blocks;   /* Point to first blob data */
2827 
2828   end_column= column + share->base.blobs;
2829   blob_lengths= row->blob_lengths;
2830   if (!tmp_data_used)
2831   {
2832     /* Still room on page; Copy as many blobs we can into this page */
2833     data= tmp_data;
2834     for (; column < end_column &&
2835            *blob_lengths <= (ulong)(end_of_data - data);
2836          column++, blob_lengths++)
2837     {
2838       uchar *tmp_pos;
2839       uint length;
2840       if (!*blob_lengths)                       /* Null or "" */
2841         continue;
2842       length= column->length - portable_sizeof_char_ptr;
2843       memcpy(&tmp_pos, record + column->offset + length, sizeof(char*));
2844       memcpy(data, tmp_pos, *blob_lengths);
2845       data+= *blob_lengths;
2846       /*
2847         The following is not true when we want to insert data into original
2848         place. In this case we don't have any extra blocks allocated
2849       */
2850       if (likely(undo_lsn == LSN_ERROR))
2851       {
2852         /* Skip over tail page that was prepared for storing blob */
2853         block++;
2854         bitmap_blocks->tail_page_skipped= 1;
2855       }
2856     }
2857     if (head_block->sub_blocks > 1)
2858     {
2859       /* We have allocated pages that where not used */
2860       bitmap_blocks->page_skipped= 1;
2861     }
2862   }
2863   else
2864     data= tmp_data_used;                        /* Get last used on page */
2865 
2866   /* Update page directory */
2867   head_length= (uint) (data - row_pos->data);
2868   DBUG_PRINT("info", ("Used head length on page: %u  header_length: %u",
2869                       head_length,
2870                       (uint) (flag & ROW_FLAG_TRANSID ? TRANSID_SIZE : 0)));
2871   if (head_length < share->base.min_block_length)
2872   {
2873     /* Extend row to be of size min_block_length */
2874     uint diff_length= share->base.min_block_length - head_length;
2875     bzero(data, diff_length);
2876     data+= diff_length;
2877     head_length= share->base.min_block_length;
2878   }
2879   DBUG_ASSERT(data <= end_of_data);
2880   /*
2881     If this is a redo entry (ie, undo_lsn != LSN_ERROR) then we should have
2882     written exactly head_length bytes (same as original record).
2883   */
2884   DBUG_ASSERT(undo_lsn == LSN_ERROR || head_length == row_pos->length);
2885   int2store(row_pos->dir + 2, head_length);
2886   /* update empty space at start of block */
2887   row_pos->empty_space-= head_length;
2888   int2store(page_buff + EMPTY_SPACE_OFFSET, row_pos->empty_space);
2889   /* Mark in bitmaps how the current page was actually used */
2890   head_block->empty_space= row_pos->empty_space;
2891   if (page_buff[DIR_COUNT_OFFSET] == MAX_ROWS_PER_PAGE &&
2892       page_buff[DIR_FREE_OFFSET] == END_OF_DIR_FREE_LIST)
2893     head_block->empty_space= 0;               /* Page is full */
2894   head_block->used|= BLOCKUSED_USED;
2895 
2896   check_directory(share,
2897                   page_buff, share->block_size, share->base.min_block_length,
2898                   (uint) -1);
2899 
2900   /*
2901      Now we have to write tail pages, as we need to store the position
2902      to them in the row extent header.
2903 
2904      We first write out all blob tails, to be able to store them in
2905      the current page or 'tmp_data'.
2906 
2907      Then we write the tail of the non-blob fields (The position to the
2908      tail page is stored either in row header, the extents in the head
2909      page or in the first full page of the non-blob data. It's never in
2910      the tail page of the non-blob data)
2911   */
2912 
2913   blob_full_pages_exists= 0;
2914   if (row_extents_in_use)
2915   {
2916     if (column != end_column)                   /* If blob fields */
2917     {
2918       MARIA_COLUMNDEF    *save_column=       column;
2919       MARIA_BITMAP_BLOCK *save_block=        block;
2920       MARIA_BITMAP_BLOCK *end_block;
2921       ulong              *save_blob_lengths= blob_lengths;
2922 
2923       for (; column < end_column; column++, blob_lengths++)
2924       {
2925         uchar *blob_pos;
2926         if (!*blob_lengths)                     /* Null or "" */
2927           continue;
2928         if (block[block->sub_blocks - 1].used & BLOCKUSED_TAIL)
2929         {
2930           uint length;
2931           length= column->length - portable_sizeof_char_ptr;
2932           memcpy(&blob_pos, record + column->offset + length, sizeof(char*));
2933           length= *blob_lengths % FULL_PAGE_SIZE(share);   /* tail size */
2934           if (length != *blob_lengths)
2935             blob_full_pages_exists= 1;
2936           if (write_tail(info, block + block->sub_blocks-1,
2937                          blob_pos + *blob_lengths - length,
2938                          length))
2939             goto disk_err;
2940         }
2941         else
2942           blob_full_pages_exists= 1;
2943 
2944         for (end_block= block + block->sub_blocks; block < end_block; block++)
2945         {
2946           /*
2947             Set only a bit, to not cause bitmap code to believe a block is full
2948             when there is still a lot of entries in it.
2949           */
2950           block->used|= BLOCKUSED_USED;
2951         }
2952       }
2953       DBUG_ASSERT((undo_lsn == LSN_ERROR ||
2954                    block == bitmap_blocks->block + bitmap_blocks->count));
2955       column= save_column;
2956       block= save_block;
2957       blob_lengths= save_blob_lengths;
2958     }
2959 
2960     if (tmp_data_used)                          /* non blob data overflows */
2961     {
2962       MARIA_BITMAP_BLOCK *cur_block, *end_block, *last_head_block;
2963       MARIA_BITMAP_BLOCK *head_tail_block= 0;
2964       ulong length;
2965       ulong data_length= (ulong) (tmp_data - info->rec_buff);
2966 
2967 #ifdef SANITY_CHECKS
2968       DBUG_ASSERT(head_block->sub_blocks != 1);
2969       if (head_block->sub_blocks == 1)
2970         goto crashed;                           /* no reserved full or tails */
2971 #endif
2972       /*
2973         Find out where to write tail for non-blob fields.
2974 
2975         Problem here is that the bitmap code may have allocated more
2976         space than we need. We have to handle the following cases:
2977 
2978         - Bitmap code allocated a tail page we don't need.
2979         - The last full page allocated needs to be changed to a tail page
2980         (Because we where able to put more data on the head page than
2981         the bitmap allocation assumed)
2982 
2983         The reserved pages in bitmap_blocks for the main page has one of
2984         the following allocations:
2985         - Full pages, with following blocks:
2986           # * full pages
2987           empty page  ; To be used if we change last full to tail page. This
2988           has 'count' = 0.
2989           tail page  (optional, if last full page was part full)
2990         - One tail page
2991       */
2992 
2993       cur_block= head_block + 1;
2994       end_block= head_block + head_block->sub_blocks;
2995       /*
2996         Loop until we have find a block bigger than we need or
2997         we find the empty page block.
2998       */
2999       while (data_length >= (length= (cur_block->page_count *
3000                                       FULL_PAGE_SIZE(share))) &&
3001              cur_block->page_count)
3002       {
3003 #ifdef SANITY_CHECKS
3004         DBUG_ASSERT(!((cur_block == end_block) ||
3005                       (cur_block->used & BLOCKUSED_USED)));
3006         if ((cur_block == end_block) || (cur_block->used & BLOCKUSED_USED))
3007           goto crashed;
3008 #endif
3009         data_length-= length;
3010         (cur_block++)->used|= BLOCKUSED_USED;
3011       }
3012       last_head_block= cur_block;
3013       if (data_length)
3014       {
3015         if (cur_block->page_count == 0)
3016         {
3017           /* Skip empty filler block */
3018           cur_block++;
3019         }
3020 #ifdef SANITY_CHECKS
3021         DBUG_ASSERT(!(cur_block >= end_block));
3022         if ((cur_block >= end_block))
3023           goto crashed;
3024 #endif
3025         if (cur_block->used & BLOCKUSED_TAIL)
3026         {
3027           DBUG_ASSERT(data_length < MAX_TAIL_SIZE(block_size));
3028           /* tail written to tail page */
3029           cur_block->used|= BLOCKUSED_USED;
3030           head_tail_block= cur_block;
3031         }
3032         else if (data_length > length - MAX_TAIL_SIZE(block_size))
3033         {
3034           /* tail written to full page */
3035           cur_block->used|= BLOCKUSED_USED;
3036           if ((cur_block != end_block - 1) &&
3037               (end_block[-1].used & BLOCKUSED_TAIL))
3038             bitmap_blocks->tail_page_skipped= 1;
3039         }
3040         else
3041         {
3042           /*
3043             cur_block is a full block, followed by an empty and optional
3044             tail block. Change cur_block to a tail block or split it
3045             into full blocks and tail blocks.
3046 
3047             TODO:
3048              If there is enough space on the following tail block, use
3049              this instead of creating a new tail block.
3050           */
3051           DBUG_ASSERT(cur_block[1].page_count == 0);
3052           if (cur_block->page_count == 1)
3053           {
3054             /* convert full block to tail block */
3055             cur_block->used|= BLOCKUSED_USED | BLOCKUSED_TAIL;
3056             head_tail_block= cur_block;
3057           }
3058           else
3059           {
3060             DBUG_ASSERT(data_length < length - FULL_PAGE_SIZE(share));
3061             DBUG_PRINT("info", ("Splitting blocks into full and tail"));
3062             cur_block[1].page= (cur_block->page + cur_block->page_count - 1);
3063             cur_block[1].page_count= 1;         /* Avoid DBUG_ASSERT */
3064             cur_block[1].used= BLOCKUSED_USED | BLOCKUSED_TAIL;
3065             cur_block->page_count--;
3066             cur_block->used|= BLOCKUSED_USED;
3067             last_head_block= head_tail_block= cur_block+1;
3068           }
3069           if (end_block[-1].used & BLOCKUSED_TAIL)
3070             bitmap_blocks->tail_page_skipped= 1;
3071         }
3072       }
3073       else
3074       {
3075         /* Must be an empty or tail page */
3076         DBUG_ASSERT(cur_block->page_count == 0 ||
3077                     cur_block->used & BLOCKUSED_TAIL);
3078         if (end_block[-1].used & BLOCKUSED_TAIL)
3079           bitmap_blocks->tail_page_skipped= 1;
3080       }
3081 
3082       /*
3083         Write all extents into page or tmp_data
3084 
3085         Note that we still don't have a correct position for the tail
3086         of the non-blob fields.
3087       */
3088       store_extent_info(row_extents_first_part,
3089                         row_extents_second_part,
3090                         head_block+1, bitmap_blocks->count - 1);
3091       if (head_tail_block)
3092       {
3093         ulong block_length= (ulong) (tmp_data - info->rec_buff);
3094         uchar *extent_data;
3095 
3096         length= (uint) (block_length % FULL_PAGE_SIZE(share));
3097         if (write_tail(info, head_tail_block,
3098                        info->rec_buff + block_length - length,
3099                        length))
3100           goto disk_err;
3101         tmp_data-= length;                      /* Remove the tail */
3102         if (tmp_data == info->rec_buff)
3103         {
3104           /* We have no full blocks to write for the head part */
3105           tmp_data_used= 0;
3106         }
3107 
3108         /* Store the tail position for the non-blob fields */
3109         if (head_tail_block == head_block + 1)
3110         {
3111           /*
3112             We had a head block + tail block, which means that the
3113             tail block is the first extent
3114           */
3115           extent_data= row_extents_first_part;
3116         }
3117         else
3118         {
3119           /*
3120             We have a head block + some full blocks + tail block
3121             last_head_block is pointing after the last used extent
3122             for the head block.
3123           */
3124           extent_data= row_extents_second_part +
3125             ((last_head_block - head_block) - 2) * ROW_EXTENT_SIZE;
3126         }
3127         /* Write information for tail block in the reserved space */
3128         page_store(extent_data, head_tail_block->page);
3129         pagerange_store(extent_data + PAGE_STORE_SIZE,
3130                         head_tail_block->page_count);
3131       }
3132     }
3133     else
3134       store_extent_info(row_extents_first_part,
3135                         row_extents_second_part,
3136                         head_block+1, bitmap_blocks->count - 1);
3137   }
3138 
3139   if (share->now_transactional)
3140   {
3141     uchar log_data[FILEID_STORE_SIZE + PAGE_STORE_SIZE + DIRPOS_STORE_SIZE];
3142     LEX_CUSTRING log_array[TRANSLOG_INTERNAL_PARTS + 2];
3143 
3144     /* Log REDO changes of head page */
3145     page_store(log_data + FILEID_STORE_SIZE, head_block->page);
3146     dirpos_store(log_data + FILEID_STORE_SIZE + PAGE_STORE_SIZE,
3147                  row_pos->rownr);
3148     log_array[TRANSLOG_INTERNAL_PARTS + 0].str=    log_data;
3149     log_array[TRANSLOG_INTERNAL_PARTS + 0].length= sizeof(log_data);
3150     log_array[TRANSLOG_INTERNAL_PARTS + 1].str=    row_pos->data;
3151     log_array[TRANSLOG_INTERNAL_PARTS + 1].length= head_length;
3152     if (translog_write_record(&lsn,
3153                               head_block_is_read ?
3154                               LOGREC_REDO_INSERT_ROW_HEAD :
3155                               LOGREC_REDO_NEW_ROW_HEAD,
3156                               info->trn,
3157                               info,
3158                               (translog_size_t) (sizeof(log_data) +
3159                                                  head_length),
3160                               TRANSLOG_INTERNAL_PARTS + 2, log_array,
3161                               log_data, NULL))
3162       goto disk_err;
3163   }
3164 
3165 #ifdef RECOVERY_EXTRA_DEBUG
3166   if (info->trn->undo_lsn != LSN_IMPOSSIBLE)
3167   {
3168     /* Stop right after the REDO; testing incomplete log record groups */
3169     DBUG_EXECUTE_IF("maria_flush_whole_log",
3170                     {
3171                       DBUG_PRINT("maria_flush_whole_log", ("now"));
3172                       translog_flush(translog_get_horizon());
3173                     });
3174     DBUG_EXECUTE_IF("maria_crash",
3175                     { DBUG_PRINT("maria_crash", ("now")); DBUG_SUICIDE(); });
3176   }
3177 #endif
3178 
3179   if (head_block_is_read)
3180   {
3181     MARIA_PINNED_PAGE *page_link;
3182     /* Head page is always the first pinned page */
3183     page_link= dynamic_element(&info->pinned_pages, 0,
3184                                MARIA_PINNED_PAGE*);
3185     pagecache_unlock_by_link(share->pagecache, page_link->link,
3186                              PAGECACHE_LOCK_WRITE_TO_READ,
3187                              PAGECACHE_PIN_LEFT_PINNED, LSN_IMPOSSIBLE,
3188                              LSN_IMPOSSIBLE, 1, FALSE);
3189     page_link->unlock= PAGECACHE_LOCK_READ_UNLOCK;
3190     page_link->changed= 1;
3191   }
3192   else
3193   {
3194     if (pagecache_write(share->pagecache,
3195                         &info->dfile, head_block->page, 0,
3196                         page_buff, share->page_type,
3197                         head_block_is_read ? PAGECACHE_LOCK_WRITE_TO_READ :
3198                         PAGECACHE_LOCK_READ,
3199                         head_block_is_read ? PAGECACHE_PIN_LEFT_PINNED :
3200                         PAGECACHE_PIN,
3201                         PAGECACHE_WRITE_DELAY, &page_link.link,
3202                         LSN_IMPOSSIBLE))
3203       goto disk_err;
3204     DBUG_ASSERT(page_link.link);
3205     page_link.unlock= PAGECACHE_LOCK_READ_UNLOCK;
3206     page_link.changed= 1;
3207     push_dynamic(&info->pinned_pages, (void*) &page_link);
3208 
3209     /* Increase data file size, if extended */
3210     position= (my_off_t) head_block->page * block_size;
3211     if (share->state.state.data_file_length <= position)
3212       _ma_set_share_data_file_length(share, position + block_size);
3213   }
3214 
3215   if (share->now_transactional && (tmp_data_used || blob_full_pages_exists))
3216   {
3217     /*
3218       Log REDO writes for all full pages (head part and all blobs)
3219       We write all here to be able to generate the UNDO record early
3220       so that we can write the LSN for the UNDO record to all full pages.
3221     */
3222     uchar tmp_log_data[FILEID_STORE_SIZE + PAGERANGE_STORE_SIZE +
3223                        (ROW_EXTENT_SIZE + BLOCK_FILLER_SIZE + SUB_RANGE_SIZE) *
3224                        ROW_EXTENTS_ON_STACK];
3225     uchar *log_data, *log_pos;
3226     LEX_CUSTRING tmp_log_array[TRANSLOG_INTERNAL_PARTS + 2 +
3227                                ROW_EXTENTS_ON_STACK];
3228     LEX_CUSTRING *log_array_pos, *log_array;
3229     int error;
3230     translog_size_t log_entry_length= 0;
3231     uint ext_length, extents= 0, sub_extents= 0;
3232 
3233     /* If few extents, then allocate things on stack to avoid a malloc call */
3234     if (bitmap_blocks->count < ROW_EXTENTS_ON_STACK)
3235     {
3236       log_array= tmp_log_array;
3237       log_data= tmp_log_data;
3238     }
3239     else
3240     {
3241       if (!my_multi_malloc(PSI_INSTRUMENT_ME, MYF(MY_WME), &log_array,
3242                           (uint) ((bitmap_blocks->count +
3243                                    TRANSLOG_INTERNAL_PARTS + 2) *
3244                                   sizeof(*log_array)),
3245                           &log_data, FILEID_STORE_SIZE + PAGERANGE_STORE_SIZE +
3246                           bitmap_blocks->count * (ROW_EXTENT_SIZE +
3247                                                   BLOCK_FILLER_SIZE +
3248                                                   SUB_RANGE_SIZE),
3249                           NullS))
3250         goto disk_err;
3251     }
3252     log_pos= log_data + FILEID_STORE_SIZE + PAGERANGE_STORE_SIZE * 2;
3253     log_array_pos= log_array+ TRANSLOG_INTERNAL_PARTS+1;
3254 
3255     if (tmp_data_used)
3256     {
3257       /* Full head page */
3258       translog_size_t block_length= (translog_size_t) (tmp_data -
3259                                                        info->rec_buff);
3260       log_pos= store_page_range(share,
3261                                 log_pos, head_block+1,
3262                                 (ulong) block_length, &extents);
3263       log_array_pos->str= info->rec_buff;
3264       log_array_pos->length= block_length;
3265       log_entry_length+= block_length;
3266       log_array_pos++;
3267       sub_extents++;
3268     }
3269     if (blob_full_pages_exists)
3270     {
3271       MARIA_COLUMNDEF *tmp_column= column;
3272       ulong *tmp_blob_lengths= blob_lengths;
3273       MARIA_BITMAP_BLOCK *tmp_block= block;
3274 
3275       /* Full blob pages */
3276       for (; tmp_column < end_column; tmp_column++, tmp_blob_lengths++)
3277       {
3278         ulong blob_length;
3279         uint length;
3280 
3281         if (!*tmp_blob_lengths)                 /* Null or "" */
3282           continue;
3283         blob_length= *tmp_blob_lengths;
3284         length= tmp_column->length - portable_sizeof_char_ptr;
3285         /*
3286           If last part of blog was on tail page, change blob_length to
3287           reflect this
3288         */
3289         if (tmp_block[tmp_block->sub_blocks - 1].used & BLOCKUSED_TAIL)
3290           blob_length-= (blob_length % FULL_PAGE_SIZE(share));
3291         if (blob_length)
3292         {
3293           memcpy((void*) &log_array_pos->str,
3294                  record + tmp_column->offset + length,
3295                  sizeof(uchar*));
3296           log_array_pos->length= blob_length;
3297           log_entry_length+= blob_length;
3298           log_array_pos++;
3299           sub_extents++;
3300 
3301           log_pos= store_page_range(share,
3302                                     log_pos, tmp_block,
3303                                     blob_length, &extents);
3304         }
3305         tmp_block+= tmp_block->sub_blocks;
3306       }
3307     }
3308 
3309     log_array[TRANSLOG_INTERNAL_PARTS + 0].str=    log_data;
3310     ext_length=  (uint) (log_pos - log_data);
3311     log_array[TRANSLOG_INTERNAL_PARTS + 0].length= ext_length;
3312     pagerange_store(log_data+ FILEID_STORE_SIZE, extents);
3313     pagerange_store(log_data+ FILEID_STORE_SIZE + PAGERANGE_STORE_SIZE,
3314                     sub_extents);
3315 
3316     log_entry_length+= ext_length;
3317     /* trn->rec_lsn is already set earlier in this function */
3318     error= translog_write_record(&lsn, LOGREC_REDO_INSERT_ROW_BLOBS,
3319                                  info->trn, info, log_entry_length,
3320                                  (uint) (log_array_pos - log_array),
3321                                  log_array, log_data, NULL);
3322     if (log_array != tmp_log_array)
3323       my_free(log_array);
3324     if (error)
3325       goto disk_err;
3326   }
3327 
3328   /* Write UNDO or CLR record */
3329   lsn= LSN_IMPOSSIBLE;
3330   if (share->now_transactional)
3331   {
3332     LEX_CUSTRING *log_array= info->log_row_parts;
3333 
3334     if (undo_lsn != LSN_ERROR)
3335     {
3336       /*
3337         Store if this CLR is about UNDO_DELETE or UNDO_UPDATE;
3338         in the first case, Recovery, when it sees the CLR_END in the
3339         REDO phase, may decrement the records' count.
3340       */
3341       if (_ma_write_clr(info, undo_lsn,
3342                         old_record ? LOGREC_UNDO_ROW_UPDATE :
3343                         LOGREC_UNDO_ROW_DELETE,
3344                         share->calc_checksum != 0,
3345                         row->checksum - old_record_checksum,
3346                         &lsn, (void*) 0))
3347         goto disk_err;
3348     }
3349     else
3350     {
3351       uchar log_data[LSN_STORE_SIZE + FILEID_STORE_SIZE +
3352                      PAGE_STORE_SIZE + DIRPOS_STORE_SIZE + 2 +
3353                      HA_CHECKSUM_STORE_SIZE + 2 + PAGERANGE_STORE_SIZE +
3354                      ROW_EXTENT_SIZE];
3355       uchar *log_pos;
3356       ha_checksum checksum_delta;
3357 
3358       /* LOGREC_UNDO_ROW_INSERT & LOGREC_UNDO_ROW_UPDATE share same header */
3359       lsn_store(log_data, info->trn->undo_lsn);
3360       page_store(log_data + LSN_STORE_SIZE + FILEID_STORE_SIZE,
3361                  head_block->page);
3362       dirpos_store(log_data + LSN_STORE_SIZE + FILEID_STORE_SIZE +
3363                    PAGE_STORE_SIZE,
3364                    row_pos->rownr);
3365       log_pos= (log_data + LSN_STORE_SIZE + FILEID_STORE_SIZE +
3366                 PAGE_STORE_SIZE + DIRPOS_STORE_SIZE);
3367       store_checksum_in_rec(share, checksum_delta,
3368                             row->checksum - old_record_checksum,
3369                             log_pos, log_pos);
3370       compile_time_assert(sizeof(ha_checksum) == HA_CHECKSUM_STORE_SIZE);
3371 
3372       log_array[TRANSLOG_INTERNAL_PARTS + 0].str=    log_data;
3373       log_array[TRANSLOG_INTERNAL_PARTS + 0].length= (uint) (log_pos -
3374                                                              log_data);
3375 
3376       if (!old_record)
3377       {
3378         /* Store undo_lsn in case we are aborting the insert */
3379         row->orig_undo_lsn= info->trn->undo_lsn;
3380         /* Write UNDO log record for the INSERT */
3381         if (translog_write_record(&lsn, LOGREC_UNDO_ROW_INSERT,
3382                                   info->trn, info,
3383                                   (translog_size_t)
3384                                   log_array[TRANSLOG_INTERNAL_PARTS +
3385                                             0].length,
3386                                   TRANSLOG_INTERNAL_PARTS + 1,
3387                                   log_array,
3388                                   log_data + LSN_STORE_SIZE, &checksum_delta))
3389           goto disk_err;
3390       }
3391       else
3392       {
3393         /* Write UNDO log record for the UPDATE */
3394         size_t row_length, extents_length;
3395         uint row_parts_count, cur_head_length;
3396 
3397         /*
3398           Write head length and extents of the original row so that we
3399           during UNDO can put it back in the original position.
3400           We don't store size for TRANSID, as we don't write this during
3401           UNDO.
3402         */
3403         cur_head_length= (info->cur_row.head_length -
3404                           info->cur_row.header_length);
3405         int2store(log_pos, cur_head_length);
3406         pagerange_store(log_pos + 2, info->cur_row.extents_count);
3407         log_pos+= 2 + PAGERANGE_STORE_SIZE;
3408         log_array[TRANSLOG_INTERNAL_PARTS + 0].length+= (2 +
3409                                                          PAGERANGE_STORE_SIZE);
3410         info->log_row_parts[TRANSLOG_INTERNAL_PARTS+1].str=
3411           info->cur_row.extents;
3412         info->log_row_parts[TRANSLOG_INTERNAL_PARTS+1].length=
3413           extents_length= info->cur_row.extents_count * ROW_EXTENT_SIZE;
3414 
3415         row_length= fill_update_undo_parts(info, old_record, record,
3416                                            log_array +
3417                                            TRANSLOG_INTERNAL_PARTS + 2,
3418                                            &row_parts_count);
3419         if (translog_write_record(&lsn, LOGREC_UNDO_ROW_UPDATE, info->trn,
3420                                   info,
3421                                   (translog_size_t)
3422                                   (log_array[TRANSLOG_INTERNAL_PARTS +
3423                                              0].length + extents_length +
3424                                    row_length),
3425                                   TRANSLOG_INTERNAL_PARTS + 2 +
3426                                   row_parts_count,
3427                                   log_array,
3428                                   log_data + LSN_STORE_SIZE,
3429                                   &checksum_delta))
3430           goto disk_err;
3431       }
3432     }
3433   }
3434   /* Release not used space in used pages */
3435   if (_ma_bitmap_release_unused(info, bitmap_blocks))
3436     goto disk_err;
3437   _ma_unpin_all_pages(info, lsn);
3438 
3439   if (tmp_data_used)
3440   {
3441     /*
3442       Write data stored in info->rec_buff to pages
3443       This is the char/varchar data that didn't fit into the head page.
3444     */
3445     DBUG_ASSERT(bitmap_blocks->count != 0);
3446     if (write_full_pages(info, lsn, head_block + 1,
3447                          info->rec_buff, (ulong) (tmp_data - info->rec_buff)))
3448       goto disk_err;
3449   }
3450 
3451   /* Write rest of blobs (data, but no tails as they are already written) */
3452   for (; column < end_column; column++, blob_lengths++)
3453   {
3454     uchar *blob_pos;
3455     uint length;
3456     ulong blob_length;
3457     if (!*blob_lengths)                         /* Null or "" */
3458       continue;
3459     length= column->length - portable_sizeof_char_ptr;
3460     memcpy(&blob_pos, record + column->offset + length, sizeof(char*));
3461     /* remove tail part */
3462     blob_length= *blob_lengths;
3463     if (block[block->sub_blocks - 1].used & BLOCKUSED_TAIL)
3464       blob_length-= (blob_length % FULL_PAGE_SIZE(share));
3465 
3466     if (blob_length && write_full_pages(info, lsn, block,
3467                                          blob_pos, blob_length))
3468       goto disk_err;
3469     block+= block->sub_blocks;
3470   }
3471 
3472   _ma_finalize_row(info);
3473   DBUG_RETURN(0);
3474 
3475 crashed:
3476   DBUG_ASSERT(!maria_assert_if_crashed_table);
3477   /* Something was wrong with data on page */
3478   _ma_set_fatal_error(share, HA_ERR_WRONG_IN_RECORD);
3479 
3480 disk_err:
3481   /**
3482      @todo RECOVERY we are going to let dirty pages go to disk while we have
3483      logged UNDO, this violates WAL. We must mark the table corrupted!
3484 
3485      @todo RECOVERY we have written some REDOs without a closing UNDO,
3486      it's possible that a next operation by this transaction succeeds and then
3487      Recovery would glue the "orphan REDOs" to the succeeded operation and
3488      execute the failed REDOs. We need some mark "abort this group" in the
3489      log, or mark the table corrupted (then user will repair it and thus REDOs
3490      will be skipped).
3491 
3492      @todo RECOVERY to not let write errors go unnoticed, pagecache_write()
3493      should take a MARIA_HA* in argument, and it it
3494      fails when flushing a page to disk it should call
3495      (*the_maria_ha->write_error_func)(the_maria_ha)
3496      and this hook will mark the table corrupted.
3497      Maybe hook should be stored in the pagecache's block structure, or in a
3498      hash "file->maria_ha*".
3499 
3500      @todo RECOVERY we should distinguish below between log write error and
3501      table write error. The former should stop Maria immediately, the latter
3502      should mark the table corrupted.
3503   */
3504   /*
3505     Unpin all pinned pages to not cause problems for disk cache. This is
3506     safe to call even if we already called _ma_unpin_all_pages() above.
3507   */
3508   save_my_errno= my_errno;
3509   _ma_unpin_all_pages_and_finalize_row(info, LSN_IMPOSSIBLE);
3510   my_errno= save_my_errno;
3511   DBUG_RETURN(1);
3512 }
3513 
3514 
3515 /*
3516   @brief Write a record
3517 
3518   @fn    allocate_and_write_block_record()
3519   @param info                Maria handler
3520   @param record              Record to write
3521   @param row		     Information about fields in 'record'
3522   @param undo_lsn	     <> LSN_ERROR if we are executing an UNDO
3523 
3524   @return
3525   @retval 0	ok
3526   @retval 1	Error
3527 */
3528 
allocate_and_write_block_record(MARIA_HA * info,const uchar * record,MARIA_ROW * row,LSN undo_lsn)3529 static my_bool allocate_and_write_block_record(MARIA_HA *info,
3530                                                const uchar *record,
3531                                                MARIA_ROW *row,
3532                                                LSN undo_lsn)
3533 {
3534   struct st_row_pos_info row_pos;
3535   MARIA_BITMAP_BLOCKS *blocks= &row->insert_blocks;
3536   int save_my_errno;
3537   DBUG_ENTER("allocate_and_write_block_record");
3538 
3539   _ma_bitmap_flushable(info, 1);
3540   if (_ma_bitmap_find_place(info, row, blocks))
3541     goto err;                         /* Error reading bitmap */
3542 
3543   /*
3544     Sleep; a checkpoint will happen and should not send this over-allocated
3545     bitmap to disk but rather wait.
3546   */
3547   DBUG_EXECUTE_IF("maria_over_alloc_bitmap", sleep(10););
3548 
3549   /* page will be pinned & locked by get_head_or_tail_page */
3550   if (get_head_or_tail_page(info, blocks->block, info->buff,
3551                             MY_MAX(row->space_on_head_page,
3552                                 info->s->base.min_block_length),
3553                             HEAD_PAGE,
3554                             PAGECACHE_LOCK_WRITE, &row_pos))
3555     goto err;
3556   row->lastpos= ma_recordpos(blocks->block->page, row_pos.rownr);
3557   if (info->s->calc_checksum)
3558   {
3559     if (undo_lsn == LSN_ERROR)
3560       row->checksum= (info->s->calc_checksum)(info, record);
3561     else
3562     {
3563       /* _ma_apply_undo_row_delete() already set row's checksum. Verify it. */
3564       DBUG_ASSERT(row->checksum == (info->s->calc_checksum)(info, record));
3565     }
3566   }
3567   DBUG_PRINT("info", ("rowid: %lu (%lu:%u) length: %u", (ulong) row->lastpos,
3568                       (ulong) ma_recordpos_to_page(row->lastpos),
3569                       ma_recordpos_to_dir_entry(row->lastpos),
3570                       row_pos.length));
3571   if (write_block_record(info, (uchar*) 0, record, row,
3572                          blocks, blocks->block->org_bitmap_value != 0,
3573                          &row_pos, undo_lsn, 0))
3574     goto err;
3575   /* Now let checkpoint happen but don't commit */
3576   DBUG_EXECUTE_IF("maria_over_alloc_bitmap", sleep(1000););
3577   DBUG_RETURN(0);
3578 
3579 err:
3580   save_my_errno= my_errno;
3581   if (info->non_flushable_state)
3582     _ma_bitmap_flushable(info, -1);
3583   _ma_unpin_all_pages_and_finalize_row(info, LSN_IMPOSSIBLE);
3584   my_errno= save_my_errno;
3585   DBUG_RETURN(1);
3586 }
3587 
3588 
3589 /*
3590   Write a record and return rowid for it
3591 
3592   SYNOPSIS
3593     _ma_write_init_block_record()
3594     info                Maria handler
3595     record              Record to write
3596 
3597   NOTES
3598     This is done BEFORE we write the keys to the row!
3599 
3600   RETURN
3601     HA_OFFSET_ERROR     Something went wrong
3602     #                   Rowid for row
3603 */
3604 
_ma_write_init_block_record(MARIA_HA * info,const uchar * record)3605 MARIA_RECORD_POS _ma_write_init_block_record(MARIA_HA *info,
3606                                              const uchar *record)
3607 {
3608   DBUG_ENTER("_ma_write_init_block_record");
3609 
3610   calc_record_size(info, record, &info->cur_row);
3611   if (allocate_and_write_block_record(info, record,
3612                                       &info->cur_row, LSN_ERROR))
3613     DBUG_RETURN(HA_OFFSET_ERROR);
3614   DBUG_RETURN(info->cur_row.lastpos);
3615 }
3616 
3617 
3618 /*
3619   Dummy function for (*info->s->write_record)()
3620 
3621   Nothing to do here, as we already wrote the record in
3622   _ma_write_init_block_record()
3623 */
3624 
_ma_write_block_record(MARIA_HA * info,const uchar * record)3625 my_bool _ma_write_block_record(MARIA_HA *info __attribute__ ((unused)),
3626                                const uchar *record __attribute__ ((unused)))
3627 {
3628   return 0;                                     /* Row already written */
3629 }
3630 
3631 
3632 /**
3633    @brief Remove row written by _ma_write_block_record() and log undo
3634 
3635    @param  info            Maria handler
3636 
3637    @note
3638      This is called in case we got a duplicate unique key while
3639      writing keys.
3640 
3641    @return Operation status
3642      @retval 0      OK
3643      @retval 1      Error
3644 */
3645 
_ma_write_abort_block_record(MARIA_HA * info)3646 my_bool _ma_write_abort_block_record(MARIA_HA *info)
3647 {
3648   my_bool res= 0;
3649   MARIA_BITMAP_BLOCKS *blocks= &info->cur_row.insert_blocks;
3650   MARIA_BITMAP_BLOCK *block, *end;
3651   LSN lsn= LSN_IMPOSSIBLE;
3652   MARIA_SHARE *share= info->s;
3653   DBUG_ENTER("_ma_write_abort_block_record");
3654 
3655   _ma_bitmap_lock(share);  /* Lock bitmap from other insert threads */
3656   if (delete_head_or_tail(info,
3657                           ma_recordpos_to_page(info->cur_row.lastpos),
3658                           ma_recordpos_to_dir_entry(info->cur_row.lastpos), 1,
3659                           0))
3660     res= 1;
3661   for (block= blocks->block + 1, end= block + blocks->count - 1; block < end;
3662        block++)
3663   {
3664     if (block->used & BLOCKUSED_USED)
3665     {
3666       if (block->used & BLOCKUSED_TAIL)
3667       {
3668         /*
3669           block->page_count is set to the tail directory entry number in
3670           write_block_record()
3671         */
3672         if (delete_head_or_tail(info, block->page,
3673                                 block->page_count & ~TAIL_BIT,
3674                                 0, 0))
3675           res= 1;
3676       }
3677       else
3678       {
3679         if (free_full_page_range(info, block->page, block->page_count))
3680           res= 1;
3681       }
3682     }
3683   }
3684   _ma_bitmap_unlock(share);
3685   if (share->now_transactional)
3686   {
3687     /*
3688       Write clr to mark end of aborted row insert.
3689       The above delete_head_or_tail() calls will only log redo, not undo.
3690       The undo just before the row insert is stored in row->orig_undo_lsn.
3691 
3692       When applying undo's, we can skip all undo records between current
3693       lsn and row->orig_undo_lsn as logically things are as before the
3694       attempted insert.
3695     */
3696     if (_ma_write_clr(info, info->cur_row.orig_undo_lsn,
3697                       LOGREC_UNDO_ROW_INSERT,
3698                       share->calc_checksum != 0,
3699                       (ha_checksum) 0 - info->cur_row.checksum,
3700                       &lsn, (void*) 0))
3701       res= 1;
3702   }
3703   _ma_unpin_all_pages_and_finalize_row(info, lsn);
3704   DBUG_RETURN(res);
3705 }
3706 
3707 
3708 /*
3709   Update a record
3710 
3711   NOTES
3712     For the moment, we assume that info->curr_row.extents is always updated
3713     when a row is read. In the future we may decide to read this on demand
3714     for rows split into many extents.
3715 */
3716 
_ma_update_block_record2(MARIA_HA * info,MARIA_RECORD_POS record_pos,const uchar * oldrec,const uchar * record,LSN undo_lsn)3717 static my_bool _ma_update_block_record2(MARIA_HA *info,
3718                                         MARIA_RECORD_POS record_pos,
3719                                         const uchar *oldrec,
3720                                         const uchar *record,
3721                                         LSN undo_lsn)
3722 {
3723   MARIA_BITMAP_BLOCKS *blocks= &info->cur_row.insert_blocks;
3724   uchar *buff;
3725   MARIA_ROW *cur_row= &info->cur_row, *new_row= &info->new_row;
3726   MARIA_PINNED_PAGE page_link;
3727   uint rownr, org_empty_size, head_length;
3728   uint block_size= info->s->block_size;
3729   uint errpos __attribute__((unused)) = 0;
3730   uchar *dir;
3731   pgcache_page_no_t page;
3732   struct st_row_pos_info row_pos;
3733   my_bool res;
3734   ha_checksum old_checksum;
3735   MARIA_SHARE *share= info->s;
3736   DBUG_ENTER("_ma_update_block_record2");
3737   DBUG_PRINT("enter", ("rowid: %lu", (long) record_pos));
3738 
3739 #ifdef ENABLE_IF_PROBLEM_WITH_UPDATE
3740   DBUG_DUMP("oldrec", oldrec, share->base.reclength);
3741   DBUG_DUMP("newrec", record, share->base.reclength);
3742 #endif
3743 
3744   /*
3745     Checksums of new and old rows were computed by callers already; new
3746     row's was put into cur_row, old row's was put into new_row.
3747   */
3748   old_checksum= new_row->checksum;
3749   new_row->checksum= cur_row->checksum;
3750   calc_record_size(info, record, new_row);
3751   page= ma_recordpos_to_page(record_pos);
3752 
3753   _ma_bitmap_flushable(info, 1);
3754   buff= pagecache_read(share->pagecache,
3755                        &info->dfile, (pgcache_page_no_t) page, 0, 0,
3756                        share->page_type,
3757                        PAGECACHE_LOCK_WRITE, &page_link.link);
3758   page_link.unlock= PAGECACHE_LOCK_WRITE_UNLOCK;
3759   page_link.changed= buff != 0;
3760   push_dynamic(&info->pinned_pages, (void*) &page_link);
3761   if (!buff)
3762     goto err;
3763 
3764   org_empty_size= uint2korr(buff + EMPTY_SPACE_OFFSET);
3765   rownr= ma_recordpos_to_dir_entry(record_pos);
3766   dir= dir_entry_pos(buff, block_size, rownr);
3767 
3768   /*
3769     We can't use cur_row->head_length as the block may have been compacted
3770     since we read it.
3771   */
3772   head_length= uint2korr(dir + 2);
3773 
3774   if ((org_empty_size + head_length) >= new_row->total_length)
3775   {
3776     uint rec_offset, length;
3777     MARIA_BITMAP_BLOCK block;
3778 
3779     DBUG_PRINT("info", ("org_empty_size: %u  org_length: %u  new_length: %lu",
3780                         org_empty_size, head_length,
3781                         new_row->total_length));
3782 
3783     /*
3784       We can fit the new row in the same page as the original head part
3785       of the row
3786     */
3787     block.org_bitmap_value= _ma_free_size_to_head_pattern(&share->bitmap,
3788                                                           org_empty_size);
3789     if (extend_area_on_page(info, buff, dir, rownr,
3790                             new_row->total_length, &org_empty_size,
3791                             &rec_offset, &length, 1))
3792     {
3793       errpos= 1;
3794       goto err;
3795     }
3796 
3797     row_pos.buff= buff;
3798     row_pos.rownr= rownr;
3799     row_pos.empty_space= org_empty_size;
3800     row_pos.dir= dir;
3801     row_pos.data= buff + rec_offset;
3802     row_pos.length= length;
3803     blocks->block= &block;
3804     blocks->count= 1;
3805     block.page= page;
3806     block.sub_blocks= 1;
3807     block.used= BLOCKUSED_USED | BLOCKUSED_USE_ORG_BITMAP;
3808     block.empty_space= row_pos.empty_space;
3809 
3810     if (*cur_row->tail_positions &&
3811         delete_tails(info, cur_row->tail_positions))
3812     {
3813       errpos= 2;
3814       goto err;
3815     }
3816     if (cur_row->extents_count && free_full_pages(info, cur_row))
3817     {
3818       errpos= 3;
3819       goto err;
3820     }
3821     res= write_block_record(info, oldrec, record, new_row, blocks,
3822                             1, &row_pos, undo_lsn, old_checksum);
3823     /* We can't update or delete this without re-reading it again */
3824     info->update&= ~HA_STATE_AKTIV;
3825     DBUG_RETURN(res);
3826   }
3827   /* Delete old row */
3828   if (*cur_row->tail_positions &&
3829       delete_tails(info, cur_row->tail_positions))
3830   {
3831     errpos= 4;
3832     goto err;
3833   }
3834   if (cur_row->extents_count && free_full_pages(info, cur_row))
3835   {
3836     errpos= 5;
3837     goto err;
3838   }
3839 
3840   head_length= uint2korr(dir + 2);
3841   if (_ma_bitmap_find_new_place(info, new_row, page, head_length +
3842                                 org_empty_size, blocks))
3843   {
3844     errpos= 6;
3845     goto err;
3846   }
3847 
3848   /*
3849     Allocate all size in block for record
3850     TODO:
3851     Need to improve this to do compact if we can fit one more blob into
3852     the head page
3853   */
3854   if ((head_length < new_row->space_on_head_page ||
3855        (new_row->total_length <= head_length &&
3856         org_empty_size + head_length >= new_row->total_length)))
3857   {
3858     _ma_compact_block_page(share,
3859                            buff, rownr, 1,
3860                            info->trn->min_read_from,
3861                            share->base.min_block_length);
3862     org_empty_size= 0;
3863     head_length= uint2korr(dir + 2);
3864   }
3865 
3866   row_pos.buff= buff;
3867   row_pos.rownr= rownr;
3868   row_pos.empty_space= org_empty_size + head_length;
3869   row_pos.dir= dir;
3870   row_pos.data= buff + uint2korr(dir);
3871   row_pos.length= head_length;
3872   if ((res= write_block_record(info, oldrec, record, new_row, blocks, 1,
3873                                &row_pos, undo_lsn, old_checksum)))
3874   {
3875     errpos= 7;
3876     goto err;
3877   }
3878   DBUG_RETURN(0);
3879 
3880 err:
3881   DBUG_ASSERT(!maria_assert_if_crashed_table);
3882   DBUG_PRINT("error", ("errpos: %d", errpos));
3883   if (info->non_flushable_state)
3884     _ma_bitmap_flushable(info, -1);
3885   _ma_unpin_all_pages_and_finalize_row(info, LSN_IMPOSSIBLE);
3886   DBUG_RETURN(1);
3887 }
3888 
3889 
3890 /*
3891   @brief Store new row on it's original position
3892 
3893   @note
3894   This is basicly a copy of _ma_update_block_record2
3895   When we have a purge thread for deleted row, we can remove this function
3896   and use _ma_update_block_record2 instead.
3897 
3898   This is the main reason we don't make a lot of subfunctions that are
3899   common between _ma_update_block_record2() and this function.
3900 
3901   Note: If something goes wrong we mark the file crashed
3902 */
3903 
_ma_update_at_original_place(MARIA_HA * info,pgcache_page_no_t page,uint rownr,uint length_on_head_page,uint extent_count,const uchar * extent_info,const uchar * oldrec,const uchar * record,LSN undo_lsn)3904 static my_bool _ma_update_at_original_place(MARIA_HA *info,
3905                                             pgcache_page_no_t page,
3906                                             uint rownr,
3907                                             uint length_on_head_page,
3908                                             uint extent_count,
3909                                             const uchar *extent_info,
3910                                             const uchar *oldrec,
3911                                             const uchar *record,
3912                                             LSN undo_lsn)
3913 {
3914   MARIA_BITMAP_BLOCKS *blocks;
3915   MARIA_BITMAP_BLOCK *block;
3916   MARIA_ROW *cur_row= &info->cur_row, *new_row= &info->new_row;
3917   MARIA_PINNED_PAGE page_link;
3918   MARIA_SHARE *share= info->s;
3919   ha_checksum old_checksum;
3920   uint org_empty_size, empty_size;
3921   uint block_size= info->s->block_size;
3922   uchar *dir, *buff;
3923   struct st_row_pos_info row_pos;
3924   my_bool res;
3925   uint rec_offset, length;
3926   DBUG_ENTER("_ma_update_at_original_place");
3927 
3928 #ifdef ENABLE_IF_PROBLEM_WITH_UPDATE
3929   DBUG_DUMP("oldrec", oldrec, share->base.reclength);
3930   DBUG_DUMP("newrec", record, share->base.reclength);
3931 #endif
3932 
3933   /*
3934     Checksums of new and old rows were computed by callers already; new
3935     row's was put into cur_row, old row's was put into new_row.
3936   */
3937   old_checksum= new_row->checksum;
3938   new_row->checksum= cur_row->checksum;
3939   calc_record_size(info, record, new_row);
3940 
3941   _ma_bitmap_flushable(info, 1);
3942   buff= pagecache_read(share->pagecache,
3943                        &info->dfile, (pgcache_page_no_t) page, 0, 0,
3944                        share->page_type,
3945                        PAGECACHE_LOCK_WRITE, &page_link.link);
3946   page_link.unlock= PAGECACHE_LOCK_WRITE_UNLOCK;
3947   page_link.changed= buff != 0;
3948   push_dynamic(&info->pinned_pages, (void*) &page_link);
3949   if (!buff)
3950     goto err;
3951 
3952   org_empty_size= uint2korr(buff + EMPTY_SPACE_OFFSET);
3953   dir= dir_entry_pos(buff, block_size, rownr);
3954 
3955   if ((org_empty_size + cur_row->head_length) < length_on_head_page)
3956   {
3957     DBUG_PRINT("error",
3958                ("org_empty_size: %u  head_length: %u  length_on_page: %u",
3959                 org_empty_size, (uint) cur_row->head_length,
3960                 length_on_head_page));
3961     _ma_set_fatal_error(share, HA_ERR_WRONG_IN_RECORD);
3962     goto err;
3963   }
3964 
3965   /*
3966     We can fit the new row in the same page as the original head part
3967     of the row
3968   */
3969   empty_size= org_empty_size;
3970   if (extend_area_on_page(info, buff, dir, rownr,
3971                           length_on_head_page, &empty_size,
3972                           &rec_offset, &length, 1))
3973     goto err;
3974 
3975   row_pos.buff= buff;
3976   row_pos.rownr= rownr;
3977   row_pos.empty_space= empty_size;
3978   row_pos.dir= dir;
3979   row_pos.data= buff + rec_offset;
3980 
3981   /* Delete old row */
3982   if (*cur_row->tail_positions &&
3983       delete_tails(info, cur_row->tail_positions))
3984     goto err;
3985   if (cur_row->extents_count && free_full_pages(info, cur_row))
3986     goto err;
3987 
3988   /* Change extent information to be usable by write_block_record() */
3989   blocks= &cur_row->insert_blocks;
3990   if (extent_to_bitmap_blocks(info, blocks, page, extent_count, extent_info))
3991     goto err;
3992   block= blocks->block;
3993   block->empty_space= row_pos.empty_space;
3994   block->org_bitmap_value=
3995     _ma_free_size_to_head_pattern(&share->bitmap,
3996                                   (enough_free_entries_on_page(share, buff) ?
3997                                    org_empty_size : 0));
3998 
3999   DBUG_ASSERT(block->org_bitmap_value ==
4000               _ma_bitmap_get_page_bits(info, &info->s->bitmap, page));
4001   block->used|= BLOCKUSED_USE_ORG_BITMAP;
4002 
4003   /*
4004     We have to use <= below as the new_row may be smaller than the original
4005     row as the new row doesn't have transaction id
4006   */
4007 
4008   DBUG_ASSERT(blocks->count > 1 ||
4009               MY_MAX(new_row->total_length, share->base.min_block_length) <=
4010               length_on_head_page);
4011 
4012   /* Store same amount of data on head page as on original page */
4013   row_pos.length= (length_on_head_page -
4014                    (extent_count + 1 - blocks->count) * ROW_EXTENT_SIZE);
4015   set_if_bigger(row_pos.length, share->base.min_block_length);
4016   if ((res= write_block_record(info, oldrec, record, new_row, blocks,
4017                                1, &row_pos, undo_lsn, old_checksum)))
4018     goto err;
4019   DBUG_RETURN(0);
4020 
4021 err:
4022   DBUG_ASSERT(!maria_assert_if_crashed_table);
4023   _ma_mark_file_crashed(share);
4024   if (info->non_flushable_state)
4025     _ma_bitmap_flushable(info, -1);
4026   _ma_unpin_all_pages_and_finalize_row(info, LSN_IMPOSSIBLE);
4027   DBUG_RETURN(1);
4028 }
4029 
4030 
4031 /* Wrapper for _ma_update_block_record2() used by ma_update() */
4032 
_ma_update_block_record(MARIA_HA * info,MARIA_RECORD_POS record_pos,const uchar * orig_rec,const uchar * new_rec)4033 my_bool _ma_update_block_record(MARIA_HA *info, MARIA_RECORD_POS record_pos,
4034                                 const uchar *orig_rec, const uchar *new_rec)
4035 {
4036   return _ma_update_block_record2(info, record_pos, orig_rec, new_rec,
4037                                   LSN_ERROR);
4038 }
4039 
4040 
4041 /*
4042   Delete a directory entry
4043 
4044   SYNOPSIS
4045     delete_dir_entry()
4046     buff		Page buffer
4047     record_number	Record number to delete
4048     empty_space		Empty space on page after delete
4049 
4050   RETURN
4051     -1    Error on page
4052     0     ok
4053     1     Page is now empty
4054 */
4055 
delete_dir_entry(MARIA_SHARE * share,uchar * buff,uint record_number,uint * empty_space_res)4056 static int delete_dir_entry(MARIA_SHARE *share,
4057                             uchar *buff, uint record_number,
4058                             uint *empty_space_res)
4059 {
4060   uint block_size= share->block_size;
4061   uint number_of_records= (uint) buff[DIR_COUNT_OFFSET];
4062   uint length, empty_space;
4063   uchar *dir;
4064   DBUG_ENTER("delete_dir_entry");
4065   DBUG_PRINT("enter", ("record_number: %u  number_of_records: %u",
4066                        record_number, number_of_records));
4067 
4068 #ifdef SANITY_CHECKS
4069   if (record_number >= number_of_records ||
4070       record_number > ((block_size - LSN_SIZE - PAGE_TYPE_SIZE - 1 -
4071                         PAGE_SUFFIX_SIZE) / DIR_ENTRY_SIZE))
4072   {
4073     DBUG_PRINT("error", ("record_number: %u  number_of_records: %u",
4074                          record_number, number_of_records));
4075 
4076     DBUG_RETURN(-1);
4077   }
4078 #endif
4079 
4080   check_directory(share, buff, block_size, 0, (uint) -1);
4081   empty_space= uint2korr(buff + EMPTY_SPACE_OFFSET);
4082   dir= dir_entry_pos(buff, block_size, record_number);
4083   length= uint2korr(dir + 2);  /* Length of entry we just deleted */
4084   DBUG_ASSERT(uint2korr(dir) != 0 && length < block_size);
4085 
4086   if (record_number == number_of_records - 1)
4087   {
4088     /* Delete this entry and all following free directory entries */
4089     uchar *end= buff + block_size - PAGE_SUFFIX_SIZE;
4090     number_of_records--;
4091     dir+= DIR_ENTRY_SIZE;
4092     empty_space+= DIR_ENTRY_SIZE;
4093 
4094     /* Unlink and free the next empty ones */
4095     while (dir < end && dir[0] == 0 && dir[1] == 0)
4096     {
4097       number_of_records--;
4098       if (dir[2] == END_OF_DIR_FREE_LIST)
4099         buff[DIR_FREE_OFFSET]= dir[3];
4100       else
4101       {
4102         uchar *prev_entry= dir_entry_pos(buff, block_size, (uint) dir[2]);
4103         DBUG_ASSERT(uint2korr(prev_entry) == 0 && prev_entry[3] ==
4104                     number_of_records);
4105         prev_entry[3]= dir[3];
4106       }
4107       if (dir[3] != END_OF_DIR_FREE_LIST)
4108       {
4109         uchar *next_entry= dir_entry_pos(buff, block_size, (uint) dir[3]);
4110         DBUG_ASSERT(uint2korr(next_entry) == 0 && next_entry[2] ==
4111                     number_of_records);
4112         next_entry[2]= dir[2];
4113       }
4114       dir+= DIR_ENTRY_SIZE;
4115       empty_space+= DIR_ENTRY_SIZE;
4116     }
4117 
4118     if (number_of_records == 0)
4119     {
4120       /* All entries on page deleted */
4121       DBUG_PRINT("info", ("Page marked as unallocated"));
4122       buff[PAGE_TYPE_OFFSET]= UNALLOCATED_PAGE;
4123 #ifdef IDENTICAL_PAGES_AFTER_RECOVERY
4124       {
4125         dir= dir_entry_pos(buff, block_size, record_number);
4126         bzero(dir, (record_number+1) * DIR_ENTRY_SIZE);
4127       }
4128 #endif
4129       *empty_space_res= block_size;
4130       DBUG_RETURN(1);
4131     }
4132     buff[DIR_COUNT_OFFSET]= (uchar) number_of_records;
4133   }
4134   else
4135   {
4136     /* Update directory */
4137     dir[0]= dir[1]= 0;
4138     dir[2]= END_OF_DIR_FREE_LIST;
4139     if ((dir[3]= buff[DIR_FREE_OFFSET]) != END_OF_DIR_FREE_LIST)
4140     {
4141       /* Relink next entry to point to newly freed entry */
4142       uchar *next_entry= dir_entry_pos(buff, block_size, (uint) dir[3]);
4143       DBUG_ASSERT(uint2korr(next_entry) == 0 &&
4144                   next_entry[2] == END_OF_DIR_FREE_LIST);
4145       next_entry[2]= record_number;
4146     }
4147     buff[DIR_FREE_OFFSET]= record_number;
4148   }
4149   empty_space+= length;
4150 
4151   int2store(buff + EMPTY_SPACE_OFFSET, empty_space);
4152   buff[PAGE_TYPE_OFFSET]|= (uchar) PAGE_CAN_BE_COMPACTED;
4153 
4154   *empty_space_res= empty_space;
4155 
4156   check_directory(share, buff, block_size, 0, empty_space);
4157   DBUG_RETURN(0);
4158 }
4159 
4160 
4161 /*
4162   Delete a head a tail part
4163 
4164   SYNOPSIS
4165     delete_head_or_tail()
4166     info                Maria handler
4167     page                Page (not file offset!) on which the row is
4168     head                1 if this is a head page
4169     from_update		1 if we are called from update. In this case we
4170 			leave the page as write locked as we may put
4171                         the new row into the old position.
4172 
4173   RETURN
4174     0  ok
4175     1  error
4176 */
4177 
delete_head_or_tail(MARIA_HA * info,pgcache_page_no_t page,uint record_number,my_bool head,my_bool from_update)4178 static my_bool delete_head_or_tail(MARIA_HA *info,
4179                                    pgcache_page_no_t page, uint record_number,
4180                                    my_bool head, my_bool from_update)
4181 {
4182   MARIA_SHARE *share= info->s;
4183   uint empty_space;
4184   int res;
4185   my_bool page_is_empty;
4186   uchar *buff;
4187   LSN lsn;
4188   MARIA_PINNED_PAGE page_link;
4189   enum pagecache_page_lock lock_at_write, lock_at_unpin;
4190   DBUG_ENTER("delete_head_or_tail");
4191   DBUG_PRINT("enter", ("id: %lu (%lu:%u)",
4192                        (ulong) ma_recordpos(page, record_number),
4193                        (ulong) page, record_number));
4194 
4195   buff= pagecache_read(share->pagecache,
4196                        &info->dfile, page, 0, 0,
4197                        share->page_type,
4198                        PAGECACHE_LOCK_WRITE, &page_link.link);
4199   page_link.unlock= PAGECACHE_LOCK_WRITE_UNLOCK;
4200   page_link.changed= buff != 0;
4201   push_dynamic(&info->pinned_pages, (void*) &page_link);
4202   if (!buff)
4203     DBUG_RETURN(1);
4204   DBUG_ASSERT((buff[PAGE_TYPE_OFFSET] & PAGE_TYPE_MASK) ==
4205               (head ? HEAD_PAGE : TAIL_PAGE));
4206 
4207   if (from_update)
4208   {
4209     lock_at_write= PAGECACHE_LOCK_LEFT_WRITELOCKED;
4210     lock_at_unpin= PAGECACHE_LOCK_WRITE_UNLOCK;
4211   }
4212   else
4213   {
4214     lock_at_write= PAGECACHE_LOCK_WRITE_TO_READ;
4215     lock_at_unpin= PAGECACHE_LOCK_READ_UNLOCK;
4216   }
4217 
4218   res= delete_dir_entry(share, buff, record_number, &empty_space);
4219   if (res < 0)
4220     DBUG_RETURN(1);
4221   if (res == 0) /* after our deletion, page is still not empty */
4222   {
4223     uchar log_data[FILEID_STORE_SIZE + PAGE_STORE_SIZE + DIRPOS_STORE_SIZE];
4224     LEX_CUSTRING log_array[TRANSLOG_INTERNAL_PARTS + 1];
4225     page_is_empty= 0;
4226     if (share->now_transactional)
4227     {
4228       /* Log REDO data */
4229       page_store(log_data + FILEID_STORE_SIZE, page);
4230       dirpos_store(log_data + FILEID_STORE_SIZE + PAGE_STORE_SIZE,
4231                    record_number);
4232 
4233       log_array[TRANSLOG_INTERNAL_PARTS + 0].str=    log_data;
4234       log_array[TRANSLOG_INTERNAL_PARTS + 0].length= sizeof(log_data);
4235       if (translog_write_record(&lsn, (head ? LOGREC_REDO_PURGE_ROW_HEAD :
4236                                        LOGREC_REDO_PURGE_ROW_TAIL),
4237                                 info->trn, info,
4238                                 (translog_size_t) sizeof(log_data),
4239                                 TRANSLOG_INTERNAL_PARTS + 1, log_array,
4240                                 log_data, NULL))
4241         DBUG_RETURN(1);
4242     }
4243   }
4244   else /* page is now empty */
4245   {
4246     page_is_empty= 1;
4247     if (share->now_transactional)
4248     {
4249       uchar log_data[FILEID_STORE_SIZE + PAGE_STORE_SIZE];
4250       LEX_CUSTRING log_array[TRANSLOG_INTERNAL_PARTS + 1];
4251       page_store(log_data + FILEID_STORE_SIZE, page);
4252       log_array[TRANSLOG_INTERNAL_PARTS + 0].str=    log_data;
4253       log_array[TRANSLOG_INTERNAL_PARTS + 0].length= sizeof(log_data);
4254       if (translog_write_record(&lsn, LOGREC_REDO_FREE_HEAD_OR_TAIL,
4255                                 info->trn, info,
4256                                 (translog_size_t) sizeof(log_data),
4257                                 TRANSLOG_INTERNAL_PARTS + 1, log_array,
4258                                 log_data, NULL))
4259         DBUG_RETURN(1);
4260     }
4261     /*
4262       Mark that this page must be written to disk by page cache, even
4263       if we could call pagecache_delete() on it.
4264       This is needed to ensure that repair finds the empty page on disk
4265       and not old data.
4266     */
4267     pagecache_set_write_on_delete_by_link(page_link.link);
4268     DBUG_ASSERT(empty_space >= share->bitmap.sizes[0]);
4269   }
4270 
4271   pagecache_unlock_by_link(share->pagecache, page_link.link,
4272                            lock_at_write,
4273                            PAGECACHE_PIN_LEFT_PINNED, LSN_IMPOSSIBLE,
4274                            LSN_IMPOSSIBLE, 1, FALSE);
4275   page_link.unlock= lock_at_unpin;
4276   set_dynamic(&info->pinned_pages, (void*) &page_link,
4277               info->pinned_pages.elements-1);
4278 
4279   DBUG_PRINT("info", ("empty_space: %u", empty_space));
4280 
4281   /*
4282     If there is not enough space for all possible tails, mark the
4283     page full
4284   */
4285   if (!head && !page_is_empty && !enough_free_entries(buff, share->block_size,
4286                                                       1 + share->base.blobs))
4287     empty_space= 0;
4288 
4289   DBUG_RETURN(_ma_bitmap_set(info, page, head, empty_space));
4290 }
4291 
4292 
4293 /*
4294   delete all tails
4295 
4296   SYNOPSIS
4297     delete_tails()
4298     info                Handler
4299     tails               Pointer to vector of tail positions, ending with 0
4300 
4301   RETURN
4302     0  ok
4303     1  error
4304 */
4305 
delete_tails(MARIA_HA * info,MARIA_RECORD_POS * tails)4306 static my_bool delete_tails(MARIA_HA *info, MARIA_RECORD_POS *tails)
4307 {
4308   my_bool res= 0;
4309   DBUG_ENTER("delete_tails");
4310   for (; *tails; tails++)
4311   {
4312     if (delete_head_or_tail(info,
4313                             ma_recordpos_to_page(*tails),
4314                             ma_recordpos_to_dir_entry(*tails), 0, 1))
4315       res= 1;
4316   }
4317   DBUG_RETURN(res);
4318 }
4319 
4320 
4321 /*
4322   Delete a record
4323 
4324   NOTES
4325    For the moment, we assume that info->cur_row.extents is always updated
4326    when a row is read. In the future we may decide to read this on demand
4327    for rows with many splits.
4328 */
4329 
_ma_delete_block_record(MARIA_HA * info,const uchar * record)4330 my_bool _ma_delete_block_record(MARIA_HA *info, const uchar *record)
4331 {
4332   pgcache_page_no_t page;
4333   uint record_number;
4334   MARIA_SHARE *share= info->s;
4335   LSN lsn= LSN_IMPOSSIBLE;
4336   DBUG_ENTER("_ma_delete_block_record");
4337 
4338   page=          ma_recordpos_to_page(info->cur_row.lastpos);
4339   record_number= ma_recordpos_to_dir_entry(info->cur_row.lastpos);
4340   DBUG_PRINT("enter", ("rowid: %lu (%lu:%u)", (ulong) info->cur_row.lastpos,
4341                        (ulong) page, record_number));
4342 
4343   _ma_bitmap_flushable(info, 1);
4344   if (delete_head_or_tail(info, page, record_number, 1, 0) ||
4345       delete_tails(info, info->cur_row.tail_positions))
4346     goto err;
4347 
4348   if (info->cur_row.extents_count && free_full_pages(info, &info->cur_row))
4349     goto err;
4350 
4351   if (share->now_transactional)
4352   {
4353     uchar log_data[LSN_STORE_SIZE + FILEID_STORE_SIZE + PAGE_STORE_SIZE +
4354                    DIRPOS_STORE_SIZE + 2 + PAGERANGE_STORE_SIZE +
4355                    HA_CHECKSUM_STORE_SIZE];
4356     uchar *log_pos;
4357     size_t row_length;
4358     uint row_parts_count, extents_length;
4359     ha_checksum checksum_delta;
4360 
4361     /* Write UNDO record */
4362     lsn_store(log_data, info->trn->undo_lsn);
4363     page_store(log_data + LSN_STORE_SIZE + FILEID_STORE_SIZE, page);
4364     log_pos= log_data + LSN_STORE_SIZE + FILEID_STORE_SIZE + PAGE_STORE_SIZE;
4365     dirpos_store(log_pos, record_number);
4366     log_pos+= DIRPOS_STORE_SIZE;
4367     int2store(log_pos, info->cur_row.head_length -
4368               info->cur_row.header_length);
4369     log_pos+= 2;
4370     pagerange_store(log_pos, info->cur_row.extents_count);
4371     log_pos+= PAGERANGE_STORE_SIZE;
4372 
4373     info->log_row_parts[TRANSLOG_INTERNAL_PARTS].str= log_data;
4374     info->log_row_parts[TRANSLOG_INTERNAL_PARTS].length=
4375       sizeof(log_data) - HA_CHECKSUM_STORE_SIZE;
4376     store_checksum_in_rec(share, checksum_delta,
4377                           (ha_checksum) 0 - info->cur_row.checksum, log_pos,
4378                           info->log_row_parts[TRANSLOG_INTERNAL_PARTS +
4379                                               0].length);
4380     info->log_row_parts[TRANSLOG_INTERNAL_PARTS+1].str=
4381       info->cur_row.extents;
4382     info->log_row_parts[TRANSLOG_INTERNAL_PARTS+1].length=
4383       extents_length= info->cur_row.extents_count * ROW_EXTENT_SIZE;
4384 
4385     row_length= fill_insert_undo_parts(info, record,
4386                                        (info->log_row_parts +
4387                                         TRANSLOG_INTERNAL_PARTS + 2),
4388                                        &row_parts_count);
4389 
4390     if (translog_write_record(&lsn, LOGREC_UNDO_ROW_DELETE, info->trn,
4391                               info,
4392                               (translog_size_t)
4393                               (info->log_row_parts[TRANSLOG_INTERNAL_PARTS +
4394                                                    0].length + row_length +
4395                                extents_length),
4396                               TRANSLOG_INTERNAL_PARTS + 2 + row_parts_count,
4397                               info->log_row_parts,
4398                               log_data + LSN_STORE_SIZE,
4399                               &checksum_delta))
4400       goto err;
4401   }
4402 
4403   _ma_bitmap_flushable(info, -1);
4404   _ma_unpin_all_pages_and_finalize_row(info, lsn);
4405   DBUG_RETURN(0);
4406 
4407 err:
4408   DBUG_ASSERT(!maria_assert_if_crashed_table);
4409   _ma_bitmap_flushable(info, -1);
4410   _ma_unpin_all_pages_and_finalize_row(info, LSN_IMPOSSIBLE);
4411   DBUG_RETURN(1);
4412 }
4413 
4414 
4415 /****************************************************************************
4416   Reading of records
4417 ****************************************************************************/
4418 
4419 /*
4420   Read position to record from record directory at end of page
4421 
4422   SYNOPSIS
4423    get_record_position()
4424    buff                 page buffer
4425    block_size           block size for page
4426    record_number        Record number in index
4427    end_of_data          pointer to end of data for record
4428 
4429   RETURN
4430     0  Error in data
4431     #  Pointer to start of record.
4432        In this case *end_of_data is set.
4433 */
4434 
get_record_position(MARIA_SHARE * share,uchar * buff,uint record_number,uchar ** end_of_data)4435 static uchar *get_record_position(MARIA_SHARE *share, uchar *buff,
4436                                  uint record_number, uchar **end_of_data)
4437 {
4438   uint block_size= share->block_size;
4439   uint number_of_records= (uint) buff[DIR_COUNT_OFFSET];
4440   uchar *dir;
4441   uchar *data;
4442   uint offset, length;
4443 
4444 #ifdef SANITY_CHECKS
4445   if (record_number >= number_of_records ||
4446       record_number > ((block_size - PAGE_HEADER_SIZE(share) - PAGE_SUFFIX_SIZE)
4447                        / DIR_ENTRY_SIZE))
4448   {
4449     DBUG_PRINT("error",
4450                ("Wrong row number: record_number: %u  number_of_records: %u",
4451                 record_number, number_of_records));
4452     return 0;
4453   }
4454 #endif
4455 
4456   dir= dir_entry_pos(buff, block_size, record_number);
4457   offset= uint2korr(dir);
4458   length= uint2korr(dir + 2);
4459 #ifdef SANITY_CHECKS
4460   if (offset < PAGE_HEADER_SIZE(share) ||
4461       offset + length > (block_size -
4462                          number_of_records * DIR_ENTRY_SIZE -
4463                          PAGE_SUFFIX_SIZE))
4464   {
4465     DBUG_PRINT("error",
4466                ("Wrong row position:  record_number: %u  offset: %u  "
4467                 "length: %u  number_of_records: %u",
4468                 record_number, offset, length, number_of_records));
4469     return 0;
4470   }
4471 #endif
4472   data= buff + offset;
4473   *end_of_data= data + length;
4474   return data;
4475 }
4476 
4477 
4478 /*
4479   Init extent
4480 
4481   NOTES
4482     extent is a cursor over which pages to read
4483 */
4484 
init_extent(MARIA_EXTENT_CURSOR * extent,uchar * extent_info,uint extents,MARIA_RECORD_POS * tail_positions)4485 static void init_extent(MARIA_EXTENT_CURSOR *extent, uchar *extent_info,
4486                         uint extents, MARIA_RECORD_POS *tail_positions)
4487 {
4488   uint page_count;
4489   extent->extent=       extent_info;
4490   extent->extent_count= extents;
4491   extent->page=         page_korr(extent_info);         /* First extent */
4492   page_count=           (uint2korr(extent_info + ROW_EXTENT_PAGE_SIZE) &
4493                          ~START_EXTENT_BIT);
4494   extent->tail=         page_count & TAIL_BIT;
4495   if (extent->tail)
4496   {
4497     extent->page_count=   1;
4498     extent->tail_row_nr=  page_count & ~TAIL_BIT;
4499   }
4500   else
4501     extent->page_count=   page_count;
4502   extent->tail_positions= tail_positions;
4503   extent->lock_for_tail_pages= PAGECACHE_LOCK_LEFT_UNLOCKED;
4504 }
4505 
4506 
4507 /*
4508   Read next extent
4509 
4510   SYNOPSIS
4511     read_next_extent()
4512     info                Maria handler
4513     extent              Pointer to current extent (this is updated to point
4514                         to next)
4515     end_of_data         Pointer to end of data in read block (out)
4516 
4517   NOTES
4518     New block is read into info->buff
4519 
4520   RETURN
4521     0   Error;  my_errno is set
4522     #   Pointer to start of data in read block
4523         In this case end_of_data is updated to point to end of data.
4524 */
4525 
read_next_extent(MARIA_HA * info,MARIA_EXTENT_CURSOR * extent,uchar ** end_of_data)4526 static uchar *read_next_extent(MARIA_HA *info, MARIA_EXTENT_CURSOR *extent,
4527                               uchar **end_of_data)
4528 {
4529   MARIA_SHARE *share= info->s;
4530   uchar *buff, *data;
4531   MARIA_PINNED_PAGE page_link;
4532   enum pagecache_page_lock lock;
4533   DBUG_ENTER("read_next_extent");
4534 
4535   if (!extent->page_count)
4536   {
4537     uint page_count;
4538     if (!--extent->extent_count)
4539       goto crashed;
4540     extent->extent+=    ROW_EXTENT_SIZE;
4541     extent->page=       page_korr(extent->extent);
4542     page_count=         (uint2korr(extent->extent+ROW_EXTENT_PAGE_SIZE) &
4543                          ~START_EXTENT_BIT);
4544     if (!page_count)
4545       goto crashed;
4546     extent->tail=       page_count & TAIL_BIT;
4547     if (extent->tail)
4548       extent->tail_row_nr= page_count & ~TAIL_BIT;
4549     else
4550       extent->page_count= page_count;
4551     DBUG_PRINT("info",("New extent.  Page: %lu  page_count: %u  tail_flag: %d",
4552                        (ulong) extent->page, extent->page_count,
4553                        extent->tail != 0));
4554   }
4555   extent->first_extent= 0;
4556 
4557   lock= PAGECACHE_LOCK_LEFT_UNLOCKED;
4558   if (extent->tail)
4559     lock= extent->lock_for_tail_pages;
4560 
4561   buff= pagecache_read(share->pagecache,
4562                        &info->dfile, extent->page, 0,
4563                        info->buff, share->page_type,
4564                        lock, &page_link.link);
4565   if (lock != PAGECACHE_LOCK_LEFT_UNLOCKED)
4566   {
4567     /* Read during UNDO */
4568     page_link.unlock= PAGECACHE_LOCK_WRITE_UNLOCK;
4569     page_link.changed= buff != 0;
4570     push_dynamic(&info->pinned_pages, (void*) &page_link);
4571   }
4572   if (!buff)
4573   {
4574     /* check if we tried to read over end of file (ie: bad data in record) */
4575     if ((extent->page + 1) * share->block_size >
4576         share->state.state.data_file_length)
4577       goto crashed;
4578     DBUG_RETURN(0);
4579   }
4580 
4581   if (!extent->tail)
4582   {
4583     /* Full data page */
4584     if ((buff[PAGE_TYPE_OFFSET] & PAGE_TYPE_MASK) != BLOB_PAGE)
4585       goto crashed;
4586     extent->page++;                             /* point to next page */
4587     extent->page_count--;
4588     *end_of_data= buff + share->block_size - PAGE_SUFFIX_SIZE;
4589     info->cur_row.full_page_count++;            /* For maria_chk */
4590     DBUG_RETURN(extent->data_start= buff + FULL_PAGE_HEADER_SIZE(share));
4591   }
4592 
4593   /* Found tail */
4594   if ((buff[PAGE_TYPE_OFFSET] & PAGE_TYPE_MASK) != TAIL_PAGE)
4595     goto crashed;
4596   *(extent->tail_positions++)= ma_recordpos(extent->page,
4597                                             extent->tail_row_nr);
4598   info->cur_row.tail_count++;                   /* For maria_chk */
4599 
4600   if (!(data= get_record_position(share, buff,
4601                                   extent->tail_row_nr,
4602                                   end_of_data)))
4603     goto crashed;
4604   extent->data_start= data;
4605   extent->page_count= 0;                        /* No more data in extent */
4606   DBUG_RETURN(data);
4607 
4608 
4609 crashed:
4610   DBUG_ASSERT(!maria_assert_if_crashed_table);
4611   _ma_set_fatal_error(share, HA_ERR_WRONG_IN_RECORD);
4612   DBUG_PRINT("error", ("wrong extent information"));
4613   DBUG_RETURN(0);
4614 }
4615 
4616 
4617 /*
4618   Read data that may be split over many blocks
4619 
4620   SYNOPSIS
4621     read_long_data()
4622     info                Maria handler
4623     to                  Store result string here (this is allocated)
4624     extent              Pointer to current extent position
4625     data                Current position in buffer
4626     end_of_data         End of data in buffer
4627 
4628   NOTES
4629     When we have to read a new buffer, it's read into info->buff
4630 
4631     This loop is implemented by goto's instead of a for() loop as
4632     the code is notable smaller and faster this way (and it's not nice
4633     to jump into a for loop() or into a 'then' clause)
4634 
4635   RETURN
4636     0   ok
4637     1   error
4638 */
4639 
read_long_data2(MARIA_HA * info,uchar * to,ulong length,MARIA_EXTENT_CURSOR * extent,uchar ** data,uchar ** end_of_data)4640 static my_bool read_long_data2(MARIA_HA *info, uchar *to, ulong length,
4641                               MARIA_EXTENT_CURSOR *extent,
4642                               uchar **data, uchar **end_of_data)
4643 {
4644   uint left_length= (uint) (*end_of_data - *data);
4645   DBUG_ENTER("read_long_data2");
4646   DBUG_PRINT("enter", ("length: %lu  left_length: %u",
4647                        length, left_length));
4648   DBUG_ASSERT(*data <= *end_of_data);
4649 
4650   /*
4651     Fields are never split in middle. This means that if length > rest-of-data
4652     we should start reading from the next extent.  The reason we may have
4653     data left on the page is that if the fixed part of the row was less than
4654     min_block_length the head block was extended to min_block_length.
4655 
4656     This may change in the future, which is why we have the loop written
4657     the way it's written.
4658   */
4659   if (extent->first_extent && length > left_length)
4660   {
4661     *end_of_data= *data;
4662     left_length= 0;
4663   }
4664 
4665   for(;;)
4666   {
4667     if (unlikely(left_length >= length))
4668     {
4669       memcpy(to, *data, length);
4670       (*data)+= length;
4671       DBUG_PRINT("info", ("left_length: %u", left_length - (uint) length));
4672       DBUG_RETURN(0);
4673     }
4674     memcpy(to, *data, left_length);
4675     to+= left_length;
4676     length-= left_length;
4677     if (!(*data= read_next_extent(info, extent, end_of_data)))
4678       break;
4679     left_length= (uint) (*end_of_data - *data);
4680   }
4681   DBUG_RETURN(1);
4682 }
4683 
read_long_data(MARIA_HA * info,uchar * to,ulong length,MARIA_EXTENT_CURSOR * extent,uchar ** data,uchar ** end_of_data)4684 static inline my_bool read_long_data(MARIA_HA *info, uchar *to, ulong length,
4685                               MARIA_EXTENT_CURSOR *extent,
4686                               uchar **data, uchar **end_of_data)
4687 {
4688   uint left_length= (uint) (*end_of_data - *data);
4689   if (likely(left_length >= length))
4690   {
4691     memcpy(to, *data, length);
4692     (*data)+= length;
4693     return 0;
4694   }
4695   return read_long_data2(info, to, length, extent, data, end_of_data);
4696 }
4697 
4698 
4699 /*
4700   Read a record from page (helper function for _ma_read_block_record())
4701 
4702   SYNOPSIS
4703     _ma_read_block_record2()
4704     info                Maria handler
4705     record              Store record here
4706     data                Start of head data for row
4707     end_of_data         End of data for row
4708 
4709   NOTES
4710     The head page is already read by caller
4711     Following data is update in info->cur_row:
4712 
4713     cur_row.head_length is set to size of entry in head block
4714     cur_row.tail_positions is set to point to all tail blocks
4715     cur_row.extents points to extents data
4716     cur_row.extents_counts contains number of extents
4717     cur_row.empty_bits is set to empty bits
4718     cur_row.field_lengths contains packed length of all fields
4719     cur_row.blob_length contains total length of all blobs
4720     cur_row.checksum contains checksum of read record.
4721 
4722    RETURN
4723      0  ok
4724      #  Error code
4725 */
4726 
_ma_read_block_record2(MARIA_HA * info,uchar * record,uchar * data,uchar * end_of_data)4727 int _ma_read_block_record2(MARIA_HA *info, uchar *record,
4728                            uchar *data, uchar *end_of_data)
4729 {
4730   MARIA_SHARE *share= info->s;
4731   uchar *UNINIT_VAR(field_length_data), *UNINIT_VAR(blob_buffer), *start_of_data;
4732   uint flag, null_bytes, cur_null_bytes, row_extents, field_lengths;
4733   my_bool found_blob= 0;
4734   MARIA_EXTENT_CURSOR extent;
4735   MARIA_COLUMNDEF *column, *end_column;
4736   MARIA_ROW *cur_row= &info->cur_row;
4737   myf myflag= MY_WME | (share->temporary ? MY_THREAD_SPECIFIC : 0);
4738   DBUG_ENTER("_ma_read_block_record2");
4739 
4740   start_of_data= data;
4741   flag= (uint) (uchar) data[0];
4742   cur_null_bytes= share->base.original_null_bytes;
4743   null_bytes=     share->base.null_bytes;
4744   cur_row->head_length= (uint) (end_of_data - data);
4745   cur_row->full_page_count= cur_row->tail_count= 0;
4746   cur_row->blob_length= 0;
4747   /* Number of bytes in header that we don't need to write during undo */
4748   cur_row->header_length= total_header_size[(flag & PRECALC_HEADER_BITMASK)]-1;
4749 
4750   if (flag & ROW_FLAG_TRANSID)
4751   {
4752     cur_row->trid= transid_korr(data+1);
4753     if (!info->trn)
4754     {
4755       /* File crashed */
4756       DBUG_ASSERT(!maria_assert_if_crashed_table);
4757       _ma_set_fatal_error(share, HA_ERR_WRONG_IN_RECORD);
4758       DBUG_RETURN(HA_ERR_WRONG_IN_RECORD);
4759     }
4760     if (!trnman_can_read_from(info->trn, cur_row->trid))
4761       DBUG_RETURN(my_errno= HA_ERR_ROW_NOT_VISIBLE);
4762   }
4763 
4764   /* Skip trans header (for now, until we have MVCC csupport) */
4765   data+= cur_row->header_length + 1 ;
4766   if (flag & ROW_FLAG_NULLS_EXTENDED)
4767     cur_null_bytes+= data[-1];
4768 
4769   row_extents= 0;
4770   if (flag & ROW_FLAG_EXTENTS)
4771   {
4772     uint row_extent_size;
4773     /*
4774       Record is split over many data pages.
4775       Get number of extents and first extent
4776     */
4777     get_key_length(row_extents, data);
4778     cur_row->extents_count= row_extents;
4779     row_extent_size= row_extents * ROW_EXTENT_SIZE;
4780     if (cur_row->extents_buffer_length < row_extent_size &&
4781         _ma_alloc_buffer(&cur_row->extents,
4782                          &cur_row->extents_buffer_length,
4783                          row_extent_size, myflag))
4784       DBUG_RETURN(my_errno);
4785     memcpy(cur_row->extents, data, ROW_EXTENT_SIZE);
4786     data+= ROW_EXTENT_SIZE;
4787     init_extent(&extent, cur_row->extents, row_extents,
4788                 cur_row->tail_positions);
4789   }
4790   else
4791   {
4792     cur_row->extents_count= 0;
4793     (*cur_row->tail_positions)= 0;
4794     extent.page_count= 0;
4795     extent.extent_count= 1;
4796   }
4797   extent.first_extent= 1;
4798 
4799   field_lengths= 0;
4800   if (share->base.max_field_lengths)
4801   {
4802     get_key_length(field_lengths, data);
4803     cur_row->field_lengths_length= field_lengths;
4804 #ifdef SANITY_CHECKS
4805     if (field_lengths > share->base.max_field_lengths)
4806       goto err;
4807 #endif
4808   }
4809 
4810   if (share->calc_checksum)
4811     cur_row->checksum= (uint) (uchar) *data++;
4812   /* data now points on null bits */
4813   memcpy(record, data, cur_null_bytes);
4814   if (unlikely(cur_null_bytes != null_bytes))
4815   {
4816     /*
4817       This only happens if we have added more NULL columns with
4818       ALTER TABLE and are fetching an old, not yet modified old row
4819     */
4820     bzero(record + cur_null_bytes, (uint) (null_bytes - cur_null_bytes));
4821   }
4822   data+= null_bytes;
4823   /* We copy the empty bits to be able to use them for delete/update */
4824   memcpy(cur_row->empty_bits, data, share->base.pack_bytes);
4825   data+= share->base.pack_bytes;
4826 
4827   /* TODO: Use field offsets, instead of just skipping them */
4828   data+= share->base.field_offsets * FIELD_OFFSET_SIZE;
4829 
4830   /*
4831     Read row extents (note that first extent was already read into
4832     cur_row->extents above)
4833   */
4834   if (row_extents > 1)
4835   {
4836     if (read_long_data(info, cur_row->extents + ROW_EXTENT_SIZE,
4837                        (row_extents - 1) * ROW_EXTENT_SIZE,
4838                        &extent, &data, &end_of_data))
4839       DBUG_RETURN(my_errno);
4840   }
4841 
4842   /*
4843     Data now points to start of fixed length field data that can't be null
4844     or 'empty'. Note that these fields can't be split over blocks.
4845   */
4846   for (column= share->columndef,
4847          end_column= column + share->base.fixed_not_null_fields;
4848        column < end_column; column++)
4849   {
4850     uint column_length= column->length;
4851     if (data + column_length > end_of_data &&
4852         !(data= read_next_extent(info, &extent, &end_of_data)))
4853       goto err;
4854     memcpy(record + column->offset, data, column_length);
4855     data+= column_length;
4856   }
4857 
4858   /* Read array of field lengths. This may be stored in several extents */
4859   if (field_lengths)
4860   {
4861     field_length_data= cur_row->field_lengths;
4862     if (read_long_data(info, field_length_data, field_lengths, &extent,
4863                        &data, &end_of_data))
4864       DBUG_RETURN(my_errno);
4865   }
4866 
4867   /* Read variable length data. Each of these may be split over many extents */
4868   for (end_column= share->columndef + share->base.fields;
4869        column < end_column; column++)
4870   {
4871     enum en_fieldtype type= column->type;
4872     uchar *field_pos= record + column->offset;
4873     /* First check if field is present in record */
4874     if ((record[column->null_pos] & column->null_bit) ||
4875         (cur_row->empty_bits[column->empty_pos] & column->empty_bit))
4876     {
4877       bfill(record + column->offset, column->fill_length,
4878             type == FIELD_SKIP_ENDSPACE ? ' ' : 0);
4879       continue;
4880     }
4881     switch (type) {
4882     case FIELD_NORMAL:                          /* Fixed length field */
4883     case FIELD_SKIP_PRESPACE:
4884     case FIELD_SKIP_ZERO:                       /* Fixed length field */
4885       if (data + column->length > end_of_data &&
4886           !(data= read_next_extent(info, &extent, &end_of_data)))
4887         goto err;
4888       memcpy(field_pos, data, column->length);
4889       data+= column->length;
4890       break;
4891     case FIELD_SKIP_ENDSPACE:                   /* CHAR */
4892     {
4893       /* Char that is space filled */
4894       uint length;
4895       if (column->length <= 255)
4896         length= (uint) (uchar) *field_length_data++;
4897       else
4898       {
4899         length= uint2korr(field_length_data);
4900         field_length_data+= 2;
4901       }
4902 #ifdef SANITY_CHECKS
4903       if (length > column->length)
4904         goto err;
4905 #endif
4906       if (read_long_data(info, field_pos, length, &extent, &data,
4907                          &end_of_data))
4908         DBUG_RETURN(my_errno);
4909       bfill(field_pos + length, column->length - length, ' ');
4910       break;
4911     }
4912     case FIELD_VARCHAR:
4913     {
4914       ulong length;
4915       if (column->length <= 256)
4916       {
4917         length= (uint) (uchar) (*field_pos++= *field_length_data++);
4918       }
4919       else
4920       {
4921         length= uint2korr(field_length_data);
4922         field_pos[0]= field_length_data[0];
4923         field_pos[1]= field_length_data[1];
4924         field_pos+= 2;
4925         field_length_data+= 2;
4926       }
4927 #ifdef SANITY_CHECKS
4928       if (length > column->length)
4929         goto err;
4930 #endif
4931       if (read_long_data(info, field_pos, length, &extent, &data,
4932                          &end_of_data))
4933         DBUG_RETURN(my_errno);
4934       break;
4935     }
4936     case FIELD_BLOB:
4937     {
4938       uint column_size_length= column->length - portable_sizeof_char_ptr;
4939       ulong blob_length= _ma_calc_blob_length(column_size_length,
4940                                               field_length_data);
4941 
4942       if (!found_blob)
4943       {
4944         /* Calculate total length for all blobs */
4945         ulong blob_lengths= 0;
4946         uchar *length_data= field_length_data;
4947         MARIA_COLUMNDEF *blob_field= column;
4948 
4949         found_blob= 1;
4950         for (; blob_field < end_column; blob_field++)
4951         {
4952           uint size_length;
4953           if ((record[blob_field->null_pos] & blob_field->null_bit) ||
4954               (cur_row->empty_bits[blob_field->empty_pos] &
4955                blob_field->empty_bit))
4956             continue;
4957           size_length= blob_field->length - portable_sizeof_char_ptr;
4958           blob_lengths+= _ma_calc_blob_length(size_length, length_data);
4959           length_data+= size_length;
4960         }
4961         cur_row->blob_length= blob_lengths;
4962         DBUG_PRINT("info", ("Total blob length: %lu", blob_lengths));
4963         if (_ma_alloc_buffer(&info->blob_buff, &info->blob_buff_size,
4964                              blob_lengths, myflag))
4965           DBUG_RETURN(my_errno);
4966         blob_buffer= info->blob_buff;
4967       }
4968 
4969       memcpy(field_pos, field_length_data, column_size_length);
4970       memcpy(field_pos + column_size_length, (uchar *) &blob_buffer,
4971              sizeof(char*));
4972       field_length_data+= column_size_length;
4973 
4974       /*
4975         After we have read one extent, then each blob is in it's own extent
4976       */
4977       if (!extent.first_extent || (ulong) (end_of_data - data) < blob_length)
4978         end_of_data= data;                      /* Force read of next extent */
4979 
4980       if (read_long_data(info, blob_buffer, blob_length, &extent, &data,
4981                          &end_of_data))
4982         DBUG_RETURN(my_errno);
4983       blob_buffer+= blob_length;
4984       break;
4985     }
4986     default:
4987 #ifdef EXTRA_DEBUG
4988       DBUG_ASSERT(0);                           /* purecov: deadcode */
4989 #endif
4990       goto err;
4991     }
4992     continue;
4993   }
4994 
4995   if (row_extents)
4996   {
4997     DBUG_PRINT("info", ("Row read:  page_count: %u  extent_count: %u",
4998                         extent.page_count, extent.extent_count));
4999     *extent.tail_positions= 0;                  /* End marker */
5000     if (extent.page_count)
5001       goto err;
5002     if (extent.extent_count > 1)
5003     {
5004       if (_ma_check_if_zero(extent.extent + ROW_EXTENT_SIZE,
5005                             (extent.extent_count-1) * ROW_EXTENT_SIZE))
5006       {
5007         DBUG_PRINT("error", ("Data in extent is not zero"));
5008         DBUG_DUMP("extent", extent.extent + ROW_EXTENT_SIZE,
5009                   (extent.extent_count-1) * ROW_EXTENT_SIZE);
5010         goto err;
5011       }
5012     }
5013   }
5014   else
5015   {
5016     DBUG_PRINT("info", ("Row read"));
5017     /*
5018       data should normally point to end_of_date. The only exception is if
5019       the row is very short in which case we allocated 'min_block_length' data
5020       for allowing the row to expand.
5021     */
5022     if (data != end_of_data && (uint) (end_of_data - start_of_data) >
5023         share->base.min_block_length)
5024       goto err;
5025   }
5026 #ifdef EXTRA_DEBUG
5027   if (share->calc_checksum && !info->in_check_table)
5028   {
5029     /* Esnure that row checksum is correct */
5030     DBUG_ASSERT(((share->calc_checksum)(info, record) & 255) ==
5031                 cur_row->checksum);
5032   }
5033 #endif
5034   info->update|= HA_STATE_AKTIV;	/* We have an active record */
5035   DBUG_RETURN(0);
5036 
5037 err:
5038   DBUG_ASSERT(!maria_assert_if_crashed_table);
5039   /* Something was wrong with data on record */
5040   DBUG_PRINT("error", ("Found record with wrong data"));
5041   _ma_set_fatal_error(share, HA_ERR_WRONG_IN_RECORD);
5042   DBUG_RETURN(HA_ERR_WRONG_IN_RECORD);
5043 }
5044 
5045 
5046 /** @brief Read positions to tail blocks and full blocks
5047 
5048   @fn    read_row_extent_info()
5049   @param info	Handler
5050 
5051   @notes
5052     This function is a simpler version of _ma_read_block_record2()
5053     The data about the used pages is stored in info->cur_row.
5054 
5055   @return Status
5056   @retval 0   ok
5057   @retval 1   Error. my_errno contains error number
5058 */
5059 
read_row_extent_info(MARIA_HA * info,uchar * buff,uint record_number)5060 static my_bool read_row_extent_info(MARIA_HA *info, uchar *buff,
5061                                     uint record_number)
5062 {
5063   MARIA_SHARE *share= info->s;
5064   MARIA_EXTENT_CURSOR extent;
5065   MARIA_RECORD_POS *tail_pos;
5066   uchar *data, *end_of_data;
5067   uint flag, row_extents, row_extents_size;
5068   uint field_lengths __attribute__ ((unused));
5069   uchar *extents, *end;
5070   myf myflag= MY_WME | (share->temporary ? MY_THREAD_SPECIFIC : 0);
5071   DBUG_ENTER("read_row_extent_info");
5072 
5073   if (!(data= get_record_position(share, buff,
5074                                   record_number, &end_of_data)))
5075     DBUG_RETURN(1);                             /* Wrong in record */
5076 
5077   flag= (uint) (uchar) data[0];
5078   /* Skip trans header */
5079   data+= total_header_size[(flag & PRECALC_HEADER_BITMASK)];
5080 
5081   row_extents= 0;
5082   row_extents_size= 0;
5083   if (flag & ROW_FLAG_EXTENTS)
5084   {
5085     /*
5086       Record is split over many data pages.
5087       Get number of extents and first extent
5088     */
5089     get_key_length(row_extents, data);
5090     row_extents_size= row_extents * ROW_EXTENT_SIZE;
5091     if (info->cur_row.extents_buffer_length < row_extents_size &&
5092         _ma_alloc_buffer(&info->cur_row.extents,
5093                          &info->cur_row.extents_buffer_length,
5094                          row_extents_size, myflag))
5095       DBUG_RETURN(1);
5096     memcpy(info->cur_row.extents, data, ROW_EXTENT_SIZE);
5097     data+= ROW_EXTENT_SIZE;
5098     init_extent(&extent, info->cur_row.extents, row_extents,
5099                 info->cur_row.tail_positions);
5100     extent.first_extent= 1;
5101   }
5102   info->cur_row.extents_count= row_extents;
5103 
5104   /*
5105     field_lengths looks unused but get_key_length will
5106     increment data, which is required as data it's used later.
5107   */
5108   if (share->base.max_field_lengths)
5109     get_key_length(field_lengths, data);
5110 
5111   if (share->calc_checksum)
5112     info->cur_row.checksum= (uint) (uchar) *data++;
5113   if (row_extents > 1)
5114   {
5115     data+= share->base.null_bytes;
5116     data+= share->base.pack_bytes;
5117     data+= share->base.field_offsets * FIELD_OFFSET_SIZE;
5118 
5119     /*
5120       Read row extents (note that first extent was already read into
5121       info->cur_row.extents above)
5122       Lock tails with write lock as we will delete them later.
5123     */
5124     extent.lock_for_tail_pages= PAGECACHE_LOCK_LEFT_WRITELOCKED;
5125     if (read_long_data(info, info->cur_row.extents + ROW_EXTENT_SIZE,
5126                        row_extents_size - ROW_EXTENT_SIZE,
5127                        &extent, &data, &end_of_data))
5128       DBUG_RETURN(1);
5129   }
5130 
5131   /* Update tail_positions with pointer to tails */
5132   tail_pos= info->cur_row.tail_positions;
5133   for (extents= info->cur_row.extents, end= extents + row_extents_size;
5134        extents < end;
5135        extents+= ROW_EXTENT_SIZE)
5136   {
5137     pgcache_page_no_t page=  uint5korr(extents);
5138     uint page_count= uint2korr(extents + ROW_EXTENT_PAGE_SIZE);
5139     if (page_count & TAIL_BIT)
5140       *(tail_pos++)= ma_recordpos(page, (page_count & ~ (TAIL_BIT |
5141                                                          START_EXTENT_BIT)));
5142   }
5143   *tail_pos= 0;                               /* End marker */
5144   DBUG_RETURN(0);
5145 }
5146 
5147 
5148 /*
5149   Read a record based on record position
5150 
5151   @fn     _ma_read_block_record()
5152   @param info                Maria handler
5153   @param record              Store record here
5154   @param record_pos          Record position
5155 
5156   @return Status
5157   @retval 0  ok
5158   @retval #  Error number
5159 */
5160 
_ma_read_block_record(MARIA_HA * info,uchar * record,MARIA_RECORD_POS record_pos)5161 int _ma_read_block_record(MARIA_HA *info, uchar *record,
5162                           MARIA_RECORD_POS record_pos)
5163 {
5164   MARIA_SHARE *share= info->s;
5165   uchar *data, *end_of_data, *buff;
5166   uint offset;
5167   int ret;
5168   DBUG_ENTER("_ma_read_block_record");
5169   DBUG_PRINT("enter", ("rowid: %lu  page: %lu  rownr: %u",
5170                        (ulong) record_pos,
5171                        (ulong) ma_recordpos_to_page(record_pos),
5172                        ma_recordpos_to_dir_entry(record_pos)));
5173 
5174   offset= ma_recordpos_to_dir_entry(record_pos);
5175 
5176   if (!(buff= pagecache_read(share->pagecache,
5177                              &info->dfile, ma_recordpos_to_page(record_pos), 0,
5178                              info->buff, share->page_type,
5179                              PAGECACHE_LOCK_LEFT_UNLOCKED, 0)))
5180     DBUG_RETURN(my_errno);
5181 
5182   /*
5183     Unallocated page access can happen if this is an access to a page where
5184     all rows where deleted as part of this statement.
5185   */
5186   DBUG_ASSERT((buff[PAGE_TYPE_OFFSET] & PAGE_TYPE_MASK) == HEAD_PAGE ||
5187               (buff[PAGE_TYPE_OFFSET] & PAGE_TYPE_MASK) == UNALLOCATED_PAGE);
5188 
5189   if (((buff[PAGE_TYPE_OFFSET] & PAGE_TYPE_MASK) == UNALLOCATED_PAGE) ||
5190       !(data= get_record_position(share, buff, offset, &end_of_data)))
5191   {
5192     DBUG_ASSERT(!maria_assert_if_crashed_table);
5193     DBUG_PRINT("warning", ("Wrong directory entry in data block"));
5194     my_errno= HA_ERR_RECORD_DELETED;           /* File crashed */
5195     DBUG_RETURN(HA_ERR_RECORD_DELETED);
5196   }
5197   ret= _ma_read_block_record2(info, record, data, end_of_data);
5198   DBUG_RETURN(ret);
5199 }
5200 
5201 
5202 /* compare unique constraint between stored rows */
5203 
_ma_cmp_block_unique(MARIA_HA * info,MARIA_UNIQUEDEF * def,const uchar * record,MARIA_RECORD_POS pos)5204 my_bool _ma_cmp_block_unique(MARIA_HA *info, MARIA_UNIQUEDEF *def,
5205                              const uchar *record, MARIA_RECORD_POS pos)
5206 {
5207   uchar *org_rec_buff, *old_record;
5208   size_t org_rec_buff_size;
5209   int error;
5210   my_bool buff_alloced;
5211   DBUG_ENTER("_ma_cmp_block_unique");
5212 
5213   alloc_on_stack(*info->stack_end_ptr, old_record, buff_alloced,
5214                  info->s->base.reclength);
5215   if (!old_record)
5216     DBUG_RETURN(1);
5217 
5218   /* Don't let the compare destroy blobs that may be in use */
5219   org_rec_buff=      info->rec_buff;
5220   org_rec_buff_size= info->rec_buff_size;
5221   if (info->s->base.blobs)
5222   {
5223     /* Force realloc of record buffer*/
5224     info->rec_buff= 0;
5225     info->rec_buff_size= 0;
5226   }
5227   error= _ma_read_block_record(info, old_record, pos);
5228   if (!error)
5229     error= _ma_unique_comp(def, record, old_record, def->null_are_equal);
5230   if (info->s->base.blobs)
5231   {
5232     my_free(info->rec_buff);
5233     info->rec_buff=      org_rec_buff;
5234     info->rec_buff_size= org_rec_buff_size;
5235   }
5236   DBUG_PRINT("exit", ("result: %d", error));
5237   stack_alloc_free(old_record, buff_alloced);
5238   DBUG_RETURN(error != 0);
5239 }
5240 
5241 
5242 /****************************************************************************
5243   Table scan
5244 ****************************************************************************/
5245 
5246 /*
5247   Allocate buffers for table scan
5248 
5249   SYNOPSIS
5250    _ma_scan_init_block_record(MARIA_HA *info)
5251 
5252   IMPLEMENTATION
5253     We allocate one buffer for the current bitmap and one buffer for the
5254     current page
5255 
5256   RETURN
5257     0  ok
5258     1  error (couldn't allocate memory or disk error)
5259 */
5260 
_ma_scan_init_block_record(MARIA_HA * info)5261 my_bool _ma_scan_init_block_record(MARIA_HA *info)
5262 {
5263   MARIA_SHARE *share= info->s;
5264   myf flag= MY_WME | (share->temporary ? MY_THREAD_SPECIFIC : 0);
5265   DBUG_ENTER("_ma_scan_init_block_record");
5266   DBUG_ASSERT(info->dfile.file == share->bitmap.file.file);
5267 
5268   /*
5269     bitmap_buff may already be allocated if this is the second call to
5270     rnd_init() without a rnd_end() in between, see sql/handler.h
5271   */
5272   if (!(info->scan.bitmap_buff ||
5273         ((info->scan.bitmap_buff=
5274           (uchar *) my_malloc(PSI_INSTRUMENT_ME, share->block_size * 2,
5275                               flag)))))
5276     DBUG_RETURN(1);
5277   info->scan.page_buff= info->scan.bitmap_buff + share->block_size;
5278   info->scan.bitmap_end= info->scan.bitmap_buff + share->bitmap.max_total_size;
5279 
5280   /* Set scan variables to get _ma_scan_block() to start with reading bitmap */
5281   info->scan.number_of_rows= 0;
5282   info->scan.bitmap_pos= info->scan.bitmap_end;
5283   info->scan.bitmap_page= (pgcache_page_no_t) 0 - share->bitmap.pages_covered;
5284   info->scan.max_page= share->state.state.data_file_length / share->block_size;
5285   /*
5286     We need to flush what's in memory (bitmap.map) to page cache otherwise, as
5287     we are going to read bitmaps from page cache in table scan (see
5288     _ma_scan_block_record()), we may miss recently inserted rows (bitmap page
5289     in page cache would be too old).
5290   */
5291   DBUG_RETURN(_ma_bitmap_flush(info->s));
5292 }
5293 
5294 
5295 /* Free buffers allocated by _ma_scan_block_init() */
5296 
_ma_scan_end_block_record(MARIA_HA * info)5297 void _ma_scan_end_block_record(MARIA_HA *info)
5298 {
5299   DBUG_ENTER("_ma_scan_end_block_record");
5300   my_free(info->scan.bitmap_buff);
5301   info->scan.bitmap_buff= 0;
5302   if (info->scan_save)
5303   {
5304     my_free(info->scan_save);
5305     info->scan_save= 0;
5306   }
5307   DBUG_VOID_RETURN;
5308 }
5309 
5310 
5311 /**
5312   @brief Save current scan position
5313 
5314   @note
5315   For the moment we can only remember one position, but this is
5316   good enough for MySQL usage
5317 
5318   @return
5319   @retval 0			  ok
5320   @retval HA_ERR_WRONG_IN_RECORD  Could not allocate memory to hold position
5321 */
5322 
_ma_scan_remember_block_record(MARIA_HA * info,MARIA_RECORD_POS * lastpos)5323 int _ma_scan_remember_block_record(MARIA_HA *info,
5324                                    MARIA_RECORD_POS *lastpos)
5325 {
5326   uchar *bitmap_buff;
5327   DBUG_ENTER("_ma_scan_remember_block_record");
5328   if (!(info->scan_save))
5329   {
5330     if (!(info->scan_save= my_malloc(PSI_INSTRUMENT_ME,
5331                                      ALIGN_SIZE(sizeof(*info->scan_save)) +
5332                                      info->s->block_size * 2,
5333                                      MYF(MY_WME))))
5334       DBUG_RETURN(HA_ERR_OUT_OF_MEM);
5335     info->scan_save->bitmap_buff= ((uchar*) info->scan_save +
5336                                    ALIGN_SIZE(sizeof(*info->scan_save)));
5337   }
5338   /* For checking if pages have changed since we last read it */
5339   info->scan.row_changes= info->row_changes;
5340 
5341   /* Remember used bitmap and used head page */
5342   bitmap_buff= info->scan_save->bitmap_buff;
5343   memcpy(info->scan_save, &info->scan, sizeof(*info->scan_save));
5344   info->scan_save->bitmap_buff= bitmap_buff;
5345   memcpy(bitmap_buff, info->scan.bitmap_buff, info->s->block_size * 2);
5346 
5347   /* Point to the last read row */
5348   *lastpos= info->cur_row.nextpos - 1;
5349   info->scan_save->dir+= DIR_ENTRY_SIZE;
5350   DBUG_RETURN(0);
5351 }
5352 
5353 
5354 /**
5355    @brief restore scan block it's original values
5356 
5357    @return
5358    0 ok
5359    # error
5360 
5361    @note
5362    In theory we could swap bitmap buffers instead of copy them.
5363    For the moment we don't do that because there are variables pointing
5364    inside the buffers and it's a bit of hassle to either make them relative
5365    or repoint them.
5366 
5367    If the data file has changed, we will re-read the new block record
5368    to ensure that when we continue scanning we can ignore any deleted rows.
5369 */
5370 
_ma_scan_restore_block_record(MARIA_HA * info,MARIA_RECORD_POS lastpos)5371 int _ma_scan_restore_block_record(MARIA_HA *info,
5372                                   MARIA_RECORD_POS lastpos)
5373 {
5374   uchar *bitmap_buff;
5375   DBUG_ENTER("_ma_scan_restore_block_record");
5376 
5377   info->cur_row.nextpos= lastpos;
5378   bitmap_buff= info->scan.bitmap_buff;
5379   memcpy(&info->scan, info->scan_save, sizeof(*info->scan_save));
5380   info->scan.bitmap_buff= bitmap_buff;
5381   memcpy(bitmap_buff, info->scan_save->bitmap_buff, info->s->block_size * 2);
5382 
5383   if (info->scan.row_changes != info->row_changes)
5384   {
5385     /*
5386       Table has been changed. We have to re-read the current page block as
5387       data may have changed on it that we have to see.
5388     */
5389     if (!(pagecache_read(info->s->pagecache,
5390                          &info->dfile,
5391                          ma_recordpos_to_page(info->scan.row_base_page),
5392                          0, info->scan.page_buff,
5393                          info->s->page_type,
5394                          PAGECACHE_LOCK_LEFT_UNLOCKED, 0)))
5395       DBUG_RETURN(my_errno);
5396     info->scan.number_of_rows=
5397       (uint) (uchar) info->scan.page_buff[DIR_COUNT_OFFSET];
5398     info->scan.dir_end= (info->scan.page_buff + info->s->block_size -
5399                          PAGE_SUFFIX_SIZE -
5400                          info->scan.number_of_rows * DIR_ENTRY_SIZE);
5401   }
5402   DBUG_RETURN(0);
5403 }
5404 
5405 
5406 /*
5407   Read next record while scanning table
5408 
5409   SYNOPSIS
5410     _ma_scan_block_record()
5411     info                Maria handler
5412     record              Store found here
5413     record_pos          Value stored in info->cur_row.next_pos after last call
5414                         This is offset inside the current pagebuff
5415     skip_deleted
5416 
5417   NOTES
5418     - One must have called mi_scan() before this
5419     - In this version, we don't actually need record_pos, we as easily
5420       use a variable in info->scan
5421 
5422   IMPLEMENTATION
5423     Current code uses a lot of goto's to separate the different kind of
5424     states we may be in. This gives us a minimum of executed if's for
5425     the normal cases.  I tried several different ways to code this, but
5426     the current one was in the end the most readable and fastest.
5427 
5428   RETURN
5429     0   ok
5430     #   Error code  (Normally HA_ERR_END_OF_FILE)
5431 */
5432 
_ma_scan_block_record(MARIA_HA * info,uchar * record,MARIA_RECORD_POS record_pos,my_bool skip_deleted)5433 int _ma_scan_block_record(MARIA_HA *info, uchar *record,
5434                           MARIA_RECORD_POS record_pos,
5435                           my_bool skip_deleted __attribute__ ((unused)))
5436 {
5437   uint block_size;
5438   MARIA_SHARE *share= info->s;
5439   DBUG_ENTER("_ma_scan_block_record");
5440 
5441 restart_record_read:
5442   /* Find next row in current page */
5443   while (likely(record_pos < info->scan.number_of_rows))
5444   {
5445     uint length, offset;
5446     uchar *data, *end_of_data;
5447     int error;
5448 
5449     /* Ensure that scan.dir and record_pos are in sync */
5450     DBUG_ASSERT(info->scan.dir == dir_entry_pos(info->scan.page_buff,
5451                                                 share->block_size,
5452                                                 (uint) record_pos));
5453 
5454     /* Search for a valid directory entry (not 0) */
5455     while (!(offset= uint2korr(info->scan.dir)))
5456     {
5457       info->scan.dir-= DIR_ENTRY_SIZE;
5458       record_pos++;
5459 #ifdef SANITY_CHECKS
5460       if (info->scan.dir < info->scan.dir_end)
5461       {
5462         DBUG_ASSERT(!maria_assert_if_crashed_table);
5463         goto err;
5464       }
5465 #endif
5466     }
5467     /*
5468       This should always be true as the directory should always start with
5469       a valid entry.
5470     */
5471     DBUG_ASSERT(info->scan.dir >= info->scan.dir_end);
5472 
5473     /* found row */
5474     info->cur_row.lastpos= info->scan.row_base_page + record_pos;
5475     info->cur_row.nextpos= record_pos + 1;
5476     data= info->scan.page_buff + offset;
5477     length= uint2korr(info->scan.dir + 2);
5478     end_of_data= data + length;
5479     info->scan.dir-= DIR_ENTRY_SIZE;      /* Point to next row to process */
5480 #ifdef SANITY_CHECKS
5481     if (end_of_data > info->scan.dir_end ||
5482         offset < PAGE_HEADER_SIZE(share) ||
5483         length < share->base.min_block_length)
5484     {
5485       DBUG_ASSERT(!(end_of_data > info->scan.dir_end));
5486       DBUG_ASSERT(!(offset < PAGE_HEADER_SIZE(share)));
5487       DBUG_ASSERT(!(length < share->base.min_block_length));
5488       goto err;
5489     }
5490 #endif
5491     DBUG_PRINT("info", ("rowid: %lu", (ulong) info->cur_row.lastpos));
5492     error= _ma_read_block_record2(info, record, data, end_of_data);
5493     if (error != HA_ERR_ROW_NOT_VISIBLE)
5494       DBUG_RETURN(error);
5495     record_pos++;
5496   }
5497 
5498   /* Find next head page in current bitmap */
5499 restart_bitmap_scan:
5500   block_size= share->block_size;
5501   if (likely(info->scan.bitmap_pos < info->scan.bitmap_end))
5502   {
5503     uchar *data=    info->scan.bitmap_pos;
5504     longlong bits= info->scan.bits;
5505     uint bit_pos=  info->scan.bit_pos;
5506 
5507     do
5508     {
5509       while (likely(bits))
5510       {
5511         uint pattern= (uint) (bits & 7);
5512         bits >>= 3;
5513         bit_pos++;
5514         if (pattern > 0 && pattern <= 4)
5515         {
5516           /* Found head page; Read it */
5517           pgcache_page_no_t page;
5518           info->scan.bitmap_pos= data;
5519           info->scan.bits= bits;
5520           info->scan.bit_pos= bit_pos;
5521           page= (info->scan.bitmap_page + 1 +
5522                  (data - info->scan.bitmap_buff) / 6 * 16 + bit_pos - 1);
5523           info->scan.row_base_page= ma_recordpos(page, 0);
5524           if (page >= info->scan.max_page)
5525           {
5526             DBUG_PRINT("info", ("Found end of file"));
5527             DBUG_RETURN((my_errno= HA_ERR_END_OF_FILE));
5528           }
5529           if (!(pagecache_read(share->pagecache,
5530                                &info->dfile,
5531                                page, 0, info->scan.page_buff,
5532                                share->page_type,
5533                                PAGECACHE_LOCK_LEFT_UNLOCKED, 0)))
5534             DBUG_RETURN(my_errno);
5535           if (((info->scan.page_buff[PAGE_TYPE_OFFSET] & PAGE_TYPE_MASK) !=
5536                HEAD_PAGE))
5537           {
5538             /*
5539               This may happen if someone has been deleting all rows
5540               from a page since we read the bitmap, so it may be ok.
5541               Print warning in debug log and continue.
5542             */
5543             DBUG_PRINT("warning",
5544                        ("Found page of type %d when expecting head page",
5545                         (info->scan.page_buff[PAGE_TYPE_OFFSET] &
5546                          PAGE_TYPE_MASK)));
5547             continue;
5548           }
5549           if ((info->scan.number_of_rows=
5550                (uint) (uchar) info->scan.page_buff[DIR_COUNT_OFFSET]) == 0)
5551           {
5552             DBUG_PRINT("error", ("Wrong page header"));
5553             _ma_set_fatal_error(share, HA_ERR_WRONG_IN_RECORD);
5554             DBUG_RETURN(HA_ERR_WRONG_IN_RECORD);
5555           }
5556           DBUG_PRINT("info", ("Page %lu has %u rows",
5557                               (ulong) page, info->scan.number_of_rows));
5558           info->scan.dir= (info->scan.page_buff + block_size -
5559                            PAGE_SUFFIX_SIZE - DIR_ENTRY_SIZE);
5560           info->scan.dir_end= (info->scan.dir -
5561                                (info->scan.number_of_rows - 1) *
5562                                DIR_ENTRY_SIZE);
5563           record_pos= 0;
5564           goto restart_record_read;
5565         }
5566       }
5567       for (data+= 6; data < info->scan.bitmap_end; data+= 6)
5568       {
5569         bits= uint6korr(data);
5570         /* Skip not allocated pages and blob / full tail pages */
5571         if (bits && bits != 07777777777777777LL)
5572           break;
5573       }
5574       bit_pos= 0;
5575     } while (data < info->scan.bitmap_end);
5576   }
5577 
5578   /* Read next bitmap */
5579   info->scan.bitmap_page+= share->bitmap.pages_covered;
5580   if (unlikely(info->scan.bitmap_page >= info->scan.max_page))
5581   {
5582     DBUG_PRINT("info", ("Found end of file"));
5583     DBUG_RETURN((my_errno= HA_ERR_END_OF_FILE));
5584   }
5585   DBUG_PRINT("info", ("Reading bitmap at %lu",
5586                       (ulong) info->scan.bitmap_page));
5587   if (!(pagecache_read(share->pagecache, &info->s->bitmap.file,
5588                        info->scan.bitmap_page,
5589                        0, info->scan.bitmap_buff, PAGECACHE_PLAIN_PAGE,
5590                        PAGECACHE_LOCK_LEFT_UNLOCKED, 0)))
5591     DBUG_RETURN(my_errno);
5592   /* Skip scanning 'bits' in bitmap scan code */
5593   info->scan.bitmap_pos= info->scan.bitmap_buff - 6;
5594   info->scan.bits= 0;
5595   goto restart_bitmap_scan;
5596 
5597 err:
5598   DBUG_ASSERT(!maria_assert_if_crashed_table);
5599   DBUG_PRINT("error", ("Wrong data on page"));
5600   _ma_set_fatal_error(share, HA_ERR_WRONG_IN_RECORD);
5601   DBUG_RETURN(HA_ERR_WRONG_IN_RECORD);
5602 }
5603 
5604 
5605 /*
5606   Compare a row against a stored one
5607 
5608   NOTES
5609     Not implemented, as block record is not supposed to be used in a shared
5610     global environment
5611 */
5612 
_ma_compare_block_record(MARIA_HA * info,const uchar * record)5613 my_bool _ma_compare_block_record(MARIA_HA *info __attribute__ ((unused)),
5614                                  const uchar *record __attribute__ ((unused)))
5615 {
5616   return 0;
5617 }
5618 
5619 
5620 /*
5621   Store an integer with simple packing
5622 
5623   SYNOPSIS
5624     ma_store_integer()
5625     to                  Store the packed integer here
5626     nr                  Integer to store
5627 
5628   NOTES
5629     This is mostly used to store field numbers and lengths of strings.
5630     We have to cast the result for the LL() becasue of a bug in Forte CC
5631     compiler.
5632 
5633     Packing used is:
5634     nr < 251 is stored as is (in 1 byte)
5635     Numbers that require 1-4 bytes are stored as char(250+byte_length), data
5636     Bigger numbers are stored as 255, data as ulonglong (not yet done).
5637 
5638   RETURN
5639     Position in 'to' after the packed length
5640 */
5641 
ma_store_length(uchar * to,ulong nr)5642 uchar *ma_store_length(uchar *to, ulong nr)
5643 {
5644   if (nr < 251)
5645   {
5646     *to=(uchar) nr;
5647     return to+1;
5648   }
5649   if (nr < 65536)
5650   {
5651     if (nr <= 255)
5652     {
5653       to[0]= (uchar) 251;
5654       to[1]= (uchar) nr;
5655       return to+2;
5656     }
5657     to[0]= (uchar) 252;
5658     int2store(to+1, nr);
5659     return to+3;
5660   }
5661   if (nr < 16777216)
5662   {
5663     *to++= (uchar) 253;
5664     int3store(to, nr);
5665     return to+3;
5666   }
5667   *to++= (uchar) 254;
5668   int4store(to, nr);
5669   return to+4;
5670 }
5671 
5672 
5673 /* Calculate how many bytes needed to store a number */
5674 
ma_calc_length_for_store_length(ulong nr)5675 uint ma_calc_length_for_store_length(ulong nr)
5676 {
5677   if (nr < 251)
5678     return 1;
5679   if (nr < 65536)
5680   {
5681     if (nr <= 255)
5682       return 2;
5683     return 3;
5684   }
5685   if (nr < 16777216)
5686     return 4;
5687   return 5;
5688 }
5689 
5690 
5691 /* Retrive a stored number */
5692 
ma_get_length(const uchar ** packet)5693 static ulong ma_get_length(const uchar **packet)
5694 {
5695   reg1 const uchar *pos= *packet;
5696   if (*pos < 251)
5697   {
5698     (*packet)++;
5699     return (ulong) *pos;
5700   }
5701   if (*pos == 251)
5702   {
5703     (*packet)+= 2;
5704     return (ulong) pos[1];
5705   }
5706   if (*pos == 252)
5707   {
5708     (*packet)+= 3;
5709     return (ulong) uint2korr(pos+1);
5710   }
5711   if (*pos == 253)
5712   {
5713     (*packet)+= 4;
5714     return (ulong) uint3korr(pos+1);
5715   }
5716   DBUG_ASSERT(*pos == 254);
5717   (*packet)+= 5;
5718   return (ulong) uint4korr(pos+1);
5719 }
5720 
5721 
5722 /*
5723   Fill array with pointers to field parts to be stored in log for insert
5724 
5725   SYNOPSIS
5726     fill_insert_undo_parts()
5727     info                Maria handler
5728     record              Inserted row
5729     log_parts           Store pointers to changed memory areas here
5730     log_parts_count     See RETURN
5731 
5732   NOTES
5733     We have information in info->cur_row about the read row.
5734 
5735   RETURN
5736     length of data in log_parts.
5737     log_parts_count contains number of used log_parts
5738 */
5739 
fill_insert_undo_parts(MARIA_HA * info,const uchar * record,LEX_CUSTRING * log_parts,uint * log_parts_count)5740 static size_t fill_insert_undo_parts(MARIA_HA *info, const uchar *record,
5741                                      LEX_CUSTRING *log_parts,
5742                                      uint *log_parts_count)
5743 {
5744   MARIA_SHARE *share= info->s;
5745   MARIA_COLUMNDEF *column, *end_column;
5746   uchar *field_lengths= info->cur_row.field_lengths;
5747   size_t row_length;
5748   MARIA_ROW *cur_row= &info->cur_row;
5749   LEX_CUSTRING *start_log_parts;
5750   DBUG_ENTER("fill_insert_undo_parts");
5751 
5752   start_log_parts= log_parts;
5753 
5754   /* Store null bits */
5755   log_parts->str=      record;
5756   log_parts->length=   share->base.null_bytes;
5757   row_length=          log_parts->length;
5758   log_parts++;
5759 
5760   /* Stored bitmap over packed (zero length or all-zero fields) */
5761   log_parts->str= info->cur_row.empty_bits;
5762   log_parts->length= share->base.pack_bytes;
5763   row_length+=       log_parts->length;
5764   log_parts++;
5765 
5766   if (share->base.max_field_lengths)
5767   {
5768     /* Store length of all not empty char, varchar and blob fields */
5769     log_parts->str= field_lengths - 2;
5770     log_parts->length=   info->cur_row.field_lengths_length+2;
5771     int2store(log_parts->str, info->cur_row.field_lengths_length);
5772     row_length+= log_parts->length;
5773     log_parts++;
5774   }
5775 
5776   if (share->base.blobs)
5777   {
5778     /*
5779       Store total blob length to make buffer allocation easier during UNDO
5780      */
5781     log_parts->str=  info->length_buff;
5782     log_parts->length= (uint) (ma_store_length(info->length_buff,
5783                                                  info->cur_row.blob_length) -
5784                                  (uchar*) log_parts->str);
5785     row_length+=          log_parts->length;
5786     log_parts++;
5787   }
5788 
5789   /* Handle constant length fields that are always present */
5790   for (column= share->columndef,
5791        end_column= column+ share->base.fixed_not_null_fields;
5792        column < end_column;
5793        column++)
5794   {
5795     log_parts->str= record + column->offset;
5796     log_parts->length= column->length;
5797     row_length+= log_parts->length;
5798     log_parts++;
5799   }
5800 
5801   /* Handle NULL fields and CHAR/VARCHAR fields */
5802   for (end_column= share->columndef + share->base.fields - share->base.blobs;
5803        column < end_column;
5804        column++)
5805   {
5806     const uchar *column_pos;
5807     size_t column_length;
5808     if ((record[column->null_pos] & column->null_bit) ||
5809         cur_row->empty_bits[column->empty_pos] & column->empty_bit)
5810       continue;
5811 
5812     column_pos=    record+ column->offset;
5813     column_length= column->length;
5814 
5815     switch (column->type) {
5816     case FIELD_CHECK:
5817     case FIELD_NORMAL:                          /* Fixed length field */
5818     case FIELD_ZERO:
5819     case FIELD_SKIP_PRESPACE:                   /* Not packed */
5820     case FIELD_SKIP_ZERO:                       /* Fixed length field */
5821       break;
5822     case FIELD_SKIP_ENDSPACE:                   /* CHAR */
5823     {
5824       if (column->length <= 255)
5825         column_length= *field_lengths++;
5826       else
5827       {
5828         column_length= uint2korr(field_lengths);
5829         field_lengths+= 2;
5830       }
5831       break;
5832     }
5833     case FIELD_VARCHAR:
5834     {
5835       if (column->fill_length == 1)
5836         column_length= *field_lengths;
5837       else
5838         column_length= uint2korr(field_lengths);
5839       field_lengths+= column->fill_length;
5840       column_pos+= column->fill_length;
5841       break;
5842     }
5843     default:
5844       DBUG_ASSERT(0);
5845     }
5846     log_parts->str= column_pos;
5847     log_parts->length= column_length;
5848     row_length+= log_parts->length;
5849     log_parts++;
5850   }
5851 
5852   /* Add blobs */
5853   for (end_column+= share->base.blobs; column < end_column; column++)
5854   {
5855     const uchar *field_pos= record + column->offset;
5856     uint size_length= column->length - portable_sizeof_char_ptr;
5857     ulong blob_length= _ma_calc_blob_length(size_length, field_pos);
5858 
5859     /*
5860       We don't have to check for null, as blob_length is guranteed to be 0
5861       if the blob is null
5862     */
5863     if (blob_length)
5864     {
5865       uchar *blob_pos;
5866       memcpy(&blob_pos, record + column->offset + size_length,
5867              sizeof(blob_pos));
5868       log_parts->str= blob_pos;
5869       log_parts->length= blob_length;
5870       row_length+= log_parts->length;
5871       log_parts++;
5872     }
5873   }
5874   *log_parts_count= (uint) (log_parts - start_log_parts);
5875   DBUG_RETURN(row_length);
5876 }
5877 
5878 
5879 /*
5880    Fill array with pointers to field parts to be stored in log for update
5881 
5882   SYNOPSIS
5883     fill_update_undo_parts()
5884     info                Maria handler
5885     oldrec		Original row
5886     newrec              New row
5887     log_parts           Store pointers to changed memory areas here
5888     log_parts_count     See RETURN
5889 
5890   IMPLEMENTATION
5891     Format of undo record:
5892 
5893     Fields are stored in same order as the field array.
5894 
5895     Offset to changed field data (packed)
5896 
5897     For each changed field
5898       Fieldnumber (packed)
5899       Length, if variable length field (packed)
5900 
5901     For each changed field
5902      Data
5903 
5904    Packing is using ma_store_integer()
5905 
5906    The reason we store field numbers & length separated from data (ie, not
5907    after each other) is to get better cpu caching when we loop over
5908    fields (as we probably don't have to access data for each field when we
5909    want to read and old row through the undo log record).
5910 
5911    As a special case, we use '255' for the field number of the null bitmap.
5912 
5913   RETURN
5914     length of data in log_parts.
5915     log_parts_count contains number of used log_parts
5916 */
5917 
fill_update_undo_parts(MARIA_HA * info,const uchar * oldrec,const uchar * newrec,LEX_CUSTRING * log_parts,uint * log_parts_count)5918 static size_t fill_update_undo_parts(MARIA_HA *info, const uchar *oldrec,
5919                                      const uchar *newrec,
5920                                      LEX_CUSTRING *log_parts,
5921                                      uint *log_parts_count)
5922 {
5923   MARIA_SHARE *share= info->s;
5924   MARIA_COLUMNDEF *column, *end_column;
5925   MARIA_ROW *old_row= &info->cur_row, *new_row= &info->new_row;
5926   uchar *field_data, *start_field_data, *length_str;
5927   uchar *old_field_lengths= old_row->field_lengths;
5928   uchar *new_field_lengths= new_row->field_lengths;
5929   size_t row_length= 0;
5930   uint field_lengths;
5931   LEX_CUSTRING *start_log_parts;
5932   my_bool new_column_is_empty;
5933   DBUG_ENTER("fill_update_undo_parts");
5934 
5935   start_log_parts= log_parts;
5936 
5937   /*
5938     First log part is for number of fields, field numbers and lengths
5939     The +4 is to reserve place for the number of changed fields.
5940   */
5941   start_field_data= field_data= info->update_field_data + 4;
5942   log_parts++;
5943 
5944   if (memcmp(oldrec, newrec, share->base.null_bytes))
5945   {
5946     /* Store changed null bits */
5947     *field_data++=       (uchar) 255;           /* Special case */
5948     log_parts->str=      oldrec;
5949     log_parts->length=   share->base.null_bytes;
5950     row_length=          log_parts->length;
5951     log_parts++;
5952   }
5953 
5954   /* Handle constant length fields */
5955   for (column= share->columndef,
5956        end_column= column+ share->base.fixed_not_null_fields;
5957        column < end_column;
5958        column++)
5959   {
5960     if (memcmp(oldrec + column->offset, newrec + column->offset,
5961                column->length))
5962     {
5963       field_data= ma_store_length(field_data,
5964                                   (uint) (column - share->columndef));
5965       log_parts->str= oldrec + column->offset;
5966       log_parts->length= column->length;
5967       row_length+=       column->length;
5968       log_parts++;
5969     }
5970   }
5971 
5972   /* Handle the rest: NULL fields and CHAR/VARCHAR fields and BLOB's */
5973   for (end_column= share->columndef + share->base.fields;
5974        column < end_column;
5975        column++)
5976   {
5977     const uchar *new_column_pos, *old_column_pos;
5978     size_t new_column_length, old_column_length;
5979 
5980     /* First check if old column is null or empty */
5981     if (oldrec[column->null_pos] & column->null_bit)
5982     {
5983       /*
5984         It's safe to skip this one as either the new column is also null
5985         (no change) or the new_column is not null, in which case the null-bit
5986         maps differed and we have already stored the null bitmap.
5987       */
5988       continue;
5989     }
5990     if (old_row->empty_bits[column->empty_pos] & column->empty_bit)
5991     {
5992       if (new_row->empty_bits[column->empty_pos] & column->empty_bit)
5993         continue;                               /* Both are empty; skip */
5994 
5995       /* Store null length column */
5996       field_data= ma_store_length(field_data,
5997                                   (uint) (column - share->columndef));
5998       field_data= ma_store_length(field_data, 0);
5999       continue;
6000     }
6001     /*
6002       Remember if the 'new' value is empty (as in this case we must always
6003       log the original value
6004     */
6005     new_column_is_empty= ((newrec[column->null_pos] & column->null_bit) ||
6006                           (new_row->empty_bits[column->empty_pos] &
6007                            column->empty_bit));
6008 
6009     old_column_pos=      oldrec + column->offset;
6010     new_column_pos=      newrec + column->offset;
6011     old_column_length= new_column_length= column->length;
6012 
6013     switch (column->type) {
6014     case FIELD_CHECK:
6015     case FIELD_NORMAL:                          /* Fixed length field */
6016     case FIELD_ZERO:
6017     case FIELD_SKIP_PRESPACE:                   /* Not packed */
6018     case FIELD_SKIP_ZERO:                       /* Fixed length field */
6019       break;
6020     case FIELD_VARCHAR:
6021       new_column_length--;                      /* Skip length prefix */
6022       old_column_pos+= column->fill_length;
6023       new_column_pos+= column->fill_length;
6024       /* Fall through */
6025     case FIELD_SKIP_ENDSPACE:                   /* CHAR */
6026     {
6027       if (new_column_length <= 255)
6028       {
6029         old_column_length= *old_field_lengths++;
6030         if (!new_column_is_empty)
6031           new_column_length= *new_field_lengths++;
6032       }
6033       else
6034       {
6035         old_column_length= uint2korr(old_field_lengths);
6036         old_field_lengths+= 2;
6037         if (!new_column_is_empty)
6038         {
6039           new_column_length= uint2korr(new_field_lengths);
6040           new_field_lengths+= 2;
6041         }
6042       }
6043       break;
6044     }
6045     case FIELD_BLOB:
6046     {
6047       uint size_length= column->length - portable_sizeof_char_ptr;
6048       old_column_length= _ma_calc_blob_length(size_length, old_column_pos);
6049       memcpy((void*) &old_column_pos, oldrec + column->offset + size_length,
6050              sizeof(old_column_pos));
6051       if (!new_column_is_empty)
6052       {
6053         new_column_length= _ma_calc_blob_length(size_length, new_column_pos);
6054         memcpy((void*) &new_column_pos, newrec + column->offset + size_length,
6055                sizeof(old_column_pos));
6056       }
6057       break;
6058     }
6059     default:
6060       DBUG_ASSERT(0);
6061     }
6062 
6063     if (new_column_is_empty || new_column_length != old_column_length ||
6064         memcmp(old_column_pos, new_column_pos, new_column_length))
6065     {
6066       field_data= ma_store_length(field_data,
6067                                   (ulong) (column - share->columndef));
6068       field_data= ma_store_length(field_data, (ulong) old_column_length);
6069 
6070       log_parts->str=     old_column_pos;
6071       log_parts->length=  old_column_length;
6072       row_length+=        old_column_length;
6073       log_parts++;
6074     }
6075   }
6076 
6077   *log_parts_count= (uint) (log_parts - start_log_parts);
6078 
6079   /* Store length of field length data before the field/field_lengths */
6080   field_lengths= (uint) (field_data - start_field_data);
6081   length_str= start_field_data - ma_calc_length_for_store_length(field_lengths);
6082   start_log_parts->str= length_str;
6083   ma_store_length(length_str, field_lengths);
6084   start_log_parts->length= (size_t) (field_data - start_log_parts->str);
6085   row_length+= start_log_parts->length;
6086   DBUG_RETURN(row_length);
6087 }
6088 
6089 /***************************************************************************
6090   In-write hooks called under log's lock when log record is written
6091 ***************************************************************************/
6092 
6093 /**
6094    @brief Sets transaction's rec_lsn if needed
6095 
6096    A transaction sometimes writes a REDO even before the page is in the
6097    pagecache (example: brand new head or tail pages; full pages). So, if
6098    Checkpoint happens just after the REDO write, it needs to know that the
6099    REDO phase must start before this REDO. Scanning the pagecache cannot
6100    tell that as the page is not in the cache. So, transaction sets its rec_lsn
6101    to the REDO's LSN or somewhere before, and Checkpoint reads the
6102    transaction's rec_lsn.
6103 
6104    @return Operation status, always 0 (success)
6105 */
6106 
write_hook_for_redo(enum translog_record_type type,TRN * trn,MARIA_HA * tbl_info,LSN * lsn,void * hook_arg)6107 my_bool write_hook_for_redo(enum translog_record_type type
6108                             __attribute__ ((unused)),
6109                             TRN *trn, MARIA_HA *tbl_info
6110                             __attribute__ ((unused)),
6111                             LSN *lsn, void *hook_arg
6112                             __attribute__ ((unused)))
6113 {
6114   /*
6115     Users of dummy_transaction_object must keep this TRN clean as it
6116     is used by many threads (like those manipulating non-transactional
6117     tables). It might be dangerous if one user sets rec_lsn or some other
6118     member and it is picked up by another user (like putting this rec_lsn into
6119     a page of a non-transactional table); it's safer if all members stay 0. So
6120     non-transactional log records (REPAIR, CREATE, RENAME, DROP) should not
6121     call this hook; we trust them but verify ;)
6122   */
6123   DBUG_ASSERT(trn->trid != 0);
6124   /*
6125     If the hook stays so simple, it would be faster to pass
6126     !trn->rec_lsn ? trn->rec_lsn : some_dummy_lsn
6127     to translog_write_record(), like Monty did in his original code, and not
6128     have a hook. For now we keep it like this.
6129   */
6130   if (trn->rec_lsn == 0)
6131     trn->rec_lsn= *lsn;
6132   return 0;
6133 }
6134 
6135 
6136 /**
6137    @brief Sets transaction's undo_lsn, first_undo_lsn if needed
6138 
6139    @return Operation status, always 0 (success)
6140 */
6141 
write_hook_for_undo(enum translog_record_type type,TRN * trn,MARIA_HA * tbl_info,LSN * lsn,void * hook_arg)6142 my_bool write_hook_for_undo(enum translog_record_type type
6143                             __attribute__ ((unused)),
6144                             TRN *trn, MARIA_HA *tbl_info
6145                             __attribute__ ((unused)),
6146                             LSN *lsn, void *hook_arg
6147                             __attribute__ ((unused)))
6148 {
6149   DBUG_ASSERT(trn->trid != 0);
6150   trn->undo_lsn= *lsn;
6151   if (unlikely(LSN_WITH_FLAGS_TO_LSN(trn->first_undo_lsn) == 0))
6152     trn->first_undo_lsn=
6153       trn->undo_lsn | LSN_WITH_FLAGS_TO_FLAGS(trn->first_undo_lsn);
6154   return 0;
6155   /*
6156     when we implement purging, we will specialize this hook: UNDO_PURGE
6157     records will additionally set trn->undo_purge_lsn
6158   */
6159 }
6160 
6161 
6162 /**
6163    @brief Sets the table's records count and checksum and others to 0, then
6164    calls the generic REDO hook.
6165 
6166    @return Operation status, always 0 (success)
6167 */
6168 
write_hook_for_redo_delete_all(enum translog_record_type type,TRN * trn,MARIA_HA * tbl_info,LSN * lsn,void * hook_arg)6169 my_bool write_hook_for_redo_delete_all(enum translog_record_type type
6170                                        __attribute__ ((unused)),
6171                                        TRN *trn, MARIA_HA *tbl_info
6172                                        __attribute__ ((unused)),
6173                                        LSN *lsn, void *hook_arg)
6174 {
6175   _ma_reset_status(tbl_info);
6176   return write_hook_for_redo(type, trn, tbl_info, lsn, hook_arg);
6177 }
6178 
6179 
6180 /**
6181    @brief Updates "records" and "checksum" and calls the generic UNDO hook
6182 
6183    @return Operation status, always 0 (success)
6184 */
6185 
write_hook_for_undo_row_insert(enum translog_record_type type,TRN * trn,MARIA_HA * tbl_info,LSN * lsn,void * hook_arg)6186 my_bool write_hook_for_undo_row_insert(enum translog_record_type type
6187                                        __attribute__ ((unused)),
6188                                        TRN *trn, MARIA_HA *tbl_info,
6189                                        LSN *lsn, void *hook_arg)
6190 {
6191   MARIA_SHARE *share= tbl_info->s;
6192   share->state.state.records++;
6193   share->state.state.checksum+= *(ha_checksum *)hook_arg;
6194   return write_hook_for_undo(type, trn, tbl_info, lsn, hook_arg);
6195 }
6196 
6197 
6198 /**
6199    @brief Updates "records" and calls the generic UNDO hook
6200 
6201    @return Operation status, always 0 (success)
6202 */
6203 
write_hook_for_undo_row_delete(enum translog_record_type type,TRN * trn,MARIA_HA * tbl_info,LSN * lsn,void * hook_arg)6204 my_bool write_hook_for_undo_row_delete(enum translog_record_type type
6205                                        __attribute__ ((unused)),
6206                                        TRN *trn, MARIA_HA *tbl_info,
6207                                        LSN *lsn, void *hook_arg)
6208 {
6209   MARIA_SHARE *share= tbl_info->s;
6210   share->state.state.records--;
6211   share->state.state.checksum+= *(ha_checksum *)hook_arg;
6212   return write_hook_for_undo(type, trn, tbl_info, lsn, hook_arg);
6213 }
6214 
6215 
6216 /**
6217    @brief Upates "records" and "checksum" and calls the generic UNDO hook
6218 
6219    @return Operation status, always 0 (success)
6220 */
6221 
write_hook_for_undo_row_update(enum translog_record_type type,TRN * trn,MARIA_HA * tbl_info,LSN * lsn,void * hook_arg)6222 my_bool write_hook_for_undo_row_update(enum translog_record_type type
6223                                        __attribute__ ((unused)),
6224                                        TRN *trn, MARIA_HA *tbl_info,
6225                                        LSN *lsn, void *hook_arg)
6226 {
6227   MARIA_SHARE *share= tbl_info->s;
6228   share->state.state.checksum+= *(ha_checksum *)hook_arg;
6229   return write_hook_for_undo(type, trn, tbl_info, lsn, hook_arg);
6230 }
6231 
6232 
write_hook_for_undo_bulk_insert(enum translog_record_type type,TRN * trn,MARIA_HA * tbl_info,LSN * lsn,void * hook_arg)6233 my_bool write_hook_for_undo_bulk_insert(enum translog_record_type type
6234                                         __attribute__ ((unused)),
6235                                         TRN *trn, MARIA_HA *tbl_info,
6236                                         LSN *lsn, void *hook_arg)
6237 {
6238   /*
6239     We are going to call maria_delete_all_rows(), but without logging and
6240     syncing, as an optimization (if we crash before commit, the UNDO will
6241     empty; if we crash after commit, we have flushed and forced the files).
6242     Status still needs to be reset under log mutex, in case of a concurrent
6243     checkpoint.
6244   */
6245   _ma_reset_status(tbl_info);
6246   return write_hook_for_undo(type, trn, tbl_info, lsn, hook_arg);
6247 }
6248 
6249 
6250 /**
6251    @brief Updates table's lsn_of_file_id.
6252 
6253    @return Operation status, always 0 (success)
6254 */
6255 
write_hook_for_file_id(enum translog_record_type type,TRN * trn,MARIA_HA * tbl_info,LSN * lsn,void * hook_arg)6256 my_bool write_hook_for_file_id(enum translog_record_type type
6257                                __attribute__ ((unused)),
6258                                TRN *trn
6259                                __attribute__ ((unused)),
6260                                MARIA_HA *tbl_info,
6261                                LSN *lsn,
6262                                void *hook_arg
6263                                __attribute__ ((unused)))
6264 {
6265   DBUG_ASSERT(cmp_translog_addr(tbl_info->s->lsn_of_file_id, *lsn) < 0);
6266   tbl_info->s->lsn_of_file_id= *lsn;
6267   return 0;
6268 }
6269 
6270 
6271 /**
6272    Updates transaction's rec_lsn when committing.
6273 
6274    A transaction writes its commit record before being committed in trnman, so
6275    if Checkpoint happens just between the COMMIT record log write and the
6276    commit in trnman, it will record that transaction is not committed. Assume
6277    the transaction (trn1) did an INSERT; after the checkpoint, a second
6278    transaction (trn2) does a DELETE of what trn1 has inserted. Then crash,
6279    Checkpoint record says that trn1 was not committed, and REDO phase starts
6280    from Checkpoint record's LSN. So it will not find the COMMIT record of
6281    trn1, will want to roll back trn1, which will fail because the row/key
6282    which it wants to delete does not exist anymore.
6283    To avoid this, Checkpoint needs to know that the REDO phase must start
6284    before this COMMIT record, so transaction sets its rec_lsn to the COMMIT's
6285    record LSN, and as Checkpoint reads the transaction's rec_lsn, Checkpoint
6286    will know.
6287 
6288    @note so after commit trn->rec_lsn is a "commit LSN", which could be of
6289    use later.
6290 
6291    @return Operation status, always 0 (success)
6292 */
6293 
write_hook_for_commit(enum translog_record_type type,TRN * trn,MARIA_HA * tbl_info,LSN * lsn,void * hook_arg)6294 my_bool write_hook_for_commit(enum translog_record_type type
6295                               __attribute__ ((unused)),
6296                               TRN *trn,
6297                               MARIA_HA *tbl_info
6298                               __attribute__ ((unused)),
6299                               LSN *lsn,
6300                               void *hook_arg
6301                               __attribute__ ((unused)))
6302 {
6303   trn->rec_lsn= *lsn;
6304   return 0;
6305 }
6306 
6307 
6308 /***************************************************************************
6309   Applying of REDO log records
6310 ***************************************************************************/
6311 
6312 /*
6313   Apply changes to head and tail pages
6314 
6315   SYNOPSIS
6316     _ma_apply_redo_insert_row_head_or_tail()
6317     info		Maria handler
6318     lsn			LSN to put on page
6319     page_type		HEAD_PAGE or TAIL_PAGE
6320     new_page		True if this is first entry on page
6321     header		Header (without FILEID)
6322     data		Data to be put on page
6323     data_length		Length of data
6324 
6325   NOTE
6326     Handles LOGREC_REDO_INSERT_ROW_HEAD, LOGREC_REDO_INSERT_ROW_TAIL
6327     LOGREC_REDO_NEW_ROW_HEAD and LOGREC_REDO_NEW_ROW_TAIL
6328 
6329   RETURN
6330     0   ok
6331     #   Error number
6332 */
6333 
_ma_apply_redo_insert_row_head_or_tail(MARIA_HA * info,LSN lsn,uint page_type,my_bool new_page,const uchar * header,const uchar * data,size_t data_length)6334 uint _ma_apply_redo_insert_row_head_or_tail(MARIA_HA *info, LSN lsn,
6335                                             uint page_type,
6336                                             my_bool new_page,
6337                                             const uchar *header,
6338                                             const uchar *data,
6339                                             size_t data_length)
6340 {
6341   MARIA_SHARE *share= info->s;
6342   pgcache_page_no_t page;
6343   uint      rownr, empty_space;
6344   uint      block_size= share->block_size;
6345   uint      rec_offset;
6346   uchar      *buff, *dir;
6347   uint      result;
6348   MARIA_PINNED_PAGE page_link;
6349   enum pagecache_page_lock lock_method;
6350   enum pagecache_page_pin pin_method;
6351   my_off_t end_of_page;
6352   uint error;
6353   DBUG_ENTER("_ma_apply_redo_insert_row_head_or_tail");
6354 
6355   page=  page_korr(header);
6356   rownr= dirpos_korr(header + PAGE_STORE_SIZE);
6357 
6358   DBUG_PRINT("enter", ("rowid: %lu  page: %lu  rownr: %u  data_length: %u",
6359                        (ulong) ma_recordpos(page, rownr),
6360                        (ulong) page, rownr, (uint) data_length));
6361 
6362   share->state.changed|= (STATE_CHANGED | STATE_NOT_ZEROFILLED |
6363                           STATE_NOT_MOVABLE);
6364 
6365   end_of_page= (page + 1) * share->block_size;
6366   if (end_of_page > share->state.state.data_file_length)
6367   {
6368     DBUG_PRINT("info", ("Enlarging data file from %lu to %lu",
6369                         (ulong) share->state.state.data_file_length,
6370                         (ulong) end_of_page));
6371     /*
6372       New page at end of file. Note that the test above is also positive if
6373       data_file_length is not a multiple of block_size (system crashed while
6374       writing the last page): in this case we just extend the last page and
6375       fill it entirely with zeroes, then the REDO will put correct data on
6376       it.
6377     */
6378     lock_method= PAGECACHE_LOCK_WRITE;
6379     pin_method=  PAGECACHE_PIN;
6380 
6381     DBUG_ASSERT(rownr == 0 && new_page);
6382     if (rownr != 0 || !new_page)
6383       goto crashed_file;
6384 
6385     buff= info->keyread_buff;
6386     info->keyread_buff_used= 1;
6387     make_empty_page(info, buff, page_type, 1);
6388     empty_space= (block_size - PAGE_OVERHEAD_SIZE(share));
6389     rec_offset= PAGE_HEADER_SIZE(share);
6390     dir= buff+ block_size - PAGE_SUFFIX_SIZE - DIR_ENTRY_SIZE;
6391   }
6392   else
6393   {
6394     lock_method= PAGECACHE_LOCK_LEFT_WRITELOCKED;
6395     pin_method=  PAGECACHE_PIN_LEFT_PINNED;
6396 
6397     share->pagecache->readwrite_flags&= ~MY_WME;
6398     share->silence_encryption_errors= 1;
6399     buff= pagecache_read(share->pagecache, &info->dfile,
6400                          page, 0, 0,
6401                          PAGECACHE_PLAIN_PAGE, PAGECACHE_LOCK_WRITE,
6402                          &page_link.link);
6403     share->pagecache->readwrite_flags= share->pagecache->org_readwrite_flags;
6404     share->silence_encryption_errors= 0;
6405     if (!buff)
6406     {
6407       /* Skip errors when reading outside of file and uninitialized pages */
6408       if (!new_page || (my_errno != HA_ERR_FILE_TOO_SHORT &&
6409                         my_errno != HA_ERR_WRONG_CRC &&
6410                         my_errno != HA_ERR_DECRYPTION_FAILED))
6411       {
6412         DBUG_PRINT("error", ("Error %d when reading page", (int) my_errno));
6413         goto err;
6414       }
6415       /* Create new page */
6416       buff= pagecache_block_link_to_buffer(page_link.link);
6417       buff[PAGE_TYPE_OFFSET]= UNALLOCATED_PAGE;
6418     }
6419     else if (lsn_korr(buff) >= lsn)           /* Test if already applied */
6420     {
6421       check_skipped_lsn(info, lsn_korr(buff), 1, page);
6422       /* Fix bitmap, just in case */
6423       empty_space= uint2korr(buff + EMPTY_SPACE_OFFSET);
6424       if (!enough_free_entries_on_page(share, buff))
6425         empty_space= 0;                         /* Page is full */
6426 
6427       if (_ma_bitmap_set(info, page, page_type == HEAD_PAGE, empty_space))
6428         goto err;
6429       pagecache_unlock_by_link(share->pagecache, page_link.link,
6430                                PAGECACHE_LOCK_WRITE_UNLOCK,
6431                                PAGECACHE_UNPIN, LSN_IMPOSSIBLE,
6432                                LSN_IMPOSSIBLE, 0, FALSE);
6433       DBUG_RETURN(0);
6434     }
6435 
6436     if (((uint) (buff[PAGE_TYPE_OFFSET] & PAGE_TYPE_MASK) != page_type))
6437     {
6438       /*
6439         This is a page that has been freed before and now should be
6440         changed to new type.
6441       */
6442       if (!new_page)
6443       {
6444         DBUG_PRINT("error",
6445                    ("Found page of wrong type: %u, should have been %u",
6446                     (uint) (buff[PAGE_TYPE_OFFSET] & PAGE_TYPE_MASK),
6447                     page_type));
6448         goto crashed_file;
6449       }
6450       make_empty_page(info, buff, page_type, 0);
6451       empty_space= block_size - PAGE_HEADER_SIZE(share) - PAGE_SUFFIX_SIZE;
6452       (void) extend_directory(info, buff, block_size, 0, rownr, &empty_space,
6453                               page_type == HEAD_PAGE);
6454       rec_offset= PAGE_HEADER_SIZE(share);
6455       dir= dir_entry_pos(buff, block_size, rownr);
6456       empty_space+= uint2korr(dir+2);
6457     }
6458     else
6459     {
6460       uint max_entry= (uint) buff[DIR_COUNT_OFFSET];
6461       uint length;
6462 
6463       DBUG_ASSERT(!new_page);
6464       dir= dir_entry_pos(buff, block_size, rownr);
6465       empty_space= uint2korr(buff + EMPTY_SPACE_OFFSET);
6466 
6467       if (max_entry <= rownr)
6468       {
6469         /* Add directory entry first in directory and data last on page */
6470         if (extend_directory(info, buff, block_size, max_entry, rownr,
6471                              &empty_space, page_type == HEAD_PAGE))
6472           goto crashed_file;
6473       }
6474       if (extend_area_on_page(info, buff, dir, rownr,
6475                               (uint) data_length, &empty_space,
6476                               &rec_offset, &length, page_type == HEAD_PAGE))
6477         goto crashed_file;
6478     }
6479   }
6480   /* Copy data */
6481   int2store(dir+2, data_length);
6482   memcpy(buff + rec_offset, data, data_length);
6483   empty_space-= (uint) data_length;
6484   int2store(buff + EMPTY_SPACE_OFFSET, empty_space);
6485 
6486   /* Fix bitmap */
6487   if (!enough_free_entries_on_page(share, buff))
6488     empty_space= 0;                         /* Page is full */
6489   if (_ma_bitmap_set(info, page, page_type == HEAD_PAGE, empty_space))
6490     goto err;
6491 
6492   /*
6493     If page was not read before, write it but keep it pinned.
6494     We don't update its LSN When we have processed all REDOs for this page
6495     in the current REDO's group, we will stamp page with UNDO's LSN
6496     (if we stamped it now, a next REDO, in
6497     this group, for this page, would be skipped) and unpin then.
6498   */
6499   result= 0;
6500   if (lock_method == PAGECACHE_LOCK_WRITE &&
6501       pagecache_write(share->pagecache,
6502                       &info->dfile, page, 0,
6503                       buff, PAGECACHE_PLAIN_PAGE,
6504                       lock_method, pin_method,
6505                       PAGECACHE_WRITE_DELAY, &page_link.link,
6506                       LSN_IMPOSSIBLE))
6507     result= my_errno;
6508 
6509   page_link.unlock= PAGECACHE_LOCK_WRITE_UNLOCK;
6510   page_link.changed= 1;
6511   push_dynamic(&info->pinned_pages, (void*) &page_link);
6512 
6513   /*
6514     Data page and bitmap page are in place, we can update data_file_length in
6515     case we extended the file. We could not do it earlier: bitmap code tests
6516     data_file_length to know if it has to create a new page or not.
6517   */
6518   set_if_bigger(share->state.state.data_file_length, end_of_page);
6519   DBUG_RETURN(result);
6520 
6521 crashed_file:
6522   _ma_set_fatal_error(share, HA_ERR_WRONG_IN_RECORD);
6523 err:
6524   error= my_errno;
6525   if (lock_method == PAGECACHE_LOCK_LEFT_WRITELOCKED)
6526     pagecache_unlock_by_link(share->pagecache, page_link.link,
6527                              PAGECACHE_LOCK_WRITE_UNLOCK,
6528                              PAGECACHE_UNPIN, LSN_IMPOSSIBLE,
6529                              LSN_IMPOSSIBLE, 0, FALSE);
6530   _ma_mark_file_crashed(share);
6531   DBUG_ASSERT(!maria_assert_if_crashed_table); /* catch recovery error early */
6532   DBUG_RETURN((my_errno= error));
6533 }
6534 
6535 
6536 /*
6537   Apply LOGREC_REDO_PURGE_ROW_HEAD & LOGREC_REDO_PURGE_ROW_TAIL
6538 
6539   SYNOPSIS
6540     _ma_apply_redo_purge_row_head_or_tail()
6541     info		Maria handler
6542     lsn			LSN to put on page
6543     page_type		HEAD_PAGE or TAIL_PAGE
6544     header		Header (without FILEID)
6545 
6546   NOTES
6547     This function is very similar to delete_head_or_tail()
6548 
6549   RETURN
6550     0   ok
6551     #   Error number
6552 */
6553 
_ma_apply_redo_purge_row_head_or_tail(MARIA_HA * info,LSN lsn,uint page_type,const uchar * header)6554 uint _ma_apply_redo_purge_row_head_or_tail(MARIA_HA *info, LSN lsn,
6555                                            uint page_type,
6556                                            const uchar *header)
6557 {
6558   MARIA_SHARE *share= info->s;
6559   pgcache_page_no_t page;
6560   uint      rownr, empty_space;
6561   uchar     *buff;
6562   int result;
6563   uint error;
6564   MARIA_PINNED_PAGE page_link;
6565   DBUG_ENTER("_ma_apply_redo_purge_row_head_or_tail");
6566 
6567   page=  page_korr(header);
6568   rownr= dirpos_korr(header+PAGE_STORE_SIZE);
6569   DBUG_PRINT("enter", ("rowid: %lu  page: %lu  rownr: %u",
6570                        (ulong) ma_recordpos(page, rownr),
6571                        (ulong) page, rownr));
6572 
6573   share->state.changed|= (STATE_CHANGED | STATE_NOT_ZEROFILLED |
6574                           STATE_NOT_MOVABLE);
6575 
6576   if (!(buff= pagecache_read(share->pagecache, &info->dfile,
6577                              page, 0, 0,
6578                              PAGECACHE_PLAIN_PAGE, PAGECACHE_LOCK_WRITE,
6579                              &page_link.link)))
6580     goto err;
6581 
6582   if (lsn_korr(buff) >= lsn)
6583   {
6584     /*
6585       Already applied
6586       Note that in case the page is not anymore a head or tail page
6587       a future redo will fix the bitmap.
6588     */
6589     check_skipped_lsn(info, lsn_korr(buff), 1, page);
6590     if ((uint) (buff[PAGE_TYPE_OFFSET] & PAGE_TYPE_MASK) == page_type)
6591     {
6592       empty_space= uint2korr(buff+EMPTY_SPACE_OFFSET);
6593       if (!enough_free_entries_on_page(share, buff))
6594         empty_space= 0;                         /* Page is full */
6595       if (_ma_bitmap_set(info, page, page_type == HEAD_PAGE,
6596                          empty_space))
6597         goto err;
6598     }
6599     pagecache_unlock_by_link(share->pagecache, page_link.link,
6600                              PAGECACHE_LOCK_WRITE_UNLOCK,
6601                              PAGECACHE_UNPIN, LSN_IMPOSSIBLE,
6602                              LSN_IMPOSSIBLE, 0, FALSE);
6603     DBUG_RETURN(0);
6604   }
6605 
6606   DBUG_ASSERT((buff[PAGE_TYPE_OFFSET] & PAGE_TYPE_MASK) == (uchar) page_type);
6607 
6608   if (delete_dir_entry(share, buff, rownr, &empty_space) < 0)
6609   {
6610     _ma_set_fatal_error(share, HA_ERR_WRONG_IN_RECORD);
6611     goto err;
6612   }
6613 
6614   page_link.unlock= PAGECACHE_LOCK_WRITE_UNLOCK;
6615   page_link.changed= 1;
6616   push_dynamic(&info->pinned_pages, (void*) &page_link);
6617 
6618   result= 0;
6619   if (!enough_free_entries_on_page(share, buff))
6620     empty_space= 0;                         /* Page is full */
6621   /* This will work even if the page was marked as UNALLOCATED_PAGE */
6622   if (_ma_bitmap_set(info, page, page_type == HEAD_PAGE, empty_space))
6623     result= my_errno;
6624 
6625   DBUG_RETURN(result);
6626 
6627 err:
6628   error= my_errno;
6629   pagecache_unlock_by_link(share->pagecache, page_link.link,
6630                            PAGECACHE_LOCK_WRITE_UNLOCK,
6631                            PAGECACHE_UNPIN, LSN_IMPOSSIBLE,
6632                            LSN_IMPOSSIBLE, 0, FALSE);
6633   _ma_mark_file_crashed(share);
6634   DBUG_ASSERT(!maria_assert_if_crashed_table);
6635   DBUG_RETURN((my_errno= error));
6636 
6637 }
6638 
6639 
6640 /**
6641    @brief Apply LOGREC_REDO_FREE_BLOCKS
6642 
6643    @param  info            Maria handler
6644    @param  header          Header (without FILEID)
6645 
6646    Mark the pages free in the bitmap.
6647 
6648    We have to check against _ma_redo_not_needed_for_page()
6649    to guard against the case where we first clear a block and after
6650    that insert new data into the blocks.  If we would unconditionally
6651    clear the bitmap here, future changes would be ignored for the page
6652    if it's not in the dirty list (ie, it would be flushed).
6653 
6654    @return Operation status
6655      @retval 0      OK
6656      @retval 1      Error
6657 */
6658 
_ma_apply_redo_free_blocks(MARIA_HA * info,LSN lsn,LSN redo_lsn,const uchar * header)6659 uint _ma_apply_redo_free_blocks(MARIA_HA *info,
6660                                 LSN lsn __attribute__((unused)),
6661                                 LSN redo_lsn,
6662                                 const uchar *header)
6663 {
6664   MARIA_SHARE *share= info->s;
6665   uint ranges;
6666   uint16 sid;
6667   DBUG_ENTER("_ma_apply_redo_free_blocks");
6668 
6669   share->state.changed|= (STATE_CHANGED | STATE_NOT_ZEROFILLED |
6670                           STATE_NOT_MOVABLE);
6671 
6672   sid= fileid_korr(header);
6673   header+= FILEID_STORE_SIZE;
6674   ranges= pagerange_korr(header);
6675   header+= PAGERANGE_STORE_SIZE;
6676   DBUG_ASSERT(ranges > 0);
6677 
6678   /** @todo leave bitmap lock to the bitmap code... */
6679   mysql_mutex_lock(&share->bitmap.bitmap_lock);
6680   while (ranges--)
6681   {
6682     my_bool res;
6683     uint page_range;
6684     pgcache_page_no_t page, start_page;
6685 
6686     start_page= page= page_korr(header);
6687     header+= PAGE_STORE_SIZE;
6688     /* Page range may have this bit set to indicate a tail page */
6689     page_range= pagerange_korr(header) & ~(TAIL_BIT | START_EXTENT_BIT);
6690     DBUG_ASSERT(page_range > 0);
6691 
6692     header+= PAGERANGE_STORE_SIZE;
6693 
6694     DBUG_PRINT("info", ("page: %lu  pages: %u", (long) page, page_range));
6695 
6696     for ( ; page_range-- ; start_page++)
6697     {
6698       if (_ma_redo_not_needed_for_page(sid, redo_lsn, start_page, FALSE))
6699         continue;
6700       res= _ma_bitmap_reset_full_page_bits(info, &share->bitmap, start_page,
6701                                            1);
6702       if (res)
6703       {
6704         mysql_mutex_unlock(&share->bitmap.bitmap_lock);
6705         _ma_mark_file_crashed(share);
6706         DBUG_ASSERT(!maria_assert_if_crashed_table);
6707         DBUG_RETURN(res);
6708       }
6709     }
6710   }
6711   mysql_mutex_unlock(&share->bitmap.bitmap_lock);
6712   DBUG_RETURN(0);
6713 }
6714 
6715 
6716 /**
6717    @brief Apply LOGREC_REDO_FREE_HEAD_OR_TAIL
6718 
6719    @param  info            Maria handler
6720    @param  header          Header (without FILEID)
6721 
6722    @note It marks the page free in the bitmap, and sets the directory's count
6723    to 0.
6724 
6725    @return Operation status
6726      @retval 0      OK
6727      @retval 1      Error
6728 */
6729 
_ma_apply_redo_free_head_or_tail(MARIA_HA * info,LSN lsn,const uchar * header)6730 uint _ma_apply_redo_free_head_or_tail(MARIA_HA *info, LSN lsn,
6731                                       const uchar *header)
6732 {
6733   MARIA_SHARE *share= info->s;
6734   uchar *buff;
6735   pgcache_page_no_t page;
6736   MARIA_PINNED_PAGE page_link;
6737   my_bool res;
6738   DBUG_ENTER("_ma_apply_redo_free_head_or_tail");
6739 
6740   share->state.changed|= (STATE_CHANGED | STATE_NOT_ZEROFILLED |
6741                           STATE_NOT_MOVABLE);
6742 
6743   page= page_korr(header);
6744 
6745   if (!(buff= pagecache_read(share->pagecache,
6746                              &info->dfile,
6747                              page, 0, 0,
6748                              PAGECACHE_PLAIN_PAGE,
6749                              PAGECACHE_LOCK_WRITE, &page_link.link)))
6750   {
6751     pagecache_unlock_by_link(share->pagecache, page_link.link,
6752                              PAGECACHE_LOCK_WRITE_UNLOCK,
6753                              PAGECACHE_UNPIN, LSN_IMPOSSIBLE,
6754                              LSN_IMPOSSIBLE, 0, FALSE);
6755     goto err;
6756   }
6757   if (lsn_korr(buff) >= lsn)
6758   {
6759     /* Already applied */
6760     check_skipped_lsn(info, lsn_korr(buff), 1, page);
6761     pagecache_unlock_by_link(share->pagecache, page_link.link,
6762                              PAGECACHE_LOCK_WRITE_UNLOCK,
6763                              PAGECACHE_UNPIN, LSN_IMPOSSIBLE,
6764                              LSN_IMPOSSIBLE, 0, FALSE);
6765   }
6766   else
6767   {
6768     buff[PAGE_TYPE_OFFSET]= UNALLOCATED_PAGE;
6769 #ifdef IDENTICAL_PAGES_AFTER_RECOVERY
6770     {
6771       uint number_of_records= (uint) buff[DIR_COUNT_OFFSET];
6772       uchar *dir= dir_entry_pos(buff, share->block_size,
6773                                 number_of_records-1);
6774       buff[DIR_FREE_OFFSET]=  END_OF_DIR_FREE_LIST;
6775       bzero(dir, number_of_records * DIR_ENTRY_SIZE);
6776     }
6777 #endif
6778 
6779     page_link.unlock= PAGECACHE_LOCK_WRITE_UNLOCK;
6780     page_link.changed= 1;
6781     push_dynamic(&info->pinned_pages, (void*) &page_link);
6782   }
6783   /** @todo leave bitmap lock to the bitmap code... */
6784   mysql_mutex_lock(&share->bitmap.bitmap_lock);
6785   res= _ma_bitmap_reset_full_page_bits(info, &share->bitmap, page, 1);
6786   mysql_mutex_unlock(&share->bitmap.bitmap_lock);
6787   if (res)
6788     goto err;
6789   DBUG_RETURN(0);
6790 
6791 err:
6792   _ma_mark_file_crashed(share);
6793   DBUG_ASSERT(!maria_assert_if_crashed_table);
6794   DBUG_RETURN(1);
6795 }
6796 
6797 
6798 /**
6799    @brief Apply LOGREC_REDO_INSERT_ROW_BLOBS
6800 
6801    @param  info            Maria handler
6802    @parma  lsn             LSN to put on pages
6803    @param  header          Header (with FILEID)
6804    @param  redo_lsn        REDO record's LSN
6805    @param[out] number_of_blobs Number of blobs found in log record
6806    @param[out] number_of_ranges Number of ranges found
6807    @param[out] first_page  First page touched
6808    @param[out] last_page   Last page touched
6809 
6810    @note Write full pages (full head & blob pages)
6811 
6812    @return Operation status
6813      @retval 0      OK
6814      @retval !=0    Error
6815 */
6816 
_ma_apply_redo_insert_row_blobs(MARIA_HA * info,LSN lsn,const uchar * header,LSN redo_lsn,uint * const number_of_blobs,uint * const number_of_ranges,pgcache_page_no_t * const first_page,pgcache_page_no_t * const last_page)6817 uint _ma_apply_redo_insert_row_blobs(MARIA_HA *info,
6818                                      LSN lsn, const uchar *header,
6819                                      LSN redo_lsn,
6820                                      uint * const number_of_blobs,
6821                                      uint * const number_of_ranges,
6822                                      pgcache_page_no_t * const first_page,
6823                                      pgcache_page_no_t * const last_page)
6824 {
6825   MARIA_SHARE *share= info->s;
6826   const uchar *data;
6827   uint      data_size= FULL_PAGE_SIZE(share);
6828   uint      blob_count, ranges;
6829   uint16    sid;
6830   pgcache_page_no_t first_page2= ULONGLONG_MAX, last_page2= 0;
6831   DBUG_ENTER("_ma_apply_redo_insert_row_blobs");
6832 
6833   share->state.changed|= (STATE_CHANGED | STATE_NOT_ZEROFILLED |
6834                           STATE_NOT_MOVABLE);
6835 
6836   sid= fileid_korr(header);
6837   header+= FILEID_STORE_SIZE;
6838   *number_of_ranges= ranges= pagerange_korr(header);
6839   header+= PAGERANGE_STORE_SIZE;
6840   *number_of_blobs= blob_count= pagerange_korr(header);
6841   header+= PAGERANGE_STORE_SIZE;
6842   DBUG_ASSERT(ranges >= blob_count);
6843 
6844   data= (header + ranges * ROW_EXTENT_SIZE +
6845          blob_count * (SUB_RANGE_SIZE + BLOCK_FILLER_SIZE));
6846 
6847   while (blob_count--)
6848   {
6849     uint sub_ranges, empty_space;
6850 
6851     sub_ranges=  uint2korr(header);
6852     header+= SUB_RANGE_SIZE;
6853     empty_space= uint2korr(header);
6854     header+= BLOCK_FILLER_SIZE;
6855     DBUG_ASSERT(sub_ranges <= ranges && empty_space < data_size);
6856     ranges-= sub_ranges;
6857 
6858     while (sub_ranges--)
6859     {
6860       uint i;
6861       uint      res;
6862       uint      page_range;
6863       pgcache_page_no_t page;
6864       uchar     *buff;
6865       uint	data_on_page= data_size;
6866 
6867       page= page_korr(header);
6868       header+= PAGE_STORE_SIZE;
6869       page_range= pagerange_korr(header);
6870       header+= PAGERANGE_STORE_SIZE;
6871 
6872       for (i= page_range; i-- > 0 ; page++, data+= data_on_page)
6873       {
6874         MARIA_PINNED_PAGE page_link;
6875         enum pagecache_page_lock unlock_method;
6876         enum pagecache_page_pin unpin_method;
6877 
6878         set_if_smaller(first_page2, page);
6879         set_if_bigger(last_page2, page);
6880         if (i == 0 && sub_ranges == 0)
6881           data_on_page= data_size - empty_space; /* data on last page */
6882         if (_ma_redo_not_needed_for_page(sid, redo_lsn, page, FALSE))
6883           continue;
6884 
6885         if (((page + 1) * share->block_size) >
6886             share->state.state.data_file_length)
6887         {
6888           /* New page or half written page at end of file */
6889           DBUG_PRINT("info", ("Enlarging data file from %lu to %lu",
6890                               (ulong) share->state.state.data_file_length,
6891                               (ulong) ((page + 1 ) * share->block_size)));
6892           share->state.state.data_file_length= (page + 1) * share->block_size;
6893           buff= info->keyread_buff;
6894           info->keyread_buff_used= 1;
6895           make_empty_page(info, buff, BLOB_PAGE, 0);
6896           unlock_method= PAGECACHE_LOCK_LEFT_UNLOCKED;
6897           unpin_method=  PAGECACHE_PIN_LEFT_UNPINNED;
6898         }
6899         else
6900         {
6901           share->pagecache->readwrite_flags&= ~MY_WME;
6902           share->silence_encryption_errors= 1;
6903           buff= pagecache_read(share->pagecache,
6904                                &info->dfile,
6905                                page, 0, 0,
6906                                PAGECACHE_PLAIN_PAGE,
6907                                PAGECACHE_LOCK_WRITE, &page_link.link);
6908           share->pagecache->readwrite_flags= share->pagecache->
6909             org_readwrite_flags;
6910           share->silence_encryption_errors= 0;
6911           if (!buff)
6912           {
6913             if (my_errno != HA_ERR_FILE_TOO_SHORT &&
6914                 my_errno != HA_ERR_WRONG_CRC &&
6915                 my_errno != HA_ERR_DECRYPTION_FAILED)
6916             {
6917               /* If not read outside of file */
6918               pagecache_unlock_by_link(share->pagecache, page_link.link,
6919                                        PAGECACHE_LOCK_WRITE_UNLOCK,
6920                                        PAGECACHE_UNPIN, LSN_IMPOSSIBLE,
6921                                        LSN_IMPOSSIBLE, 0, FALSE);
6922               goto err;
6923             }
6924             /*
6925               Physical file was too short, create new page. It can be that
6926               recovery started with a file with N pages, wrote page N+2 into
6927               pagecache (increased data_file_length but not physical file
6928               length), now reads page N+1: the read fails.
6929             */
6930             buff= pagecache_block_link_to_buffer(page_link.link);
6931             make_empty_page(info, buff, BLOB_PAGE, 0);
6932           }
6933           else
6934           {
6935 #ifdef DBUG_ASSERT_EXISTS
6936             uchar found_page_type= (buff[PAGE_TYPE_OFFSET] & PAGE_TYPE_MASK);
6937 #endif
6938             if (lsn_korr(buff) >= lsn)
6939             {
6940               /* Already applied */
6941               check_skipped_lsn(info, lsn_korr(buff), 1, page);
6942               pagecache_unlock_by_link(share->pagecache, page_link.link,
6943                                        PAGECACHE_LOCK_WRITE_UNLOCK,
6944                                        PAGECACHE_UNPIN, LSN_IMPOSSIBLE,
6945                                        LSN_IMPOSSIBLE, 0, FALSE);
6946               goto fix_bitmap;
6947             }
6948             DBUG_ASSERT((found_page_type == (uchar) BLOB_PAGE) ||
6949                         (found_page_type == (uchar) UNALLOCATED_PAGE));
6950           }
6951           unlock_method= PAGECACHE_LOCK_WRITE_UNLOCK;
6952           unpin_method=  PAGECACHE_UNPIN;
6953         }
6954 
6955         /*
6956           Blob pages are never updated twice in same redo-undo chain, so
6957           it's safe to update lsn for them here
6958         */
6959         lsn_store(buff, lsn);
6960         buff[PAGE_TYPE_OFFSET]= BLOB_PAGE;
6961         bzero(buff + LSN_SIZE + PAGE_TYPE_SIZE,
6962               FULL_PAGE_HEADER_SIZE(share) - (LSN_SIZE + PAGE_TYPE_SIZE));
6963 
6964         if (data_on_page != data_size)
6965         {
6966           /*
6967             Last page may be only partly filled. We zero the rest, like
6968             write_full_pages() does.
6969           */
6970           bzero(buff + share->block_size - PAGE_SUFFIX_SIZE - empty_space,
6971                 empty_space);
6972         }
6973         memcpy(buff + FULL_PAGE_HEADER_SIZE(share), data, data_on_page);
6974         if (pagecache_write(share->pagecache,
6975                             &info->dfile, page, 0,
6976                             buff, PAGECACHE_PLAIN_PAGE,
6977                             unlock_method, unpin_method,
6978                             PAGECACHE_WRITE_DELAY, 0, LSN_IMPOSSIBLE))
6979           goto err;
6980 
6981     fix_bitmap:
6982       /** @todo leave bitmap lock to the bitmap code... */
6983         mysql_mutex_lock(&share->bitmap.bitmap_lock);
6984         res= _ma_bitmap_set_full_page_bits(info, &share->bitmap, page,
6985                                            1);
6986         mysql_mutex_unlock(&share->bitmap.bitmap_lock);
6987         if (res)
6988           goto err;
6989       }
6990     }
6991   }
6992   *first_page= first_page2;
6993   *last_page=  last_page2;
6994   DBUG_RETURN(0);
6995 
6996 err:
6997   _ma_mark_file_crashed(share);
6998   DBUG_ASSERT(!maria_assert_if_crashed_table);
6999   DBUG_RETURN(1);
7000 }
7001 
7002 
7003 /****************************************************************************
7004  Applying of UNDO entries
7005 ****************************************************************************/
7006 
7007 /** Execute undo of a row insert (delete the inserted row) */
7008 
_ma_apply_undo_row_insert(MARIA_HA * info,LSN undo_lsn,const uchar * header)7009 my_bool _ma_apply_undo_row_insert(MARIA_HA *info, LSN undo_lsn,
7010                                   const uchar *header)
7011 {
7012   pgcache_page_no_t page;
7013   uint rownr;
7014   uchar *buff;
7015   my_bool res;
7016   MARIA_PINNED_PAGE page_link;
7017   MARIA_SHARE *share= info->s;
7018   ha_checksum checksum;
7019   LSN lsn;
7020   DBUG_ENTER("_ma_apply_undo_row_insert");
7021 
7022   page=  page_korr(header);
7023   header+= PAGE_STORE_SIZE;
7024   rownr= dirpos_korr(header);
7025   header+= DIRPOS_STORE_SIZE;
7026   DBUG_PRINT("enter", ("rowid: %lu  page: %lu  rownr: %u",
7027                        (ulong) ma_recordpos(page, rownr),
7028                        (ulong) page, rownr));
7029 
7030   buff= pagecache_read(share->pagecache,
7031                        &info->dfile, page, 0,
7032                        0, share->page_type,
7033                        PAGECACHE_LOCK_WRITE,
7034                        &page_link.link);
7035   page_link.unlock= PAGECACHE_LOCK_WRITE_UNLOCK;
7036   page_link.changed= buff != 0;
7037   push_dynamic(&info->pinned_pages, (void*) &page_link);
7038   if (!buff)
7039     goto err;
7040 
7041   if (read_row_extent_info(info, buff, rownr))
7042     goto err;
7043 
7044   _ma_bitmap_flushable(info, 1);
7045   if (delete_head_or_tail(info, page, rownr, 1, 1) ||
7046       delete_tails(info, info->cur_row.tail_positions))
7047     goto err;
7048 
7049   if (info->cur_row.extents_count && free_full_pages(info, &info->cur_row))
7050     goto err;
7051 
7052   checksum= 0;
7053   if (share->calc_checksum)
7054     checksum= (ha_checksum) 0 - ha_checksum_korr(header);
7055   info->last_auto_increment= ~ (ulonglong) 0;
7056   if (_ma_write_clr(info, undo_lsn, LOGREC_UNDO_ROW_INSERT,
7057                     share->calc_checksum != 0, checksum, &lsn, (void*) 0))
7058     goto err;
7059 
7060   res= 0;
7061 end:
7062   /* The following is true only if _ma_bitmap_flushable() was called earlier */
7063   if (info->non_flushable_state)
7064     _ma_bitmap_flushable(info, -1);
7065   _ma_unpin_all_pages_and_finalize_row(info, lsn);
7066   DBUG_RETURN(res);
7067 
7068 err:
7069   DBUG_ASSERT(!maria_assert_if_crashed_table);
7070   res= 1;
7071   _ma_mark_file_crashed(share);
7072   /*
7073     Don't write a new LSN on the used pages. Not important as the file is
7074     marked as crashed and need to be repaired before it can be used.
7075   */
7076   lsn= LSN_IMPOSSIBLE;
7077   goto end;
7078 }
7079 
7080 
7081 /** Execute undo of a row delete (insert the row back where it was) */
7082 
_ma_apply_undo_row_delete(MARIA_HA * info,LSN undo_lsn,const uchar * header,size_t header_length)7083 my_bool _ma_apply_undo_row_delete(MARIA_HA *info, LSN undo_lsn,
7084                                   const uchar *header, size_t header_length
7085                                   __attribute__((unused)))
7086 {
7087   MARIA_SHARE *share= info->s;
7088   MARIA_ROW row;
7089   MARIA_COLUMNDEF *column, *end_column;
7090   MARIA_BITMAP_BLOCKS *blocks;
7091   struct st_row_pos_info row_pos;
7092   uchar *record;
7093   const uchar *null_bits, *field_length_data, *extent_info;
7094   pgcache_page_no_t page;
7095   ulong *blob_lengths;
7096   uint *null_field_lengths, extent_count, rownr, length_on_head_page;
7097   DBUG_ENTER("_ma_apply_undo_row_delete");
7098 
7099   /*
7100     Use cur row as a base;  We need to make a copy as we will change
7101     some buffers to point directly to 'header'
7102   */
7103   memcpy(&row, &info->cur_row, sizeof(row));
7104 
7105   page=  page_korr(header);
7106   header+= PAGE_STORE_SIZE;
7107   rownr= dirpos_korr(header);
7108   header+= DIRPOS_STORE_SIZE;
7109   length_on_head_page= uint2korr(header);
7110   header+= 2;
7111   extent_count= pagerange_korr(header);
7112   header+= PAGERANGE_STORE_SIZE;
7113   DBUG_PRINT("enter", ("rowid: %lu  page: %lu  rownr: %u",
7114                        (ulong) ma_recordpos(page, rownr),
7115                        (ulong) page, rownr));
7116 
7117   if (share->calc_checksum)
7118   {
7119     /*
7120       We extract the checksum delta here, saving a recomputation in
7121       allocate_and_write_block_record(). It's only an optimization.
7122     */
7123     row.checksum= (ha_checksum) 0 - ha_checksum_korr(header);
7124     header+= HA_CHECKSUM_STORE_SIZE;
7125   }
7126   extent_info= header;
7127   header+= extent_count * ROW_EXTENT_SIZE;
7128 
7129   null_field_lengths= row.null_field_lengths;
7130   blob_lengths= row.blob_lengths;
7131 
7132   /*
7133     Fill in info->cur_row with information about the row, like in
7134     calc_record_size(), to be used by write_block_record()
7135   */
7136 
7137   row.normal_length= row.char_length= row.varchar_length=
7138     row.blob_length= row.extents_count= row.field_lengths_length= 0;
7139 
7140   null_bits= header;
7141   header+= share->base.null_bytes;
7142   /* This will not be changed */
7143   row.empty_bits= (uchar*) header;
7144   header+= share->base.pack_bytes;
7145   if (share->base.max_field_lengths)
7146   {
7147     row.field_lengths_length= uint2korr(header);
7148     row.field_lengths= (uchar*) header + 2 ;
7149     header+= 2 + row.field_lengths_length;
7150   }
7151   if (share->base.blobs)
7152     row.blob_length= ma_get_length(&header);
7153 
7154   /* We need to build up a record (without blobs) in rec_buff */
7155   if (!(record= my_malloc(PSI_INSTRUMENT_ME, share->base.reclength,
7156                           MYF(MY_WME))))
7157     DBUG_RETURN(1);
7158 
7159   memcpy(record, null_bits, share->base.null_bytes);
7160 
7161   /* Copy field information from header to record */
7162 
7163   /* Handle constant length fields that are always present */
7164   for (column= share->columndef,
7165          end_column= column+ share->base.fixed_not_null_fields;
7166        column < end_column;
7167        column++)
7168   {
7169     memcpy(record + column->offset, header, column->length);
7170     header+= column->length;
7171   }
7172 
7173   /* Handle NULL fields and CHAR/VARCHAR fields */
7174   field_length_data= row.field_lengths;
7175   for (end_column= share->columndef + share->base.fields;
7176        column < end_column;
7177        column++, null_field_lengths++)
7178   {
7179     if ((record[column->null_pos] & column->null_bit) ||
7180         row.empty_bits[column->empty_pos] & column->empty_bit)
7181     {
7182       if (column->type != FIELD_BLOB)
7183         *null_field_lengths= 0;
7184       else
7185         *blob_lengths++= 0;
7186       if (share->calc_checksum)
7187         bfill(record + column->offset, column->fill_length,
7188               column->type == FIELD_SKIP_ENDSPACE ? ' ' : 0);
7189       continue;
7190     }
7191     switch (column->type) {
7192     case FIELD_CHECK:
7193     case FIELD_NORMAL:                          /* Fixed length field */
7194     case FIELD_ZERO:
7195     case FIELD_SKIP_PRESPACE:                   /* Not packed */
7196     case FIELD_SKIP_ZERO:                       /* Fixed length field */
7197       row.normal_length+= column->length;
7198       *null_field_lengths= column->length;
7199       memcpy(record + column->offset, header, column->length);
7200       header+= column->length;
7201       break;
7202     case FIELD_SKIP_ENDSPACE:                   /* CHAR */
7203     {
7204       uint length;
7205       if (column->length <= 255)
7206         length= (uint) *field_length_data++;
7207       else
7208       {
7209         length= uint2korr(field_length_data);
7210         field_length_data+= 2;
7211       }
7212       row.char_length+= length;
7213       *null_field_lengths= length;
7214       memcpy(record + column->offset, header, length);
7215       if (share->calc_checksum)
7216         bfill(record + column->offset + length, (column->length - length),
7217               ' ');
7218       header+= length;
7219       break;
7220     }
7221     case FIELD_VARCHAR:
7222     {
7223       uint length;
7224       uchar *field_pos= record + column->offset;
7225 
7226       /* 256 is correct as this includes the length uchar */
7227       if (column->fill_length == 1)
7228       {
7229         field_pos[0]= *field_length_data;
7230         length= (uint) *field_length_data;
7231       }
7232       else
7233       {
7234         field_pos[0]= field_length_data[0];
7235         field_pos[1]= field_length_data[1];
7236         length= uint2korr(field_length_data);
7237       }
7238       field_length_data+= column->fill_length;
7239       field_pos+= column->fill_length;
7240       row.varchar_length+= length;
7241       *null_field_lengths= length;
7242       memcpy(field_pos, header, length);
7243       header+= length;
7244       break;
7245     }
7246     case FIELD_BLOB:
7247     {
7248       /* Copy length of blob and pointer to blob data to record */
7249       uchar *field_pos= record + column->offset;
7250       uint size_length= column->length - portable_sizeof_char_ptr;
7251       ulong blob_length= _ma_calc_blob_length(size_length, field_length_data);
7252 
7253       memcpy(field_pos, field_length_data, size_length);
7254       field_length_data+= size_length;
7255       memcpy(field_pos + size_length, &header, sizeof(header));
7256       header+= blob_length;
7257       *blob_lengths++= blob_length;
7258       break;
7259     }
7260     default:
7261       DBUG_ASSERT(0);
7262     }
7263   }
7264   row.head_length= (info->row_base_length +
7265                     share->base.fixed_not_null_fields_length +
7266                     row.field_lengths_length +
7267                     size_to_store_key_length(row.field_lengths_length) +
7268                     row.normal_length +
7269                     row.char_length + row.varchar_length);
7270   row.total_length= (row.head_length + row.blob_length);
7271   if (row.total_length < share->base.min_block_length)
7272     row.total_length= share->base.min_block_length;
7273 
7274   /*
7275     Row is now generated. Now we need to insert record on the original
7276     pages with original size on each page.
7277   */
7278 
7279   _ma_bitmap_flushable(info, 1);
7280   /* Change extent information to be usable by write_block_record() */
7281   blocks= &row.insert_blocks;
7282   if (extent_to_bitmap_blocks(info, blocks, page, extent_count, extent_info))
7283     goto err;
7284   blocks->block->org_bitmap_value= _ma_bitmap_get_page_bits(info,
7285                                                             &share->bitmap,
7286                                                             page);
7287   blocks->block->used|= BLOCKUSED_USE_ORG_BITMAP;
7288 
7289   /* Read head page and allocate data for rowid */
7290   if (get_rowpos_in_head_or_tail_page(info, blocks->block,
7291                                       info->buff,
7292                                       length_on_head_page,
7293                                       HEAD_PAGE, PAGECACHE_LOCK_WRITE,
7294                                       rownr, &row_pos))
7295     goto err;
7296 
7297   if (share->calc_checksum)
7298   {
7299     DBUG_ASSERT(row.checksum == (share->calc_checksum)(info, record));
7300   }
7301   /* Store same amount of data on head page as on original page */
7302   row_pos.length= (length_on_head_page -
7303                    (extent_count + 1 - blocks->count) * ROW_EXTENT_SIZE);
7304   set_if_bigger(row_pos.length, share->base.min_block_length);
7305   if (write_block_record(info, (uchar*) 0, record, &row,
7306                          blocks, blocks->block->org_bitmap_value != 0,
7307                          &row_pos, undo_lsn, 0))
7308     goto err;
7309 
7310   my_free(record);
7311   DBUG_RETURN(0);
7312 
7313 err:
7314   DBUG_ASSERT(!maria_assert_if_crashed_table);
7315   _ma_mark_file_crashed(share);
7316   if (info->non_flushable_state)
7317     _ma_bitmap_flushable(info, -1);
7318   _ma_unpin_all_pages_and_finalize_row(info, LSN_IMPOSSIBLE);
7319   my_free(record);
7320   DBUG_RETURN(1);
7321 }
7322 
7323 
7324 /**
7325   Execute undo of a row update
7326 
7327   @fn _ma_apply_undo_row_update()
7328 
7329   @return Operation status
7330     @retval 0      OK
7331     @retval 1      Error
7332 */
7333 
_ma_apply_undo_row_update(MARIA_HA * info,LSN undo_lsn,const uchar * header,size_t header_length)7334 my_bool _ma_apply_undo_row_update(MARIA_HA *info, LSN undo_lsn,
7335                                   const uchar *header,
7336                                   size_t header_length
7337                                   __attribute__((unused)))
7338 {
7339   MARIA_SHARE *share= info->s;
7340   MARIA_RECORD_POS record_pos;
7341   const uchar *field_length_data, *field_length_data_end, *extent_info;
7342   uchar *current_record, *orig_record;
7343   pgcache_page_no_t page;
7344   ha_checksum UNINIT_VAR(checksum_delta);
7345   uint rownr, field_length_header, extent_count, length_on_head_page;
7346   int error;
7347   DBUG_ENTER("_ma_apply_undo_row_update");
7348 
7349   page=  page_korr(header);
7350   header+= PAGE_STORE_SIZE;
7351   rownr= dirpos_korr(header);
7352   header+= DIRPOS_STORE_SIZE;
7353 
7354   record_pos= ma_recordpos(page, rownr);
7355   DBUG_PRINT("enter", ("rowid: %lu  page: %lu  rownr: %u",
7356                        (ulong) record_pos, (ulong) page, rownr));
7357 
7358   if (share->calc_checksum)
7359   {
7360     checksum_delta= ha_checksum_korr(header);
7361     header+= HA_CHECKSUM_STORE_SIZE;
7362   }
7363   length_on_head_page= uint2korr(header);
7364   set_if_bigger(length_on_head_page, share->base.min_block_length);
7365   header+= 2;
7366   extent_count= pagerange_korr(header);
7367   header+= PAGERANGE_STORE_SIZE;
7368   extent_info= header;
7369   header+= extent_count * ROW_EXTENT_SIZE;
7370 
7371   /*
7372     Set header to point to old field values, generated by
7373     fill_update_undo_parts()
7374   */
7375   field_length_header= ma_get_length(&header);
7376   field_length_data= (uchar*) header;
7377   header+= field_length_header;
7378   field_length_data_end= header;
7379 
7380   /* Allocate buffer for current row & original row */
7381   if (!(current_record= my_malloc(PSI_INSTRUMENT_ME, share->base.reclength * 2,
7382                                   MYF(MY_WME))))
7383     DBUG_RETURN(1);
7384   orig_record= current_record+ share->base.reclength;
7385 
7386   /* Read current record */
7387   if (_ma_read_block_record(info, current_record, record_pos))
7388     goto err;
7389 
7390   if (*field_length_data == 255)
7391   {
7392     /* Bitmap changed */
7393     field_length_data++;
7394     memcpy(orig_record, header, share->base.null_bytes);
7395     header+= share->base.null_bytes;
7396   }
7397   else
7398     memcpy(orig_record, current_record, share->base.null_bytes);
7399   bitmap_clear_all(&info->changed_fields);
7400 
7401   while (field_length_data < field_length_data_end)
7402   {
7403     uint field_nr= ma_get_length(&field_length_data), field_length;
7404     MARIA_COLUMNDEF *column= share->columndef + field_nr;
7405     uchar *orig_field_pos= orig_record + column->offset;
7406 
7407     bitmap_set_bit(&info->changed_fields, field_nr);
7408     if (field_nr >= share->base.fixed_not_null_fields)
7409     {
7410       if (!(field_length= ma_get_length(&field_length_data)))
7411       {
7412         /* Null field or empty field */
7413         bfill(orig_field_pos, column->fill_length,
7414               column->type == FIELD_SKIP_ENDSPACE ? ' ' : 0);
7415         continue;
7416       }
7417     }
7418     else
7419       field_length= column->length;
7420 
7421     switch (column->type) {
7422     case FIELD_CHECK:
7423     case FIELD_NORMAL:                          /* Fixed length field */
7424     case FIELD_ZERO:
7425     case FIELD_SKIP_PRESPACE:                   /* Not packed */
7426       memcpy(orig_field_pos, header, column->length);
7427       header+= column->length;
7428       break;
7429     case FIELD_SKIP_ZERO:                       /* Number */
7430     case FIELD_SKIP_ENDSPACE:                   /* CHAR */
7431     {
7432       uint diff;
7433       memcpy(orig_field_pos, header, field_length);
7434       if ((diff= (column->length - field_length)))
7435         bfill(orig_field_pos + column->length - diff, diff,
7436               column->type == FIELD_SKIP_ENDSPACE ? ' ' : 0);
7437       header+= field_length;
7438     }
7439     break;
7440     case FIELD_VARCHAR:
7441       if (column->length <= 256)
7442       {
7443         *orig_field_pos++= (uchar) field_length;
7444       }
7445       else
7446       {
7447         int2store(orig_field_pos, field_length);
7448         orig_field_pos+= 2;
7449       }
7450       memcpy(orig_field_pos, header, field_length);
7451       header+= field_length;
7452       break;
7453     case FIELD_BLOB:
7454     {
7455       uint size_length= column->length - portable_sizeof_char_ptr;
7456       _ma_store_blob_length(orig_field_pos, size_length, field_length);
7457       memcpy(orig_field_pos + size_length, &header, sizeof(header));
7458       header+= field_length;
7459       break;
7460     }
7461     default:
7462       DBUG_ASSERT(0);
7463     }
7464   }
7465   copy_not_changed_fields(info, &info->changed_fields,
7466                           orig_record, current_record);
7467 
7468   if (share->calc_checksum)
7469   {
7470     info->new_row.checksum= checksum_delta +
7471       (info->cur_row.checksum= (*share->calc_checksum)(info, orig_record));
7472     /* verify that record's content is sane */
7473     DBUG_ASSERT(info->new_row.checksum ==
7474                 (*share->calc_checksum)(info, current_record));
7475   }
7476 
7477   info->last_auto_increment= ~ (ulonglong) 0;
7478   /* Now records are up to date, execute the update to original values */
7479   if (_ma_update_at_original_place(info, page, rownr, length_on_head_page,
7480                                    extent_count, extent_info,
7481                                    current_record, orig_record, undo_lsn))
7482     goto err;
7483 
7484   error= 0;
7485 end:
7486   my_free(current_record);
7487   DBUG_RETURN(error);
7488 
7489 err:
7490   DBUG_ASSERT(!maria_assert_if_crashed_table);
7491   error= 1;
7492   _ma_mark_file_crashed(share);
7493   goto end;
7494 }
7495 
7496 
7497 /**
7498   Execute undo of a bulk insert which used repair
7499 
7500   @return Operation status
7501     @retval 0      OK
7502     @retval 1      Error
7503 */
7504 
_ma_apply_undo_bulk_insert(MARIA_HA * info,LSN undo_lsn)7505 my_bool _ma_apply_undo_bulk_insert(MARIA_HA *info, LSN undo_lsn)
7506 {
7507   my_bool error;
7508   LSN lsn;
7509   DBUG_ENTER("_ma_apply_undo_bulk_insert");
7510   /*
7511     We delete all rows, re-enable indices as bulk insert had disabled
7512     non-unique ones.
7513   */
7514   error= (maria_delete_all_rows(info) ||
7515           maria_enable_indexes(info) ||
7516           /* we enabled indices so need '2' below */
7517           _ma_state_info_write(info->s,
7518                                MA_STATE_INFO_WRITE_DONT_MOVE_OFFSET |
7519                                MA_STATE_INFO_WRITE_FULL_INFO |
7520                                MA_STATE_INFO_WRITE_LOCK) ||
7521           _ma_write_clr(info, undo_lsn, LOGREC_UNDO_BULK_INSERT,
7522                         FALSE, 0, &lsn, NULL));
7523   DBUG_RETURN(error);
7524 }
7525 
7526 
7527 /**
7528   @brief Get the TRANSLOG_ADDRESS to flush up to
7529 
7530   @param page            Page's content
7531   @param page_no         Page's number (<offset>/<page length>)
7532   @param data_ptr        Callback data pointer (pointer to MARIA_SHARE)
7533 
7534   @note
7535   Usable for data (non-bitmap) and index pages
7536 
7537   @retval LSN to flush up to
7538 */
7539 
7540 TRANSLOG_ADDRESS
maria_page_get_lsn(uchar * page,pgcache_page_no_t page_no,uchar * data_ptr)7541 maria_page_get_lsn(uchar *page,
7542                    pgcache_page_no_t page_no __attribute__((unused)),
7543                    uchar* data_ptr __attribute__((unused)))
7544 {
7545 #ifndef DBUG_OFF
7546   const MARIA_SHARE *share= (MARIA_SHARE*)data_ptr;
7547   DBUG_ASSERT(share->page_type == PAGECACHE_LSN_PAGE &&
7548               share->now_transactional);
7549 #endif
7550   return lsn_korr(page);
7551 }
7552 
7553 
7554 /**
7555   @brief Enable reading of all rows, ignoring versioning
7556 
7557   @note
7558     This is mainly useful in single user applications, like maria_pack,
7559     where we want to be able to read all rows without having to read the
7560     transaction id from the control file
7561 */
7562 
maria_ignore_trids(MARIA_HA * info)7563 void maria_ignore_trids(MARIA_HA *info)
7564 {
7565   if (info->s->base.born_transactional)
7566   {
7567     if (!info->trn)
7568       _ma_set_tmp_trn_for_table(info, &dummy_transaction_object);
7569     /* Ignore transaction id when row is read */
7570     info->trn->min_read_from= ~(TrID) 0;
7571   }
7572 }
7573 
7574 
7575 #ifndef DBUG_OFF
7576 
7577 /* The following functions are useful to call from debugger */
7578 
_ma_print_block_info(MARIA_SHARE * share,uchar * buff)7579 void _ma_print_block_info(MARIA_SHARE *share, uchar *buff)
7580 {
7581   LSN lsn= lsn_korr(buff);
7582 
7583   printf("LSN: " LSN_FMT "  type: %u  dir_entries: %u  dir_free: %u  empty_space: %u\n",
7584          LSN_IN_PARTS(lsn),
7585          (uint)buff[PAGE_TYPE_OFFSET],
7586          (uint)buff[DIR_COUNT_OFFSET],
7587          (uint)buff[DIR_FREE_OFFSET],
7588          (uint) uint2korr(buff + EMPTY_SPACE_OFFSET));
7589   printf("Start of directory: %lu\n",
7590          maria_block_size - PAGE_SUFFIX_SIZE -
7591          (uint) buff[DIR_COUNT_OFFSET] * DIR_ENTRY_SIZE);
7592   _ma_print_directory(share, stdout, buff, maria_block_size);
7593 }
7594 #endif
7595