1 /* Copyright (C) 2007-2008 Michael Widenius
2 
3    This program is free software; you can redistribute it and/or modify
4    it under the terms of the GNU General Public License as published by
5    the Free Software Foundation; version 2 of the License.
6 
7    This program is distributed in the hope that it will be useful,
8    but WITHOUT ANY WARRANTY; without even the implied warranty of
9    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
10    GNU General Public License for more details.
11 
12    You should have received a copy of the GNU General Public License
13    along with this program; if not, write to the Free Software
14    Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1335 USA */
15 
16 /*
17   Storage of records in block
18 
19   Some clarifications about the abbrev used:
20 
21   NULL fields      -> Fields that may have contain a NULL value.
22   Not null fields  -> Fields that may not contain a NULL value.
23   Critical fields  -> Fields that can't be null and can't be dropped without
24 		      causing a table reorganization.
25 
26 
27   Maria will have a LSN at start of each page (excluding the bitmap pages)
28 
29   The different page types that are in a data file are:
30 
31   Bitmap pages     Map of free pages in the next extent (8192 page size
32                    gives us 256M of mapped pages / bitmap)
33   Head page        Start of rows are stored on this page.
34                    A rowid always points to a head page
35   Blob page        This page is totally filled with data from one blob or by
36                    a set of long VARCHAR/CHAR fields
37   Tail page        This contains the last part from different rows, blobs
38                    or varchar fields.
39 
40   The data file starts with a bitmap page, followed by as many data
41   pages as the bitmap can cover. After this there is a new bitmap page
42   and more data pages etc.
43 
44   For information about the bitmap page, see ma_bitmap.c
45 
46   Structure of data and tail page:
47 
48   The page has a row directory at end of page to allow us to do deletes
49   without having to reorganize the page.  It also allows us to later store
50   some more bytes after each row to allow them to grow without having to move
51   around other rows.
52 
53   Page header:
54 
55   LSN            7 bytes   Log position for last page change
56   PAGE_TYPE      1 uchar   0 unalloced / 1 for head / 2 for tail / 3 for blob
57   DIR_COUNT      1 uchar   Number of row/tail entries on page
58   FREE_DIR_LINK  1 uchar   Pointer to first free director entry or 255 if no
59   empty space    2 bytes   Bytes of empty space on page
60 
61   The most significant bit in PAGE_TYPE is set to 1 if the data on the page
62   can be compacted to get more space. (PAGE_CAN_BE_COMPACTED)
63 
64   Row data
65 
66   Row directory of NO entries, that consist of the following for each row
67   (in reverse order; i.e., first record is stored last):
68 
69   Position     2 bytes Position of row on page
70   Length       2 bytes Length of entry
71 
72   For Position and Length, the 1 most significant bit of the position and
73   the 1 most significant bit of the length could be used for some states of
74   the row (in other words, we should try to keep these reserved)
75 
76   Position is 0 if the entry is not used.  In this case length[0] points
77   to a previous free entry (255 if no previous entry) and length[1]
78   to the next free entry (or 255 if last free entry). This works because
79   the directory entry 255 can never be marked free (if the first directory
80   entry is freed, the directory is shrinked).
81 
82   checksum     4 bytes  Reserved for full page read testing and live backup.
83 
84   ----------------
85 
86   Structure of blob pages:
87 
88   LSN          7 bytes  Log position for last page change
89   PAGE_TYPE    1 uchar   3
90 
91   data
92 
93   -----------------
94 
95   Row data structure:
96 
97   Flag                          1 uchar   Marker of which header field exists
98   TRANSID                       6 bytes  TRANSID of changing transaction
99                                          (optional, added on insert and first
100                                          update/delete)
101   VER_PTR                       7 bytes  Pointer to older version in log
102                                          (undo record)
103                                          (optional, added after first
104                                          update/delete)
105   DELETE_TRANSID                6 bytes  (optional). TRANSID of original row.
106                                          Added on delete.
107   Nulls_extended                1 uchar   To allow us to add new DEFAULT NULL
108                                          fields (optional, added after first
109                                          change of row after alter table)
110   Number of ROW_EXTENT's        1-3 uchar Length encoded, optional
111                                          This is the number of extents the
112                                          row is split into
113   First row_extent              7 uchar  Pointer to first row extent (optional)
114 
115   Total length of length array  1-3 uchar Only used if we have
116                                          char/varchar/blob fields.
117   Row checksum		        1 uchar   Only if table created with checksums
118   Null_bits             ..      One bit for each NULL field (a field that may
119 				have the value NULL)
120   Empty_bits            ..      One bit for each field that may be 'empty'.
121 				(Both for null and not null fields).
122                                 This bit is 1 if the value for the field is
123                                 0 or empty string.
124 
125   field_offsets                 2 byte/offset
126                                   For each 32'th field, there is one offset
127                                   that points to where the field information
128                                   starts in the block. This is to provide
129                                   fast access to later field in the row
130                                   when we only need to return a small
131                                   set of fields.
132                                   TODO: Implement this.
133 
134   Things marked above as 'optional' will only be present if the
135   corresponding bit is set in 'Flag' field.  Flag gives us a way to
136   get more space on a page when doing page compaction as we don't need
137   to store TRANSID that have committed before the smallest running
138   transaction we have in memory.
139 
140   Data in the following order:
141   (Field order is precalculated when table is created)
142 
143   Critical fixed length, not null, fields. (Note, these can't be dropped)
144   Fixed length, null fields
145 
146   Length array, 1-4 uchar per field for all CHAR/VARCHAR/BLOB fields.
147   Number of bytes used in length array per entry is depending on max length
148   for field.
149 
150   ROW_EXTENT's
151   CHAR data (space stripped)
152   VARCHAR data
153   BLOB data
154 
155   Fields marked in null_bits or empty_bits are not stored in data part or
156   length array.
157 
158   If row doesn't fit into the given block, then the first EXTENT will be
159   stored last on the row. This is done so that we don't break any field
160   data in the middle.
161 
162   We first try to store the full row into one block. If that's not possible
163   we move out each big blob into their own extents. If this is not enough we
164   move out a concatenation of all varchars to their own extent.
165 
166   Each blob and the concatenated char/varchar fields are stored the following
167   way:
168   - Store the parts in as many full-contiguous pages as possible.
169   - The last part, that doesn't fill a full page, is stored in tail page.
170 
171   When doing an insert of a new row, we don't have to have
172   VER_PTR in the row. This will make rows that are not changed stored
173   efficiently. On update and delete we would add TRANSID (if it was an old
174   committed row) and VER_PTR to
175   the row. On row page compaction we can easily detect rows where
176   TRANSID was committed before the longest running transaction
177   started and we can then delete TRANSID and VER_PTR from the row to
178   gain more space.
179 
180   If a row is deleted in Maria, we change TRANSID to the deleting
181   transaction's id, change VER_PTR to point to the undo record for the delete,
182   and add DELETE_TRANSID (the id of the transaction which last
183   inserted/updated the row before its deletion). DELETE_TRANSID allows an old
184   transaction to avoid reading the log to know if it can see the last version
185   before delete (in other words it reduces the probability of having to follow
186   VER_PTR). TODO: depending on a compilation option, evaluate the performance
187   impact of not storing DELETE_TRANSID (which would make the row smaller).
188 
189   Description of the different parts:
190 
191   Flag is coded as:
192 
193   Description           bit
194   TRANS_ID_exists       0
195   VER_PTR_exists        1
196   Row is deleted        2       (Means that DELETE_TRANSID exists)
197   Nulls_extended_exists 3
198   Row is split          7       This means that 'Number_of_row_extents' exists
199 
200   Nulls_extended is the number of new DEFAULT NULL fields in the row
201   compared to the number of DEFAULT NULL fields when the first version
202   of the table was created.  If Nulls_extended doesn't exist in the row,
203   we know it's 0 as this must be one of the original rows from when the
204   table was created first time.  This coding allows us to add 255*8 =
205   2048 new fields without requiring a full alter table.
206 
207   Empty_bits is used to allow us to store 0, 0.0, empty string, empty
208   varstring and empty blob efficiently. (This is very good for data
209   warehousing where NULL's are often regarded as evil). Having this
210   bitmap also allows us to drop information of a field during a future
211   delete if field was deleted with ALTER TABLE DROP COLUMN.  To be able
212   to handle DROP COLUMN, we must store in the index header the fields
213   that has been dropped. When unpacking a row we will ignore dropped
214   fields. When storing a row, we will mark a dropped field either with a
215   null in the null bit map or in the empty_bits and not store any data
216   for it.
217   TODO: Add code for handling dropped fields.
218 
219 
220   A ROW EXTENT is range of pages. One ROW_EXTENT is coded as:
221 
222   START_PAGE            5 bytes
223   PAGE_COUNT            2 bytes.  Bit 16 is set if this is a tail page.
224                                   Bit 15 is to set if this is start of a new
225                                   blob extent.
226 
227   With 8K pages, we can cover 256M in one extent. This coding gives us a
228   maximum file size of 2^40*8192 = 8192 tera
229 
230   As an example of ROW_EXTENT handling, assume a row with one integer
231   field (value 5), two big VARCHAR fields (size 250 and 8192*3), and 2
232   big BLOB fields that we have updated.
233 
234   The record format for storing this into an empty file would be:
235 
236   Page 1:
237 
238   00 00 00 00 00 00 00          LSN
239   01                            Only one row in page
240   FF                            No free dir entry
241   xx xx                         Empty space on page
242 
243   10                            Flag: row split, VER_PTR exists
244   01 00 00 00 00 00             TRANSID 1
245   00 00 00 00 00 01 00          VER_PTR to first block in LOG file 1
246   5                             Number of row extents
247   02 00 00 00 00 03 00          VARCHAR's are stored in full pages 2,3,4
248   0                             No null fields
249   0                             No empty fields
250   05 00 00 00 00 00 80          Tail page for VARCHAR, rowid 0
251   06 00 00 00 00 80 00          First blob, stored at page 6-133
252   05 00 00 00 00 01 80          Tail of first blob (896 bytes) at page 5
253   86 00 00 00 00 80 00          Second blob, stored at page 134-262
254   05 00 00 00 00 02 80          Tail of second blob (896 bytes) at page 5
255   05 00                         5 integer
256   FA                            Length of first varchar field (size 250)
257   00 60                         Length of second varchar field (size 8192*3)
258   00 60 10                      First medium BLOB, 1M
259   01 00 10 00                   Second BLOB, 1M
260   xx xx xx xx xx xx             Varchars are stored here until end of page
261 
262   ..... until end of page
263 
264   09 00 F4 1F                   Start position 9, length 8180
265   xx xx xx xx			Checksum
266 
267   A data page is allowed to have a wrong CRC and header as long as it is
268   marked empty in the bitmap and its directory's count is 0.
269 */
270 
271 #include "maria_def.h"
272 #include "ma_blockrec.h"
273 #include "trnman.h"
274 #include "ma_trnman.h"
275 #include "ma_key_recover.h"
276 #include "ma_recovery_util.h"
277 #include <lf.h>
278 
279 /*
280   Struct for having a cursor over a set of extent.
281   This is used to loop over all extents for a row when reading
282   the row data. It's also used to store the tail positions for
283   a read row to be used by a later update/delete command.
284 */
285 
286 typedef struct st_maria_extent_cursor
287 {
288   /*
289     Pointer to packed uchar array of extents for the row.
290     Format is described above in the header
291   */
292   uchar *extent;
293   /* Where data starts on page; Only for debugging */
294   uchar *data_start;
295   /* Position to all tails in the row. Updated when reading a row */
296   MARIA_RECORD_POS *tail_positions;
297   /* Current page */
298   pgcache_page_no_t page;
299   /* How many pages in the page region */
300   uint page_count;
301   /* What kind of lock to use for tail pages */
302   enum pagecache_page_lock lock_for_tail_pages;
303   /* Total number of extents (i.e., entries in the 'extent' slot) */
304   uint extent_count;
305   /* <> 0 if current extent is a tail page; Set while using cursor */
306   uint tail;
307   /* Position for tail on tail page */
308   uint tail_row_nr;
309   /*
310     == 1 if we are working on the first extent (i.e., the one that is stored in
311     the row header, not an extent that is stored as part of the row data).
312   */
313   my_bool first_extent;
314 } MARIA_EXTENT_CURSOR;
315 
316 
317 /**
318    @brief Structure for passing down info to write_hook_for_clr_end().
319    This hooks needs to know the variation of the live checksum caused by the
320    current operation to update state.checksum under log's mutex,
321    needs to know the transaction's previous undo_lsn to set
322    trn->undo_lsn under log mutex, and needs to know the type of UNDO being
323    undone now to modify state.records under log mutex.
324 */
325 
326 /** S:share,D:checksum_delta,E:expression,P:pointer_into_record,L:length */
327 #define store_checksum_in_rec(S,D,E,P,L)        do      \
328   {                                                     \
329     D= 0;                                               \
330     if ((S)->calc_checksum != NULL)                     \
331     {                                                   \
332       D= (E);                                           \
333       ha_checksum_store(P, D);                          \
334       L+= HA_CHECKSUM_STORE_SIZE;                       \
335     }                                                   \
336   } while (0)
337 
338 
339 static my_bool delete_tails(MARIA_HA *info, MARIA_RECORD_POS *tails);
340 static my_bool delete_head_or_tail(MARIA_HA *info,
341                                    pgcache_page_no_t page, uint record_number,
342                                    my_bool head, my_bool from_update);
343 #ifndef DBUG_OFF
344 static void _ma_print_directory(MARIA_SHARE *share,
345                                 FILE *file, uchar *buff, uint block_size);
346 #endif
347 static uchar *store_page_range(MARIA_SHARE *share,
348                                uchar *to, MARIA_BITMAP_BLOCK *block,
349                                ulong length,
350                                uint *tot_ranges);
351 static size_t fill_insert_undo_parts(MARIA_HA *info, const uchar *record,
352                                      LEX_CUSTRING *log_parts,
353                                      uint *log_parts_count);
354 static size_t fill_update_undo_parts(MARIA_HA *info, const uchar *oldrec,
355                                      const uchar *newrec,
356                                      LEX_CUSTRING *log_parts,
357                                      uint *log_parts_count);
358 
359 /****************************************************************************
360   Initialization
361 ****************************************************************************/
362 
363 /*
364   Initialize data needed for block structures
365 */
366 
367 
368 /* Size of the different header elements for a row */
369 
370 static uchar header_sizes[]=
371 {
372   TRANSID_SIZE,
373   VERPTR_SIZE,
374   TRANSID_SIZE,                                 /* Delete transid */
375   1                                             /* Null extends */
376 };
377 
378 /*
379   Calculate array of all used headers
380 
381   Used to speed up:
382 
383   size= 1;
384   if (flag & 1)
385     size+= TRANSID_SIZE;
386   if (flag & 2)
387     size+= VERPTR_SIZE;
388   if (flag & 4)
389     size+= TRANSID_SIZE
390   if (flag & 8)
391     size+= 1;
392 
393    NOTES
394      This is called only once at startup of Maria
395 */
396 
397 static uchar total_header_size[1 << array_elements(header_sizes)];
398 #define PRECALC_HEADER_BITMASK (array_elements(total_header_size) -1)
399 
_ma_init_block_record_data(void)400 void _ma_init_block_record_data(void)
401 {
402   uint i;
403   bzero(total_header_size, sizeof(total_header_size));
404   total_header_size[0]= FLAG_SIZE;              /* Flag uchar */
405   for (i= 1; i < array_elements(total_header_size); i++)
406   {
407     uint size= FLAG_SIZE, j, bit;
408     for (j= 0; (bit= (1 << j)) <= i; j++)
409     {
410       if (i & bit)
411         size+= header_sizes[j];
412     }
413     total_header_size[i]= size;
414   }
415 }
416 
417 
_ma_once_init_block_record(MARIA_SHARE * share,File data_file)418 my_bool _ma_once_init_block_record(MARIA_SHARE *share, File data_file)
419 {
420   my_bool res;
421   pgcache_page_no_t last_page;
422 
423   /*
424     First calculate the max file length with can have with a pointer of size
425     rec_reflength.
426 
427     The 'rec_reflength - 1' is because one byte is used for row
428     position withing the page.
429     The /2 comes from _ma_transaction_recpos_to_keypos() where we use
430     the lowest bit to mark if there is a transid following the rownr.
431   */
432   last_page= ((ulonglong) 1 << ((share->base.rec_reflength-1)*8))/2;
433   if (!last_page)                                  /* Overflow; set max size */
434     last_page= ~(pgcache_page_no_t) 0;
435 
436   res= _ma_bitmap_init(share, data_file, &last_page);
437   share->base.max_data_file_length= _ma_safe_mul(last_page + 1,
438                                                  share->block_size);
439 #if SIZEOF_OFF_T == 4
440   set_if_smaller(share->base.max_data_file_length, INT_MAX32);
441 #endif
442   return res;
443 }
444 
445 
_ma_once_end_block_record(MARIA_SHARE * share)446 my_bool _ma_once_end_block_record(MARIA_SHARE *share)
447 {
448   int res= _ma_bitmap_end(share);
449   if (share->bitmap.file.file >= 0)
450   {
451     if (flush_pagecache_blocks(share->pagecache, &share->bitmap.file,
452                        share->deleting ? FLUSH_IGNORE_CHANGED : FLUSH_RELEASE))
453       res= 1;
454     /*
455       File must be synced as it is going out of the maria_open_list and so
456       becoming unknown to Checkpoint.
457     */
458     if (share->now_transactional &&
459         mysql_file_sync(share->bitmap.file.file, MYF(MY_WME)))
460       res= 1;
461     if (mysql_file_close(share->bitmap.file.file, MYF(MY_WME)))
462       res= 1;
463     /*
464       Trivial assignment to guard against multiple invocations
465       (May happen if file are closed but we want to keep the maria object
466       around a bit longer)
467     */
468     share->bitmap.file.file= -1;
469   }
470   if (share->id != 0)
471   {
472     /*
473       We de-assign the id even though index has not been flushed, this is ok
474       as close_lock serializes us with a Checkpoint looking at our share.
475     */
476     translog_deassign_id_from_share(share);
477   }
478   return res;
479 }
480 
481 
482 /* Init info->cur_row structure */
483 
_ma_init_block_record(MARIA_HA * info)484 my_bool _ma_init_block_record(MARIA_HA *info)
485 {
486   MARIA_ROW *row= &info->cur_row, *new_row= &info->new_row;
487   MARIA_SHARE *share= info->s;
488   uint default_extents;
489   DBUG_ENTER("_ma_init_block_record");
490 
491   if (!my_multi_malloc(MY_WME,
492                        &row->empty_bits, share->base.pack_bytes,
493                        &row->field_lengths,
494                        share->base.max_field_lengths + 2,
495                        &row->blob_lengths, sizeof(ulong) * share->base.blobs,
496                        &row->null_field_lengths, (sizeof(uint) *
497                                                   (share->base.fields -
498                                                    share->base.blobs +
499                                                    EXTRA_LENGTH_FIELDS)),
500                        &row->tail_positions, (sizeof(MARIA_RECORD_POS) *
501                                               (share->base.blobs + 2)),
502                        &new_row->empty_bits, share->base.pack_bytes,
503                        &new_row->field_lengths,
504                        share->base.max_field_lengths + 2,
505                        &new_row->blob_lengths,
506                        sizeof(ulong) * share->base.blobs,
507                        &new_row->null_field_lengths, (sizeof(uint) *
508                                                       (share->base.fields -
509                                                        share->base.blobs +
510                                                        EXTRA_LENGTH_FIELDS)),
511                        &info->log_row_parts,
512                        sizeof(*info->log_row_parts) *
513                        (TRANSLOG_INTERNAL_PARTS + 3 +
514                         share->base.fields + 3),
515                        &info->update_field_data,
516                        (share->base.fields * 4 +
517                         share->base.max_field_lengths + 1 + 4),
518                        NullS, 0))
519     DBUG_RETURN(1);
520   /* Skip over bytes used to store length of field length for logging */
521   row->field_lengths+= 2;
522   new_row->field_lengths+= 2;
523 
524   /* Reserve some initial space to avoid mallocs during execution */
525   default_extents= (ELEMENTS_RESERVED_FOR_MAIN_PART + 1 +
526                     (AVERAGE_BLOB_SIZE /
527                      FULL_PAGE_SIZE(share) /
528                      BLOB_SEGMENT_MIN_SIZE));
529 
530   if (my_init_dynamic_array(&info->bitmap_blocks,
531                             sizeof(MARIA_BITMAP_BLOCK), default_extents,
532                             64, MYF(0)))
533     goto err;
534   info->cur_row.extents_buffer_length= default_extents * ROW_EXTENT_SIZE;
535   if (!(info->cur_row.extents= my_malloc(info->cur_row.extents_buffer_length,
536                                          MYF(MY_WME))))
537     goto err;
538 
539   info->row_base_length= share->base_length;
540   info->row_flag= share->base.default_row_flag;
541 
542   /*
543     We need to reserve 'EXTRA_LENGTH_FIELDS' number of parts in
544     null_field_lengths to allow splitting of rows in 'find_where_to_split_row'
545   */
546   row->null_field_lengths+= EXTRA_LENGTH_FIELDS;
547   new_row->null_field_lengths+= EXTRA_LENGTH_FIELDS;
548 
549   DBUG_RETURN(0);
550 
551 err:
552   _ma_end_block_record(info);
553   DBUG_RETURN(1);
554 }
555 
556 
_ma_end_block_record(MARIA_HA * info)557 void _ma_end_block_record(MARIA_HA *info)
558 {
559   DBUG_ENTER("_ma_end_block_record");
560   my_free(info->cur_row.empty_bits);
561   delete_dynamic(&info->bitmap_blocks);
562   my_free(info->cur_row.extents);
563   my_free(info->blob_buff);
564   /*
565     The data file is closed, when needed, in ma_once_end_block_record().
566     The following protects us from doing an extra, not allowed, close
567     in maria_close()
568   */
569   info->dfile.file= -1;
570   DBUG_VOID_RETURN;
571 }
572 
573 
574 /****************************************************************************
575   Helper functions
576 ****************************************************************************/
577 
578 /*
579   Return the next unused postion on the page after a directory entry.
580 
581   SYNOPSIS
582     start_of_next_entry()
583     dir		Directory entry to be used. This can not be the
584                 the last entry on the page!
585 
586   RETURN
587     #   Position in page where next entry starts.
588         Everything between the '*dir' and this are free to be used.
589 */
590 
start_of_next_entry(uchar * dir)591 static inline uint start_of_next_entry(uchar *dir)
592 {
593   uchar *prev;
594   /*
595      Find previous used entry. (There is always a previous entry as
596      the directory never starts with a deleted entry)
597   */
598   for (prev= dir - DIR_ENTRY_SIZE ;
599        prev[0] == 0 && prev[1] == 0 ;
600        prev-= DIR_ENTRY_SIZE)
601   {}
602   return (uint) uint2korr(prev);
603 }
604 
605 
606 /*
607   Return the offset where the previous entry ends (before on page)
608 
609   SYNOPSIS
610     end_of_previous_entry()
611     dir		Address for current directory entry
612     end         Address to last directory entry
613 
614   RETURN
615     #   Position where previous entry ends (smallest address on page)
616         Everything between # and current entry are free to be used.
617 */
618 
619 
end_of_previous_entry(MARIA_SHARE * share,uchar * dir,uchar * end)620 static inline uint end_of_previous_entry(MARIA_SHARE *share,
621                                          uchar *dir, uchar *end)
622 {
623   uchar *pos;
624   for (pos= dir + DIR_ENTRY_SIZE ; pos < end ; pos+= DIR_ENTRY_SIZE)
625   {
626     uint offset;
627     if ((offset= uint2korr(pos)))
628       return offset + uint2korr(pos+2);
629   }
630   return PAGE_HEADER_SIZE(share);
631 }
632 
633 
634 #ifndef DBUG_OFF
635 
_ma_print_directory(MARIA_SHARE * share,FILE * file,uchar * buff,uint block_size)636 static void _ma_print_directory(MARIA_SHARE *share,
637                                 FILE *file, uchar *buff, uint block_size)
638 {
639   uint max_entry= (uint) ((uchar *) buff)[DIR_COUNT_OFFSET], row= 0;
640   uint end_of_prev_row= PAGE_HEADER_SIZE(share);
641   uchar *dir, *end;
642 
643   dir= dir_entry_pos(buff, block_size, max_entry-1);
644   end= dir_entry_pos(buff, block_size, 0);
645 
646   DBUG_LOCK_FILE;                               /* If using DBUG_FILE */
647   fprintf(file,"Directory dump (pos:length):\n");
648 
649   for (row= 1; dir <= end ; end-= DIR_ENTRY_SIZE, row++)
650   {
651     uint offset= uint2korr(end);
652     uint length= uint2korr(end+2);
653     fprintf(file, "   %4u:%4u", offset, offset ? length : 0);
654     if (!(row % (80/12)))
655       fputc('\n', file);
656     if (offset)
657     {
658       DBUG_ASSERT(offset >= end_of_prev_row);
659       end_of_prev_row= offset + length;
660     }
661   }
662   fputc('\n', file);
663   fflush(file);
664   DBUG_UNLOCK_FILE;
665 }
666 
667 
check_directory(MARIA_SHARE * share,uchar * buff,uint block_size,uint min_row_length,uint real_empty_size)668 static void check_directory(MARIA_SHARE *share,
669                             uchar *buff, uint block_size, uint min_row_length,
670                             uint real_empty_size)
671 {
672   uchar *dir, *end;
673   uint max_entry= (uint) buff[DIR_COUNT_OFFSET];
674   uint start_of_dir, deleted;
675   uint end_of_prev_row= PAGE_HEADER_SIZE(share);
676   uint empty_size_on_page;
677   uint empty_size;
678   uchar free_entry, prev_free_entry;
679 
680   dir= dir_entry_pos(buff, block_size, max_entry-1);
681   start_of_dir= (uint) (dir - buff);
682   end= dir_entry_pos(buff, block_size, 0);
683   deleted= empty_size= 0;
684 
685   empty_size_on_page= (real_empty_size != (uint) -1 ? real_empty_size :
686                        uint2korr(buff + EMPTY_SPACE_OFFSET));
687 
688   /* Ensure that all rows are in increasing order and no overlaps */
689   for (; dir <= end ; end-= DIR_ENTRY_SIZE)
690   {
691     uint offset= uint2korr(end);
692     uint length= uint2korr(end+2);
693     if (offset)
694     {
695       DBUG_ASSERT(offset >= end_of_prev_row);
696       DBUG_ASSERT(!length || length >= min_row_length);
697       empty_size+= offset - end_of_prev_row;
698       end_of_prev_row= offset + length;
699     }
700     else
701       deleted++;
702   }
703   empty_size+= start_of_dir - end_of_prev_row;
704   DBUG_ASSERT(end_of_prev_row <= start_of_dir);
705   DBUG_ASSERT(empty_size == empty_size_on_page);
706 
707   /* check free links */
708   free_entry= buff[DIR_FREE_OFFSET];
709   prev_free_entry= END_OF_DIR_FREE_LIST;
710   while (free_entry != END_OF_DIR_FREE_LIST)
711   {
712     uchar *dir= dir_entry_pos(buff, block_size, free_entry);
713     DBUG_ASSERT(dir[0] == 0 && dir[1] == 0);
714     DBUG_ASSERT(dir[2] == prev_free_entry);
715     prev_free_entry= free_entry;
716     free_entry= dir[3];
717     deleted--;
718   }
719   DBUG_ASSERT(deleted == 0);
720 }
721 #else
722 #define check_directory(A,B,C,D,E)
723 #endif /* DBUG_OFF */
724 
725 
726 /**
727    @brief Calculate if there is enough entries on the page
728 */
729 
enough_free_entries(uchar * buff,uint block_size,uint wanted_entries)730 static my_bool enough_free_entries(uchar *buff, uint block_size,
731                                    uint wanted_entries)
732 {
733   uint entries= (uint) buff[DIR_COUNT_OFFSET];
734   uint needed_free_entries, free_entry;
735 
736   if (entries + wanted_entries <= MAX_ROWS_PER_PAGE)
737     return 1;
738 
739   /* Check if enough free entries in free list */
740   needed_free_entries= entries + wanted_entries - MAX_ROWS_PER_PAGE;
741 
742   free_entry= (uint) buff[DIR_FREE_OFFSET];
743   while (free_entry != END_OF_DIR_FREE_LIST)
744   {
745     uchar *dir;
746     if (!--needed_free_entries)
747       return 1;
748     dir= dir_entry_pos(buff, block_size, free_entry);
749     free_entry= dir[3];
750   }
751   return 0;                                     /* Not enough entries */
752 }
753 
754 
755 /**
756    @brief Check if there is room for more rows on page
757 
758    @fn enough_free_entries_on_page
759 
760    @return 0    Directory is full
761    @return 1	There is room for more entries on the page
762 */
763 
enough_free_entries_on_page(MARIA_SHARE * share,uchar * page_buff)764 my_bool enough_free_entries_on_page(MARIA_SHARE *share,
765                                     uchar *page_buff)
766 {
767   enum en_page_type page_type;
768   page_type= (enum en_page_type) (page_buff[PAGE_TYPE_OFFSET] &
769                                   ~(uchar) PAGE_CAN_BE_COMPACTED);
770 
771   if (page_type == HEAD_PAGE)
772   {
773     uint row_count= (uint) page_buff[DIR_COUNT_OFFSET];
774     return !(row_count == MAX_ROWS_PER_PAGE &&
775              page_buff[DIR_FREE_OFFSET] == END_OF_DIR_FREE_LIST);
776   }
777   return enough_free_entries(page_buff, share->block_size,
778                              1 + share->base.blobs);
779 }
780 
781 
782 /**
783    @brief Extend a record area to fit a given size block
784 
785    @fn extend_area_on_page()
786    @param info                  Handler
787    @param buff			Page buffer
788    @param dir			Pointer to dir entry in buffer
789    @param rownr			Row number we working on
790    @param block_size		Block size of buffer
791    @param request_length	How much data we want to put at [dir]
792    @param empty_space		Total empty space in buffer
793 			        This is updated with length after dir
794                                 is allocated and current block freed
795    @param head_page		1 if head page, 0 for tail page
796 
797   @implementation
798     The logic is as follows (same as in _ma_update_block_record())
799     - If new data fits in old block, use old block.
800     - Extend block with empty space before block. If enough, use it.
801     - Extend block with empty space after block. If enough, use it.
802     - Use _ma_compact_block_page() to get all empty space at dir.
803 
804   @note
805     The given directory entry is set to rec length.
806     empty_space doesn't include the new directory entry
807 
808 
809   @return
810   @retval 0   ok
811   @retval ret_offset		Pointer to store offset to found area
812   @retval ret_length		Pointer to store length of found area
813   @retval [dir]                 rec_offset is store here too
814 
815   @retval 1   error (wrong info in block)
816 */
817 
extend_area_on_page(MARIA_HA * info,uchar * buff,uchar * dir,uint rownr,uint request_length,uint * empty_space,uint * ret_offset,uint * ret_length,my_bool head_page)818 static my_bool extend_area_on_page(MARIA_HA *info,
819                                    uchar *buff, uchar *dir,
820                                    uint rownr,
821                                    uint request_length,
822                                    uint *empty_space, uint *ret_offset,
823                                    uint *ret_length,
824                                    my_bool head_page)
825 {
826   uint rec_offset, length, org_rec_length;
827   uint max_entry= (uint) buff[DIR_COUNT_OFFSET];
828   MARIA_SHARE *share= info->s;
829   uint block_size= share->block_size;
830   DBUG_ENTER("extend_area_on_page");
831 
832   /*
833     We can't check for min length here as we may have called
834     extend_directory() to create a new (empty) entry just before
835   */
836   check_directory(share, buff, block_size, 0, *empty_space);
837 
838   rec_offset= uint2korr(dir);
839   if (rec_offset)
840   {
841     /* Extending old row;  Mark current space as 'free' */
842     length= org_rec_length= uint2korr(dir + 2);
843     DBUG_PRINT("info", ("rec_offset: %u  length: %u  request_length: %u  "
844                         "empty_space: %u",
845                         rec_offset, org_rec_length, request_length,
846                         *empty_space));
847 
848     *empty_space+= org_rec_length;
849   }
850   else
851   {
852     /* Reusing free directory entry; Free it from the directory list */
853     if (dir[2] == END_OF_DIR_FREE_LIST)
854       buff[DIR_FREE_OFFSET]= dir[3];
855     else
856     {
857       uchar *prev_dir= dir_entry_pos(buff, block_size, (uint) dir[2]);
858       DBUG_ASSERT(uint2korr(prev_dir) == 0 && prev_dir[3] == (uchar) rownr);
859       prev_dir[3]= dir[3];
860     }
861     if (dir[3] != END_OF_DIR_FREE_LIST)
862     {
863       uchar *next_dir= dir_entry_pos(buff, block_size, (uint) dir[3]);
864       DBUG_ASSERT(uint2korr(next_dir) == 0 && next_dir[2] == (uchar) rownr);
865       next_dir[2]= dir[2];
866     }
867     rec_offset= start_of_next_entry(dir);
868     length= 0;
869   }
870   if (length < request_length)
871   {
872     uint old_rec_offset;
873     /*
874       New data did not fit in old position.
875       Find first possible position where to put new data.
876     */
877     old_rec_offset= rec_offset;
878     rec_offset= end_of_previous_entry(share,
879                                       dir, buff + block_size -
880                                       PAGE_SUFFIX_SIZE);
881     length+= (uint) (old_rec_offset - rec_offset);
882     DBUG_ASSERT(old_rec_offset);
883     /*
884       'length' is 0 if we are doing an insert into a not allocated block.
885       This can only happen during "REDO of INSERT" or "UNDO of DELETE."
886     */
887     if (length < request_length)
888     {
889       /*
890         Did not fit in current block + empty space. Extend with
891         empty space after block.
892       */
893       if (rownr == max_entry - 1)
894       {
895         /* Last entry; Everything is free between this and directory */
896         length= ((block_size - PAGE_SUFFIX_SIZE - DIR_ENTRY_SIZE * max_entry) -
897                  rec_offset);
898       }
899       else
900         length= start_of_next_entry(dir) - rec_offset;
901       DBUG_ASSERT((int) length >= 0);
902       if (length < request_length)
903       {
904         /* Not enough continuous space, compact page to get more */
905         int2store(dir, rec_offset);
906         /* Reset length, as this may be a deleted block */
907         int2store(dir+2, 0);
908         _ma_compact_block_page(share,
909                                buff, rownr, 1,
910                                head_page ? info->trn->min_read_from: 0,
911                                head_page ? share->base.min_block_length : 0);
912         rec_offset= uint2korr(dir);
913         length=     uint2korr(dir+2);
914         if (length < request_length)
915         {
916           DBUG_PRINT("error", ("Not enough space: "
917                                "length: %u  request_length: %u",
918                                length, request_length));
919           _ma_set_fatal_error(share, HA_ERR_WRONG_IN_RECORD);
920           DBUG_RETURN(1);                       /* Error in block */
921         }
922         *empty_space= length;                   /* All space is here */
923       }
924     }
925   }
926   int2store(dir, rec_offset);
927   int2store(dir + 2, length);
928   *ret_offset= rec_offset;
929   *ret_length= length;
930 
931   check_directory(share,
932                   buff, block_size,
933                   head_page ? share->base.min_block_length : 0,
934                   *empty_space - length);
935   DBUG_RETURN(0);
936 }
937 
938 
939 /**
940    @brief Copy not changed fields from 'from' to 'to'
941 
942    @notes
943    Assumption is that most fields are not changed!
944    (Which is why we don't test if all bits are set for some bytes in bitmap)
945 */
946 
copy_not_changed_fields(MARIA_HA * info,MY_BITMAP * changed_fields,uchar * to,uchar * from)947 void copy_not_changed_fields(MARIA_HA *info, MY_BITMAP *changed_fields,
948                              uchar *to, uchar *from)
949 {
950   MARIA_COLUMNDEF *column, *end_column;
951   uchar *bitmap= (uchar*) changed_fields->bitmap;
952   MARIA_SHARE *share= info->s;
953   uint bit= 1;
954 
955   for (column= share->columndef, end_column= column+ share->base.fields;
956        column < end_column; column++)
957   {
958     if (!(*bitmap & bit))
959     {
960       uint field_length= column->length;
961       if (column->type == FIELD_VARCHAR)
962       {
963         if (column->fill_length == 1)
964           field_length= (uint) from[column->offset] + 1;
965         else
966           field_length= uint2korr(from + column->offset) + 2;
967       }
968       memcpy(to + column->offset, from + column->offset, field_length);
969     }
970     if ((bit= (bit << 1)) == 256)
971     {
972       bitmap++;
973       bit= 1;
974     }
975   }
976 }
977 
978 #ifdef NOT_YET_NEEDED
979 /* Calculate empty space on a page */
980 
empty_space_on_page(uchar * buff,uint block_size)981 static uint empty_space_on_page(uchar *buff, uint block_size)
982 {
983   enum en_page_type;
984   page_type= (enum en_page_type) (buff[PAGE_TYPE_OFFSET] &
985                                   ~(uchar) PAGE_CAN_BE_COMPACTED);
986   if (page_type == UNALLOCATED_PAGE)
987     return block_size;
988   if ((uint) page_type <= TAIL_PAGE)
989     return uint2korr(buff+EMPTY_SPACE_OFFSET);
990   return 0;                                     /* Blob page */
991 }
992 #endif
993 
994 
995 /*
996   @brief Ensure we have space for new directory entries
997 
998   @fn make_space_for_directory()
999   @param info		Handler
1000   @param buff		Page buffer
1001   @param max_entry	Number of current entries in directory
1002   @param count		Number of new entries to be added to directory
1003   @param first_dir	First directory entry on page
1004   @param empty_space    Total empty space in buffer. It's updated
1005 			to reflect the new empty space
1006   @param first_pos      Store position to last data byte on page here
1007   @param head_page	1 if head page, 0 for tail page.
1008 
1009   @note
1010   This function is inline as the argument passing is the biggest
1011   part of the function
1012 
1013   @return
1014   @retval 0  ok
1015   @retval 1  error (No data on page, fatal error)
1016 */
1017 
1018 static inline my_bool
make_space_for_directory(MARIA_HA * info,uchar * buff,uint max_entry,uint count,uchar * first_dir,uint * empty_space,uint * first_pos,my_bool head_page)1019 make_space_for_directory(MARIA_HA *info,
1020                          uchar *buff, uint max_entry,
1021                          uint count, uchar *first_dir, uint *empty_space,
1022                          uint *first_pos,
1023                          my_bool head_page)
1024 {
1025   uint length_needed= DIR_ENTRY_SIZE * count;
1026   MARIA_SHARE *share= info->s;
1027 
1028   /*
1029     The following is not true only in the case and UNDO is used to reinsert
1030     a row on a previously not used page
1031   */
1032   if (likely(max_entry))
1033   {
1034     /* Check if there is place for the directory entry on the page */
1035     *first_pos= uint2korr(first_dir) + uint2korr(first_dir + 2);
1036 
1037     if ((uint) (first_dir - buff) < *first_pos + length_needed)
1038     {
1039       /* Create place for directory */
1040       _ma_compact_block_page(share,
1041                              buff, max_entry - 1, 0,
1042                              head_page ? info->trn->min_read_from : 0,
1043                              head_page ? share->base.min_block_length : 0);
1044       *first_pos= (uint2korr(first_dir) + uint2korr(first_dir + 2));
1045       *empty_space= uint2korr(buff + EMPTY_SPACE_OFFSET);
1046       if (*empty_space < length_needed)
1047       {
1048         /*
1049           We should always have space, as we only come here for
1050           UNDO of DELETE (in which case we know the row was on the
1051           page before) or if the bitmap told us there was space on page
1052         */
1053         DBUG_ASSERT(!maria_assert_if_crashed_table);
1054         return(1);
1055       }
1056     }
1057   }
1058   else
1059     *first_pos= PAGE_HEADER_SIZE(share);
1060 
1061   /* Reduce directory entry size from free space size */
1062   (*empty_space)-= length_needed;
1063   buff[DIR_COUNT_OFFSET]= (uchar) (max_entry + count);
1064   return(0);
1065 }
1066 
1067 
1068 /*
1069   Find free position in directory
1070 
1071   SYNOPSIS
1072   find_free_position()
1073     info                Handler
1074     buff                Page
1075     block_size          Size of page
1076     res_rownr           Store index to free position here
1077     res_length		Store length of found segment here
1078     empty_space		Store length of empty space on disk here. This is
1079 		        all empty space, including the found block.
1080   @param head_page	1 if head page, 0 for tail page.
1081 
1082   NOTES
1083     If there is a free directory entry (entry with position == 0),
1084     then use it and change it to be the size of the empty block
1085     after the previous entry. This guarantees that all row entries
1086     are stored on disk in inverse directory order, which makes life easier for
1087     '_ma_compact_block_page()' and to know if there is free space after any
1088     block.
1089 
1090     If there is no free entry (entry with position == 0), then we create
1091     a new one. If there is not space for the directory entry (because
1092     the last block overlapps with the directory), we compact the page.
1093 
1094     We will update the offset and the length of the found dir entry to
1095     match the position and empty space found.
1096 
1097     buff[EMPTY_SPACE_OFFSET] is NOT updated but left up to the caller
1098 
1099     See start of file for description of how free directory entires are linked
1100 
1101   RETURN
1102     0      Error (directory full or last block goes over directory)
1103     #      Pointer to directory entry on page
1104 */
1105 
find_free_position(MARIA_HA * info,uchar * buff,uint block_size,uint * res_rownr,uint * res_length,uint * empty_space,my_bool head_page)1106 static uchar *find_free_position(MARIA_HA *info,
1107                                  uchar *buff, uint block_size, uint *res_rownr,
1108                                  uint *res_length, uint *empty_space,
1109                                  my_bool head_page)
1110 {
1111   uint max_entry, free_entry;
1112   uint length, first_pos;
1113   uchar *dir, *first_dir;
1114   MARIA_SHARE *share= info->s;
1115   DBUG_ENTER("find_free_position");
1116 
1117   max_entry= (uint) buff[DIR_COUNT_OFFSET];
1118   free_entry= (uint) buff[DIR_FREE_OFFSET];
1119   *empty_space= uint2korr(buff + EMPTY_SPACE_OFFSET);
1120 
1121   DBUG_PRINT("info", ("max_entry: %u  free_entry: %u", max_entry, free_entry));
1122 
1123   first_dir= dir_entry_pos(buff, block_size, max_entry - 1);
1124 
1125   /* Search after first free position */
1126   if (free_entry != END_OF_DIR_FREE_LIST)
1127   {
1128     if (free_entry >= max_entry)
1129       DBUG_RETURN(0);                           /* Consistency error */
1130     dir= dir_entry_pos(buff, block_size, free_entry);
1131     DBUG_ASSERT(uint2korr(dir) == 0 && dir[2] == END_OF_DIR_FREE_LIST);
1132     /* Relink free list */
1133     if ((buff[DIR_FREE_OFFSET]= dir[3]) != END_OF_DIR_FREE_LIST)
1134     {
1135       uchar *next_entry= dir_entry_pos(buff, block_size, (uint) dir[3]);
1136       DBUG_ASSERT((uint) next_entry[2] == free_entry &&
1137                   uint2korr(next_entry) == 0);
1138       next_entry[2]= END_OF_DIR_FREE_LIST;      /* Backlink */
1139     }
1140 
1141     first_pos= end_of_previous_entry(share,
1142                                      dir, buff + block_size -
1143                                      PAGE_SUFFIX_SIZE);
1144     length= start_of_next_entry(dir) - first_pos;
1145     int2store(dir, first_pos);                /* Update dir entry */
1146     int2store(dir + 2, 0);
1147     *res_rownr= free_entry;
1148     *res_length= length;
1149 
1150     check_directory(share, buff, block_size,
1151                     head_page ? share->base.min_block_length : 0, (uint) -1);
1152     DBUG_RETURN(dir);
1153   }
1154   /* No free places in dir; create a new one */
1155 
1156   /* Check if there is place for the directory entry */
1157   if (max_entry == MAX_ROWS_PER_PAGE)
1158     DBUG_RETURN(0);
1159 
1160   if (make_space_for_directory(info, buff, max_entry, 1,
1161                                first_dir, empty_space, &first_pos, head_page))
1162     DBUG_RETURN(0);
1163 
1164   dir= first_dir - DIR_ENTRY_SIZE;
1165   length= (uint) (dir - buff - first_pos);
1166   DBUG_ASSERT(length <= *empty_space);
1167   int2store(dir, first_pos);
1168   int2store(dir + 2, 0);                      /* Max length of region */
1169   *res_rownr= max_entry;
1170   *res_length= length;
1171 
1172   check_directory(share,
1173                   buff, block_size,
1174                   head_page ? share->base.min_block_length : 0,
1175                   *empty_space);
1176   DBUG_RETURN(dir);
1177 }
1178 
1179 
1180 /**
1181    @brief Enlarge page directory to hold more entries
1182 
1183    @fn extend_directory()
1184    @param info          Handler
1185    @param buff		Page buffer
1186    @param block_size	Block size
1187    @param max_entry	Number of directory entries on page
1188    @param new_entry	Position for new entry
1189    @param empty_space	Total empty space in buffer. It's updated
1190 			to reflect the new empty space
1191    @param head_page	1 if head page, 0 for tail page.
1192 
1193    @note
1194    This is only called on UNDO when we want to expand the directory
1195    to be able to re-insert row in a given position
1196 
1197    The new directory entry will be set to cover the maximum possible space
1198 
1199    @return
1200    @retval 0  ok
1201    @retval 1  error (No data on page, fatal error)
1202 */
1203 
extend_directory(MARIA_HA * info,uchar * buff,uint block_size,uint max_entry,uint new_entry,uint * empty_space,my_bool head_page)1204 static my_bool extend_directory(MARIA_HA *info, uchar *buff, uint block_size,
1205                                 uint max_entry, uint new_entry,
1206                                 uint *empty_space, my_bool head_page)
1207 {
1208   uint length, first_pos;
1209   uchar *dir, *first_dir;
1210   DBUG_ENTER("extend_directory");
1211 
1212   /*
1213     Note that in if max_entry is 0, then first_dir will point to
1214     an illegal directory entry. This is ok, as in this case we will
1215     not access anything through first_dir.
1216   */
1217   first_dir= dir_entry_pos(buff, block_size, max_entry) + DIR_ENTRY_SIZE;
1218 
1219   if (make_space_for_directory(info, buff, max_entry,
1220                                new_entry - max_entry + 1,
1221                                first_dir, empty_space, &first_pos, head_page))
1222     DBUG_RETURN(1);
1223 
1224   /* Set the new directory entry to cover the max possible length */
1225   dir= first_dir - DIR_ENTRY_SIZE * (new_entry - max_entry + 1);
1226   length= (uint) (dir - buff - first_pos);
1227   int2store(dir, first_pos);
1228   int2store(dir+2, length);
1229   *empty_space-= length;
1230 
1231   if (new_entry-- > max_entry)
1232   {
1233     /* Link all row entries between new_entry and max_entry into free list */
1234     uint free_entry= (uint) buff[DIR_FREE_OFFSET];
1235     uint prev_entry= END_OF_DIR_FREE_LIST;
1236     buff[DIR_FREE_OFFSET]= new_entry;
1237     do
1238     {
1239       dir+= DIR_ENTRY_SIZE;
1240       dir[0]= dir[1]= 0;
1241       dir[2]= (uchar) prev_entry;
1242       dir[3]= (uchar) new_entry-1;
1243       prev_entry= new_entry;
1244     } while (new_entry-- > max_entry);
1245     if ((dir[3]= free_entry) != END_OF_DIR_FREE_LIST)
1246     {
1247       /* Relink next entry to point to newly freed entry */
1248       uchar *next_entry= dir_entry_pos(buff, block_size, (uint) dir[3]);
1249       DBUG_ASSERT(uint2korr(next_entry) == 0 &&
1250                   next_entry[2] == END_OF_DIR_FREE_LIST);
1251       next_entry[2]= max_entry;
1252     }
1253   }
1254 
1255   check_directory(info->s,
1256                   buff, block_size,
1257                   head_page ? MY_MIN(info->s->base.min_block_length, length) :
1258                   0, *empty_space);
1259   DBUG_RETURN(0);
1260 }
1261 
1262 
1263 /****************************************************************************
1264   Updating records
1265 ****************************************************************************/
1266 
1267 /*
1268   Calculate length of all the different field parts
1269 
1270   SYNOPSIS
1271     calc_record_size()
1272     info	Maria handler
1273     record      Row to store
1274     row		Store statistics about row here
1275 
1276   NOTES
1277     The statistics is used to find out how much space a row will need
1278     and also where we can split a row when we need to split it into several
1279     extents.
1280 */
1281 
calc_record_size(MARIA_HA * info,const uchar * record,MARIA_ROW * row)1282 static void calc_record_size(MARIA_HA *info, const uchar *record,
1283                              MARIA_ROW *row)
1284 {
1285   MARIA_SHARE *share= info->s;
1286   uchar *field_length_data;
1287   MARIA_COLUMNDEF *column, *end_column;
1288   uint *null_field_lengths= row->null_field_lengths;
1289   ulong *blob_lengths= row->blob_lengths;
1290   DBUG_ENTER("calc_record_size");
1291 
1292   row->normal_length= row->char_length= row->varchar_length=
1293     row->blob_length= row->extents_count= 0;
1294 
1295   /* Create empty bitmap and calculate length of each varlength/char field */
1296   bzero(row->empty_bits, share->base.pack_bytes);
1297   field_length_data= row->field_lengths;
1298   for (column= share->columndef + share->base.fixed_not_null_fields,
1299        end_column= share->columndef + share->base.fields;
1300        column < end_column; column++, null_field_lengths++)
1301   {
1302     if ((record[column->null_pos] & column->null_bit))
1303     {
1304       if (column->type != FIELD_BLOB)
1305         *null_field_lengths= 0;
1306       else
1307         *blob_lengths++= 0;
1308       continue;
1309     }
1310     switch (column->type) {
1311     case FIELD_CHECK:
1312     case FIELD_NORMAL:                          /* Fixed length field */
1313     case FIELD_ZERO:
1314       DBUG_ASSERT(column->empty_bit == 0);
1315       /* fall through */
1316     case FIELD_SKIP_PRESPACE:                   /* Not packed */
1317       row->normal_length+= column->length;
1318       *null_field_lengths= column->length;
1319       break;
1320     case FIELD_SKIP_ZERO:                       /* Fixed length field */
1321       if (memcmp(record+ column->offset, maria_zero_string,
1322                  column->length) == 0)
1323       {
1324         row->empty_bits[column->empty_pos] |= column->empty_bit;
1325         *null_field_lengths= 0;
1326       }
1327       else
1328       {
1329         row->normal_length+= column->length;
1330         *null_field_lengths= column->length;
1331       }
1332       break;
1333     case FIELD_SKIP_ENDSPACE:                   /* CHAR */
1334     {
1335       const uchar *pos, *end;
1336       for (pos= record + column->offset, end= pos + column->length;
1337            end > pos && end[-1] == ' '; end--)
1338         ;
1339       if (pos == end)                           /* If empty string */
1340       {
1341         row->empty_bits[column->empty_pos]|= column->empty_bit;
1342         *null_field_lengths= 0;
1343       }
1344       else
1345       {
1346         uint length= (uint) (end - pos);
1347         if (column->length <= 255)
1348           *field_length_data++= (uchar) length;
1349         else
1350         {
1351           int2store(field_length_data, length);
1352           field_length_data+= 2;
1353         }
1354         row->char_length+= length;
1355         *null_field_lengths= length;
1356       }
1357       break;
1358     }
1359     case FIELD_VARCHAR:
1360     {
1361       uint length, field_length_data_length;
1362       const uchar *field_pos= record + column->offset;
1363 
1364       /* 256 is correct as this includes the length uchar */
1365       field_length_data[0]= field_pos[0];
1366       if (column->length <= 256)
1367       {
1368         length= (uint) (uchar) *field_pos;
1369         field_length_data_length= 1;
1370       }
1371       else
1372       {
1373         length= uint2korr(field_pos);
1374         field_length_data[1]= field_pos[1];
1375         field_length_data_length= 2;
1376       }
1377       *null_field_lengths= length;
1378       if (!length)
1379       {
1380         row->empty_bits[column->empty_pos]|= column->empty_bit;
1381         break;
1382       }
1383       row->varchar_length+= length;
1384       *null_field_lengths= length;
1385       field_length_data+= field_length_data_length;
1386       break;
1387     }
1388     case FIELD_BLOB:
1389     {
1390       const uchar *field_pos= record + column->offset;
1391       uint size_length= column->length - portable_sizeof_char_ptr;
1392       ulong blob_length= _ma_calc_blob_length(size_length, field_pos);
1393 
1394       *blob_lengths++= blob_length;
1395       if (!blob_length)
1396         row->empty_bits[column->empty_pos]|= column->empty_bit;
1397       else
1398       {
1399         row->blob_length+= blob_length;
1400         memcpy(field_length_data, field_pos, size_length);
1401         field_length_data+= size_length;
1402       }
1403       break;
1404     }
1405     default:
1406       DBUG_ASSERT(0);
1407     }
1408   }
1409   row->field_lengths_length= (uint) (field_length_data - row->field_lengths);
1410   /*
1411     - info->row_base_length is base information we must have on a page in first
1412       extent:
1413       - flag byte (1) + is_nulls_extended (0 | 1) + null_bytes + pack_bytes +
1414         table_checksum (0 | 1)
1415     - row->min_length is minimum amount of data we must store on
1416       a page. bitmap code will ensure we get at list this much +
1417       total number of extents and one extent information
1418     - fixed_not_null_fields_length is length of fixed length fields that can't
1419       be compacted
1420     - head_length is the amount of data for the head page
1421      (ie, all fields except blobs)
1422   */
1423   row->min_length=   (info->row_base_length +
1424                       (share->base.max_field_lengths ?
1425                        size_to_store_key_length(row->field_lengths_length) :
1426                        0));
1427   row->head_length= (row->min_length +
1428                      share->base.fixed_not_null_fields_length +
1429                      row->field_lengths_length +
1430                      row->normal_length +
1431                      row->char_length + row->varchar_length);
1432   row->total_length= (row->head_length + row->blob_length);
1433   if (row->total_length < share->base.min_block_length)
1434     row->total_length= share->base.min_block_length;
1435   DBUG_PRINT("exit", ("head_length: %lu  total_length: %lu",
1436                       (ulong) row->head_length, (ulong) row->total_length));
1437   DBUG_VOID_RETURN;
1438 }
1439 
1440 
1441 /**
1442   Compact page by removing all space between rows
1443 
1444   Moves up all rows to start of page. Moves blocks that are directly after
1445   each other with one memmove.
1446 
1447   @note if rownr is the last row in the page, and extend_block is false,
1448   caller has to make sure to update bitmap page afterwards to reflect freed
1449   space.
1450 
1451   @param  buff          Page to compact
1452   @param  block_size    Size of page
1453   @param  rownr         Put empty data after this row
1454   @param  extend_block	If 1, extend the block at 'rownr' to cover the
1455                         whole block.
1456   @param  min_read_from If <> 0, remove all trid's that are less than this
1457 */
1458 
_ma_compact_block_page(MARIA_SHARE * share,uchar * buff,uint rownr,my_bool extend_block,TrID min_read_from,uint min_row_length)1459 void _ma_compact_block_page(MARIA_SHARE *share,
1460                             uchar *buff, uint rownr,
1461                             my_bool extend_block, TrID min_read_from,
1462                             uint min_row_length)
1463 {
1464   uint max_entry= (uint) buff[DIR_COUNT_OFFSET];
1465   uint page_pos, next_free_pos, start_of_found_block, diff, end_of_found_block;
1466   uint freed_size= 0;
1467   uint block_size= share->block_size;
1468   uchar *dir, *end;
1469   DBUG_ENTER("_ma_compact_block_page");
1470   DBUG_PRINT("enter", ("rownr: %u  min_read_from: %lu", rownr,
1471                        (ulong) min_read_from));
1472   DBUG_ASSERT(max_entry > 0 &&
1473               max_entry < (block_size - PAGE_HEADER_SIZE(share) -
1474                            PAGE_SUFFIX_SIZE) / DIR_ENTRY_SIZE);
1475 
1476   /* Move all entries before and including rownr up to start of page */
1477   dir= dir_entry_pos(buff, block_size, rownr);
1478   end= dir_entry_pos(buff, block_size, 0);
1479   page_pos= next_free_pos= start_of_found_block= PAGE_HEADER_SIZE(share);
1480   diff= 0;
1481   for (; dir <= end ; end-= DIR_ENTRY_SIZE)
1482   {
1483     uint offset= uint2korr(end);
1484 
1485     if (offset)
1486     {
1487       uint row_length= uint2korr(end + 2);
1488       DBUG_ASSERT(offset >= page_pos);
1489       DBUG_ASSERT(buff + offset + row_length <= dir);
1490       DBUG_ASSERT(row_length >= min_row_length || row_length == 0);
1491 
1492       /* Row length can be zero if row is to be deleted */
1493       if (min_read_from && row_length && (buff[offset] & ROW_FLAG_TRANSID))
1494       {
1495         TrID transid= transid_korr(buff+offset+1);
1496         if (transid < min_read_from)
1497         {
1498           /* Remove transid from row by moving the start point of the row up */
1499           buff[offset + TRANSID_SIZE]= buff[offset] & ~ROW_FLAG_TRANSID;
1500           offset+= TRANSID_SIZE;
1501           freed_size+= TRANSID_SIZE;
1502           row_length-= TRANSID_SIZE;
1503           int2store(end+2, row_length);
1504         }
1505       }
1506 
1507       if (offset != next_free_pos)
1508       {
1509         uint length= (next_free_pos - start_of_found_block);
1510         /*
1511           There was empty space before this and prev block
1512           Check if we have to move previous block up to page start
1513         */
1514         if (page_pos != start_of_found_block)
1515         {
1516           /* move up previous block */
1517           memmove(buff + page_pos, buff + start_of_found_block, length);
1518         }
1519         page_pos+= length;
1520         /* next continuous block starts here */
1521         start_of_found_block= offset;
1522         diff= offset - page_pos;
1523       }
1524       int2store(end, offset - diff);            /* correct current pos */
1525       next_free_pos= offset + row_length;
1526 
1527       if (unlikely(row_length < min_row_length) && row_length)
1528       {
1529         /*
1530           This can only happen in the case we compacted transid and
1531           the row become 'too short'
1532 
1533           Move the current row down to it's right place and extend it
1534           with 0.
1535         */
1536         uint row_diff= min_row_length - row_length;
1537         uint length= (next_free_pos - start_of_found_block);
1538 
1539         DBUG_ASSERT(page_pos != start_of_found_block);
1540         bmove(buff + page_pos, buff + start_of_found_block, length);
1541         bzero(buff+ page_pos + length, row_diff);
1542         page_pos+= min_row_length;
1543         int2store(end+2, min_row_length);
1544         freed_size-= row_diff;
1545         next_free_pos= start_of_found_block= page_pos;
1546         diff= 0;
1547       }
1548     }
1549   }
1550   if (page_pos != start_of_found_block)
1551   {
1552     uint length= (next_free_pos - start_of_found_block);
1553     memmove(buff + page_pos, buff + start_of_found_block, length);
1554   }
1555   start_of_found_block= uint2korr(dir);
1556 
1557   if (rownr != max_entry - 1)
1558   {
1559     /* Move all entries after rownr to end of page */
1560     uint rownr_length;
1561 
1562     DBUG_ASSERT(extend_block);                  /* Should always be true */
1563     next_free_pos= end_of_found_block= page_pos=
1564       block_size - DIR_ENTRY_SIZE * max_entry - PAGE_SUFFIX_SIZE;
1565     diff= 0;
1566     /* End points to entry before 'rownr' */
1567     for (dir= buff + end_of_found_block ; dir <= end ; dir+= DIR_ENTRY_SIZE)
1568     {
1569       uint offset= uint2korr(dir);
1570       uint row_length;
1571       uint row_end;
1572       if (!offset)
1573         continue;
1574       row_length= uint2korr(dir + 2);
1575       row_end= offset + row_length;
1576       DBUG_ASSERT(offset >= start_of_found_block &&
1577                   row_end <= next_free_pos && row_length >= min_row_length);
1578 
1579       if (min_read_from && (buff[offset] & ROW_FLAG_TRANSID))
1580       {
1581         TrID transid= transid_korr(buff + offset+1);
1582         if (transid < min_read_from)
1583         {
1584           /* Remove transid from row */
1585           buff[offset + TRANSID_SIZE]= buff[offset] & ~ROW_FLAG_TRANSID;
1586           offset+= TRANSID_SIZE;
1587           row_length-= TRANSID_SIZE;
1588           int2store(dir+2, row_length);
1589         }
1590         if (unlikely(row_length < min_row_length))
1591         {
1592           /*
1593             This can only happen in the case we compacted transid and
1594             the row become 'too short'
1595           */
1596           uint row_diff= min_row_length - row_length;
1597           if (next_free_pos < row_end + row_diff)
1598           {
1599             /*
1600               Not enough space for extending next block with enough
1601               end 0's. Move current data down to get place for them
1602             */
1603             uint move_down= row_diff - (next_free_pos - row_end);
1604             bmove(buff + offset - move_down, buff + offset, row_length);
1605             offset-= move_down;
1606           }
1607           /*
1608             Extend the next block with 0, which will be part of current
1609             row when the blocks are joined together later
1610           */
1611           bzero(buff + next_free_pos - row_diff, row_diff);
1612           next_free_pos-= row_diff;
1613           int2store(dir+2, min_row_length);
1614         }
1615         row_end= offset + row_length;
1616       }
1617 
1618       if (row_end != next_free_pos)
1619       {
1620         uint length= (end_of_found_block - next_free_pos);
1621         if (page_pos != end_of_found_block)
1622         {
1623           /* move next block down */
1624           memmove(buff + page_pos - length, buff + next_free_pos, length);
1625         }
1626         page_pos-= length;
1627         /* next continuous block starts here */
1628         end_of_found_block= row_end;
1629         diff= page_pos - row_end;
1630       }
1631       int2store(dir, offset + diff);            /* correct current pos */
1632       next_free_pos= offset;
1633     }
1634     if (page_pos != end_of_found_block)
1635     {
1636       uint length= (end_of_found_block - next_free_pos);
1637       memmove(buff + page_pos - length, buff + next_free_pos, length);
1638       next_free_pos= page_pos- length;
1639     }
1640 
1641     /* Extend rownr block to cover hole */
1642     rownr_length= next_free_pos - start_of_found_block;
1643     int2store(dir+2, rownr_length);
1644     DBUG_ASSERT(rownr_length >= min_row_length);
1645   }
1646   else
1647   {
1648     if (extend_block)
1649     {
1650       /* Extend last block to cover whole page */
1651       uint length= ((uint) (dir - buff) - start_of_found_block);
1652       int2store(dir+2, length);
1653       DBUG_ASSERT(length >= min_row_length);
1654     }
1655     else
1656     {
1657       /* Add length gained from freed transaction id's to this page */
1658       uint length= uint2korr(buff+ EMPTY_SPACE_OFFSET) + freed_size;
1659       int2store(buff + EMPTY_SPACE_OFFSET, length);
1660     }
1661     buff[PAGE_TYPE_OFFSET]&= ~(uchar) PAGE_CAN_BE_COMPACTED;
1662   }
1663   check_directory(share, buff, block_size, min_row_length,
1664                   extend_block ? 0 : (uint) -1);
1665   DBUG_EXECUTE("directory", _ma_print_directory(share,
1666                                                 DBUG_FILE, buff, block_size););
1667   DBUG_VOID_RETURN;
1668 }
1669 
1670 
1671 /*
1672   Create an empty tail or head page
1673 
1674   SYNOPSIS
1675     make_empty_page()
1676     buff		Page buffer
1677     block_size		Block size
1678     page_type		HEAD_PAGE or TAIL_PAGE
1679     create_dir_entry	TRUE of we should create a directory entry
1680 
1681   NOTES
1682     EMPTY_SPACE is not updated
1683 */
1684 
make_empty_page(MARIA_HA * info,uchar * buff,uint page_type,my_bool create_dir_entry)1685 static void make_empty_page(MARIA_HA *info, uchar *buff, uint page_type,
1686                             my_bool create_dir_entry)
1687 {
1688   uint block_size= info->s->block_size;
1689   DBUG_ENTER("make_empty_page");
1690 
1691   bzero(buff, PAGE_HEADER_SIZE(info->s));
1692 
1693 #if !defined(DONT_ZERO_PAGE_BLOCKS) || defined(HAVE_valgrind)
1694   /*
1695     We zero the rest of the block to avoid getting old memory information
1696     to disk and to allow the file to be compressed better if archived.
1697     The code does not assume the block is zeroed.
1698   */
1699   if (page_type != BLOB_PAGE)
1700     bzero(buff+ PAGE_HEADER_SIZE(info->s),
1701           block_size - PAGE_HEADER_SIZE(info->s));
1702 #endif
1703   buff[PAGE_TYPE_OFFSET]= (uchar) page_type;
1704   buff[DIR_COUNT_OFFSET]= (int) create_dir_entry;
1705   buff[DIR_FREE_OFFSET]=  END_OF_DIR_FREE_LIST;
1706   if (create_dir_entry)
1707   {
1708     /* Create directory entry to point to start of page with size 0 */
1709     buff+= block_size - PAGE_SUFFIX_SIZE - DIR_ENTRY_SIZE;
1710     int2store(buff, PAGE_HEADER_SIZE(info->s));
1711     int2store(buff+2, 0);
1712   }
1713   DBUG_VOID_RETURN;
1714 }
1715 
1716 
1717 /*
1718   Read or initialize new head or tail page
1719 
1720   SYNOPSIS
1721     get_head_or_tail_page()
1722     info                        Maria handler
1723     block                       Block to read
1724     buff                        Suggest this buffer to key cache
1725     length                      Minimum space needed
1726     page_type			HEAD_PAGE || TAIL_PAGE
1727     res                         Store result position here
1728 
1729   NOTES
1730     We don't decremented buff[EMPTY_SPACE_OFFSET] with the allocated data
1731     as we don't know how much data the caller will actually use.
1732 
1733     res->empty_space is set to length of empty space
1734 
1735   RETURN
1736     0  ok     All slots in 'res' are updated
1737     1  error  my_errno is set
1738 */
1739 
1740 struct st_row_pos_info
1741 {
1742   uchar *buff;                                  /* page buffer */
1743   uchar *data;                                  /* Place for data */
1744   uchar *dir;                                   /* Directory */
1745   uint length;                                  /* Length for data */
1746   uint rownr;                                   /* Offset in directory */
1747   uint empty_space;                             /* Space left on page */
1748 };
1749 
1750 
get_head_or_tail_page(MARIA_HA * info,const MARIA_BITMAP_BLOCK * block,uchar * buff,uint length,uint page_type,enum pagecache_page_lock lock,struct st_row_pos_info * res)1751 static my_bool get_head_or_tail_page(MARIA_HA *info,
1752                                      const MARIA_BITMAP_BLOCK *block,
1753                                      uchar *buff, uint length, uint page_type,
1754                                      enum pagecache_page_lock lock,
1755                                      struct st_row_pos_info *res)
1756 {
1757   uint block_size;
1758   MARIA_PINNED_PAGE page_link;
1759   MARIA_SHARE *share= info->s;
1760   DBUG_ENTER("get_head_or_tail_page");
1761   DBUG_PRINT("enter", ("page_type: %u  length: %u", page_type, length));
1762 
1763   block_size= share->block_size;
1764   if (block->org_bitmap_value == 0)             /* Empty block */
1765   {
1766     /* New page */
1767     make_empty_page(info, buff, page_type, 1);
1768     res->buff= buff;
1769     res->empty_space= res->length= (block_size - PAGE_OVERHEAD_SIZE(share));
1770     res->data= (buff + PAGE_HEADER_SIZE(share));
1771     res->dir= res->data + res->length;
1772     res->rownr= 0;
1773     DBUG_ASSERT(length <= res->length);
1774   }
1775   else
1776   {
1777     uchar *dir;
1778     /* Read old page */
1779     page_link.unlock= PAGECACHE_LOCK_WRITE_UNLOCK;
1780     res->buff= pagecache_read(share->pagecache, &info->dfile,
1781                               block->page, 0, 0, share->page_type,
1782                               lock, &page_link.link);
1783     page_link.changed= res->buff != 0;
1784     push_dynamic(&info->pinned_pages, (void*) &page_link);
1785     if (!page_link.changed)
1786       goto crashed;
1787 
1788     DBUG_ASSERT((uint) (res->buff[PAGE_TYPE_OFFSET] & PAGE_TYPE_MASK) ==
1789                 page_type);
1790     if (!(dir= find_free_position(info, res->buff, block_size, &res->rownr,
1791                                   &res->length, &res->empty_space,
1792                                   page_type == HEAD_PAGE)))
1793       goto crashed;
1794 
1795     if (res->length < length)
1796     {
1797       if (res->empty_space + res->length >= length)
1798       {
1799         _ma_compact_block_page(share,
1800                                res->buff, res->rownr, 1,
1801                                (page_type == HEAD_PAGE ?
1802                                 info->trn->min_read_from : 0),
1803                                (page_type == HEAD_PAGE ?
1804                                 share->base.min_block_length :
1805                                 0));
1806         /* All empty space are now after current position */
1807         dir= dir_entry_pos(res->buff, block_size, res->rownr);
1808         res->length= res->empty_space= uint2korr(dir+2);
1809       }
1810       if (res->length < length)
1811       {
1812         DBUG_PRINT("error", ("length: %u  res->length: %u  empty_space: %u",
1813                              length, res->length, res->empty_space));
1814         goto crashed;                         /* Wrong bitmap information */
1815       }
1816     }
1817     res->dir= dir;
1818     res->data= res->buff + uint2korr(dir);
1819   }
1820   DBUG_RETURN(0);
1821 
1822 crashed:
1823   DBUG_ASSERT(!maria_assert_if_crashed_table);
1824   _ma_set_fatal_error(share, HA_ERR_WRONG_IN_RECORD);  /* File crashed */
1825   DBUG_RETURN(1);
1826 }
1827 
1828 
1829 /*
1830   @brief Create room for a head or tail row on a given page at given position
1831 
1832   @fn get_rowpos_in_head_or_tail_page()
1833   @param info                        Maria handler
1834   @param block                       Block to read
1835   @param buff                        Suggest this buffer to key cache
1836   @param length                      Minimum space needed
1837   @param page_type	             HEAD_PAGE || TAIL_PAGE
1838   @param rownr			     Rownr to use
1839   @param res                         Store result position here
1840 
1841   @note
1842     This is essential same as get_head_or_tail_page, with the difference
1843     that the caller species at what position the row should be put.
1844     This is used when restoring a row to it's original position as
1845     part of UNDO DELETE or UNDO UPDATE
1846 
1847   @return
1848   @retval 0  ok     All slots in 'res' are updated
1849   @retval 1  error  my_errno is set
1850 */
1851 
get_rowpos_in_head_or_tail_page(MARIA_HA * info,const MARIA_BITMAP_BLOCK * block,uchar * buff,uint length,uint page_type,enum pagecache_page_lock lock,uint rownr,struct st_row_pos_info * res)1852 static my_bool get_rowpos_in_head_or_tail_page(MARIA_HA *info,
1853                                                const MARIA_BITMAP_BLOCK *block,
1854                                                uchar *buff, uint length,
1855                                                uint page_type,
1856                                                enum pagecache_page_lock lock,
1857                                                uint rownr,
1858                                                struct st_row_pos_info *res)
1859 {
1860   MARIA_PINNED_PAGE page_link;
1861   MARIA_SHARE *share= info->s;
1862   uchar *dir;
1863   uint block_size= share->block_size;
1864   uint max_entry, max_length, rec_offset;
1865   DBUG_ENTER("get_rowpos_in_head_or_tail_page");
1866 
1867   if (block->org_bitmap_value == 0)             /* Empty block */
1868   {
1869     /* New page */
1870     make_empty_page(info, buff, page_type, 0);
1871     res->empty_space= block_size - PAGE_HEADER_SIZE(share) - PAGE_SUFFIX_SIZE;
1872   }
1873   else
1874   {
1875     page_link.unlock= PAGECACHE_LOCK_WRITE_UNLOCK;
1876     buff= pagecache_read(share->pagecache, &info->dfile,
1877                          block->page, 0, 0, share->page_type,
1878                          lock, &page_link.link);
1879     page_link.changed= buff != 0;
1880     push_dynamic(&info->pinned_pages, (void*) &page_link);
1881     if (!page_link.changed)                     /* Read error */
1882       goto err;
1883     DBUG_ASSERT((buff[PAGE_TYPE_OFFSET] & PAGE_TYPE_MASK) ==
1884                 (uchar) page_type);
1885     if ((buff[PAGE_TYPE_OFFSET] & PAGE_TYPE_MASK) != (uchar) page_type)
1886       goto err;
1887     res->empty_space= uint2korr(buff + EMPTY_SPACE_OFFSET);
1888   }
1889 
1890   max_entry= (uint) buff[DIR_COUNT_OFFSET];
1891   if (max_entry <= rownr)
1892   {
1893     if (extend_directory(info, buff, block_size,
1894                          max_entry, rownr, &res->empty_space,
1895                          page_type == HEAD_PAGE))
1896       goto err;
1897   }
1898 
1899   /*
1900     The following dir entry is unused in case of insert / update but
1901     not in case of undo_update / undo_delete
1902   */
1903   dir= dir_entry_pos(buff, block_size, rownr);
1904 
1905   if (extend_area_on_page(info, buff, dir, rownr, length,
1906                           &res->empty_space, &rec_offset, &max_length,
1907                           page_type == HEAD_PAGE))
1908     goto err;
1909 
1910   res->buff= buff;
1911   res->rownr= rownr;
1912   res->dir= dir;
1913   res->data= buff + rec_offset;
1914   res->length= length;
1915   DBUG_RETURN(0);
1916 
1917 err:
1918   DBUG_ASSERT(!maria_assert_if_crashed_table);
1919   _ma_set_fatal_error(share, HA_ERR_WRONG_IN_RECORD);  /* File crashed */
1920   DBUG_RETURN(1);
1921 }
1922 
1923 
1924 /*
1925   Write tail for head data or blob
1926 
1927   SYNOPSIS
1928     write_tail()
1929     info                Maria handler
1930     block               Block to tail page
1931     row_part            Data to write to page
1932     length              Length of data
1933 
1934   NOTES
1935     block->page_count is updated to the directory offset for the tail
1936     so that we can store the position in the row extent information
1937 
1938   RETURN
1939     0  ok
1940        block->page_count is set to point (dir entry + TAIL_BIT)
1941 
1942     1  error; In this case my_errno is set to the error
1943 */
1944 
write_tail(MARIA_HA * info,MARIA_BITMAP_BLOCK * block,uchar * row_part,uint org_length)1945 static my_bool write_tail(MARIA_HA *info,
1946                           MARIA_BITMAP_BLOCK *block,
1947                           uchar *row_part, uint org_length)
1948 {
1949   MARIA_SHARE *share= info->s;
1950   MARIA_PINNED_PAGE page_link;
1951   uint block_size= share->block_size, empty_space, length= org_length;
1952   struct st_row_pos_info row_pos;
1953   my_off_t position;
1954   my_bool res, block_is_read;
1955   DBUG_ENTER("write_tail");
1956   DBUG_PRINT("enter", ("page: %lu  length: %u",
1957                        (ulong) block->page, length));
1958 
1959   info->keyread_buff_used= 1;
1960   /*
1961     Don't allocate smaller block than MIN_TAIL_SIZE (we want to give rows
1962     some place to grow in the future)
1963   */
1964   if (length < MIN_TAIL_SIZE)
1965     length= MIN_TAIL_SIZE;
1966 
1967   if (block->page_count == TAIL_PAGE_COUNT_MARKER)
1968   {
1969     /*
1970       Create new tail
1971       page will be pinned & locked by get_head_or_tail_page
1972     */
1973     if (get_head_or_tail_page(info, block, info->keyread_buff, length,
1974                               TAIL_PAGE, PAGECACHE_LOCK_WRITE,
1975                               &row_pos))
1976       DBUG_RETURN(1);
1977   }
1978   else
1979   {
1980     /* Write tail on predefined row position */
1981     if (get_rowpos_in_head_or_tail_page(info, block, info->keyread_buff,
1982                                         length, TAIL_PAGE,
1983                                         PAGECACHE_LOCK_WRITE,
1984                                         block->page_count & ~TAIL_BIT,
1985                                         &row_pos))
1986       DBUG_RETURN(1);
1987   }
1988   DBUG_PRINT("info", ("tailid: %lu (%lu:%u)",
1989                       (ulong) ma_recordpos(block->page, row_pos.rownr),
1990                       (ulong) block->page, row_pos.rownr));
1991 
1992   block_is_read= block->org_bitmap_value != 0;
1993 
1994   memcpy(row_pos.data, row_part, org_length);
1995 
1996   if (share->now_transactional)
1997   {
1998     /* Log changes in tail block */
1999     uchar log_data[FILEID_STORE_SIZE + PAGE_STORE_SIZE + DIRPOS_STORE_SIZE];
2000     LEX_CUSTRING log_array[TRANSLOG_INTERNAL_PARTS + 2];
2001     LSN lsn;
2002 
2003     /*
2004       Log REDO changes of tail page
2005       Note that we have to log length, not org_length, to be sure that
2006       REDO, which doesn't use write_tail, also creates a block of at least
2007       MIN_TAIL_SIZE
2008      */
2009     page_store(log_data + FILEID_STORE_SIZE, block->page);
2010     dirpos_store(log_data + FILEID_STORE_SIZE + PAGE_STORE_SIZE,
2011                  row_pos.rownr);
2012     log_array[TRANSLOG_INTERNAL_PARTS + 0].str=    log_data;
2013     log_array[TRANSLOG_INTERNAL_PARTS + 0].length= sizeof(log_data);
2014     log_array[TRANSLOG_INTERNAL_PARTS + 1].str=    row_pos.data;
2015     log_array[TRANSLOG_INTERNAL_PARTS + 1].length= length;
2016     if (translog_write_record(&lsn,
2017                               (block_is_read ? LOGREC_REDO_INSERT_ROW_TAIL :
2018                                LOGREC_REDO_NEW_ROW_TAIL),
2019                               info->trn, info,
2020                               (translog_size_t) (sizeof(log_data) + length),
2021                               TRANSLOG_INTERNAL_PARTS + 2, log_array,
2022                               log_data, NULL))
2023       DBUG_RETURN(1);
2024   }
2025 
2026   int2store(row_pos.dir + 2, length);
2027   empty_space= row_pos.empty_space - length;
2028   int2store(row_pos.buff + EMPTY_SPACE_OFFSET, empty_space);
2029   block->page_count= row_pos.rownr + TAIL_BIT;
2030   /*
2031     If there is less directory entries free than number of possible tails
2032     we can write for a row, we mark the page full to ensure that we don't
2033     during _ma_bitmap_find_place() allocate more entries on the tail page
2034     than it can hold
2035   */
2036   block->empty_space= (enough_free_entries(row_pos.buff, share->block_size,
2037                                            1 + share->base.blobs) ?
2038                        empty_space : 0);
2039   /* Keep BLOCKUSED_USE_ORG_BITMAP */
2040   block->used|= BLOCKUSED_USED | BLOCKUSED_TAIL;
2041 
2042   if (block_is_read)
2043   {
2044     /* Current page link is last element in pinned_pages */
2045     MARIA_PINNED_PAGE *page_link;
2046     page_link= dynamic_element(&info->pinned_pages,
2047                                info->pinned_pages.elements-1,
2048                                MARIA_PINNED_PAGE*);
2049     pagecache_unlock_by_link(share->pagecache, page_link->link,
2050                              PAGECACHE_LOCK_WRITE_TO_READ,
2051                              PAGECACHE_PIN_LEFT_PINNED, LSN_IMPOSSIBLE,
2052                              LSN_IMPOSSIBLE, 1, FALSE);
2053     DBUG_ASSERT(page_link->changed);
2054     page_link->unlock= PAGECACHE_LOCK_READ_UNLOCK;
2055     res= 0;
2056   }
2057   else
2058   {
2059     if (!(res= pagecache_write(share->pagecache,
2060                                &info->dfile, block->page, 0,
2061                                row_pos.buff,share->page_type,
2062                                PAGECACHE_LOCK_READ,
2063                                PAGECACHE_PIN,
2064                                PAGECACHE_WRITE_DELAY, &page_link.link,
2065                                LSN_IMPOSSIBLE)))
2066     {
2067       DBUG_ASSERT(page_link.link);
2068       page_link.unlock= PAGECACHE_LOCK_READ_UNLOCK;
2069       page_link.changed= 1;
2070       push_dynamic(&info->pinned_pages, (void*) &page_link);
2071     }
2072 
2073     /* Increase data file size, if extended */
2074     position= (my_off_t) block->page * block_size;
2075     if (share->state.state.data_file_length <= position)
2076     {
2077       /*
2078         We are modifying a state member before writing the UNDO; this is a WAL
2079         violation. But for data_file_length this is ok, as long as we change
2080         data_file_length after writing any log record (FILE_ID/REDO/UNDO) (see
2081         collect_tables()).
2082       */
2083       _ma_set_share_data_file_length(share, position + block_size);
2084     }
2085   }
2086   DBUG_RETURN(res);
2087 }
2088 
2089 
2090 /*
2091   Write full pages
2092 
2093   SYNOPSIS
2094     write_full_pages()
2095     info                Maria handler
2096     lsn			LSN for the undo record
2097     block               Where to write data
2098     data                Data to write
2099     length              Length of data
2100 
2101   NOTES
2102     Logging of the changes to the full pages are done in the caller
2103     write_block_record().
2104 
2105   RETURN
2106     0  ok
2107     1  error on write
2108 */
2109 
write_full_pages(MARIA_HA * info,LSN lsn,MARIA_BITMAP_BLOCK * block,uchar * data,ulong length)2110 static my_bool write_full_pages(MARIA_HA *info,
2111                                 LSN lsn,
2112                                 MARIA_BITMAP_BLOCK *block,
2113                                 uchar *data, ulong length)
2114 {
2115   pgcache_page_no_t page;
2116   MARIA_SHARE *share= info->s;
2117   uint block_size= share->block_size;
2118   uint data_size= FULL_PAGE_SIZE(share);
2119   uchar *buff= info->keyread_buff;
2120   uint page_count, sub_blocks;
2121   my_off_t position, max_position;
2122   DBUG_ENTER("write_full_pages");
2123   DBUG_PRINT("enter", ("length: %lu  page: %lu  page_count: %lu",
2124                        (ulong) length, (ulong) block->page,
2125                        (ulong) block->page_count));
2126   DBUG_ASSERT((block->page_count & TAIL_BIT) == 0);
2127 
2128   info->keyread_buff_used= 1;
2129   page=       block->page;
2130   page_count= block->page_count;
2131   sub_blocks= block->sub_blocks;
2132 
2133   max_position= (my_off_t) (page + page_count) * block_size;
2134 
2135   /* Increase data file size, if extended */
2136 
2137   for (; length; data+= data_size)
2138   {
2139     uint copy_length;
2140     if (!page_count--)
2141     {
2142       if (!--sub_blocks)
2143       {
2144         _ma_set_fatal_error(share, HA_ERR_WRONG_IN_RECORD);
2145         DBUG_RETURN(1);
2146       }
2147 
2148       block++;
2149       page= block->page;
2150       page_count= block->page_count - 1;
2151       DBUG_PRINT("info", ("page: %lu  page_count: %lu",
2152                           (ulong) block->page, (ulong) block->page_count));
2153 
2154       position= (page + page_count + 1) * block_size;
2155       set_if_bigger(max_position, position);
2156     }
2157     lsn_store(buff, lsn);
2158     buff[PAGE_TYPE_OFFSET]= (uchar) BLOB_PAGE;
2159     bzero(buff + LSN_SIZE + PAGE_TYPE_SIZE,
2160           FULL_PAGE_HEADER_SIZE(share) - (LSN_SIZE + PAGE_TYPE_SIZE));
2161     copy_length= MY_MIN(data_size, length);
2162     memcpy(buff + FULL_PAGE_HEADER_SIZE(share), data, copy_length);
2163     length-= copy_length;
2164 
2165     /*
2166       Zero out old information from the block. This removes possible
2167       sensitive information from the block and also makes the file
2168       easier to compress and easier to compare after recovery.
2169     */
2170     if (copy_length != data_size)
2171       bzero(buff + block_size - PAGE_SUFFIX_SIZE - (data_size - copy_length),
2172             (data_size - copy_length) + PAGE_SUFFIX_SIZE);
2173 
2174     if (pagecache_write(share->pagecache,
2175                         &info->dfile, page, 0,
2176                         buff, share->page_type,
2177                         PAGECACHE_LOCK_LEFT_UNLOCKED,
2178                         PAGECACHE_PIN_LEFT_UNPINNED,
2179                         PAGECACHE_WRITE_DELAY,
2180                         0, info->trn->rec_lsn))
2181       DBUG_RETURN(1);
2182     page++;
2183     DBUG_ASSERT(block->used & BLOCKUSED_USED);
2184   }
2185   if (share->state.state.data_file_length < max_position)
2186     _ma_set_share_data_file_length(share, max_position);
2187   DBUG_RETURN(0);
2188 }
2189 
2190 
2191 /*
2192   Store ranges of full pages in compact format for logging
2193 
2194   SYNOPSIS
2195     store_page_range()
2196     to		Store data here
2197     block       Where pages are to be written
2198     length	Length of data to be written
2199 		Normally this is full pages, except for the last
2200                 tail block that may only partly fit the last page.
2201     tot_ranges  Add here the number of ranges used
2202 
2203   NOTES
2204     The format of one entry is:
2205 
2206      Ranges				 SUB_RANGE_SIZE
2207      Empty bytes at end of last byte     BLOCK_FILLER_SIZE
2208      For each range
2209        Page number                       PAGE_STORE_SIZE
2210        Number of pages			 PAGERANGE_STORE_SIZE
2211 
2212   RETURN
2213     #  end position for 'to'
2214 */
2215 
store_page_range(MARIA_SHARE * share,uchar * to,MARIA_BITMAP_BLOCK * block,ulong length,uint * tot_ranges)2216 static uchar *store_page_range(MARIA_SHARE *share,
2217                                uchar *to, MARIA_BITMAP_BLOCK *block,
2218                                ulong length,
2219                                uint *tot_ranges)
2220 {
2221   uint data_size= FULL_PAGE_SIZE(share);
2222   ulong pages_left= (length + data_size -1) / data_size;
2223   uint page_count, ranges, empty_space;
2224   uchar *to_start;
2225   DBUG_ENTER("store_page_range");
2226 
2227   to_start= to;
2228   to+= SUB_RANGE_SIZE;
2229 
2230   /* Store number of unused bytes at last page */
2231   empty_space= (uint) (pages_left * data_size - length);
2232   int2store(to, empty_space);
2233   to+= BLOCK_FILLER_SIZE;
2234 
2235   ranges= 0;
2236   do
2237   {
2238     pgcache_page_no_t page;
2239     page=       block->page;
2240     page_count= block->page_count;
2241     block++;
2242     if (page_count > pages_left)
2243       page_count= pages_left;
2244 
2245     page_store(to, page);
2246     to+= PAGE_STORE_SIZE;
2247     pagerange_store(to, page_count);
2248     to+= PAGERANGE_STORE_SIZE;
2249     ranges++;
2250   } while ((pages_left-= page_count));
2251   /* Store number of ranges for this block */
2252   int2store(to_start, ranges);
2253   (*tot_ranges)+= ranges;
2254 
2255   DBUG_RETURN(to);
2256 }
2257 
2258 
2259 /*
2260   Store packed extent data
2261 
2262   SYNOPSIS
2263    store_extent_info()
2264    to				Store first packed data here
2265    row_extents_second_part	Store rest here
2266    first_block		        First block to store
2267    count			Number of blocks
2268 
2269   NOTES
2270     We don't have to store the position for the head block
2271 
2272     We have to set the START_EXTENT_BIT for every extent where the
2273     blob will be stored on a page of it's own. We need this in the
2274     UNDO phase to generate MARIA_BITMAP_BLOCK's for undo-delete and
2275     undo-update.
2276 */
2277 
store_extent_info(uchar * to,uchar * row_extents_second_part,MARIA_BITMAP_BLOCK * first_block,uint count)2278 static void store_extent_info(uchar *to,
2279                               uchar *row_extents_second_part,
2280                               MARIA_BITMAP_BLOCK *first_block,
2281                               uint count)
2282 {
2283   MARIA_BITMAP_BLOCK *block, *end_block;
2284   uint copy_length;
2285   my_bool first_found= 0;
2286   DBUG_ENTER("store_extent_info");
2287   DBUG_PRINT("enter", ("count: %u", count));
2288 
2289   for (block= first_block, end_block= first_block+count ;
2290        block < end_block; block++)
2291   {
2292     /* The following is only false for marker (unused) blocks */
2293     if (likely(block->used & BLOCKUSED_USED))
2294     {
2295       uint page_count= block->page_count;
2296       DBUG_ASSERT(page_count != 0);
2297       page_store(to, block->page);
2298       if (block->sub_blocks)
2299       {
2300         /*
2301           Set a bit so that we later know that this was the first block
2302           for a blob
2303         */
2304         page_count|= START_EXTENT_BIT;
2305       }
2306       pagerange_store(to + PAGE_STORE_SIZE, page_count);
2307       DBUG_DUMP("extent", to, ROW_EXTENT_SIZE);
2308       to+= ROW_EXTENT_SIZE;
2309       if (!first_found)
2310       {
2311         first_found= 1;
2312         to= row_extents_second_part;
2313       }
2314     }
2315   }
2316   copy_length= (count - 1) * ROW_EXTENT_SIZE;
2317   /*
2318     In some unlikely cases we have allocated to many blocks. Clear this
2319     data.
2320   */
2321   bzero(to, (size_t) (row_extents_second_part + copy_length - to));
2322   DBUG_VOID_RETURN;
2323 }
2324 
2325 
2326 /**
2327    @brief
2328    Convert extent info read from file to MARIA_BITMAP_BLOCKS suitable
2329    for write_block_record
2330 
2331    @note
2332    In case of blobs, this function marks all the blob pages in the bitmap
2333    as full pages. The bitmap bits for other pages will be marked
2334    when write_block_record() calls _ma_bitmap_release_unused().
2335 
2336    This function will be removed in Maria 2.0 when we instead of delete rows
2337    mark them as deleted and only remove them after commit.
2338 
2339    @return
2340    @retval 0  ok
2341    @retval 1  Error (out of memory or disk error changing bitmap) or
2342               wrong information in extent information
2343 */
2344 
extent_to_bitmap_blocks(MARIA_HA * info,MARIA_BITMAP_BLOCKS * blocks,pgcache_page_no_t head_page,uint extent_count,const uchar * extent_info)2345 static my_bool extent_to_bitmap_blocks(MARIA_HA *info,
2346                                        MARIA_BITMAP_BLOCKS *blocks,
2347                                        pgcache_page_no_t head_page,
2348                                        uint extent_count,
2349                                        const uchar *extent_info)
2350 {
2351   MARIA_BITMAP_BLOCK *block, *start_block;
2352   MARIA_SHARE *share= info->s;
2353   uint i, tail_page;
2354   DBUG_ENTER("extent_to_bitmap_blocks");
2355 
2356   if (allocate_dynamic(&info->bitmap_blocks, extent_count + 2))
2357     DBUG_RETURN(1);
2358   block= blocks->block=  dynamic_element(&info->bitmap_blocks, 0,
2359                                         MARIA_BITMAP_BLOCK*);
2360   blocks->count= extent_count + 1;
2361   blocks->tail_page_skipped= blocks->page_skipped= 0;
2362   block->page= head_page;
2363   block->page_count= 1;
2364   block->used= BLOCKUSED_USED | BLOCKUSED_USE_ORG_BITMAP;
2365   /* Impossible value, will force storage of real value */
2366   block->org_bitmap_value= 255;
2367 
2368   start_block= block++;
2369   for (i=0 ;
2370        i++ < extent_count ;
2371        block++, extent_info+= ROW_EXTENT_SIZE)
2372   {
2373     uint page_count= uint2korr(extent_info + ROW_EXTENT_PAGE_SIZE);
2374     if (page_count & START_EXTENT_BIT)
2375     {
2376       page_count&= ~START_EXTENT_BIT;
2377       start_block->sub_blocks= (uint) (block - start_block);
2378       start_block= block;
2379     }
2380     block->page= page_korr(extent_info);
2381     block->page_count= page_count;
2382     block->sub_blocks= 0;
2383     if (block->page_count == 0)
2384     {
2385       /* Extend allocated but not used by write_block_record() */
2386       DBUG_ASSERT(block->page == 0);
2387       /* This is the last block */
2388       blocks->count= i;
2389       break;
2390     }
2391     if ((tail_page= page_count & TAIL_BIT))
2392       page_count= 1;
2393 
2394     /* Check if wrong data */
2395     if (block->page == 0 || page_count == 0 ||
2396         (block->page + page_count) * share->block_size >
2397          share->state.state.data_file_length)
2398     {
2399       DBUG_PRINT("error", ("page: %lu  page_count: %u  tail: %u  length: %ld  data_length: %ld",
2400                            (ulong) block->page,
2401                            (block->page_count & ~TAIL_BIT),
2402                            (uint) MY_TEST(block->page_count & TAIL_BIT),
2403                            (ulong) ((block->page + (page_count & ~TAIL_BIT)) *
2404                                     share->block_size),
2405                            (ulong) share->state.state.data_file_length));
2406       DBUG_RETURN(1);
2407     }
2408     if (tail_page)
2409     {
2410       block->org_bitmap_value= _ma_bitmap_get_page_bits(info, &share->bitmap,
2411                                                         block->page);
2412       block->used= (BLOCKUSED_TAIL | BLOCKUSED_USED |
2413                     BLOCKUSED_USE_ORG_BITMAP);
2414     }
2415     else
2416     {
2417       my_bool res;
2418       mysql_mutex_lock(&share->bitmap.bitmap_lock);
2419       res= _ma_bitmap_set_full_page_bits(info, &share->bitmap,
2420                                          block->page, page_count);
2421       mysql_mutex_unlock(&share->bitmap.bitmap_lock);
2422       if (res)
2423         DBUG_RETURN(1);
2424       block->used= BLOCKUSED_USED;
2425     }
2426   }
2427   start_block->sub_blocks= (uint) (block - start_block);
2428   DBUG_RETURN(0);
2429 }
2430 
2431 
2432 /*
2433   Free regions of pages with logging
2434 
2435   NOTES
2436     We are removing filler events and tail page events from
2437     row->extents to get smaller log.
2438 
2439   RETURN
2440     0   ok
2441     1   error
2442 */
2443 
free_full_pages(MARIA_HA * info,MARIA_ROW * row)2444 static my_bool free_full_pages(MARIA_HA *info, MARIA_ROW *row)
2445 {
2446   uchar log_data[FILEID_STORE_SIZE + PAGERANGE_STORE_SIZE];
2447   LEX_CUSTRING log_array[TRANSLOG_INTERNAL_PARTS + 2];
2448   LSN lsn;
2449   size_t extents_length;
2450   uchar *extents= row->extents;
2451   DBUG_ENTER("free_full_pages");
2452 
2453   if (info->s->now_transactional)
2454   {
2455     /* Compact events by removing filler and tail events */
2456     uchar *new_block= 0;
2457     uchar *end, *to, *compact_extent_info;
2458     my_bool res;
2459     uint extents_count;
2460 
2461     if (!(compact_extent_info= my_alloca(row->extents_count *
2462                                          ROW_EXTENT_SIZE)))
2463       DBUG_RETURN(1);
2464 
2465     to= compact_extent_info;
2466     for (end= extents + row->extents_count * ROW_EXTENT_SIZE ;
2467          extents < end ;
2468          extents+= ROW_EXTENT_SIZE)
2469     {
2470       uint page_count= uint2korr(extents + ROW_EXTENT_PAGE_SIZE);
2471       page_count&= ~START_EXTENT_BIT;
2472       if (! (page_count & TAIL_BIT) && page_count != 0)
2473       {
2474         /* Found correct extent */
2475         if (!new_block)
2476           new_block= extents;                   /* First extent in range */
2477         continue;
2478       }
2479       /* Found extent to remove, copy everything found so far */
2480       if (new_block)
2481       {
2482         size_t length= (size_t) (extents - new_block);
2483         memcpy(to, new_block, length);
2484         to+= length;
2485         new_block= 0;
2486       }
2487     }
2488     if (new_block)
2489     {
2490       size_t length= (size_t) (extents - new_block);
2491       memcpy(to, new_block, length);
2492       to+= length;
2493     }
2494 
2495     if (!unlikely(extents_length= (uint) (to - compact_extent_info)))
2496     {
2497       /*
2498         No ranges. This happens in the rear case when we have a allocated
2499         place for a blob on a tail page but it did fit into the main page.
2500       */
2501       my_afree(compact_extent_info);
2502       DBUG_RETURN(0);
2503     }
2504     extents_count= (uint) (extents_length / ROW_EXTENT_SIZE);
2505     pagerange_store(log_data + FILEID_STORE_SIZE, extents_count);
2506     log_array[TRANSLOG_INTERNAL_PARTS + 0].str=    log_data;
2507     log_array[TRANSLOG_INTERNAL_PARTS + 0].length= sizeof(log_data);
2508     log_array[TRANSLOG_INTERNAL_PARTS + 1].str=    compact_extent_info;
2509     log_array[TRANSLOG_INTERNAL_PARTS + 1].length= extents_length;
2510     res= translog_write_record(&lsn, LOGREC_REDO_FREE_BLOCKS, info->trn,
2511                                info,
2512                                (translog_size_t) (sizeof(log_data) +
2513                                                   extents_length),
2514                                TRANSLOG_INTERNAL_PARTS + 2, log_array,
2515                                log_data, NULL);
2516     my_afree(compact_extent_info);
2517     if (res)
2518       DBUG_RETURN(1);
2519   }
2520 
2521   DBUG_RETURN(_ma_bitmap_free_full_pages(info, row->extents,
2522                                          row->extents_count));
2523 }
2524 
2525 
2526 /*
2527   Free one page range
2528 
2529   NOTES
2530     This is very similar to free_full_pages()
2531 
2532   RETURN
2533     0   ok
2534     1   error
2535 */
2536 
free_full_page_range(MARIA_HA * info,pgcache_page_no_t page,uint count)2537 static my_bool free_full_page_range(MARIA_HA *info, pgcache_page_no_t page,
2538                                     uint count)
2539 {
2540   my_bool res= 0;
2541   uint delete_count;
2542   MARIA_SHARE *share= info->s;
2543   DBUG_ENTER("free_full_page_range");
2544 
2545   delete_count= count;
2546   if (share->state.state.data_file_length ==
2547       (page + count) * share->block_size)
2548   {
2549     /*
2550       Don't delete last page from pagecache as this will make the file
2551       shorter than expected if the last operation extended the file
2552     */
2553     delete_count--;
2554   }
2555   if (delete_count &&
2556       pagecache_delete_pages(share->pagecache, &info->dfile,
2557                              page, delete_count, PAGECACHE_LOCK_WRITE, 1))
2558     res= 1;
2559 
2560   if (share->now_transactional)
2561   {
2562     LSN lsn;
2563     /** @todo unify log_data's shape with delete_head_or_tail() */
2564     uchar log_data[FILEID_STORE_SIZE + PAGERANGE_STORE_SIZE +
2565                    ROW_EXTENT_SIZE];
2566     LEX_CUSTRING log_array[TRANSLOG_INTERNAL_PARTS + 1];
2567     DBUG_ASSERT(info->trn->rec_lsn);
2568     pagerange_store(log_data + FILEID_STORE_SIZE, 1);
2569     page_store(log_data + FILEID_STORE_SIZE + PAGERANGE_STORE_SIZE,
2570               page);
2571     int2store(log_data + FILEID_STORE_SIZE + PAGERANGE_STORE_SIZE +
2572               PAGE_STORE_SIZE, count);
2573     log_array[TRANSLOG_INTERNAL_PARTS + 0].str=    log_data;
2574     log_array[TRANSLOG_INTERNAL_PARTS + 0].length= sizeof(log_data);
2575 
2576     if (translog_write_record(&lsn, LOGREC_REDO_FREE_BLOCKS,
2577                               info->trn, info,
2578                               (translog_size_t) sizeof(log_data),
2579                               TRANSLOG_INTERNAL_PARTS + 1, log_array,
2580                               log_data, NULL))
2581       res= 1;
2582   }
2583   mysql_mutex_lock(&share->bitmap.bitmap_lock);
2584   if (_ma_bitmap_reset_full_page_bits(info, &share->bitmap, page, count))
2585     res= 1;
2586   mysql_mutex_unlock(&share->bitmap.bitmap_lock);
2587   DBUG_RETURN(res);
2588 }
2589 
2590 
2591 /**
2592    @brief Write a record to a (set of) pages
2593 
2594    @fn     write_block_record()
2595    @param  info            Maria handler
2596    @param  old_record      Original record in case of update; NULL in case of
2597                            insert
2598    @param  record          Record we should write
2599    @param  row             Statistics about record (calculated by
2600                            calc_record_size())
2601    @param  map_blocks      On which pages the record should be stored
2602    @param  row_pos         Position on head page where to put head part of
2603                            record
2604    @param  undo_lsn	   <> LSN_ERROR if we are executing an UNDO
2605    @param  old_record_checksum Checksum of old_record: ignored if table does
2606                                not have live checksum; otherwise if
2607                                old_record==NULL it must be 0.
2608 
2609    @note
2610      On return all pinned pages are released.
2611 
2612      [page_buff + EMPTY_SPACE_OFFSET] is set to
2613      row_pos->empty_space - head_length
2614 
2615    @return Operation status
2616    @retval 0      OK
2617    @retval 1      Error
2618 */
2619 
write_block_record(MARIA_HA * info,const uchar * old_record,const uchar * record,MARIA_ROW * row,MARIA_BITMAP_BLOCKS * bitmap_blocks,my_bool head_block_is_read,struct st_row_pos_info * row_pos,LSN undo_lsn,ha_checksum old_record_checksum)2620 static my_bool write_block_record(MARIA_HA *info,
2621                                   const uchar *old_record,
2622                                   const uchar *record,
2623                                   MARIA_ROW *row,
2624                                   MARIA_BITMAP_BLOCKS *bitmap_blocks,
2625                                   my_bool head_block_is_read,
2626                                   struct st_row_pos_info *row_pos,
2627                                   LSN undo_lsn,
2628                                   ha_checksum old_record_checksum)
2629 {
2630   uchar *data, *end_of_data, *tmp_data_used, *tmp_data;
2631   uchar *UNINIT_VAR(row_extents_first_part), *UNINIT_VAR(row_extents_second_part);
2632   uchar *field_length_data;
2633   uchar *page_buff;
2634   MARIA_BITMAP_BLOCK *block, *head_block;
2635   MARIA_SHARE *share= info->s;
2636   MARIA_COLUMNDEF *column, *end_column;
2637   MARIA_PINNED_PAGE page_link;
2638   uint block_size, flag, head_length;
2639   ulong *blob_lengths;
2640   my_bool row_extents_in_use, blob_full_pages_exists;
2641   LSN lsn;
2642   my_off_t position;
2643   uint save_my_errno;
2644   DBUG_ENTER("write_block_record");
2645 
2646   head_block= bitmap_blocks->block;
2647   block_size= share->block_size;
2648 
2649   page_buff= row_pos->buff;
2650   /* Position on head page where we should store the head part */
2651   data= row_pos->data;
2652   end_of_data= data + row_pos->length;
2653 
2654   /* Write header */
2655   flag= info->row_flag;
2656   row_extents_in_use= 0;
2657   if (unlikely(row->total_length > row_pos->length))
2658   {
2659     /* Need extent */
2660     DBUG_ASSERT(bitmap_blocks->count > 1);
2661     if (bitmap_blocks->count <= 1)
2662       goto crashed;                             /* Wrong in bitmap */
2663     flag|= ROW_FLAG_EXTENTS;
2664     row_extents_in_use= 1;
2665   }
2666   /* For now we have only a minimum header */
2667   *data++= (uchar) flag;
2668   if (flag & ROW_FLAG_TRANSID)
2669   {
2670     transid_store(data, info->trn->trid);
2671     data+= TRANSID_SIZE;
2672   }
2673 
2674   if (unlikely(flag & ROW_FLAG_NULLS_EXTENDED))
2675     *data++= (uchar) (share->base.null_bytes -
2676                       share->base.original_null_bytes);
2677   if (row_extents_in_use)
2678   {
2679     /* Store first extent in header */
2680     store_key_length_inc(data, bitmap_blocks->count - 1);
2681     row_extents_first_part= data;
2682     data+= ROW_EXTENT_SIZE;
2683   }
2684   if (share->base.max_field_lengths)
2685     store_key_length_inc(data, row->field_lengths_length);
2686   if (share->calc_checksum)
2687   {
2688     *(data++)= (uchar) (row->checksum); /* store least significant byte */
2689     DBUG_ASSERT(!((old_record_checksum != 0) && (old_record == NULL)));
2690   }
2691   memcpy(data, record, share->base.null_bytes);
2692   data+= share->base.null_bytes;
2693   memcpy(data, row->empty_bits, share->base.pack_bytes);
2694   data+= share->base.pack_bytes;
2695 
2696   DBUG_ASSERT(row_extents_in_use || undo_lsn != LSN_ERROR ||
2697               (uint) (data - row_pos->data) == row->min_length);
2698 
2699   /*
2700     Allocate a buffer of rest of data (except blobs)
2701 
2702     To avoid double copying of data, we copy as many columns that fits into
2703     the page. The rest goes into info->packed_row.
2704 
2705     Using an extra buffer, instead of doing continuous writes to different
2706     pages, uses less code and we don't need to have to do a complex call
2707     for every data segment we want to store.
2708   */
2709   if (_ma_alloc_buffer(&info->rec_buff, &info->rec_buff_size,
2710                        row->head_length))
2711     DBUG_RETURN(1);
2712 
2713   tmp_data_used= 0;                 /* Either 0 or last used uchar in 'data' */
2714   tmp_data= data;
2715 
2716   if (row_extents_in_use)
2717   {
2718     uint copy_length= (bitmap_blocks->count - 2) * ROW_EXTENT_SIZE;
2719     if (!tmp_data_used && tmp_data + copy_length > end_of_data)
2720     {
2721       tmp_data_used= tmp_data;
2722       tmp_data= info->rec_buff;
2723     }
2724     row_extents_second_part= tmp_data;
2725     /*
2726        We will copy the extents here when we have figured out the tail
2727        positions.
2728     */
2729     tmp_data+= copy_length;
2730   }
2731 
2732   /* Copy fields that has fixed lengths (primary key etc) */
2733   for (column= share->columndef,
2734          end_column= column + share->base.fixed_not_null_fields;
2735        column < end_column; column++)
2736   {
2737     if (!tmp_data_used && tmp_data + column->length > end_of_data)
2738     {
2739       tmp_data_used= tmp_data;
2740       tmp_data= info->rec_buff;
2741     }
2742     memcpy(tmp_data, record + column->offset, column->length);
2743     tmp_data+= column->length;
2744   }
2745 
2746   /* Copy length of data for variable length fields */
2747   if (!tmp_data_used && tmp_data + row->field_lengths_length > end_of_data)
2748   {
2749     tmp_data_used= tmp_data;
2750     tmp_data= info->rec_buff;
2751   }
2752   field_length_data= row->field_lengths;
2753   memcpy(tmp_data, field_length_data, row->field_lengths_length);
2754   tmp_data+= row->field_lengths_length;
2755 
2756   DBUG_ASSERT(row_extents_in_use || undo_lsn != LSN_ERROR ||
2757               (uint) (tmp_data - row_pos->data) == row->min_length +
2758               share->base.fixed_not_null_fields_length +
2759               row->field_lengths_length);
2760 
2761   /* Copy variable length fields and fields with null/zero */
2762   for (end_column= share->columndef + share->base.fields - share->base.blobs;
2763        column < end_column ;
2764        column++)
2765   {
2766     const uchar *field_pos;
2767     ulong length;
2768     if ((record[column->null_pos] & column->null_bit) ||
2769         (row->empty_bits[column->empty_pos] & column->empty_bit))
2770       continue;
2771 
2772     field_pos= record + column->offset;
2773     switch (column->type) {
2774     case FIELD_NORMAL:                          /* Fixed length field */
2775     case FIELD_SKIP_PRESPACE:
2776     case FIELD_SKIP_ZERO:                       /* Fixed length field */
2777       length= column->length;
2778       break;
2779     case FIELD_SKIP_ENDSPACE:                   /* CHAR */
2780       /* Char that is space filled */
2781       if (column->length <= 255)
2782         length= (uint) (uchar) *field_length_data++;
2783       else
2784       {
2785         length= uint2korr(field_length_data);
2786         field_length_data+= 2;
2787       }
2788       break;
2789     case FIELD_VARCHAR:
2790       if (column->length <= 256)
2791       {
2792         length= (uint) (uchar) *field_length_data++;
2793         field_pos++;                            /* Skip length uchar */
2794       }
2795       else
2796       {
2797         length= uint2korr(field_length_data);
2798         field_length_data+= 2;
2799         field_pos+= 2;
2800       }
2801       DBUG_ASSERT(length <= column->length);
2802       break;
2803     default:                                    /* Wrong data */
2804       DBUG_ASSERT(!maria_assert_if_crashed_table);
2805       length=0;
2806       break;
2807     }
2808     if (!tmp_data_used && tmp_data + length > end_of_data)
2809     {
2810       /* Data didn't fit in page; Change to use tmp buffer */
2811       tmp_data_used= tmp_data;
2812       tmp_data= info->rec_buff;
2813     }
2814     memcpy((char*) tmp_data, field_pos, length);
2815     tmp_data+= length;
2816   }
2817 
2818   block= head_block + head_block->sub_blocks;   /* Point to first blob data */
2819 
2820   end_column= column + share->base.blobs;
2821   blob_lengths= row->blob_lengths;
2822   if (!tmp_data_used)
2823   {
2824     /* Still room on page; Copy as many blobs we can into this page */
2825     data= tmp_data;
2826     for (; column < end_column &&
2827            *blob_lengths <= (ulong)(end_of_data - data);
2828          column++, blob_lengths++)
2829     {
2830       uchar *tmp_pos;
2831       uint length;
2832       if (!*blob_lengths)                       /* Null or "" */
2833         continue;
2834       length= column->length - portable_sizeof_char_ptr;
2835       memcpy(&tmp_pos, record + column->offset + length, sizeof(char*));
2836       memcpy(data, tmp_pos, *blob_lengths);
2837       data+= *blob_lengths;
2838       /*
2839         The following is not true when we want to insert data into original
2840         place. In this case we don't have any extra blocks allocated
2841       */
2842       if (likely(undo_lsn == LSN_ERROR))
2843       {
2844         /* Skip over tail page that was prepared for storing blob */
2845         block++;
2846         bitmap_blocks->tail_page_skipped= 1;
2847       }
2848     }
2849     if (head_block->sub_blocks > 1)
2850     {
2851       /* We have allocated pages that where not used */
2852       bitmap_blocks->page_skipped= 1;
2853     }
2854   }
2855   else
2856     data= tmp_data_used;                        /* Get last used on page */
2857 
2858   /* Update page directory */
2859   head_length= (uint) (data - row_pos->data);
2860   DBUG_PRINT("info", ("Used head length on page: %u  header_length: %u",
2861                       head_length,
2862                       (uint) (flag & ROW_FLAG_TRANSID ? TRANSID_SIZE : 0)));
2863   if (head_length < share->base.min_block_length)
2864   {
2865     /* Extend row to be of size min_block_length */
2866     uint diff_length= share->base.min_block_length - head_length;
2867     bzero(data, diff_length);
2868     data+= diff_length;
2869     head_length= share->base.min_block_length;
2870   }
2871   DBUG_ASSERT(data <= end_of_data);
2872   /*
2873     If this is a redo entry (ie, undo_lsn != LSN_ERROR) then we should have
2874     written exactly head_length bytes (same as original record).
2875   */
2876   DBUG_ASSERT(undo_lsn == LSN_ERROR || head_length == row_pos->length);
2877   int2store(row_pos->dir + 2, head_length);
2878   /* update empty space at start of block */
2879   row_pos->empty_space-= head_length;
2880   int2store(page_buff + EMPTY_SPACE_OFFSET, row_pos->empty_space);
2881   /* Mark in bitmaps how the current page was actually used */
2882   head_block->empty_space= row_pos->empty_space;
2883   if (page_buff[DIR_COUNT_OFFSET] == MAX_ROWS_PER_PAGE &&
2884       page_buff[DIR_FREE_OFFSET] == END_OF_DIR_FREE_LIST)
2885     head_block->empty_space= 0;               /* Page is full */
2886   head_block->used|= BLOCKUSED_USED;
2887 
2888   check_directory(share,
2889                   page_buff, share->block_size, share->base.min_block_length,
2890                   (uint) -1);
2891 
2892   /*
2893      Now we have to write tail pages, as we need to store the position
2894      to them in the row extent header.
2895 
2896      We first write out all blob tails, to be able to store them in
2897      the current page or 'tmp_data'.
2898 
2899      Then we write the tail of the non-blob fields (The position to the
2900      tail page is stored either in row header, the extents in the head
2901      page or in the first full page of the non-blob data. It's never in
2902      the tail page of the non-blob data)
2903   */
2904 
2905   blob_full_pages_exists= 0;
2906   if (row_extents_in_use)
2907   {
2908     if (column != end_column)                   /* If blob fields */
2909     {
2910       MARIA_COLUMNDEF    *save_column=       column;
2911       MARIA_BITMAP_BLOCK *save_block=        block;
2912       MARIA_BITMAP_BLOCK *end_block;
2913       ulong              *save_blob_lengths= blob_lengths;
2914 
2915       for (; column < end_column; column++, blob_lengths++)
2916       {
2917         uchar *blob_pos;
2918         if (!*blob_lengths)                     /* Null or "" */
2919           continue;
2920         if (block[block->sub_blocks - 1].used & BLOCKUSED_TAIL)
2921         {
2922           uint length;
2923           length= column->length - portable_sizeof_char_ptr;
2924           memcpy(&blob_pos, record + column->offset + length, sizeof(char*));
2925           length= *blob_lengths % FULL_PAGE_SIZE(share);   /* tail size */
2926           if (length != *blob_lengths)
2927             blob_full_pages_exists= 1;
2928           if (write_tail(info, block + block->sub_blocks-1,
2929                          blob_pos + *blob_lengths - length,
2930                          length))
2931             goto disk_err;
2932         }
2933         else
2934           blob_full_pages_exists= 1;
2935 
2936         for (end_block= block + block->sub_blocks; block < end_block; block++)
2937         {
2938           /*
2939             Set only a bit, to not cause bitmap code to believe a block is full
2940             when there is still a lot of entries in it.
2941           */
2942           block->used|= BLOCKUSED_USED;
2943         }
2944       }
2945       DBUG_ASSERT((undo_lsn == LSN_ERROR ||
2946                    block == bitmap_blocks->block + bitmap_blocks->count));
2947       column= save_column;
2948       block= save_block;
2949       blob_lengths= save_blob_lengths;
2950     }
2951 
2952     if (tmp_data_used)                          /* non blob data overflows */
2953     {
2954       MARIA_BITMAP_BLOCK *cur_block, *end_block, *last_head_block;
2955       MARIA_BITMAP_BLOCK *head_tail_block= 0;
2956       ulong length;
2957       ulong data_length= (ulong) (tmp_data - info->rec_buff);
2958 
2959 #ifdef SANITY_CHECKS
2960       DBUG_ASSERT(head_block->sub_blocks != 1);
2961       if (head_block->sub_blocks == 1)
2962         goto crashed;                           /* no reserved full or tails */
2963 #endif
2964       /*
2965         Find out where to write tail for non-blob fields.
2966 
2967         Problem here is that the bitmap code may have allocated more
2968         space than we need. We have to handle the following cases:
2969 
2970         - Bitmap code allocated a tail page we don't need.
2971         - The last full page allocated needs to be changed to a tail page
2972         (Because we where able to put more data on the head page than
2973         the bitmap allocation assumed)
2974 
2975         The reserved pages in bitmap_blocks for the main page has one of
2976         the following allocations:
2977         - Full pages, with following blocks:
2978           # * full pages
2979           empty page  ; To be used if we change last full to tail page. This
2980           has 'count' = 0.
2981           tail page  (optional, if last full page was part full)
2982         - One tail page
2983       */
2984 
2985       cur_block= head_block + 1;
2986       end_block= head_block + head_block->sub_blocks;
2987       /*
2988         Loop until we have find a block bigger than we need or
2989         we find the empty page block.
2990       */
2991       while (data_length >= (length= (cur_block->page_count *
2992                                       FULL_PAGE_SIZE(share))) &&
2993              cur_block->page_count)
2994       {
2995 #ifdef SANITY_CHECKS
2996         DBUG_ASSERT(!((cur_block == end_block) ||
2997                       (cur_block->used & BLOCKUSED_USED)));
2998         if ((cur_block == end_block) || (cur_block->used & BLOCKUSED_USED))
2999           goto crashed;
3000 #endif
3001         data_length-= length;
3002         (cur_block++)->used|= BLOCKUSED_USED;
3003       }
3004       last_head_block= cur_block;
3005       if (data_length)
3006       {
3007         if (cur_block->page_count == 0)
3008         {
3009           /* Skip empty filler block */
3010           cur_block++;
3011         }
3012 #ifdef SANITY_CHECKS
3013         DBUG_ASSERT(!(cur_block >= end_block));
3014         if ((cur_block >= end_block))
3015           goto crashed;
3016 #endif
3017         if (cur_block->used & BLOCKUSED_TAIL)
3018         {
3019           DBUG_ASSERT(data_length < MAX_TAIL_SIZE(block_size));
3020           /* tail written to tail page */
3021           cur_block->used|= BLOCKUSED_USED;
3022           head_tail_block= cur_block;
3023         }
3024         else if (data_length > length - MAX_TAIL_SIZE(block_size))
3025         {
3026           /* tail written to full page */
3027           cur_block->used|= BLOCKUSED_USED;
3028           if ((cur_block != end_block - 1) &&
3029               (end_block[-1].used & BLOCKUSED_TAIL))
3030             bitmap_blocks->tail_page_skipped= 1;
3031         }
3032         else
3033         {
3034           /*
3035             cur_block is a full block, followed by an empty and optional
3036             tail block. Change cur_block to a tail block or split it
3037             into full blocks and tail blocks.
3038 
3039             TODO:
3040              If there is enough space on the following tail block, use
3041              this instead of creating a new tail block.
3042           */
3043           DBUG_ASSERT(cur_block[1].page_count == 0);
3044           if (cur_block->page_count == 1)
3045           {
3046             /* convert full block to tail block */
3047             cur_block->used|= BLOCKUSED_USED | BLOCKUSED_TAIL;
3048             head_tail_block= cur_block;
3049           }
3050           else
3051           {
3052             DBUG_ASSERT(data_length < length - FULL_PAGE_SIZE(share));
3053             DBUG_PRINT("info", ("Splitting blocks into full and tail"));
3054             cur_block[1].page= (cur_block->page + cur_block->page_count - 1);
3055             cur_block[1].page_count= 1;         /* Avoid DBUG_ASSERT */
3056             cur_block[1].used= BLOCKUSED_USED | BLOCKUSED_TAIL;
3057             cur_block->page_count--;
3058             cur_block->used|= BLOCKUSED_USED;
3059             last_head_block= head_tail_block= cur_block+1;
3060           }
3061           if (end_block[-1].used & BLOCKUSED_TAIL)
3062             bitmap_blocks->tail_page_skipped= 1;
3063         }
3064       }
3065       else
3066       {
3067         /* Must be an empty or tail page */
3068         DBUG_ASSERT(cur_block->page_count == 0 ||
3069                     cur_block->used & BLOCKUSED_TAIL);
3070         if (end_block[-1].used & BLOCKUSED_TAIL)
3071           bitmap_blocks->tail_page_skipped= 1;
3072       }
3073 
3074       /*
3075         Write all extents into page or tmp_data
3076 
3077         Note that we still don't have a correct position for the tail
3078         of the non-blob fields.
3079       */
3080       store_extent_info(row_extents_first_part,
3081                         row_extents_second_part,
3082                         head_block+1, bitmap_blocks->count - 1);
3083       if (head_tail_block)
3084       {
3085         ulong block_length= (ulong) (tmp_data - info->rec_buff);
3086         uchar *extent_data;
3087 
3088         length= (uint) (block_length % FULL_PAGE_SIZE(share));
3089         if (write_tail(info, head_tail_block,
3090                        info->rec_buff + block_length - length,
3091                        length))
3092           goto disk_err;
3093         tmp_data-= length;                      /* Remove the tail */
3094         if (tmp_data == info->rec_buff)
3095         {
3096           /* We have no full blocks to write for the head part */
3097           tmp_data_used= 0;
3098         }
3099 
3100         /* Store the tail position for the non-blob fields */
3101         if (head_tail_block == head_block + 1)
3102         {
3103           /*
3104             We had a head block + tail block, which means that the
3105             tail block is the first extent
3106           */
3107           extent_data= row_extents_first_part;
3108         }
3109         else
3110         {
3111           /*
3112             We have a head block + some full blocks + tail block
3113             last_head_block is pointing after the last used extent
3114             for the head block.
3115           */
3116           extent_data= row_extents_second_part +
3117             ((last_head_block - head_block) - 2) * ROW_EXTENT_SIZE;
3118         }
3119         /* Write information for tail block in the reserved space */
3120         page_store(extent_data, head_tail_block->page);
3121         pagerange_store(extent_data + PAGE_STORE_SIZE,
3122                         head_tail_block->page_count);
3123       }
3124     }
3125     else
3126       store_extent_info(row_extents_first_part,
3127                         row_extents_second_part,
3128                         head_block+1, bitmap_blocks->count - 1);
3129   }
3130 
3131   if (share->now_transactional)
3132   {
3133     uchar log_data[FILEID_STORE_SIZE + PAGE_STORE_SIZE + DIRPOS_STORE_SIZE];
3134     LEX_CUSTRING log_array[TRANSLOG_INTERNAL_PARTS + 2];
3135 
3136     /* Log REDO changes of head page */
3137     page_store(log_data + FILEID_STORE_SIZE, head_block->page);
3138     dirpos_store(log_data + FILEID_STORE_SIZE + PAGE_STORE_SIZE,
3139                  row_pos->rownr);
3140     log_array[TRANSLOG_INTERNAL_PARTS + 0].str=    log_data;
3141     log_array[TRANSLOG_INTERNAL_PARTS + 0].length= sizeof(log_data);
3142     log_array[TRANSLOG_INTERNAL_PARTS + 1].str=    row_pos->data;
3143     log_array[TRANSLOG_INTERNAL_PARTS + 1].length= head_length;
3144     if (translog_write_record(&lsn,
3145                               head_block_is_read ?
3146                               LOGREC_REDO_INSERT_ROW_HEAD :
3147                               LOGREC_REDO_NEW_ROW_HEAD,
3148                               info->trn,
3149                               info,
3150                               (translog_size_t) (sizeof(log_data) +
3151                                                  head_length),
3152                               TRANSLOG_INTERNAL_PARTS + 2, log_array,
3153                               log_data, NULL))
3154       goto disk_err;
3155   }
3156 
3157 #ifdef RECOVERY_EXTRA_DEBUG
3158   if (info->trn->undo_lsn != LSN_IMPOSSIBLE)
3159   {
3160     /* Stop right after the REDO; testing incomplete log record groups */
3161     DBUG_EXECUTE_IF("maria_flush_whole_log",
3162                     {
3163                       DBUG_PRINT("maria_flush_whole_log", ("now"));
3164                       translog_flush(translog_get_horizon());
3165                     });
3166     DBUG_EXECUTE_IF("maria_crash",
3167                     { DBUG_PRINT("maria_crash", ("now")); DBUG_SUICIDE(); });
3168   }
3169 #endif
3170 
3171   if (head_block_is_read)
3172   {
3173     MARIA_PINNED_PAGE *page_link;
3174     /* Head page is always the first pinned page */
3175     page_link= dynamic_element(&info->pinned_pages, 0,
3176                                MARIA_PINNED_PAGE*);
3177     pagecache_unlock_by_link(share->pagecache, page_link->link,
3178                              PAGECACHE_LOCK_WRITE_TO_READ,
3179                              PAGECACHE_PIN_LEFT_PINNED, LSN_IMPOSSIBLE,
3180                              LSN_IMPOSSIBLE, 1, FALSE);
3181     page_link->unlock= PAGECACHE_LOCK_READ_UNLOCK;
3182     page_link->changed= 1;
3183   }
3184   else
3185   {
3186     if (pagecache_write(share->pagecache,
3187                         &info->dfile, head_block->page, 0,
3188                         page_buff, share->page_type,
3189                         head_block_is_read ? PAGECACHE_LOCK_WRITE_TO_READ :
3190                         PAGECACHE_LOCK_READ,
3191                         head_block_is_read ? PAGECACHE_PIN_LEFT_PINNED :
3192                         PAGECACHE_PIN,
3193                         PAGECACHE_WRITE_DELAY, &page_link.link,
3194                         LSN_IMPOSSIBLE))
3195       goto disk_err;
3196     DBUG_ASSERT(page_link.link);
3197     page_link.unlock= PAGECACHE_LOCK_READ_UNLOCK;
3198     page_link.changed= 1;
3199     push_dynamic(&info->pinned_pages, (void*) &page_link);
3200 
3201     /* Increase data file size, if extended */
3202     position= (my_off_t) head_block->page * block_size;
3203     if (share->state.state.data_file_length <= position)
3204       _ma_set_share_data_file_length(share, position + block_size);
3205   }
3206 
3207   if (share->now_transactional && (tmp_data_used || blob_full_pages_exists))
3208   {
3209     /*
3210       Log REDO writes for all full pages (head part and all blobs)
3211       We write all here to be able to generate the UNDO record early
3212       so that we can write the LSN for the UNDO record to all full pages.
3213     */
3214     uchar tmp_log_data[FILEID_STORE_SIZE + PAGERANGE_STORE_SIZE +
3215                        (ROW_EXTENT_SIZE + BLOCK_FILLER_SIZE + SUB_RANGE_SIZE) *
3216                        ROW_EXTENTS_ON_STACK];
3217     uchar *log_data, *log_pos;
3218     LEX_CUSTRING tmp_log_array[TRANSLOG_INTERNAL_PARTS + 2 +
3219                                ROW_EXTENTS_ON_STACK];
3220     LEX_CUSTRING *log_array_pos, *log_array;
3221     int error;
3222     translog_size_t log_entry_length= 0;
3223     uint ext_length, extents= 0, sub_extents= 0;
3224 
3225     /* If few extents, then allocate things on stack to avoid a malloc call */
3226     if (bitmap_blocks->count < ROW_EXTENTS_ON_STACK)
3227     {
3228       log_array= tmp_log_array;
3229       log_data= tmp_log_data;
3230     }
3231     else
3232     {
3233       if (!my_multi_malloc(MY_WME, &log_array,
3234                           (uint) ((bitmap_blocks->count +
3235                                    TRANSLOG_INTERNAL_PARTS + 2) *
3236                                   sizeof(*log_array)),
3237                           &log_data, FILEID_STORE_SIZE + PAGERANGE_STORE_SIZE +
3238                           bitmap_blocks->count * (ROW_EXTENT_SIZE +
3239                                                   BLOCK_FILLER_SIZE +
3240                                                   SUB_RANGE_SIZE),
3241                           NullS))
3242         goto disk_err;
3243     }
3244     log_pos= log_data + FILEID_STORE_SIZE + PAGERANGE_STORE_SIZE * 2;
3245     log_array_pos= log_array+ TRANSLOG_INTERNAL_PARTS+1;
3246 
3247     if (tmp_data_used)
3248     {
3249       /* Full head page */
3250       translog_size_t block_length= (translog_size_t) (tmp_data -
3251                                                        info->rec_buff);
3252       log_pos= store_page_range(share,
3253                                 log_pos, head_block+1,
3254                                 (ulong) block_length, &extents);
3255       log_array_pos->str= info->rec_buff;
3256       log_array_pos->length= block_length;
3257       log_entry_length+= block_length;
3258       log_array_pos++;
3259       sub_extents++;
3260     }
3261     if (blob_full_pages_exists)
3262     {
3263       MARIA_COLUMNDEF *tmp_column= column;
3264       ulong *tmp_blob_lengths= blob_lengths;
3265       MARIA_BITMAP_BLOCK *tmp_block= block;
3266 
3267       /* Full blob pages */
3268       for (; tmp_column < end_column; tmp_column++, tmp_blob_lengths++)
3269       {
3270         ulong blob_length;
3271         uint length;
3272 
3273         if (!*tmp_blob_lengths)                 /* Null or "" */
3274           continue;
3275         blob_length= *tmp_blob_lengths;
3276         length= tmp_column->length - portable_sizeof_char_ptr;
3277         /*
3278           If last part of blog was on tail page, change blob_length to
3279           reflect this
3280         */
3281         if (tmp_block[tmp_block->sub_blocks - 1].used & BLOCKUSED_TAIL)
3282           blob_length-= (blob_length % FULL_PAGE_SIZE(share));
3283         if (blob_length)
3284         {
3285           memcpy((void*) &log_array_pos->str,
3286                  record + tmp_column->offset + length,
3287                  sizeof(uchar*));
3288           log_array_pos->length= blob_length;
3289           log_entry_length+= blob_length;
3290           log_array_pos++;
3291           sub_extents++;
3292 
3293           log_pos= store_page_range(share,
3294                                     log_pos, tmp_block,
3295                                     blob_length, &extents);
3296         }
3297         tmp_block+= tmp_block->sub_blocks;
3298       }
3299     }
3300 
3301     log_array[TRANSLOG_INTERNAL_PARTS + 0].str=    log_data;
3302     ext_length=  (uint) (log_pos - log_data);
3303     log_array[TRANSLOG_INTERNAL_PARTS + 0].length= ext_length;
3304     pagerange_store(log_data+ FILEID_STORE_SIZE, extents);
3305     pagerange_store(log_data+ FILEID_STORE_SIZE + PAGERANGE_STORE_SIZE,
3306                     sub_extents);
3307 
3308     log_entry_length+= ext_length;
3309     /* trn->rec_lsn is already set earlier in this function */
3310     error= translog_write_record(&lsn, LOGREC_REDO_INSERT_ROW_BLOBS,
3311                                  info->trn, info, log_entry_length,
3312                                  (uint) (log_array_pos - log_array),
3313                                  log_array, log_data, NULL);
3314     if (log_array != tmp_log_array)
3315       my_free(log_array);
3316     if (error)
3317       goto disk_err;
3318   }
3319 
3320   /* Write UNDO or CLR record */
3321   lsn= LSN_IMPOSSIBLE;
3322   if (share->now_transactional)
3323   {
3324     LEX_CUSTRING *log_array= info->log_row_parts;
3325 
3326     if (undo_lsn != LSN_ERROR)
3327     {
3328       /*
3329         Store if this CLR is about UNDO_DELETE or UNDO_UPDATE;
3330         in the first case, Recovery, when it sees the CLR_END in the
3331         REDO phase, may decrement the records' count.
3332       */
3333       if (_ma_write_clr(info, undo_lsn,
3334                         old_record ? LOGREC_UNDO_ROW_UPDATE :
3335                         LOGREC_UNDO_ROW_DELETE,
3336                         share->calc_checksum != 0,
3337                         row->checksum - old_record_checksum,
3338                         &lsn, (void*) 0))
3339         goto disk_err;
3340     }
3341     else
3342     {
3343       uchar log_data[LSN_STORE_SIZE + FILEID_STORE_SIZE +
3344                      PAGE_STORE_SIZE + DIRPOS_STORE_SIZE + 2 +
3345                      HA_CHECKSUM_STORE_SIZE + 2 + PAGERANGE_STORE_SIZE +
3346                      ROW_EXTENT_SIZE];
3347       uchar *log_pos;
3348       ha_checksum checksum_delta;
3349 
3350       /* LOGREC_UNDO_ROW_INSERT & LOGREC_UNDO_ROW_UPDATE share same header */
3351       lsn_store(log_data, info->trn->undo_lsn);
3352       page_store(log_data + LSN_STORE_SIZE + FILEID_STORE_SIZE,
3353                  head_block->page);
3354       dirpos_store(log_data + LSN_STORE_SIZE + FILEID_STORE_SIZE +
3355                    PAGE_STORE_SIZE,
3356                    row_pos->rownr);
3357       log_pos= (log_data + LSN_STORE_SIZE + FILEID_STORE_SIZE +
3358                 PAGE_STORE_SIZE + DIRPOS_STORE_SIZE);
3359       store_checksum_in_rec(share, checksum_delta,
3360                             row->checksum - old_record_checksum,
3361                             log_pos, log_pos);
3362       compile_time_assert(sizeof(ha_checksum) == HA_CHECKSUM_STORE_SIZE);
3363 
3364       log_array[TRANSLOG_INTERNAL_PARTS + 0].str=    log_data;
3365       log_array[TRANSLOG_INTERNAL_PARTS + 0].length= (uint) (log_pos -
3366                                                              log_data);
3367 
3368       if (!old_record)
3369       {
3370         /* Store undo_lsn in case we are aborting the insert */
3371         row->orig_undo_lsn= info->trn->undo_lsn;
3372         /* Write UNDO log record for the INSERT */
3373         if (translog_write_record(&lsn, LOGREC_UNDO_ROW_INSERT,
3374                                   info->trn, info,
3375                                   (translog_size_t)
3376                                   log_array[TRANSLOG_INTERNAL_PARTS +
3377                                             0].length,
3378                                   TRANSLOG_INTERNAL_PARTS + 1,
3379                                   log_array,
3380                                   log_data + LSN_STORE_SIZE, &checksum_delta))
3381           goto disk_err;
3382       }
3383       else
3384       {
3385         /* Write UNDO log record for the UPDATE */
3386         size_t row_length, extents_length;
3387         uint row_parts_count, cur_head_length;
3388 
3389         /*
3390           Write head length and extents of the original row so that we
3391           during UNDO can put it back in the original position.
3392           We don't store size for TRANSID, as we don't write this during
3393           UNDO.
3394         */
3395         cur_head_length= (info->cur_row.head_length -
3396                           info->cur_row.header_length);
3397         int2store(log_pos, cur_head_length);
3398         pagerange_store(log_pos + 2, info->cur_row.extents_count);
3399         log_pos+= 2 + PAGERANGE_STORE_SIZE;
3400         log_array[TRANSLOG_INTERNAL_PARTS + 0].length+= (2 +
3401                                                          PAGERANGE_STORE_SIZE);
3402         info->log_row_parts[TRANSLOG_INTERNAL_PARTS+1].str=
3403           info->cur_row.extents;
3404         info->log_row_parts[TRANSLOG_INTERNAL_PARTS+1].length=
3405           extents_length= info->cur_row.extents_count * ROW_EXTENT_SIZE;
3406 
3407         row_length= fill_update_undo_parts(info, old_record, record,
3408                                            log_array +
3409                                            TRANSLOG_INTERNAL_PARTS + 2,
3410                                            &row_parts_count);
3411         if (translog_write_record(&lsn, LOGREC_UNDO_ROW_UPDATE, info->trn,
3412                                   info,
3413                                   (translog_size_t)
3414                                   (log_array[TRANSLOG_INTERNAL_PARTS +
3415                                              0].length + extents_length +
3416                                    row_length),
3417                                   TRANSLOG_INTERNAL_PARTS + 2 +
3418                                   row_parts_count,
3419                                   log_array,
3420                                   log_data + LSN_STORE_SIZE,
3421                                   &checksum_delta))
3422           goto disk_err;
3423       }
3424     }
3425   }
3426   /* Release not used space in used pages */
3427   if (_ma_bitmap_release_unused(info, bitmap_blocks))
3428     goto disk_err;
3429   _ma_unpin_all_pages(info, lsn);
3430 
3431   if (tmp_data_used)
3432   {
3433     /*
3434       Write data stored in info->rec_buff to pages
3435       This is the char/varchar data that didn't fit into the head page.
3436     */
3437     DBUG_ASSERT(bitmap_blocks->count != 0);
3438     if (write_full_pages(info, lsn, head_block + 1,
3439                          info->rec_buff, (ulong) (tmp_data - info->rec_buff)))
3440       goto disk_err;
3441   }
3442 
3443   /* Write rest of blobs (data, but no tails as they are already written) */
3444   for (; column < end_column; column++, blob_lengths++)
3445   {
3446     uchar *blob_pos;
3447     uint length;
3448     ulong blob_length;
3449     if (!*blob_lengths)                         /* Null or "" */
3450       continue;
3451     length= column->length - portable_sizeof_char_ptr;
3452     memcpy(&blob_pos, record + column->offset + length, sizeof(char*));
3453     /* remove tail part */
3454     blob_length= *blob_lengths;
3455     if (block[block->sub_blocks - 1].used & BLOCKUSED_TAIL)
3456       blob_length-= (blob_length % FULL_PAGE_SIZE(share));
3457 
3458     if (blob_length && write_full_pages(info, lsn, block,
3459                                          blob_pos, blob_length))
3460       goto disk_err;
3461     block+= block->sub_blocks;
3462   }
3463 
3464   _ma_finalize_row(info);
3465   DBUG_RETURN(0);
3466 
3467 crashed:
3468   DBUG_ASSERT(!maria_assert_if_crashed_table);
3469   /* Something was wrong with data on page */
3470   _ma_set_fatal_error(share, HA_ERR_WRONG_IN_RECORD);
3471 
3472 disk_err:
3473   /**
3474      @todo RECOVERY we are going to let dirty pages go to disk while we have
3475      logged UNDO, this violates WAL. We must mark the table corrupted!
3476 
3477      @todo RECOVERY we have written some REDOs without a closing UNDO,
3478      it's possible that a next operation by this transaction succeeds and then
3479      Recovery would glue the "orphan REDOs" to the succeeded operation and
3480      execute the failed REDOs. We need some mark "abort this group" in the
3481      log, or mark the table corrupted (then user will repair it and thus REDOs
3482      will be skipped).
3483 
3484      @todo RECOVERY to not let write errors go unnoticed, pagecache_write()
3485      should take a MARIA_HA* in argument, and it it
3486      fails when flushing a page to disk it should call
3487      (*the_maria_ha->write_error_func)(the_maria_ha)
3488      and this hook will mark the table corrupted.
3489      Maybe hook should be stored in the pagecache's block structure, or in a
3490      hash "file->maria_ha*".
3491 
3492      @todo RECOVERY we should distinguish below between log write error and
3493      table write error. The former should stop Maria immediately, the latter
3494      should mark the table corrupted.
3495   */
3496   /*
3497     Unpin all pinned pages to not cause problems for disk cache. This is
3498     safe to call even if we already called _ma_unpin_all_pages() above.
3499   */
3500   save_my_errno= my_errno;
3501   _ma_unpin_all_pages_and_finalize_row(info, LSN_IMPOSSIBLE);
3502   my_errno= save_my_errno;
3503   DBUG_RETURN(1);
3504 }
3505 
3506 
3507 /*
3508   @brief Write a record
3509 
3510   @fn    allocate_and_write_block_record()
3511   @param info                Maria handler
3512   @param record              Record to write
3513   @param row		     Information about fields in 'record'
3514   @param undo_lsn	     <> LSN_ERROR if we are executing an UNDO
3515 
3516   @return
3517   @retval 0	ok
3518   @retval 1	Error
3519 */
3520 
allocate_and_write_block_record(MARIA_HA * info,const uchar * record,MARIA_ROW * row,LSN undo_lsn)3521 static my_bool allocate_and_write_block_record(MARIA_HA *info,
3522                                                const uchar *record,
3523                                                MARIA_ROW *row,
3524                                                LSN undo_lsn)
3525 {
3526   struct st_row_pos_info row_pos;
3527   MARIA_BITMAP_BLOCKS *blocks= &row->insert_blocks;
3528   int save_my_errno;
3529   DBUG_ENTER("allocate_and_write_block_record");
3530 
3531   _ma_bitmap_flushable(info, 1);
3532   if (_ma_bitmap_find_place(info, row, blocks))
3533     goto err;                         /* Error reading bitmap */
3534 
3535   /*
3536     Sleep; a checkpoint will happen and should not send this over-allocated
3537     bitmap to disk but rather wait.
3538   */
3539   DBUG_EXECUTE_IF("maria_over_alloc_bitmap", sleep(10););
3540 
3541   /* page will be pinned & locked by get_head_or_tail_page */
3542   if (get_head_or_tail_page(info, blocks->block, info->buff,
3543                             MY_MAX(row->space_on_head_page,
3544                                 info->s->base.min_block_length),
3545                             HEAD_PAGE,
3546                             PAGECACHE_LOCK_WRITE, &row_pos))
3547     goto err;
3548   row->lastpos= ma_recordpos(blocks->block->page, row_pos.rownr);
3549   if (info->s->calc_checksum)
3550   {
3551     if (undo_lsn == LSN_ERROR)
3552       row->checksum= (info->s->calc_checksum)(info, record);
3553     else
3554     {
3555       /* _ma_apply_undo_row_delete() already set row's checksum. Verify it. */
3556       DBUG_ASSERT(row->checksum == (info->s->calc_checksum)(info, record));
3557     }
3558   }
3559   DBUG_PRINT("info", ("rowid: %lu (%lu:%u) length: %u", (ulong) row->lastpos,
3560                       (ulong) ma_recordpos_to_page(row->lastpos),
3561                       ma_recordpos_to_dir_entry(row->lastpos),
3562                       row_pos.length));
3563   if (write_block_record(info, (uchar*) 0, record, row,
3564                          blocks, blocks->block->org_bitmap_value != 0,
3565                          &row_pos, undo_lsn, 0))
3566     goto err;
3567   /* Now let checkpoint happen but don't commit */
3568   DBUG_EXECUTE_IF("maria_over_alloc_bitmap", sleep(1000););
3569   DBUG_RETURN(0);
3570 
3571 err:
3572   save_my_errno= my_errno;
3573   if (info->non_flushable_state)
3574     _ma_bitmap_flushable(info, -1);
3575   _ma_unpin_all_pages_and_finalize_row(info, LSN_IMPOSSIBLE);
3576   my_errno= save_my_errno;
3577   DBUG_RETURN(1);
3578 }
3579 
3580 
3581 /*
3582   Write a record and return rowid for it
3583 
3584   SYNOPSIS
3585     _ma_write_init_block_record()
3586     info                Maria handler
3587     record              Record to write
3588 
3589   NOTES
3590     This is done BEFORE we write the keys to the row!
3591 
3592   RETURN
3593     HA_OFFSET_ERROR     Something went wrong
3594     #                   Rowid for row
3595 */
3596 
_ma_write_init_block_record(MARIA_HA * info,const uchar * record)3597 MARIA_RECORD_POS _ma_write_init_block_record(MARIA_HA *info,
3598                                              const uchar *record)
3599 {
3600   DBUG_ENTER("_ma_write_init_block_record");
3601 
3602   calc_record_size(info, record, &info->cur_row);
3603   if (allocate_and_write_block_record(info, record,
3604                                       &info->cur_row, LSN_ERROR))
3605     DBUG_RETURN(HA_OFFSET_ERROR);
3606   DBUG_RETURN(info->cur_row.lastpos);
3607 }
3608 
3609 
3610 /*
3611   Dummy function for (*info->s->write_record)()
3612 
3613   Nothing to do here, as we already wrote the record in
3614   _ma_write_init_block_record()
3615 */
3616 
_ma_write_block_record(MARIA_HA * info,const uchar * record)3617 my_bool _ma_write_block_record(MARIA_HA *info __attribute__ ((unused)),
3618                                const uchar *record __attribute__ ((unused)))
3619 {
3620   return 0;                                     /* Row already written */
3621 }
3622 
3623 
3624 /**
3625    @brief Remove row written by _ma_write_block_record() and log undo
3626 
3627    @param  info            Maria handler
3628 
3629    @note
3630      This is called in case we got a duplicate unique key while
3631      writing keys.
3632 
3633    @return Operation status
3634      @retval 0      OK
3635      @retval 1      Error
3636 */
3637 
_ma_write_abort_block_record(MARIA_HA * info)3638 my_bool _ma_write_abort_block_record(MARIA_HA *info)
3639 {
3640   my_bool res= 0;
3641   MARIA_BITMAP_BLOCKS *blocks= &info->cur_row.insert_blocks;
3642   MARIA_BITMAP_BLOCK *block, *end;
3643   LSN lsn= LSN_IMPOSSIBLE;
3644   MARIA_SHARE *share= info->s;
3645   DBUG_ENTER("_ma_write_abort_block_record");
3646 
3647   _ma_bitmap_lock(share);  /* Lock bitmap from other insert threads */
3648   if (delete_head_or_tail(info,
3649                           ma_recordpos_to_page(info->cur_row.lastpos),
3650                           ma_recordpos_to_dir_entry(info->cur_row.lastpos), 1,
3651                           0))
3652     res= 1;
3653   for (block= blocks->block + 1, end= block + blocks->count - 1; block < end;
3654        block++)
3655   {
3656     if (block->used & BLOCKUSED_USED)
3657     {
3658       if (block->used & BLOCKUSED_TAIL)
3659       {
3660         /*
3661           block->page_count is set to the tail directory entry number in
3662           write_block_record()
3663         */
3664         if (delete_head_or_tail(info, block->page,
3665                                 block->page_count & ~TAIL_BIT,
3666                                 0, 0))
3667           res= 1;
3668       }
3669       else
3670       {
3671         if (free_full_page_range(info, block->page, block->page_count))
3672           res= 1;
3673       }
3674     }
3675   }
3676   _ma_bitmap_unlock(share);
3677   if (share->now_transactional)
3678   {
3679     if (_ma_write_clr(info, info->cur_row.orig_undo_lsn,
3680                       LOGREC_UNDO_ROW_INSERT,
3681                       share->calc_checksum != 0,
3682                       (ha_checksum) 0 - info->cur_row.checksum,
3683                       &lsn, (void*) 0))
3684       res= 1;
3685   }
3686   _ma_unpin_all_pages_and_finalize_row(info, lsn);
3687   DBUG_RETURN(res);
3688 }
3689 
3690 
3691 /*
3692   Update a record
3693 
3694   NOTES
3695     For the moment, we assume that info->curr_row.extents is always updated
3696     when a row is read. In the future we may decide to read this on demand
3697     for rows split into many extents.
3698 */
3699 
_ma_update_block_record2(MARIA_HA * info,MARIA_RECORD_POS record_pos,const uchar * oldrec,const uchar * record,LSN undo_lsn)3700 static my_bool _ma_update_block_record2(MARIA_HA *info,
3701                                         MARIA_RECORD_POS record_pos,
3702                                         const uchar *oldrec,
3703                                         const uchar *record,
3704                                         LSN undo_lsn)
3705 {
3706   MARIA_BITMAP_BLOCKS *blocks= &info->cur_row.insert_blocks;
3707   uchar *buff;
3708   MARIA_ROW *cur_row= &info->cur_row, *new_row= &info->new_row;
3709   MARIA_PINNED_PAGE page_link;
3710   uint rownr, org_empty_size, head_length;
3711   uint block_size= info->s->block_size;
3712   uint errpos __attribute__((unused)) = 0;
3713   uchar *dir;
3714   pgcache_page_no_t page;
3715   struct st_row_pos_info row_pos;
3716   my_bool res;
3717   ha_checksum old_checksum;
3718   MARIA_SHARE *share= info->s;
3719   DBUG_ENTER("_ma_update_block_record2");
3720   DBUG_PRINT("enter", ("rowid: %lu", (long) record_pos));
3721 
3722 #ifdef ENABLE_IF_PROBLEM_WITH_UPDATE
3723   DBUG_DUMP("oldrec", oldrec, share->base.reclength);
3724   DBUG_DUMP("newrec", record, share->base.reclength);
3725 #endif
3726 
3727   /*
3728     Checksums of new and old rows were computed by callers already; new
3729     row's was put into cur_row, old row's was put into new_row.
3730   */
3731   old_checksum= new_row->checksum;
3732   new_row->checksum= cur_row->checksum;
3733   calc_record_size(info, record, new_row);
3734   page= ma_recordpos_to_page(record_pos);
3735 
3736   _ma_bitmap_flushable(info, 1);
3737   buff= pagecache_read(share->pagecache,
3738                        &info->dfile, (pgcache_page_no_t) page, 0, 0,
3739                        share->page_type,
3740                        PAGECACHE_LOCK_WRITE, &page_link.link);
3741   page_link.unlock= PAGECACHE_LOCK_WRITE_UNLOCK;
3742   page_link.changed= buff != 0;
3743   push_dynamic(&info->pinned_pages, (void*) &page_link);
3744   if (!buff)
3745     goto err;
3746 
3747   org_empty_size= uint2korr(buff + EMPTY_SPACE_OFFSET);
3748   rownr= ma_recordpos_to_dir_entry(record_pos);
3749   dir= dir_entry_pos(buff, block_size, rownr);
3750 
3751   /*
3752     We can't use cur_row->head_length as the block may have been compacted
3753     since we read it.
3754   */
3755   head_length= uint2korr(dir + 2);
3756 
3757   if ((org_empty_size + head_length) >= new_row->total_length)
3758   {
3759     uint rec_offset, length;
3760     MARIA_BITMAP_BLOCK block;
3761 
3762     DBUG_PRINT("info", ("org_empty_size: %u  org_length: %u  new_length: %lu",
3763                         org_empty_size, head_length,
3764                         new_row->total_length));
3765 
3766     /*
3767       We can fit the new row in the same page as the original head part
3768       of the row
3769     */
3770     block.org_bitmap_value= _ma_free_size_to_head_pattern(&share->bitmap,
3771                                                           org_empty_size);
3772     if (extend_area_on_page(info, buff, dir, rownr,
3773                             new_row->total_length, &org_empty_size,
3774                             &rec_offset, &length, 1))
3775     {
3776       errpos= 1;
3777       goto err;
3778     }
3779 
3780     row_pos.buff= buff;
3781     row_pos.rownr= rownr;
3782     row_pos.empty_space= org_empty_size;
3783     row_pos.dir= dir;
3784     row_pos.data= buff + rec_offset;
3785     row_pos.length= length;
3786     blocks->block= &block;
3787     blocks->count= 1;
3788     block.page= page;
3789     block.sub_blocks= 1;
3790     block.used= BLOCKUSED_USED | BLOCKUSED_USE_ORG_BITMAP;
3791     block.empty_space= row_pos.empty_space;
3792 
3793     if (*cur_row->tail_positions &&
3794         delete_tails(info, cur_row->tail_positions))
3795     {
3796       errpos= 2;
3797       goto err;
3798     }
3799     if (cur_row->extents_count && free_full_pages(info, cur_row))
3800     {
3801       errpos= 3;
3802       goto err;
3803     }
3804     res= write_block_record(info, oldrec, record, new_row, blocks,
3805                             1, &row_pos, undo_lsn, old_checksum);
3806     /* We can't update or delete this without re-reading it again */
3807     info->update&= ~HA_STATE_AKTIV;
3808     DBUG_RETURN(res);
3809   }
3810   /* Delete old row */
3811   if (*cur_row->tail_positions &&
3812       delete_tails(info, cur_row->tail_positions))
3813   {
3814     errpos= 4;
3815     goto err;
3816   }
3817   if (cur_row->extents_count && free_full_pages(info, cur_row))
3818   {
3819     errpos= 5;
3820     goto err;
3821   }
3822 
3823   head_length= uint2korr(dir + 2);
3824   if (_ma_bitmap_find_new_place(info, new_row, page, head_length +
3825                                 org_empty_size, blocks))
3826   {
3827     errpos= 6;
3828     goto err;
3829   }
3830 
3831   /*
3832     Allocate all size in block for record
3833     TODO:
3834     Need to improve this to do compact if we can fit one more blob into
3835     the head page
3836   */
3837   if ((head_length < new_row->space_on_head_page ||
3838        (new_row->total_length <= head_length &&
3839         org_empty_size + head_length >= new_row->total_length)))
3840   {
3841     _ma_compact_block_page(share,
3842                            buff, rownr, 1,
3843                            info->trn->min_read_from,
3844                            share->base.min_block_length);
3845     org_empty_size= 0;
3846     head_length= uint2korr(dir + 2);
3847   }
3848 
3849   row_pos.buff= buff;
3850   row_pos.rownr= rownr;
3851   row_pos.empty_space= org_empty_size + head_length;
3852   row_pos.dir= dir;
3853   row_pos.data= buff + uint2korr(dir);
3854   row_pos.length= head_length;
3855   if ((res= write_block_record(info, oldrec, record, new_row, blocks, 1,
3856                                &row_pos, undo_lsn, old_checksum)))
3857   {
3858     errpos= 7;
3859     goto err;
3860   }
3861   DBUG_RETURN(0);
3862 
3863 err:
3864   DBUG_ASSERT(!maria_assert_if_crashed_table);
3865   DBUG_PRINT("error", ("errpos: %d", errpos));
3866   if (info->non_flushable_state)
3867     _ma_bitmap_flushable(info, -1);
3868   _ma_unpin_all_pages_and_finalize_row(info, LSN_IMPOSSIBLE);
3869   DBUG_RETURN(1);
3870 }
3871 
3872 
3873 /*
3874   @brief Store new row on it's original position
3875 
3876   @note
3877   This is basicly a copy of _ma_update_block_record2
3878   When we have a purge thread for deleted row, we can remove this function
3879   and use _ma_update_block_record2 instead.
3880 
3881   This is the main reason we don't make a lot of subfunctions that are
3882   common between _ma_update_block_record2() and this function.
3883 
3884   Note: If something goes wrong we mark the file crashed
3885 */
3886 
_ma_update_at_original_place(MARIA_HA * info,pgcache_page_no_t page,uint rownr,uint length_on_head_page,uint extent_count,const uchar * extent_info,const uchar * oldrec,const uchar * record,LSN undo_lsn)3887 static my_bool _ma_update_at_original_place(MARIA_HA *info,
3888                                             pgcache_page_no_t page,
3889                                             uint rownr,
3890                                             uint length_on_head_page,
3891                                             uint extent_count,
3892                                             const uchar *extent_info,
3893                                             const uchar *oldrec,
3894                                             const uchar *record,
3895                                             LSN undo_lsn)
3896 {
3897   MARIA_BITMAP_BLOCKS *blocks;
3898   MARIA_BITMAP_BLOCK *block;
3899   MARIA_ROW *cur_row= &info->cur_row, *new_row= &info->new_row;
3900   MARIA_PINNED_PAGE page_link;
3901   MARIA_SHARE *share= info->s;
3902   ha_checksum old_checksum;
3903   uint org_empty_size, empty_size;
3904   uint block_size= info->s->block_size;
3905   uchar *dir, *buff;
3906   struct st_row_pos_info row_pos;
3907   my_bool res;
3908   uint rec_offset, length;
3909   DBUG_ENTER("_ma_update_at_original_place");
3910 
3911 #ifdef ENABLE_IF_PROBLEM_WITH_UPDATE
3912   DBUG_DUMP("oldrec", oldrec, share->base.reclength);
3913   DBUG_DUMP("newrec", record, share->base.reclength);
3914 #endif
3915 
3916   /*
3917     Checksums of new and old rows were computed by callers already; new
3918     row's was put into cur_row, old row's was put into new_row.
3919   */
3920   old_checksum= new_row->checksum;
3921   new_row->checksum= cur_row->checksum;
3922   calc_record_size(info, record, new_row);
3923 
3924   _ma_bitmap_flushable(info, 1);
3925   buff= pagecache_read(share->pagecache,
3926                        &info->dfile, (pgcache_page_no_t) page, 0, 0,
3927                        share->page_type,
3928                        PAGECACHE_LOCK_WRITE, &page_link.link);
3929   page_link.unlock= PAGECACHE_LOCK_WRITE_UNLOCK;
3930   page_link.changed= buff != 0;
3931   push_dynamic(&info->pinned_pages, (void*) &page_link);
3932   if (!buff)
3933     goto err;
3934 
3935   org_empty_size= uint2korr(buff + EMPTY_SPACE_OFFSET);
3936   dir= dir_entry_pos(buff, block_size, rownr);
3937 
3938   if ((org_empty_size + cur_row->head_length) < length_on_head_page)
3939   {
3940     DBUG_PRINT("error",
3941                ("org_empty_size: %u  head_length: %u  length_on_page: %u",
3942                 org_empty_size, (uint) cur_row->head_length,
3943                 length_on_head_page));
3944     _ma_set_fatal_error(share, HA_ERR_WRONG_IN_RECORD);
3945     goto err;
3946   }
3947 
3948   /*
3949     We can fit the new row in the same page as the original head part
3950     of the row
3951   */
3952   empty_size= org_empty_size;
3953   if (extend_area_on_page(info, buff, dir, rownr,
3954                           length_on_head_page, &empty_size,
3955                           &rec_offset, &length, 1))
3956     goto err;
3957 
3958   row_pos.buff= buff;
3959   row_pos.rownr= rownr;
3960   row_pos.empty_space= empty_size;
3961   row_pos.dir= dir;
3962   row_pos.data= buff + rec_offset;
3963 
3964   /* Delete old row */
3965   if (*cur_row->tail_positions &&
3966       delete_tails(info, cur_row->tail_positions))
3967     goto err;
3968   if (cur_row->extents_count && free_full_pages(info, cur_row))
3969     goto err;
3970 
3971   /* Change extent information to be usable by write_block_record() */
3972   blocks= &cur_row->insert_blocks;
3973   if (extent_to_bitmap_blocks(info, blocks, page, extent_count, extent_info))
3974     goto err;
3975   block= blocks->block;
3976   block->empty_space= row_pos.empty_space;
3977   block->org_bitmap_value=
3978     _ma_free_size_to_head_pattern(&share->bitmap,
3979                                   (enough_free_entries_on_page(share, buff) ?
3980                                    org_empty_size : 0));
3981 
3982   DBUG_ASSERT(block->org_bitmap_value ==
3983               _ma_bitmap_get_page_bits(info, &info->s->bitmap, page));
3984   block->used|= BLOCKUSED_USE_ORG_BITMAP;
3985 
3986   /*
3987     We have to use <= below as the new_row may be smaller than the original
3988     row as the new row doesn't have transaction id
3989   */
3990 
3991   DBUG_ASSERT(blocks->count > 1 ||
3992               MY_MAX(new_row->total_length, share->base.min_block_length) <=
3993               length_on_head_page);
3994 
3995   /* Store same amount of data on head page as on original page */
3996   row_pos.length= (length_on_head_page -
3997                    (extent_count + 1 - blocks->count) * ROW_EXTENT_SIZE);
3998   set_if_bigger(row_pos.length, share->base.min_block_length);
3999   if ((res= write_block_record(info, oldrec, record, new_row, blocks,
4000                                1, &row_pos, undo_lsn, old_checksum)))
4001     goto err;
4002   DBUG_RETURN(0);
4003 
4004 err:
4005   DBUG_ASSERT(!maria_assert_if_crashed_table);
4006   _ma_mark_file_crashed(share);
4007   if (info->non_flushable_state)
4008     _ma_bitmap_flushable(info, -1);
4009   _ma_unpin_all_pages_and_finalize_row(info, LSN_IMPOSSIBLE);
4010   DBUG_RETURN(1);
4011 }
4012 
4013 
4014 /* Wrapper for _ma_update_block_record2() used by ma_update() */
4015 
_ma_update_block_record(MARIA_HA * info,MARIA_RECORD_POS record_pos,const uchar * orig_rec,const uchar * new_rec)4016 my_bool _ma_update_block_record(MARIA_HA *info, MARIA_RECORD_POS record_pos,
4017                                 const uchar *orig_rec, const uchar *new_rec)
4018 {
4019   return _ma_update_block_record2(info, record_pos, orig_rec, new_rec,
4020                                   LSN_ERROR);
4021 }
4022 
4023 
4024 /*
4025   Delete a directory entry
4026 
4027   SYNOPSIS
4028     delete_dir_entry()
4029     buff		Page buffer
4030     record_number	Record number to delete
4031     empty_space		Empty space on page after delete
4032 
4033   RETURN
4034     -1    Error on page
4035     0     ok
4036     1     Page is now empty
4037 */
4038 
delete_dir_entry(MARIA_SHARE * share,uchar * buff,uint record_number,uint * empty_space_res)4039 static int delete_dir_entry(MARIA_SHARE *share,
4040                             uchar *buff, uint record_number,
4041                             uint *empty_space_res)
4042 {
4043   uint block_size= share->block_size;
4044   uint number_of_records= (uint) buff[DIR_COUNT_OFFSET];
4045   uint length, empty_space;
4046   uchar *dir;
4047   DBUG_ENTER("delete_dir_entry");
4048   DBUG_PRINT("enter", ("record_number: %u  number_of_records: %u",
4049                        record_number, number_of_records));
4050 
4051 #ifdef SANITY_CHECKS
4052   if (record_number >= number_of_records ||
4053       record_number > ((block_size - LSN_SIZE - PAGE_TYPE_SIZE - 1 -
4054                         PAGE_SUFFIX_SIZE) / DIR_ENTRY_SIZE))
4055   {
4056     DBUG_PRINT("error", ("record_number: %u  number_of_records: %u",
4057                          record_number, number_of_records));
4058 
4059     DBUG_RETURN(-1);
4060   }
4061 #endif
4062 
4063   check_directory(share, buff, block_size, 0, (uint) -1);
4064   empty_space= uint2korr(buff + EMPTY_SPACE_OFFSET);
4065   dir= dir_entry_pos(buff, block_size, record_number);
4066   length= uint2korr(dir + 2);  /* Length of entry we just deleted */
4067   DBUG_ASSERT(uint2korr(dir) != 0 && length < block_size);
4068 
4069   if (record_number == number_of_records - 1)
4070   {
4071     /* Delete this entry and all following free directory entries */
4072     uchar *end= buff + block_size - PAGE_SUFFIX_SIZE;
4073     number_of_records--;
4074     dir+= DIR_ENTRY_SIZE;
4075     empty_space+= DIR_ENTRY_SIZE;
4076 
4077     /* Unlink and free the next empty ones */
4078     while (dir < end && dir[0] == 0 && dir[1] == 0)
4079     {
4080       number_of_records--;
4081       if (dir[2] == END_OF_DIR_FREE_LIST)
4082         buff[DIR_FREE_OFFSET]= dir[3];
4083       else
4084       {
4085         uchar *prev_entry= dir_entry_pos(buff, block_size, (uint) dir[2]);
4086         DBUG_ASSERT(uint2korr(prev_entry) == 0 && prev_entry[3] ==
4087                     number_of_records);
4088         prev_entry[3]= dir[3];
4089       }
4090       if (dir[3] != END_OF_DIR_FREE_LIST)
4091       {
4092         uchar *next_entry= dir_entry_pos(buff, block_size, (uint) dir[3]);
4093         DBUG_ASSERT(uint2korr(next_entry) == 0 && next_entry[2] ==
4094                     number_of_records);
4095         next_entry[2]= dir[2];
4096       }
4097       dir+= DIR_ENTRY_SIZE;
4098       empty_space+= DIR_ENTRY_SIZE;
4099     }
4100 
4101     if (number_of_records == 0)
4102     {
4103       /* All entries on page deleted */
4104       DBUG_PRINT("info", ("Page marked as unallocated"));
4105       buff[PAGE_TYPE_OFFSET]= UNALLOCATED_PAGE;
4106 #ifdef IDENTICAL_PAGES_AFTER_RECOVERY
4107       {
4108         dir= dir_entry_pos(buff, block_size, record_number);
4109         bzero(dir, (record_number+1) * DIR_ENTRY_SIZE);
4110       }
4111 #endif
4112       *empty_space_res= block_size;
4113       DBUG_RETURN(1);
4114     }
4115     buff[DIR_COUNT_OFFSET]= (uchar) number_of_records;
4116   }
4117   else
4118   {
4119     /* Update directory */
4120     dir[0]= dir[1]= 0;
4121     dir[2]= END_OF_DIR_FREE_LIST;
4122     if ((dir[3]= buff[DIR_FREE_OFFSET]) != END_OF_DIR_FREE_LIST)
4123     {
4124       /* Relink next entry to point to newly freed entry */
4125       uchar *next_entry= dir_entry_pos(buff, block_size, (uint) dir[3]);
4126       DBUG_ASSERT(uint2korr(next_entry) == 0 &&
4127                   next_entry[2] == END_OF_DIR_FREE_LIST);
4128       next_entry[2]= record_number;
4129     }
4130     buff[DIR_FREE_OFFSET]= record_number;
4131   }
4132   empty_space+= length;
4133 
4134   int2store(buff + EMPTY_SPACE_OFFSET, empty_space);
4135   buff[PAGE_TYPE_OFFSET]|= (uchar) PAGE_CAN_BE_COMPACTED;
4136 
4137   *empty_space_res= empty_space;
4138 
4139   check_directory(share, buff, block_size, 0, empty_space);
4140   DBUG_RETURN(0);
4141 }
4142 
4143 
4144 /*
4145   Delete a head a tail part
4146 
4147   SYNOPSIS
4148     delete_head_or_tail()
4149     info                Maria handler
4150     page                Page (not file offset!) on which the row is
4151     head                1 if this is a head page
4152     from_update		1 if we are called from update. In this case we
4153 			leave the page as write locked as we may put
4154                         the new row into the old position.
4155 
4156   RETURN
4157     0  ok
4158     1  error
4159 */
4160 
delete_head_or_tail(MARIA_HA * info,pgcache_page_no_t page,uint record_number,my_bool head,my_bool from_update)4161 static my_bool delete_head_or_tail(MARIA_HA *info,
4162                                    pgcache_page_no_t page, uint record_number,
4163                                    my_bool head, my_bool from_update)
4164 {
4165   MARIA_SHARE *share= info->s;
4166   uint empty_space;
4167   int res;
4168   my_bool page_is_empty;
4169   uchar *buff;
4170   LSN lsn;
4171   MARIA_PINNED_PAGE page_link;
4172   enum pagecache_page_lock lock_at_write, lock_at_unpin;
4173   DBUG_ENTER("delete_head_or_tail");
4174   DBUG_PRINT("enter", ("id: %lu (%lu:%u)",
4175                        (ulong) ma_recordpos(page, record_number),
4176                        (ulong) page, record_number));
4177 
4178   buff= pagecache_read(share->pagecache,
4179                        &info->dfile, page, 0, 0,
4180                        share->page_type,
4181                        PAGECACHE_LOCK_WRITE, &page_link.link);
4182   page_link.unlock= PAGECACHE_LOCK_WRITE_UNLOCK;
4183   page_link.changed= buff != 0;
4184   push_dynamic(&info->pinned_pages, (void*) &page_link);
4185   if (!buff)
4186     DBUG_RETURN(1);
4187   DBUG_ASSERT((buff[PAGE_TYPE_OFFSET] & PAGE_TYPE_MASK) ==
4188               (head ? HEAD_PAGE : TAIL_PAGE));
4189 
4190   if (from_update)
4191   {
4192     lock_at_write= PAGECACHE_LOCK_LEFT_WRITELOCKED;
4193     lock_at_unpin= PAGECACHE_LOCK_WRITE_UNLOCK;
4194   }
4195   else
4196   {
4197     lock_at_write= PAGECACHE_LOCK_WRITE_TO_READ;
4198     lock_at_unpin= PAGECACHE_LOCK_READ_UNLOCK;
4199   }
4200 
4201   res= delete_dir_entry(share, buff, record_number, &empty_space);
4202   if (res < 0)
4203     DBUG_RETURN(1);
4204   if (res == 0) /* after our deletion, page is still not empty */
4205   {
4206     uchar log_data[FILEID_STORE_SIZE + PAGE_STORE_SIZE + DIRPOS_STORE_SIZE];
4207     LEX_CUSTRING log_array[TRANSLOG_INTERNAL_PARTS + 1];
4208     page_is_empty= 0;
4209     if (share->now_transactional)
4210     {
4211       /* Log REDO data */
4212       page_store(log_data + FILEID_STORE_SIZE, page);
4213       dirpos_store(log_data + FILEID_STORE_SIZE + PAGE_STORE_SIZE,
4214                    record_number);
4215 
4216       log_array[TRANSLOG_INTERNAL_PARTS + 0].str=    log_data;
4217       log_array[TRANSLOG_INTERNAL_PARTS + 0].length= sizeof(log_data);
4218       if (translog_write_record(&lsn, (head ? LOGREC_REDO_PURGE_ROW_HEAD :
4219                                        LOGREC_REDO_PURGE_ROW_TAIL),
4220                                 info->trn, info,
4221                                 (translog_size_t) sizeof(log_data),
4222                                 TRANSLOG_INTERNAL_PARTS + 1, log_array,
4223                                 log_data, NULL))
4224         DBUG_RETURN(1);
4225     }
4226   }
4227   else /* page is now empty */
4228   {
4229     page_is_empty= 1;
4230     if (share->now_transactional)
4231     {
4232       uchar log_data[FILEID_STORE_SIZE + PAGE_STORE_SIZE];
4233       LEX_CUSTRING log_array[TRANSLOG_INTERNAL_PARTS + 1];
4234       page_store(log_data + FILEID_STORE_SIZE, page);
4235       log_array[TRANSLOG_INTERNAL_PARTS + 0].str=    log_data;
4236       log_array[TRANSLOG_INTERNAL_PARTS + 0].length= sizeof(log_data);
4237       if (translog_write_record(&lsn, LOGREC_REDO_FREE_HEAD_OR_TAIL,
4238                                 info->trn, info,
4239                                 (translog_size_t) sizeof(log_data),
4240                                 TRANSLOG_INTERNAL_PARTS + 1, log_array,
4241                                 log_data, NULL))
4242         DBUG_RETURN(1);
4243     }
4244     /*
4245       Mark that this page must be written to disk by page cache, even
4246       if we could call pagecache_delete() on it.
4247       This is needed to ensure that repair finds the empty page on disk
4248       and not old data.
4249     */
4250     pagecache_set_write_on_delete_by_link(page_link.link);
4251     DBUG_ASSERT(empty_space >= share->bitmap.sizes[0]);
4252   }
4253 
4254   pagecache_unlock_by_link(share->pagecache, page_link.link,
4255                            lock_at_write,
4256                            PAGECACHE_PIN_LEFT_PINNED, LSN_IMPOSSIBLE,
4257                            LSN_IMPOSSIBLE, 1, FALSE);
4258   page_link.unlock= lock_at_unpin;
4259   set_dynamic(&info->pinned_pages, (void*) &page_link,
4260               info->pinned_pages.elements-1);
4261 
4262   DBUG_PRINT("info", ("empty_space: %u", empty_space));
4263 
4264   /*
4265     If there is not enough space for all possible tails, mark the
4266     page full
4267   */
4268   if (!head && !page_is_empty && !enough_free_entries(buff, share->block_size,
4269                                                       1 + share->base.blobs))
4270     empty_space= 0;
4271 
4272   DBUG_RETURN(_ma_bitmap_set(info, page, head, empty_space));
4273 }
4274 
4275 
4276 /*
4277   delete all tails
4278 
4279   SYNOPSIS
4280     delete_tails()
4281     info                Handler
4282     tails               Pointer to vector of tail positions, ending with 0
4283 
4284   RETURN
4285     0  ok
4286     1  error
4287 */
4288 
delete_tails(MARIA_HA * info,MARIA_RECORD_POS * tails)4289 static my_bool delete_tails(MARIA_HA *info, MARIA_RECORD_POS *tails)
4290 {
4291   my_bool res= 0;
4292   DBUG_ENTER("delete_tails");
4293   for (; *tails; tails++)
4294   {
4295     if (delete_head_or_tail(info,
4296                             ma_recordpos_to_page(*tails),
4297                             ma_recordpos_to_dir_entry(*tails), 0, 1))
4298       res= 1;
4299   }
4300   DBUG_RETURN(res);
4301 }
4302 
4303 
4304 /*
4305   Delete a record
4306 
4307   NOTES
4308    For the moment, we assume that info->cur_row.extents is always updated
4309    when a row is read. In the future we may decide to read this on demand
4310    for rows with many splits.
4311 */
4312 
_ma_delete_block_record(MARIA_HA * info,const uchar * record)4313 my_bool _ma_delete_block_record(MARIA_HA *info, const uchar *record)
4314 {
4315   pgcache_page_no_t page;
4316   uint record_number;
4317   MARIA_SHARE *share= info->s;
4318   LSN lsn= LSN_IMPOSSIBLE;
4319   DBUG_ENTER("_ma_delete_block_record");
4320 
4321   page=          ma_recordpos_to_page(info->cur_row.lastpos);
4322   record_number= ma_recordpos_to_dir_entry(info->cur_row.lastpos);
4323   DBUG_PRINT("enter", ("rowid: %lu (%lu:%u)", (ulong) info->cur_row.lastpos,
4324                        (ulong) page, record_number));
4325 
4326   _ma_bitmap_flushable(info, 1);
4327   if (delete_head_or_tail(info, page, record_number, 1, 0) ||
4328       delete_tails(info, info->cur_row.tail_positions))
4329     goto err;
4330 
4331   if (info->cur_row.extents_count && free_full_pages(info, &info->cur_row))
4332     goto err;
4333 
4334   if (share->now_transactional)
4335   {
4336     uchar log_data[LSN_STORE_SIZE + FILEID_STORE_SIZE + PAGE_STORE_SIZE +
4337                    DIRPOS_STORE_SIZE + 2 + PAGERANGE_STORE_SIZE +
4338                    HA_CHECKSUM_STORE_SIZE];
4339     uchar *log_pos;
4340     size_t row_length;
4341     uint row_parts_count, extents_length;
4342     ha_checksum checksum_delta;
4343 
4344     /* Write UNDO record */
4345     lsn_store(log_data, info->trn->undo_lsn);
4346     page_store(log_data + LSN_STORE_SIZE + FILEID_STORE_SIZE, page);
4347     log_pos= log_data + LSN_STORE_SIZE + FILEID_STORE_SIZE + PAGE_STORE_SIZE;
4348     dirpos_store(log_pos, record_number);
4349     log_pos+= DIRPOS_STORE_SIZE;
4350     int2store(log_pos, info->cur_row.head_length -
4351               info->cur_row.header_length);
4352     log_pos+= 2;
4353     pagerange_store(log_pos, info->cur_row.extents_count);
4354     log_pos+= PAGERANGE_STORE_SIZE;
4355 
4356     info->log_row_parts[TRANSLOG_INTERNAL_PARTS].str= log_data;
4357     info->log_row_parts[TRANSLOG_INTERNAL_PARTS].length=
4358       sizeof(log_data) - HA_CHECKSUM_STORE_SIZE;
4359     store_checksum_in_rec(share, checksum_delta,
4360                           (ha_checksum) 0 - info->cur_row.checksum, log_pos,
4361                           info->log_row_parts[TRANSLOG_INTERNAL_PARTS +
4362                                               0].length);
4363     info->log_row_parts[TRANSLOG_INTERNAL_PARTS+1].str=
4364       info->cur_row.extents;
4365     info->log_row_parts[TRANSLOG_INTERNAL_PARTS+1].length=
4366       extents_length= info->cur_row.extents_count * ROW_EXTENT_SIZE;
4367 
4368     row_length= fill_insert_undo_parts(info, record,
4369                                        (info->log_row_parts +
4370                                         TRANSLOG_INTERNAL_PARTS + 2),
4371                                        &row_parts_count);
4372 
4373     if (translog_write_record(&lsn, LOGREC_UNDO_ROW_DELETE, info->trn,
4374                               info,
4375                               (translog_size_t)
4376                               (info->log_row_parts[TRANSLOG_INTERNAL_PARTS +
4377                                                    0].length + row_length +
4378                                extents_length),
4379                               TRANSLOG_INTERNAL_PARTS + 2 + row_parts_count,
4380                               info->log_row_parts,
4381                               log_data + LSN_STORE_SIZE,
4382                               &checksum_delta))
4383       goto err;
4384   }
4385 
4386   _ma_bitmap_flushable(info, -1);
4387   _ma_unpin_all_pages_and_finalize_row(info, lsn);
4388   DBUG_RETURN(0);
4389 
4390 err:
4391   DBUG_ASSERT(!maria_assert_if_crashed_table);
4392   _ma_bitmap_flushable(info, -1);
4393   _ma_unpin_all_pages_and_finalize_row(info, LSN_IMPOSSIBLE);
4394   DBUG_RETURN(1);
4395 }
4396 
4397 
4398 /****************************************************************************
4399   Reading of records
4400 ****************************************************************************/
4401 
4402 /*
4403   Read position to record from record directory at end of page
4404 
4405   SYNOPSIS
4406    get_record_position()
4407    buff                 page buffer
4408    block_size           block size for page
4409    record_number        Record number in index
4410    end_of_data          pointer to end of data for record
4411 
4412   RETURN
4413     0  Error in data
4414     #  Pointer to start of record.
4415        In this case *end_of_data is set.
4416 */
4417 
get_record_position(MARIA_SHARE * share,uchar * buff,uint record_number,uchar ** end_of_data)4418 static uchar *get_record_position(MARIA_SHARE *share, uchar *buff,
4419                                  uint record_number, uchar **end_of_data)
4420 {
4421   uint block_size= share->block_size;
4422   uint number_of_records= (uint) buff[DIR_COUNT_OFFSET];
4423   uchar *dir;
4424   uchar *data;
4425   uint offset, length;
4426 
4427 #ifdef SANITY_CHECKS
4428   if (record_number >= number_of_records ||
4429       record_number > ((block_size - PAGE_HEADER_SIZE(share) - PAGE_SUFFIX_SIZE)
4430                        / DIR_ENTRY_SIZE))
4431   {
4432     DBUG_PRINT("error",
4433                ("Wrong row number: record_number: %u  number_of_records: %u",
4434                 record_number, number_of_records));
4435     return 0;
4436   }
4437 #endif
4438 
4439   dir= dir_entry_pos(buff, block_size, record_number);
4440   offset= uint2korr(dir);
4441   length= uint2korr(dir + 2);
4442 #ifdef SANITY_CHECKS
4443   if (offset < PAGE_HEADER_SIZE(share) ||
4444       offset + length > (block_size -
4445                          number_of_records * DIR_ENTRY_SIZE -
4446                          PAGE_SUFFIX_SIZE))
4447   {
4448     DBUG_PRINT("error",
4449                ("Wrong row position:  record_number: %u  offset: %u  "
4450                 "length: %u  number_of_records: %u",
4451                 record_number, offset, length, number_of_records));
4452     return 0;
4453   }
4454 #endif
4455   data= buff + offset;
4456   *end_of_data= data + length;
4457   return data;
4458 }
4459 
4460 
4461 /*
4462   Init extent
4463 
4464   NOTES
4465     extent is a cursor over which pages to read
4466 */
4467 
init_extent(MARIA_EXTENT_CURSOR * extent,uchar * extent_info,uint extents,MARIA_RECORD_POS * tail_positions)4468 static void init_extent(MARIA_EXTENT_CURSOR *extent, uchar *extent_info,
4469                         uint extents, MARIA_RECORD_POS *tail_positions)
4470 {
4471   uint page_count;
4472   extent->extent=       extent_info;
4473   extent->extent_count= extents;
4474   extent->page=         page_korr(extent_info);         /* First extent */
4475   page_count=           (uint2korr(extent_info + ROW_EXTENT_PAGE_SIZE) &
4476                          ~START_EXTENT_BIT);
4477   extent->tail=         page_count & TAIL_BIT;
4478   if (extent->tail)
4479   {
4480     extent->page_count=   1;
4481     extent->tail_row_nr=  page_count & ~TAIL_BIT;
4482   }
4483   else
4484     extent->page_count=   page_count;
4485   extent->tail_positions= tail_positions;
4486   extent->lock_for_tail_pages= PAGECACHE_LOCK_LEFT_UNLOCKED;
4487 }
4488 
4489 
4490 /*
4491   Read next extent
4492 
4493   SYNOPSIS
4494     read_next_extent()
4495     info                Maria handler
4496     extent              Pointer to current extent (this is updated to point
4497                         to next)
4498     end_of_data         Pointer to end of data in read block (out)
4499 
4500   NOTES
4501     New block is read into info->buff
4502 
4503   RETURN
4504     0   Error;  my_errno is set
4505     #   Pointer to start of data in read block
4506         In this case end_of_data is updated to point to end of data.
4507 */
4508 
read_next_extent(MARIA_HA * info,MARIA_EXTENT_CURSOR * extent,uchar ** end_of_data)4509 static uchar *read_next_extent(MARIA_HA *info, MARIA_EXTENT_CURSOR *extent,
4510                               uchar **end_of_data)
4511 {
4512   MARIA_SHARE *share= info->s;
4513   uchar *buff, *data;
4514   MARIA_PINNED_PAGE page_link;
4515   enum pagecache_page_lock lock;
4516   DBUG_ENTER("read_next_extent");
4517 
4518   if (!extent->page_count)
4519   {
4520     uint page_count;
4521     if (!--extent->extent_count)
4522       goto crashed;
4523     extent->extent+=    ROW_EXTENT_SIZE;
4524     extent->page=       page_korr(extent->extent);
4525     page_count=         (uint2korr(extent->extent+ROW_EXTENT_PAGE_SIZE) &
4526                          ~START_EXTENT_BIT);
4527     if (!page_count)
4528       goto crashed;
4529     extent->tail=       page_count & TAIL_BIT;
4530     if (extent->tail)
4531       extent->tail_row_nr= page_count & ~TAIL_BIT;
4532     else
4533       extent->page_count= page_count;
4534     DBUG_PRINT("info",("New extent.  Page: %lu  page_count: %u  tail_flag: %d",
4535                        (ulong) extent->page, extent->page_count,
4536                        extent->tail != 0));
4537   }
4538   extent->first_extent= 0;
4539 
4540   lock= PAGECACHE_LOCK_LEFT_UNLOCKED;
4541   if (extent->tail)
4542     lock= extent->lock_for_tail_pages;
4543 
4544   buff= pagecache_read(share->pagecache,
4545                        &info->dfile, extent->page, 0,
4546                        info->buff, share->page_type,
4547                        lock, &page_link.link);
4548   if (lock != PAGECACHE_LOCK_LEFT_UNLOCKED)
4549   {
4550     /* Read during UNDO */
4551     page_link.unlock= PAGECACHE_LOCK_WRITE_UNLOCK;
4552     page_link.changed= buff != 0;
4553     push_dynamic(&info->pinned_pages, (void*) &page_link);
4554   }
4555   if (!buff)
4556   {
4557     /* check if we tried to read over end of file (ie: bad data in record) */
4558     if ((extent->page + 1) * share->block_size >
4559         share->state.state.data_file_length)
4560       goto crashed;
4561     DBUG_RETURN(0);
4562   }
4563 
4564   if (!extent->tail)
4565   {
4566     /* Full data page */
4567     if ((buff[PAGE_TYPE_OFFSET] & PAGE_TYPE_MASK) != BLOB_PAGE)
4568       goto crashed;
4569     extent->page++;                             /* point to next page */
4570     extent->page_count--;
4571     *end_of_data= buff + share->block_size - PAGE_SUFFIX_SIZE;
4572     info->cur_row.full_page_count++;            /* For maria_chk */
4573     DBUG_RETURN(extent->data_start= buff + FULL_PAGE_HEADER_SIZE(share));
4574   }
4575 
4576   /* Found tail */
4577   if ((buff[PAGE_TYPE_OFFSET] & PAGE_TYPE_MASK) != TAIL_PAGE)
4578     goto crashed;
4579   *(extent->tail_positions++)= ma_recordpos(extent->page,
4580                                             extent->tail_row_nr);
4581   info->cur_row.tail_count++;                   /* For maria_chk */
4582 
4583   if (!(data= get_record_position(share, buff,
4584                                   extent->tail_row_nr,
4585                                   end_of_data)))
4586     goto crashed;
4587   extent->data_start= data;
4588   extent->page_count= 0;                        /* No more data in extent */
4589   DBUG_RETURN(data);
4590 
4591 
4592 crashed:
4593   DBUG_ASSERT(!maria_assert_if_crashed_table);
4594   _ma_set_fatal_error(share, HA_ERR_WRONG_IN_RECORD);
4595   DBUG_PRINT("error", ("wrong extent information"));
4596   DBUG_RETURN(0);
4597 }
4598 
4599 
4600 /*
4601   Read data that may be split over many blocks
4602 
4603   SYNOPSIS
4604     read_long_data()
4605     info                Maria handler
4606     to                  Store result string here (this is allocated)
4607     extent              Pointer to current extent position
4608     data                Current position in buffer
4609     end_of_data         End of data in buffer
4610 
4611   NOTES
4612     When we have to read a new buffer, it's read into info->buff
4613 
4614     This loop is implemented by goto's instead of a for() loop as
4615     the code is notable smaller and faster this way (and it's not nice
4616     to jump into a for loop() or into a 'then' clause)
4617 
4618   RETURN
4619     0   ok
4620     1   error
4621 */
4622 
read_long_data2(MARIA_HA * info,uchar * to,ulong length,MARIA_EXTENT_CURSOR * extent,uchar ** data,uchar ** end_of_data)4623 static my_bool read_long_data2(MARIA_HA *info, uchar *to, ulong length,
4624                               MARIA_EXTENT_CURSOR *extent,
4625                               uchar **data, uchar **end_of_data)
4626 {
4627   uint left_length= (uint) (*end_of_data - *data);
4628   DBUG_ENTER("read_long_data2");
4629   DBUG_PRINT("enter", ("length: %lu  left_length: %u",
4630                        length, left_length));
4631   DBUG_ASSERT(*data <= *end_of_data);
4632 
4633   /*
4634     Fields are never split in middle. This means that if length > rest-of-data
4635     we should start reading from the next extent.  The reason we may have
4636     data left on the page is that if the fixed part of the row was less than
4637     min_block_length the head block was extended to min_block_length.
4638 
4639     This may change in the future, which is why we have the loop written
4640     the way it's written.
4641   */
4642   if (extent->first_extent && length > left_length)
4643   {
4644     *end_of_data= *data;
4645     left_length= 0;
4646   }
4647 
4648   for(;;)
4649   {
4650     if (unlikely(left_length >= length))
4651     {
4652       memcpy(to, *data, length);
4653       (*data)+= length;
4654       DBUG_PRINT("info", ("left_length: %u", left_length - (uint) length));
4655       DBUG_RETURN(0);
4656     }
4657     memcpy(to, *data, left_length);
4658     to+= left_length;
4659     length-= left_length;
4660     if (!(*data= read_next_extent(info, extent, end_of_data)))
4661       break;
4662     left_length= (uint) (*end_of_data - *data);
4663   }
4664   DBUG_RETURN(1);
4665 }
4666 
read_long_data(MARIA_HA * info,uchar * to,ulong length,MARIA_EXTENT_CURSOR * extent,uchar ** data,uchar ** end_of_data)4667 static inline my_bool read_long_data(MARIA_HA *info, uchar *to, ulong length,
4668                               MARIA_EXTENT_CURSOR *extent,
4669                               uchar **data, uchar **end_of_data)
4670 {
4671   uint left_length= (uint) (*end_of_data - *data);
4672   if (likely(left_length >= length))
4673   {
4674     memcpy(to, *data, length);
4675     (*data)+= length;
4676     return 0;
4677   }
4678   return read_long_data2(info, to, length, extent, data, end_of_data);
4679 }
4680 
4681 
4682 /*
4683   Read a record from page (helper function for _ma_read_block_record())
4684 
4685   SYNOPSIS
4686     _ma_read_block_record2()
4687     info                Maria handler
4688     record              Store record here
4689     data                Start of head data for row
4690     end_of_data         End of data for row
4691 
4692   NOTES
4693     The head page is already read by caller
4694     Following data is update in info->cur_row:
4695 
4696     cur_row.head_length is set to size of entry in head block
4697     cur_row.tail_positions is set to point to all tail blocks
4698     cur_row.extents points to extents data
4699     cur_row.extents_counts contains number of extents
4700     cur_row.empty_bits is set to empty bits
4701     cur_row.field_lengths contains packed length of all fields
4702     cur_row.blob_length contains total length of all blobs
4703     cur_row.checksum contains checksum of read record.
4704 
4705    RETURN
4706      0  ok
4707      #  Error code
4708 */
4709 
_ma_read_block_record2(MARIA_HA * info,uchar * record,uchar * data,uchar * end_of_data)4710 int _ma_read_block_record2(MARIA_HA *info, uchar *record,
4711                            uchar *data, uchar *end_of_data)
4712 {
4713   MARIA_SHARE *share= info->s;
4714   uchar *UNINIT_VAR(field_length_data), *UNINIT_VAR(blob_buffer), *start_of_data;
4715   uint flag, null_bytes, cur_null_bytes, row_extents, field_lengths;
4716   my_bool found_blob= 0;
4717   MARIA_EXTENT_CURSOR extent;
4718   MARIA_COLUMNDEF *column, *end_column;
4719   MARIA_ROW *cur_row= &info->cur_row;
4720   DBUG_ENTER("_ma_read_block_record2");
4721 
4722   start_of_data= data;
4723   flag= (uint) (uchar) data[0];
4724   cur_null_bytes= share->base.original_null_bytes;
4725   null_bytes=     share->base.null_bytes;
4726   cur_row->head_length= (uint) (end_of_data - data);
4727   cur_row->full_page_count= cur_row->tail_count= 0;
4728   cur_row->blob_length= 0;
4729   /* Number of bytes in header that we don't need to write during undo */
4730   cur_row->header_length= total_header_size[(flag & PRECALC_HEADER_BITMASK)]-1;
4731 
4732   if (flag & ROW_FLAG_TRANSID)
4733   {
4734     cur_row->trid= transid_korr(data+1);
4735     if (!info->trn)
4736     {
4737       /* File crashed */
4738       DBUG_ASSERT(!maria_assert_if_crashed_table);
4739       _ma_set_fatal_error(share, HA_ERR_WRONG_IN_RECORD);
4740       DBUG_RETURN(HA_ERR_WRONG_IN_RECORD);
4741     }
4742     if (!trnman_can_read_from(info->trn, cur_row->trid))
4743       DBUG_RETURN(my_errno= HA_ERR_ROW_NOT_VISIBLE);
4744   }
4745 
4746   /* Skip trans header (for now, until we have MVCC csupport) */
4747   data+= cur_row->header_length + 1 ;
4748   if (flag & ROW_FLAG_NULLS_EXTENDED)
4749     cur_null_bytes+= data[-1];
4750 
4751   row_extents= 0;
4752   if (flag & ROW_FLAG_EXTENTS)
4753   {
4754     uint row_extent_size;
4755     /*
4756       Record is split over many data pages.
4757       Get number of extents and first extent
4758     */
4759     get_key_length(row_extents, data);
4760     cur_row->extents_count= row_extents;
4761     row_extent_size= row_extents * ROW_EXTENT_SIZE;
4762     if (cur_row->extents_buffer_length < row_extent_size &&
4763         _ma_alloc_buffer(&cur_row->extents,
4764                          &cur_row->extents_buffer_length,
4765                          row_extent_size))
4766       DBUG_RETURN(my_errno);
4767     memcpy(cur_row->extents, data, ROW_EXTENT_SIZE);
4768     data+= ROW_EXTENT_SIZE;
4769     init_extent(&extent, cur_row->extents, row_extents,
4770                 cur_row->tail_positions);
4771   }
4772   else
4773   {
4774     cur_row->extents_count= 0;
4775     (*cur_row->tail_positions)= 0;
4776     extent.page_count= 0;
4777     extent.extent_count= 1;
4778   }
4779   extent.first_extent= 1;
4780 
4781   field_lengths= 0;
4782   if (share->base.max_field_lengths)
4783   {
4784     get_key_length(field_lengths, data);
4785     cur_row->field_lengths_length= field_lengths;
4786 #ifdef SANITY_CHECKS
4787     if (field_lengths > share->base.max_field_lengths)
4788       goto err;
4789 #endif
4790   }
4791 
4792   if (share->calc_checksum)
4793     cur_row->checksum= (uint) (uchar) *data++;
4794   /* data now points on null bits */
4795   memcpy(record, data, cur_null_bytes);
4796   if (unlikely(cur_null_bytes != null_bytes))
4797   {
4798     /*
4799       This only happens if we have added more NULL columns with
4800       ALTER TABLE and are fetching an old, not yet modified old row
4801     */
4802     bzero(record + cur_null_bytes, (uint) (null_bytes - cur_null_bytes));
4803   }
4804   data+= null_bytes;
4805   /* We copy the empty bits to be able to use them for delete/update */
4806   memcpy(cur_row->empty_bits, data, share->base.pack_bytes);
4807   data+= share->base.pack_bytes;
4808 
4809   /* TODO: Use field offsets, instead of just skipping them */
4810   data+= share->base.field_offsets * FIELD_OFFSET_SIZE;
4811 
4812   /*
4813     Read row extents (note that first extent was already read into
4814     cur_row->extents above)
4815   */
4816   if (row_extents > 1)
4817   {
4818     if (read_long_data(info, cur_row->extents + ROW_EXTENT_SIZE,
4819                        (row_extents - 1) * ROW_EXTENT_SIZE,
4820                        &extent, &data, &end_of_data))
4821       DBUG_RETURN(my_errno);
4822   }
4823 
4824   /*
4825     Data now points to start of fixed length field data that can't be null
4826     or 'empty'. Note that these fields can't be split over blocks.
4827   */
4828   for (column= share->columndef,
4829          end_column= column + share->base.fixed_not_null_fields;
4830        column < end_column; column++)
4831   {
4832     uint column_length= column->length;
4833     if (data + column_length > end_of_data &&
4834         !(data= read_next_extent(info, &extent, &end_of_data)))
4835       goto err;
4836     memcpy(record + column->offset, data, column_length);
4837     data+= column_length;
4838   }
4839 
4840   /* Read array of field lengths. This may be stored in several extents */
4841   if (field_lengths)
4842   {
4843     field_length_data= cur_row->field_lengths;
4844     if (read_long_data(info, field_length_data, field_lengths, &extent,
4845                        &data, &end_of_data))
4846       DBUG_RETURN(my_errno);
4847   }
4848 
4849   /* Read variable length data. Each of these may be split over many extents */
4850   for (end_column= share->columndef + share->base.fields;
4851        column < end_column; column++)
4852   {
4853     enum en_fieldtype type= column->type;
4854     uchar *field_pos= record + column->offset;
4855     /* First check if field is present in record */
4856     if ((record[column->null_pos] & column->null_bit) ||
4857         (cur_row->empty_bits[column->empty_pos] & column->empty_bit))
4858     {
4859       bfill(record + column->offset, column->fill_length,
4860             type == FIELD_SKIP_ENDSPACE ? ' ' : 0);
4861       continue;
4862     }
4863     switch (type) {
4864     case FIELD_NORMAL:                          /* Fixed length field */
4865     case FIELD_SKIP_PRESPACE:
4866     case FIELD_SKIP_ZERO:                       /* Fixed length field */
4867       if (data + column->length > end_of_data &&
4868           !(data= read_next_extent(info, &extent, &end_of_data)))
4869         goto err;
4870       memcpy(field_pos, data, column->length);
4871       data+= column->length;
4872       break;
4873     case FIELD_SKIP_ENDSPACE:                   /* CHAR */
4874     {
4875       /* Char that is space filled */
4876       uint length;
4877       if (column->length <= 255)
4878         length= (uint) (uchar) *field_length_data++;
4879       else
4880       {
4881         length= uint2korr(field_length_data);
4882         field_length_data+= 2;
4883       }
4884 #ifdef SANITY_CHECKS
4885       if (length > column->length)
4886         goto err;
4887 #endif
4888       if (read_long_data(info, field_pos, length, &extent, &data,
4889                          &end_of_data))
4890         DBUG_RETURN(my_errno);
4891       bfill(field_pos + length, column->length - length, ' ');
4892       break;
4893     }
4894     case FIELD_VARCHAR:
4895     {
4896       ulong length;
4897       if (column->length <= 256)
4898       {
4899         length= (uint) (uchar) (*field_pos++= *field_length_data++);
4900       }
4901       else
4902       {
4903         length= uint2korr(field_length_data);
4904         field_pos[0]= field_length_data[0];
4905         field_pos[1]= field_length_data[1];
4906         field_pos+= 2;
4907         field_length_data+= 2;
4908       }
4909 #ifdef SANITY_CHECKS
4910       if (length > column->length)
4911         goto err;
4912 #endif
4913       if (read_long_data(info, field_pos, length, &extent, &data,
4914                          &end_of_data))
4915         DBUG_RETURN(my_errno);
4916       break;
4917     }
4918     case FIELD_BLOB:
4919     {
4920       uint column_size_length= column->length - portable_sizeof_char_ptr;
4921       ulong blob_length= _ma_calc_blob_length(column_size_length,
4922                                               field_length_data);
4923 
4924       if (!found_blob)
4925       {
4926         /* Calculate total length for all blobs */
4927         ulong blob_lengths= 0;
4928         uchar *length_data= field_length_data;
4929         MARIA_COLUMNDEF *blob_field= column;
4930 
4931         found_blob= 1;
4932         for (; blob_field < end_column; blob_field++)
4933         {
4934           uint size_length;
4935           if ((record[blob_field->null_pos] & blob_field->null_bit) ||
4936               (cur_row->empty_bits[blob_field->empty_pos] &
4937                blob_field->empty_bit))
4938             continue;
4939           size_length= blob_field->length - portable_sizeof_char_ptr;
4940           blob_lengths+= _ma_calc_blob_length(size_length, length_data);
4941           length_data+= size_length;
4942         }
4943         cur_row->blob_length= blob_lengths;
4944         DBUG_PRINT("info", ("Total blob length: %lu", blob_lengths));
4945         if (_ma_alloc_buffer(&info->blob_buff, &info->blob_buff_size,
4946                              blob_lengths))
4947           DBUG_RETURN(my_errno);
4948         blob_buffer= info->blob_buff;
4949       }
4950 
4951       memcpy(field_pos, field_length_data, column_size_length);
4952       memcpy(field_pos + column_size_length, (uchar *) &blob_buffer,
4953              sizeof(char*));
4954       field_length_data+= column_size_length;
4955 
4956       /*
4957         After we have read one extent, then each blob is in it's own extent
4958       */
4959       if (!extent.first_extent || (ulong) (end_of_data - data) < blob_length)
4960         end_of_data= data;                      /* Force read of next extent */
4961 
4962       if (read_long_data(info, blob_buffer, blob_length, &extent, &data,
4963                          &end_of_data))
4964         DBUG_RETURN(my_errno);
4965       blob_buffer+= blob_length;
4966       break;
4967     }
4968     default:
4969 #ifdef EXTRA_DEBUG
4970       DBUG_ASSERT(0);                           /* purecov: deadcode */
4971 #endif
4972       goto err;
4973     }
4974     continue;
4975   }
4976 
4977   if (row_extents)
4978   {
4979     DBUG_PRINT("info", ("Row read:  page_count: %u  extent_count: %u",
4980                         extent.page_count, extent.extent_count));
4981     *extent.tail_positions= 0;                  /* End marker */
4982     if (extent.page_count)
4983       goto err;
4984     if (extent.extent_count > 1)
4985     {
4986       if (_ma_check_if_zero(extent.extent + ROW_EXTENT_SIZE,
4987                             (extent.extent_count-1) * ROW_EXTENT_SIZE))
4988       {
4989         DBUG_PRINT("error", ("Data in extent is not zero"));
4990         DBUG_DUMP("extent", extent.extent + ROW_EXTENT_SIZE,
4991                   (extent.extent_count-1) * ROW_EXTENT_SIZE);
4992         goto err;
4993       }
4994     }
4995   }
4996   else
4997   {
4998     DBUG_PRINT("info", ("Row read"));
4999     /*
5000       data should normally point to end_of_date. The only exception is if
5001       the row is very short in which case we allocated 'min_block_length' data
5002       for allowing the row to expand.
5003     */
5004     if (data != end_of_data && (uint) (end_of_data - start_of_data) >
5005         share->base.min_block_length)
5006       goto err;
5007   }
5008 #ifdef EXTRA_DEBUG
5009   if (share->calc_checksum && !info->in_check_table)
5010   {
5011     /* Esnure that row checksum is correct */
5012     DBUG_ASSERT(((share->calc_checksum)(info, record) & 255) ==
5013                 cur_row->checksum);
5014   }
5015 #endif
5016   info->update|= HA_STATE_AKTIV;	/* We have an active record */
5017   DBUG_RETURN(0);
5018 
5019 err:
5020   DBUG_ASSERT(!maria_assert_if_crashed_table);
5021   /* Something was wrong with data on record */
5022   DBUG_PRINT("error", ("Found record with wrong data"));
5023   _ma_set_fatal_error(share, HA_ERR_WRONG_IN_RECORD);
5024   DBUG_RETURN(HA_ERR_WRONG_IN_RECORD);
5025 }
5026 
5027 
5028 /** @brief Read positions to tail blocks and full blocks
5029 
5030   @fn    read_row_extent_info()
5031   @param info	Handler
5032 
5033   @notes
5034     This function is a simpler version of _ma_read_block_record2()
5035     The data about the used pages is stored in info->cur_row.
5036 
5037   @return Status
5038   @retval 0   ok
5039   @retval 1   Error. my_errno contains error number
5040 */
5041 
read_row_extent_info(MARIA_HA * info,uchar * buff,uint record_number)5042 static my_bool read_row_extent_info(MARIA_HA *info, uchar *buff,
5043                                     uint record_number)
5044 {
5045   MARIA_SHARE *share= info->s;
5046   MARIA_EXTENT_CURSOR extent;
5047   MARIA_RECORD_POS *tail_pos;
5048   uchar *data, *end_of_data;
5049   uint flag, row_extents, row_extents_size;
5050   uint field_lengths __attribute__ ((unused));
5051   uchar *extents, *end;
5052   DBUG_ENTER("read_row_extent_info");
5053 
5054   if (!(data= get_record_position(share, buff,
5055                                   record_number, &end_of_data)))
5056     DBUG_RETURN(1);                             /* Wrong in record */
5057 
5058   flag= (uint) (uchar) data[0];
5059   /* Skip trans header */
5060   data+= total_header_size[(flag & PRECALC_HEADER_BITMASK)];
5061 
5062   row_extents= 0;
5063   row_extents_size= 0;
5064   if (flag & ROW_FLAG_EXTENTS)
5065   {
5066     /*
5067       Record is split over many data pages.
5068       Get number of extents and first extent
5069     */
5070     get_key_length(row_extents, data);
5071     row_extents_size= row_extents * ROW_EXTENT_SIZE;
5072     if (info->cur_row.extents_buffer_length < row_extents_size &&
5073         _ma_alloc_buffer(&info->cur_row.extents,
5074                          &info->cur_row.extents_buffer_length,
5075                          row_extents_size))
5076       DBUG_RETURN(1);
5077     memcpy(info->cur_row.extents, data, ROW_EXTENT_SIZE);
5078     data+= ROW_EXTENT_SIZE;
5079     init_extent(&extent, info->cur_row.extents, row_extents,
5080                 info->cur_row.tail_positions);
5081     extent.first_extent= 1;
5082   }
5083   info->cur_row.extents_count= row_extents;
5084 
5085   /*
5086     field_lengths looks unused but get_key_length will
5087     increment data, which is required as data it's used later.
5088   */
5089   if (share->base.max_field_lengths)
5090     get_key_length(field_lengths, data);
5091 
5092   if (share->calc_checksum)
5093     info->cur_row.checksum= (uint) (uchar) *data++;
5094   if (row_extents > 1)
5095   {
5096     data+= share->base.null_bytes;
5097     data+= share->base.pack_bytes;
5098     data+= share->base.field_offsets * FIELD_OFFSET_SIZE;
5099 
5100     /*
5101       Read row extents (note that first extent was already read into
5102       info->cur_row.extents above)
5103       Lock tails with write lock as we will delete them later.
5104     */
5105     extent.lock_for_tail_pages= PAGECACHE_LOCK_LEFT_WRITELOCKED;
5106     if (read_long_data(info, info->cur_row.extents + ROW_EXTENT_SIZE,
5107                        row_extents_size - ROW_EXTENT_SIZE,
5108                        &extent, &data, &end_of_data))
5109       DBUG_RETURN(1);
5110   }
5111 
5112   /* Update tail_positions with pointer to tails */
5113   tail_pos= info->cur_row.tail_positions;
5114   for (extents= info->cur_row.extents, end= extents + row_extents_size;
5115        extents < end;
5116        extents+= ROW_EXTENT_SIZE)
5117   {
5118     pgcache_page_no_t page=  uint5korr(extents);
5119     uint page_count= uint2korr(extents + ROW_EXTENT_PAGE_SIZE);
5120     if (page_count & TAIL_BIT)
5121       *(tail_pos++)= ma_recordpos(page, (page_count & ~ (TAIL_BIT |
5122                                                          START_EXTENT_BIT)));
5123   }
5124   *tail_pos= 0;                               /* End marker */
5125   DBUG_RETURN(0);
5126 }
5127 
5128 
5129 /*
5130   Read a record based on record position
5131 
5132   @fn     _ma_read_block_record()
5133   @param info                Maria handler
5134   @param record              Store record here
5135   @param record_pos          Record position
5136 
5137   @return Status
5138   @retval 0  ok
5139   @retval #  Error number
5140 */
5141 
_ma_read_block_record(MARIA_HA * info,uchar * record,MARIA_RECORD_POS record_pos)5142 int _ma_read_block_record(MARIA_HA *info, uchar *record,
5143                           MARIA_RECORD_POS record_pos)
5144 {
5145   MARIA_SHARE *share= info->s;
5146   uchar *data, *end_of_data, *buff;
5147   uint offset;
5148   int ret;
5149   DBUG_ENTER("_ma_read_block_record");
5150   DBUG_PRINT("enter", ("rowid: %lu  page: %lu  rownr: %u",
5151                        (ulong) record_pos,
5152                        (ulong) ma_recordpos_to_page(record_pos),
5153                        ma_recordpos_to_dir_entry(record_pos)));
5154 
5155   offset= ma_recordpos_to_dir_entry(record_pos);
5156 
5157   if (!(buff= pagecache_read(share->pagecache,
5158                              &info->dfile, ma_recordpos_to_page(record_pos), 0,
5159                              info->buff, share->page_type,
5160                              PAGECACHE_LOCK_LEFT_UNLOCKED, 0)))
5161     DBUG_RETURN(my_errno);
5162 
5163   /*
5164     Unallocated page access can happen if this is an access to a page where
5165     all rows where deleted as part of this statement.
5166   */
5167   DBUG_ASSERT((buff[PAGE_TYPE_OFFSET] & PAGE_TYPE_MASK) == HEAD_PAGE ||
5168               (buff[PAGE_TYPE_OFFSET] & PAGE_TYPE_MASK) == UNALLOCATED_PAGE);
5169 
5170   if (((buff[PAGE_TYPE_OFFSET] & PAGE_TYPE_MASK) == UNALLOCATED_PAGE) ||
5171       !(data= get_record_position(share, buff, offset, &end_of_data)))
5172   {
5173     DBUG_ASSERT(!maria_assert_if_crashed_table);
5174     DBUG_PRINT("warning", ("Wrong directory entry in data block"));
5175     my_errno= HA_ERR_RECORD_DELETED;           /* File crashed */
5176     DBUG_RETURN(HA_ERR_RECORD_DELETED);
5177   }
5178   ret= _ma_read_block_record2(info, record, data, end_of_data);
5179   DBUG_RETURN(ret);
5180 }
5181 
5182 
5183 /* compare unique constraint between stored rows */
5184 
_ma_cmp_block_unique(MARIA_HA * info,MARIA_UNIQUEDEF * def,const uchar * record,MARIA_RECORD_POS pos)5185 my_bool _ma_cmp_block_unique(MARIA_HA *info, MARIA_UNIQUEDEF *def,
5186                              const uchar *record, MARIA_RECORD_POS pos)
5187 {
5188   uchar *org_rec_buff, *old_record;
5189   size_t org_rec_buff_size;
5190   int error;
5191   DBUG_ENTER("_ma_cmp_block_unique");
5192 
5193   /*
5194     Don't allocate more than 16K on the stack to ensure we don't get
5195     stack overflow.
5196   */
5197   if (!(old_record= my_safe_alloca(info->s->base.reclength)))
5198     DBUG_RETURN(1);
5199 
5200   /* Don't let the compare destroy blobs that may be in use */
5201   org_rec_buff=      info->rec_buff;
5202   org_rec_buff_size= info->rec_buff_size;
5203   if (info->s->base.blobs)
5204   {
5205     /* Force realloc of record buffer*/
5206     info->rec_buff= 0;
5207     info->rec_buff_size= 0;
5208   }
5209   error= _ma_read_block_record(info, old_record, pos);
5210   if (!error)
5211     error= _ma_unique_comp(def, record, old_record, def->null_are_equal);
5212   if (info->s->base.blobs)
5213   {
5214     my_free(info->rec_buff);
5215     info->rec_buff=      org_rec_buff;
5216     info->rec_buff_size= org_rec_buff_size;
5217   }
5218   DBUG_PRINT("exit", ("result: %d", error));
5219   my_safe_afree(old_record, info->s->base.reclength);
5220   DBUG_RETURN(error != 0);
5221 }
5222 
5223 
5224 /****************************************************************************
5225   Table scan
5226 ****************************************************************************/
5227 
5228 /*
5229   Allocate buffers for table scan
5230 
5231   SYNOPSIS
5232    _ma_scan_init_block_record(MARIA_HA *info)
5233 
5234   IMPLEMENTATION
5235     We allocate one buffer for the current bitmap and one buffer for the
5236     current page
5237 
5238   RETURN
5239     0  ok
5240     1  error (couldn't allocate memory or disk error)
5241 */
5242 
_ma_scan_init_block_record(MARIA_HA * info)5243 my_bool _ma_scan_init_block_record(MARIA_HA *info)
5244 {
5245   MARIA_SHARE *share= info->s;
5246   DBUG_ENTER("_ma_scan_init_block_record");
5247   DBUG_ASSERT(info->dfile.file == share->bitmap.file.file);
5248 
5249   /*
5250     bitmap_buff may already be allocated if this is the second call to
5251     rnd_init() without a rnd_end() in between, see sql/handler.h
5252   */
5253   if (!(info->scan.bitmap_buff ||
5254         ((info->scan.bitmap_buff=
5255           (uchar *) my_malloc(share->block_size * 2, MYF(MY_WME))))))
5256     DBUG_RETURN(1);
5257   info->scan.page_buff= info->scan.bitmap_buff + share->block_size;
5258   info->scan.bitmap_end= info->scan.bitmap_buff + share->bitmap.max_total_size;
5259 
5260   /* Set scan variables to get _ma_scan_block() to start with reading bitmap */
5261   info->scan.number_of_rows= 0;
5262   info->scan.bitmap_pos= info->scan.bitmap_end;
5263   info->scan.bitmap_page= (pgcache_page_no_t) 0 - share->bitmap.pages_covered;
5264   info->scan.max_page= share->state.state.data_file_length / share->block_size;
5265   /*
5266     We need to flush what's in memory (bitmap.map) to page cache otherwise, as
5267     we are going to read bitmaps from page cache in table scan (see
5268     _ma_scan_block_record()), we may miss recently inserted rows (bitmap page
5269     in page cache would be too old).
5270   */
5271   DBUG_RETURN(_ma_bitmap_flush(info->s));
5272 }
5273 
5274 
5275 /* Free buffers allocated by _ma_scan_block_init() */
5276 
_ma_scan_end_block_record(MARIA_HA * info)5277 void _ma_scan_end_block_record(MARIA_HA *info)
5278 {
5279   DBUG_ENTER("_ma_scan_end_block_record");
5280   my_free(info->scan.bitmap_buff);
5281   info->scan.bitmap_buff= 0;
5282   if (info->scan_save)
5283   {
5284     my_free(info->scan_save);
5285     info->scan_save= 0;
5286   }
5287   DBUG_VOID_RETURN;
5288 }
5289 
5290 
5291 /**
5292   @brief Save current scan position
5293 
5294   @note
5295   For the moment we can only remember one position, but this is
5296   good enough for MySQL usage
5297 
5298   @return
5299   @retval 0			  ok
5300   @retval HA_ERR_WRONG_IN_RECORD  Could not allocate memory to hold position
5301 */
5302 
_ma_scan_remember_block_record(MARIA_HA * info,MARIA_RECORD_POS * lastpos)5303 int _ma_scan_remember_block_record(MARIA_HA *info,
5304                                    MARIA_RECORD_POS *lastpos)
5305 {
5306   uchar *bitmap_buff;
5307   DBUG_ENTER("_ma_scan_remember_block_record");
5308   if (!(info->scan_save))
5309   {
5310     if (!(info->scan_save= my_malloc(ALIGN_SIZE(sizeof(*info->scan_save)) +
5311                                      info->s->block_size * 2,
5312                                      MYF(MY_WME))))
5313       DBUG_RETURN(HA_ERR_OUT_OF_MEM);
5314     info->scan_save->bitmap_buff= ((uchar*) info->scan_save +
5315                                    ALIGN_SIZE(sizeof(*info->scan_save)));
5316   }
5317   /* For checking if pages have changed since we last read it */
5318   info->scan.row_changes= info->row_changes;
5319 
5320   /* Remember used bitmap and used head page */
5321   bitmap_buff= info->scan_save->bitmap_buff;
5322   memcpy(info->scan_save, &info->scan, sizeof(*info->scan_save));
5323   info->scan_save->bitmap_buff= bitmap_buff;
5324   memcpy(bitmap_buff, info->scan.bitmap_buff, info->s->block_size * 2);
5325 
5326   /* Point to the last read row */
5327   *lastpos= info->cur_row.nextpos - 1;
5328   info->scan_save->dir+= DIR_ENTRY_SIZE;
5329   DBUG_RETURN(0);
5330 }
5331 
5332 
5333 /**
5334    @brief restore scan block it's original values
5335 
5336    @return
5337    0 ok
5338    # error
5339 
5340    @note
5341    In theory we could swap bitmap buffers instead of copy them.
5342    For the moment we don't do that because there are variables pointing
5343    inside the buffers and it's a bit of hassle to either make them relative
5344    or repoint them.
5345 
5346    If the data file has changed, we will re-read the new block record
5347    to ensure that when we continue scanning we can ignore any deleted rows.
5348 */
5349 
_ma_scan_restore_block_record(MARIA_HA * info,MARIA_RECORD_POS lastpos)5350 int _ma_scan_restore_block_record(MARIA_HA *info,
5351                                   MARIA_RECORD_POS lastpos)
5352 {
5353   uchar *bitmap_buff;
5354   DBUG_ENTER("_ma_scan_restore_block_record");
5355 
5356   info->cur_row.nextpos= lastpos;
5357   bitmap_buff= info->scan.bitmap_buff;
5358   memcpy(&info->scan, info->scan_save, sizeof(*info->scan_save));
5359   info->scan.bitmap_buff= bitmap_buff;
5360   memcpy(bitmap_buff, info->scan_save->bitmap_buff, info->s->block_size * 2);
5361 
5362   if (info->scan.row_changes != info->row_changes)
5363   {
5364     /*
5365       Table has been changed. We have to re-read the current page block as
5366       data may have changed on it that we have to see.
5367     */
5368     if (!(pagecache_read(info->s->pagecache,
5369                          &info->dfile,
5370                          ma_recordpos_to_page(info->scan.row_base_page),
5371                          0, info->scan.page_buff,
5372                          info->s->page_type,
5373                          PAGECACHE_LOCK_LEFT_UNLOCKED, 0)))
5374       DBUG_RETURN(my_errno);
5375     info->scan.number_of_rows=
5376       (uint) (uchar) info->scan.page_buff[DIR_COUNT_OFFSET];
5377     info->scan.dir_end= (info->scan.page_buff + info->s->block_size -
5378                          PAGE_SUFFIX_SIZE -
5379                          info->scan.number_of_rows * DIR_ENTRY_SIZE);
5380   }
5381   DBUG_RETURN(0);
5382 }
5383 
5384 
5385 /*
5386   Read next record while scanning table
5387 
5388   SYNOPSIS
5389     _ma_scan_block_record()
5390     info                Maria handler
5391     record              Store found here
5392     record_pos          Value stored in info->cur_row.next_pos after last call
5393                         This is offset inside the current pagebuff
5394     skip_deleted
5395 
5396   NOTES
5397     - One must have called mi_scan() before this
5398     - In this version, we don't actually need record_pos, we as easily
5399       use a variable in info->scan
5400 
5401   IMPLEMENTATION
5402     Current code uses a lot of goto's to separate the different kind of
5403     states we may be in. This gives us a minimum of executed if's for
5404     the normal cases.  I tried several different ways to code this, but
5405     the current one was in the end the most readable and fastest.
5406 
5407   RETURN
5408     0   ok
5409     #   Error code  (Normally HA_ERR_END_OF_FILE)
5410 */
5411 
_ma_scan_block_record(MARIA_HA * info,uchar * record,MARIA_RECORD_POS record_pos,my_bool skip_deleted)5412 int _ma_scan_block_record(MARIA_HA *info, uchar *record,
5413                           MARIA_RECORD_POS record_pos,
5414                           my_bool skip_deleted __attribute__ ((unused)))
5415 {
5416   uint block_size;
5417   MARIA_SHARE *share= info->s;
5418   DBUG_ENTER("_ma_scan_block_record");
5419 
5420 restart_record_read:
5421   /* Find next row in current page */
5422   while (likely(record_pos < info->scan.number_of_rows))
5423   {
5424     uint length, offset;
5425     uchar *data, *end_of_data;
5426     int error;
5427 
5428     /* Ensure that scan.dir and record_pos are in sync */
5429     DBUG_ASSERT(info->scan.dir == dir_entry_pos(info->scan.page_buff,
5430                                                 share->block_size,
5431                                                 (uint) record_pos));
5432 
5433     /* Search for a valid directory entry (not 0) */
5434     while (!(offset= uint2korr(info->scan.dir)))
5435     {
5436       info->scan.dir-= DIR_ENTRY_SIZE;
5437       record_pos++;
5438 #ifdef SANITY_CHECKS
5439       if (info->scan.dir < info->scan.dir_end)
5440       {
5441         DBUG_ASSERT(!maria_assert_if_crashed_table);
5442         goto err;
5443       }
5444 #endif
5445     }
5446     /*
5447       This should always be true as the directory should always start with
5448       a valid entry.
5449     */
5450     DBUG_ASSERT(info->scan.dir >= info->scan.dir_end);
5451 
5452     /* found row */
5453     info->cur_row.lastpos= info->scan.row_base_page + record_pos;
5454     info->cur_row.nextpos= record_pos + 1;
5455     data= info->scan.page_buff + offset;
5456     length= uint2korr(info->scan.dir + 2);
5457     end_of_data= data + length;
5458     info->scan.dir-= DIR_ENTRY_SIZE;      /* Point to next row to process */
5459 #ifdef SANITY_CHECKS
5460     if (end_of_data > info->scan.dir_end ||
5461         offset < PAGE_HEADER_SIZE(share) ||
5462         length < share->base.min_block_length)
5463     {
5464       DBUG_ASSERT(!(end_of_data > info->scan.dir_end));
5465       DBUG_ASSERT(!(offset < PAGE_HEADER_SIZE(share)));
5466       DBUG_ASSERT(!(length < share->base.min_block_length));
5467       goto err;
5468     }
5469 #endif
5470     DBUG_PRINT("info", ("rowid: %lu", (ulong) info->cur_row.lastpos));
5471     error= _ma_read_block_record2(info, record, data, end_of_data);
5472     if (error != HA_ERR_ROW_NOT_VISIBLE)
5473       DBUG_RETURN(error);
5474     record_pos++;
5475   }
5476 
5477   /* Find next head page in current bitmap */
5478 restart_bitmap_scan:
5479   block_size= share->block_size;
5480   if (likely(info->scan.bitmap_pos < info->scan.bitmap_end))
5481   {
5482     uchar *data=    info->scan.bitmap_pos;
5483     longlong bits= info->scan.bits;
5484     uint bit_pos=  info->scan.bit_pos;
5485 
5486     do
5487     {
5488       while (likely(bits))
5489       {
5490         uint pattern= (uint) (bits & 7);
5491         bits >>= 3;
5492         bit_pos++;
5493         if (pattern > 0 && pattern <= 4)
5494         {
5495           /* Found head page; Read it */
5496           pgcache_page_no_t page;
5497           info->scan.bitmap_pos= data;
5498           info->scan.bits= bits;
5499           info->scan.bit_pos= bit_pos;
5500           page= (info->scan.bitmap_page + 1 +
5501                  (data - info->scan.bitmap_buff) / 6 * 16 + bit_pos - 1);
5502           info->scan.row_base_page= ma_recordpos(page, 0);
5503           if (page >= info->scan.max_page)
5504           {
5505             DBUG_PRINT("info", ("Found end of file"));
5506             DBUG_RETURN((my_errno= HA_ERR_END_OF_FILE));
5507           }
5508           if (!(pagecache_read(share->pagecache,
5509                                &info->dfile,
5510                                page, 0, info->scan.page_buff,
5511                                share->page_type,
5512                                PAGECACHE_LOCK_LEFT_UNLOCKED, 0)))
5513             DBUG_RETURN(my_errno);
5514           if (((info->scan.page_buff[PAGE_TYPE_OFFSET] & PAGE_TYPE_MASK) !=
5515                HEAD_PAGE))
5516           {
5517             /*
5518               This may happen if someone has been deleting all rows
5519               from a page since we read the bitmap, so it may be ok.
5520               Print warning in debug log and continue.
5521             */
5522             DBUG_PRINT("warning",
5523                        ("Found page of type %d when expecting head page",
5524                         (info->scan.page_buff[PAGE_TYPE_OFFSET] &
5525                          PAGE_TYPE_MASK)));
5526             continue;
5527           }
5528           if ((info->scan.number_of_rows=
5529                (uint) (uchar) info->scan.page_buff[DIR_COUNT_OFFSET]) == 0)
5530           {
5531             DBUG_PRINT("error", ("Wrong page header"));
5532             _ma_set_fatal_error(share, HA_ERR_WRONG_IN_RECORD);
5533             DBUG_RETURN(HA_ERR_WRONG_IN_RECORD);
5534           }
5535           DBUG_PRINT("info", ("Page %lu has %u rows",
5536                               (ulong) page, info->scan.number_of_rows));
5537           info->scan.dir= (info->scan.page_buff + block_size -
5538                            PAGE_SUFFIX_SIZE - DIR_ENTRY_SIZE);
5539           info->scan.dir_end= (info->scan.dir -
5540                                (info->scan.number_of_rows - 1) *
5541                                DIR_ENTRY_SIZE);
5542           record_pos= 0;
5543           goto restart_record_read;
5544         }
5545       }
5546       for (data+= 6; data < info->scan.bitmap_end; data+= 6)
5547       {
5548         bits= uint6korr(data);
5549         /* Skip not allocated pages and blob / full tail pages */
5550         if (bits && bits != 07777777777777777LL)
5551           break;
5552       }
5553       bit_pos= 0;
5554     } while (data < info->scan.bitmap_end);
5555   }
5556 
5557   /* Read next bitmap */
5558   info->scan.bitmap_page+= share->bitmap.pages_covered;
5559   if (unlikely(info->scan.bitmap_page >= info->scan.max_page))
5560   {
5561     DBUG_PRINT("info", ("Found end of file"));
5562     DBUG_RETURN((my_errno= HA_ERR_END_OF_FILE));
5563   }
5564   DBUG_PRINT("info", ("Reading bitmap at %lu",
5565                       (ulong) info->scan.bitmap_page));
5566   if (!(pagecache_read(share->pagecache, &info->s->bitmap.file,
5567                        info->scan.bitmap_page,
5568                        0, info->scan.bitmap_buff, PAGECACHE_PLAIN_PAGE,
5569                        PAGECACHE_LOCK_LEFT_UNLOCKED, 0)))
5570     DBUG_RETURN(my_errno);
5571   /* Skip scanning 'bits' in bitmap scan code */
5572   info->scan.bitmap_pos= info->scan.bitmap_buff - 6;
5573   info->scan.bits= 0;
5574   goto restart_bitmap_scan;
5575 
5576 err:
5577   DBUG_ASSERT(!maria_assert_if_crashed_table);
5578   DBUG_PRINT("error", ("Wrong data on page"));
5579   _ma_set_fatal_error(share, HA_ERR_WRONG_IN_RECORD);
5580   DBUG_RETURN(HA_ERR_WRONG_IN_RECORD);
5581 }
5582 
5583 
5584 /*
5585   Compare a row against a stored one
5586 
5587   NOTES
5588     Not implemented, as block record is not supposed to be used in a shared
5589     global environment
5590 */
5591 
_ma_compare_block_record(MARIA_HA * info,const uchar * record)5592 my_bool _ma_compare_block_record(MARIA_HA *info __attribute__ ((unused)),
5593                                  const uchar *record __attribute__ ((unused)))
5594 {
5595   return 0;
5596 }
5597 
5598 
5599 /*
5600   Store an integer with simple packing
5601 
5602   SYNOPSIS
5603     ma_store_integer()
5604     to                  Store the packed integer here
5605     nr                  Integer to store
5606 
5607   NOTES
5608     This is mostly used to store field numbers and lengths of strings.
5609     We have to cast the result for the LL() becasue of a bug in Forte CC
5610     compiler.
5611 
5612     Packing used is:
5613     nr < 251 is stored as is (in 1 byte)
5614     Numbers that require 1-4 bytes are stored as char(250+byte_length), data
5615     Bigger numbers are stored as 255, data as ulonglong (not yet done).
5616 
5617   RETURN
5618     Position in 'to' after the packed length
5619 */
5620 
ma_store_length(uchar * to,ulong nr)5621 uchar *ma_store_length(uchar *to, ulong nr)
5622 {
5623   if (nr < 251)
5624   {
5625     *to=(uchar) nr;
5626     return to+1;
5627   }
5628   if (nr < 65536)
5629   {
5630     if (nr <= 255)
5631     {
5632       to[0]= (uchar) 251;
5633       to[1]= (uchar) nr;
5634       return to+2;
5635     }
5636     to[0]= (uchar) 252;
5637     int2store(to+1, nr);
5638     return to+3;
5639   }
5640   if (nr < 16777216)
5641   {
5642     *to++= (uchar) 253;
5643     int3store(to, nr);
5644     return to+3;
5645   }
5646   *to++= (uchar) 254;
5647   int4store(to, nr);
5648   return to+4;
5649 }
5650 
5651 
5652 /* Calculate how many bytes needed to store a number */
5653 
ma_calc_length_for_store_length(ulong nr)5654 uint ma_calc_length_for_store_length(ulong nr)
5655 {
5656   if (nr < 251)
5657     return 1;
5658   if (nr < 65536)
5659   {
5660     if (nr <= 255)
5661       return 2;
5662     return 3;
5663   }
5664   if (nr < 16777216)
5665     return 4;
5666   return 5;
5667 }
5668 
5669 
5670 /* Retrive a stored number */
5671 
ma_get_length(const uchar ** packet)5672 static ulong ma_get_length(const uchar **packet)
5673 {
5674   reg1 const uchar *pos= *packet;
5675   if (*pos < 251)
5676   {
5677     (*packet)++;
5678     return (ulong) *pos;
5679   }
5680   if (*pos == 251)
5681   {
5682     (*packet)+= 2;
5683     return (ulong) pos[1];
5684   }
5685   if (*pos == 252)
5686   {
5687     (*packet)+= 3;
5688     return (ulong) uint2korr(pos+1);
5689   }
5690   if (*pos == 253)
5691   {
5692     (*packet)+= 4;
5693     return (ulong) uint3korr(pos+1);
5694   }
5695   DBUG_ASSERT(*pos == 254);
5696   (*packet)+= 5;
5697   return (ulong) uint4korr(pos+1);
5698 }
5699 
5700 
5701 /*
5702   Fill array with pointers to field parts to be stored in log for insert
5703 
5704   SYNOPSIS
5705     fill_insert_undo_parts()
5706     info                Maria handler
5707     record              Inserted row
5708     log_parts           Store pointers to changed memory areas here
5709     log_parts_count     See RETURN
5710 
5711   NOTES
5712     We have information in info->cur_row about the read row.
5713 
5714   RETURN
5715     length of data in log_parts.
5716     log_parts_count contains number of used log_parts
5717 */
5718 
fill_insert_undo_parts(MARIA_HA * info,const uchar * record,LEX_CUSTRING * log_parts,uint * log_parts_count)5719 static size_t fill_insert_undo_parts(MARIA_HA *info, const uchar *record,
5720                                      LEX_CUSTRING *log_parts,
5721                                      uint *log_parts_count)
5722 {
5723   MARIA_SHARE *share= info->s;
5724   MARIA_COLUMNDEF *column, *end_column;
5725   uchar *field_lengths= info->cur_row.field_lengths;
5726   size_t row_length;
5727   MARIA_ROW *cur_row= &info->cur_row;
5728   LEX_CUSTRING *start_log_parts;
5729   DBUG_ENTER("fill_insert_undo_parts");
5730 
5731   start_log_parts= log_parts;
5732 
5733   /* Store null bits */
5734   log_parts->str=      record;
5735   log_parts->length=   share->base.null_bytes;
5736   row_length=          log_parts->length;
5737   log_parts++;
5738 
5739   /* Stored bitmap over packed (zero length or all-zero fields) */
5740   log_parts->str= info->cur_row.empty_bits;
5741   log_parts->length= share->base.pack_bytes;
5742   row_length+=       log_parts->length;
5743   log_parts++;
5744 
5745   if (share->base.max_field_lengths)
5746   {
5747     /* Store length of all not empty char, varchar and blob fields */
5748     log_parts->str= field_lengths - 2;
5749     log_parts->length=   info->cur_row.field_lengths_length+2;
5750     int2store(log_parts->str, info->cur_row.field_lengths_length);
5751     row_length+= log_parts->length;
5752     log_parts++;
5753   }
5754 
5755   if (share->base.blobs)
5756   {
5757     /*
5758       Store total blob length to make buffer allocation easier during UNDO
5759      */
5760     log_parts->str=  info->length_buff;
5761     log_parts->length= (uint) (ma_store_length(info->length_buff,
5762                                                  info->cur_row.blob_length) -
5763                                  (uchar*) log_parts->str);
5764     row_length+=          log_parts->length;
5765     log_parts++;
5766   }
5767 
5768   /* Handle constant length fields that are always present */
5769   for (column= share->columndef,
5770        end_column= column+ share->base.fixed_not_null_fields;
5771        column < end_column;
5772        column++)
5773   {
5774     log_parts->str= record + column->offset;
5775     log_parts->length= column->length;
5776     row_length+= log_parts->length;
5777     log_parts++;
5778   }
5779 
5780   /* Handle NULL fields and CHAR/VARCHAR fields */
5781   for (end_column= share->columndef + share->base.fields - share->base.blobs;
5782        column < end_column;
5783        column++)
5784   {
5785     const uchar *column_pos;
5786     size_t column_length;
5787     if ((record[column->null_pos] & column->null_bit) ||
5788         cur_row->empty_bits[column->empty_pos] & column->empty_bit)
5789       continue;
5790 
5791     column_pos=    record+ column->offset;
5792     column_length= column->length;
5793 
5794     switch (column->type) {
5795     case FIELD_CHECK:
5796     case FIELD_NORMAL:                          /* Fixed length field */
5797     case FIELD_ZERO:
5798     case FIELD_SKIP_PRESPACE:                   /* Not packed */
5799     case FIELD_SKIP_ZERO:                       /* Fixed length field */
5800       break;
5801     case FIELD_SKIP_ENDSPACE:                   /* CHAR */
5802     {
5803       if (column->length <= 255)
5804         column_length= *field_lengths++;
5805       else
5806       {
5807         column_length= uint2korr(field_lengths);
5808         field_lengths+= 2;
5809       }
5810       break;
5811     }
5812     case FIELD_VARCHAR:
5813     {
5814       if (column->fill_length == 1)
5815         column_length= *field_lengths;
5816       else
5817         column_length= uint2korr(field_lengths);
5818       field_lengths+= column->fill_length;
5819       column_pos+= column->fill_length;
5820       break;
5821     }
5822     default:
5823       DBUG_ASSERT(0);
5824     }
5825     log_parts->str= column_pos;
5826     log_parts->length= column_length;
5827     row_length+= log_parts->length;
5828     log_parts++;
5829   }
5830 
5831   /* Add blobs */
5832   for (end_column+= share->base.blobs; column < end_column; column++)
5833   {
5834     const uchar *field_pos= record + column->offset;
5835     uint size_length= column->length - portable_sizeof_char_ptr;
5836     ulong blob_length= _ma_calc_blob_length(size_length, field_pos);
5837 
5838     /*
5839       We don't have to check for null, as blob_length is guranteed to be 0
5840       if the blob is null
5841     */
5842     if (blob_length)
5843     {
5844       uchar *blob_pos;
5845       memcpy(&blob_pos, record + column->offset + size_length,
5846              sizeof(blob_pos));
5847       log_parts->str= blob_pos;
5848       log_parts->length= blob_length;
5849       row_length+= log_parts->length;
5850       log_parts++;
5851     }
5852   }
5853   *log_parts_count= (uint) (log_parts - start_log_parts);
5854   DBUG_RETURN(row_length);
5855 }
5856 
5857 
5858 /*
5859    Fill array with pointers to field parts to be stored in log for update
5860 
5861   SYNOPSIS
5862     fill_update_undo_parts()
5863     info                Maria handler
5864     oldrec		Original row
5865     newrec              New row
5866     log_parts           Store pointers to changed memory areas here
5867     log_parts_count     See RETURN
5868 
5869   IMPLEMENTATION
5870     Format of undo record:
5871 
5872     Fields are stored in same order as the field array.
5873 
5874     Offset to changed field data (packed)
5875 
5876     For each changed field
5877       Fieldnumber (packed)
5878       Length, if variable length field (packed)
5879 
5880     For each changed field
5881      Data
5882 
5883    Packing is using ma_store_integer()
5884 
5885    The reason we store field numbers & length separated from data (ie, not
5886    after each other) is to get better cpu caching when we loop over
5887    fields (as we probably don't have to access data for each field when we
5888    want to read and old row through the undo log record).
5889 
5890    As a special case, we use '255' for the field number of the null bitmap.
5891 
5892   RETURN
5893     length of data in log_parts.
5894     log_parts_count contains number of used log_parts
5895 */
5896 
fill_update_undo_parts(MARIA_HA * info,const uchar * oldrec,const uchar * newrec,LEX_CUSTRING * log_parts,uint * log_parts_count)5897 static size_t fill_update_undo_parts(MARIA_HA *info, const uchar *oldrec,
5898                                      const uchar *newrec,
5899                                      LEX_CUSTRING *log_parts,
5900                                      uint *log_parts_count)
5901 {
5902   MARIA_SHARE *share= info->s;
5903   MARIA_COLUMNDEF *column, *end_column;
5904   MARIA_ROW *old_row= &info->cur_row, *new_row= &info->new_row;
5905   uchar *field_data, *start_field_data, *length_str;
5906   uchar *old_field_lengths= old_row->field_lengths;
5907   uchar *new_field_lengths= new_row->field_lengths;
5908   size_t row_length= 0;
5909   uint field_lengths;
5910   LEX_CUSTRING *start_log_parts;
5911   my_bool new_column_is_empty;
5912   DBUG_ENTER("fill_update_undo_parts");
5913 
5914   start_log_parts= log_parts;
5915 
5916   /*
5917     First log part is for number of fields, field numbers and lengths
5918     The +4 is to reserve place for the number of changed fields.
5919   */
5920   start_field_data= field_data= info->update_field_data + 4;
5921   log_parts++;
5922 
5923   if (memcmp(oldrec, newrec, share->base.null_bytes))
5924   {
5925     /* Store changed null bits */
5926     *field_data++=       (uchar) 255;           /* Special case */
5927     log_parts->str=      oldrec;
5928     log_parts->length=   share->base.null_bytes;
5929     row_length=          log_parts->length;
5930     log_parts++;
5931   }
5932 
5933   /* Handle constant length fields */
5934   for (column= share->columndef,
5935        end_column= column+ share->base.fixed_not_null_fields;
5936        column < end_column;
5937        column++)
5938   {
5939     if (memcmp(oldrec + column->offset, newrec + column->offset,
5940                column->length))
5941     {
5942       field_data= ma_store_length(field_data,
5943                                   (uint) (column - share->columndef));
5944       log_parts->str= oldrec + column->offset;
5945       log_parts->length= column->length;
5946       row_length+=       column->length;
5947       log_parts++;
5948     }
5949   }
5950 
5951   /* Handle the rest: NULL fields and CHAR/VARCHAR fields and BLOB's */
5952   for (end_column= share->columndef + share->base.fields;
5953        column < end_column;
5954        column++)
5955   {
5956     const uchar *new_column_pos, *old_column_pos;
5957     size_t new_column_length, old_column_length;
5958 
5959     /* First check if old column is null or empty */
5960     if (oldrec[column->null_pos] & column->null_bit)
5961     {
5962       /*
5963         It's safe to skip this one as either the new column is also null
5964         (no change) or the new_column is not null, in which case the null-bit
5965         maps differed and we have already stored the null bitmap.
5966       */
5967       continue;
5968     }
5969     if (old_row->empty_bits[column->empty_pos] & column->empty_bit)
5970     {
5971       if (new_row->empty_bits[column->empty_pos] & column->empty_bit)
5972         continue;                               /* Both are empty; skip */
5973 
5974       /* Store null length column */
5975       field_data= ma_store_length(field_data,
5976                                   (uint) (column - share->columndef));
5977       field_data= ma_store_length(field_data, 0);
5978       continue;
5979     }
5980     /*
5981       Remember if the 'new' value is empty (as in this case we must always
5982       log the original value
5983     */
5984     new_column_is_empty= ((newrec[column->null_pos] & column->null_bit) ||
5985                           (new_row->empty_bits[column->empty_pos] &
5986                            column->empty_bit));
5987 
5988     old_column_pos=      oldrec + column->offset;
5989     new_column_pos=      newrec + column->offset;
5990     old_column_length= new_column_length= column->length;
5991 
5992     switch (column->type) {
5993     case FIELD_CHECK:
5994     case FIELD_NORMAL:                          /* Fixed length field */
5995     case FIELD_ZERO:
5996     case FIELD_SKIP_PRESPACE:                   /* Not packed */
5997     case FIELD_SKIP_ZERO:                       /* Fixed length field */
5998       break;
5999     case FIELD_VARCHAR:
6000       new_column_length--;                      /* Skip length prefix */
6001       old_column_pos+= column->fill_length;
6002       new_column_pos+= column->fill_length;
6003       /* Fall through */
6004     case FIELD_SKIP_ENDSPACE:                   /* CHAR */
6005     {
6006       if (new_column_length <= 255)
6007       {
6008         old_column_length= *old_field_lengths++;
6009         if (!new_column_is_empty)
6010           new_column_length= *new_field_lengths++;
6011       }
6012       else
6013       {
6014         old_column_length= uint2korr(old_field_lengths);
6015         old_field_lengths+= 2;
6016         if (!new_column_is_empty)
6017         {
6018           new_column_length= uint2korr(new_field_lengths);
6019           new_field_lengths+= 2;
6020         }
6021       }
6022       break;
6023     }
6024     case FIELD_BLOB:
6025     {
6026       uint size_length= column->length - portable_sizeof_char_ptr;
6027       old_column_length= _ma_calc_blob_length(size_length, old_column_pos);
6028       memcpy((void*) &old_column_pos, oldrec + column->offset + size_length,
6029              sizeof(old_column_pos));
6030       if (!new_column_is_empty)
6031       {
6032         new_column_length= _ma_calc_blob_length(size_length, new_column_pos);
6033         memcpy((void*) &new_column_pos, newrec + column->offset + size_length,
6034                sizeof(old_column_pos));
6035       }
6036       break;
6037     }
6038     default:
6039       DBUG_ASSERT(0);
6040     }
6041 
6042     if (new_column_is_empty || new_column_length != old_column_length ||
6043         memcmp(old_column_pos, new_column_pos, new_column_length))
6044     {
6045       field_data= ma_store_length(field_data,
6046                                   (ulong) (column - share->columndef));
6047       field_data= ma_store_length(field_data, (ulong) old_column_length);
6048 
6049       log_parts->str=     old_column_pos;
6050       log_parts->length=  old_column_length;
6051       row_length+=        old_column_length;
6052       log_parts++;
6053     }
6054   }
6055 
6056   *log_parts_count= (uint) (log_parts - start_log_parts);
6057 
6058   /* Store length of field length data before the field/field_lengths */
6059   field_lengths= (uint) (field_data - start_field_data);
6060   length_str= start_field_data - ma_calc_length_for_store_length(field_lengths);
6061   start_log_parts->str= length_str;
6062   ma_store_length(length_str, field_lengths);
6063   start_log_parts->length= (size_t) (field_data - start_log_parts->str);
6064   row_length+= start_log_parts->length;
6065   DBUG_RETURN(row_length);
6066 }
6067 
6068 /***************************************************************************
6069   In-write hooks called under log's lock when log record is written
6070 ***************************************************************************/
6071 
6072 /**
6073    @brief Sets transaction's rec_lsn if needed
6074 
6075    A transaction sometimes writes a REDO even before the page is in the
6076    pagecache (example: brand new head or tail pages; full pages). So, if
6077    Checkpoint happens just after the REDO write, it needs to know that the
6078    REDO phase must start before this REDO. Scanning the pagecache cannot
6079    tell that as the page is not in the cache. So, transaction sets its rec_lsn
6080    to the REDO's LSN or somewhere before, and Checkpoint reads the
6081    transaction's rec_lsn.
6082 
6083    @return Operation status, always 0 (success)
6084 */
6085 
write_hook_for_redo(enum translog_record_type type,TRN * trn,MARIA_HA * tbl_info,LSN * lsn,void * hook_arg)6086 my_bool write_hook_for_redo(enum translog_record_type type
6087                             __attribute__ ((unused)),
6088                             TRN *trn, MARIA_HA *tbl_info
6089                             __attribute__ ((unused)),
6090                             LSN *lsn, void *hook_arg
6091                             __attribute__ ((unused)))
6092 {
6093   /*
6094     Users of dummy_transaction_object must keep this TRN clean as it
6095     is used by many threads (like those manipulating non-transactional
6096     tables). It might be dangerous if one user sets rec_lsn or some other
6097     member and it is picked up by another user (like putting this rec_lsn into
6098     a page of a non-transactional table); it's safer if all members stay 0. So
6099     non-transactional log records (REPAIR, CREATE, RENAME, DROP) should not
6100     call this hook; we trust them but verify ;)
6101   */
6102   DBUG_ASSERT(trn->trid != 0);
6103   /*
6104     If the hook stays so simple, it would be faster to pass
6105     !trn->rec_lsn ? trn->rec_lsn : some_dummy_lsn
6106     to translog_write_record(), like Monty did in his original code, and not
6107     have a hook. For now we keep it like this.
6108   */
6109   if (trn->rec_lsn == 0)
6110     trn->rec_lsn= *lsn;
6111   return 0;
6112 }
6113 
6114 
6115 /**
6116    @brief Sets transaction's undo_lsn, first_undo_lsn if needed
6117 
6118    @return Operation status, always 0 (success)
6119 */
6120 
write_hook_for_undo(enum translog_record_type type,TRN * trn,MARIA_HA * tbl_info,LSN * lsn,void * hook_arg)6121 my_bool write_hook_for_undo(enum translog_record_type type
6122                             __attribute__ ((unused)),
6123                             TRN *trn, MARIA_HA *tbl_info
6124                             __attribute__ ((unused)),
6125                             LSN *lsn, void *hook_arg
6126                             __attribute__ ((unused)))
6127 {
6128   DBUG_ASSERT(trn->trid != 0);
6129   trn->undo_lsn= *lsn;
6130   if (unlikely(LSN_WITH_FLAGS_TO_LSN(trn->first_undo_lsn) == 0))
6131     trn->first_undo_lsn=
6132       trn->undo_lsn | LSN_WITH_FLAGS_TO_FLAGS(trn->first_undo_lsn);
6133   return 0;
6134   /*
6135     when we implement purging, we will specialize this hook: UNDO_PURGE
6136     records will additionally set trn->undo_purge_lsn
6137   */
6138 }
6139 
6140 
6141 /**
6142    @brief Sets the table's records count and checksum and others to 0, then
6143    calls the generic REDO hook.
6144 
6145    @return Operation status, always 0 (success)
6146 */
6147 
write_hook_for_redo_delete_all(enum translog_record_type type,TRN * trn,MARIA_HA * tbl_info,LSN * lsn,void * hook_arg)6148 my_bool write_hook_for_redo_delete_all(enum translog_record_type type
6149                                        __attribute__ ((unused)),
6150                                        TRN *trn, MARIA_HA *tbl_info
6151                                        __attribute__ ((unused)),
6152                                        LSN *lsn, void *hook_arg)
6153 {
6154   _ma_reset_status(tbl_info);
6155   return write_hook_for_redo(type, trn, tbl_info, lsn, hook_arg);
6156 }
6157 
6158 
6159 /**
6160    @brief Updates "records" and "checksum" and calls the generic UNDO hook
6161 
6162    @return Operation status, always 0 (success)
6163 */
6164 
write_hook_for_undo_row_insert(enum translog_record_type type,TRN * trn,MARIA_HA * tbl_info,LSN * lsn,void * hook_arg)6165 my_bool write_hook_for_undo_row_insert(enum translog_record_type type
6166                                        __attribute__ ((unused)),
6167                                        TRN *trn, MARIA_HA *tbl_info,
6168                                        LSN *lsn, void *hook_arg)
6169 {
6170   MARIA_SHARE *share= tbl_info->s;
6171   share->state.state.records++;
6172   share->state.state.checksum+= *(ha_checksum *)hook_arg;
6173   return write_hook_for_undo(type, trn, tbl_info, lsn, hook_arg);
6174 }
6175 
6176 
6177 /**
6178    @brief Updates "records" and calls the generic UNDO hook
6179 
6180    @return Operation status, always 0 (success)
6181 */
6182 
write_hook_for_undo_row_delete(enum translog_record_type type,TRN * trn,MARIA_HA * tbl_info,LSN * lsn,void * hook_arg)6183 my_bool write_hook_for_undo_row_delete(enum translog_record_type type
6184                                        __attribute__ ((unused)),
6185                                        TRN *trn, MARIA_HA *tbl_info,
6186                                        LSN *lsn, void *hook_arg)
6187 {
6188   MARIA_SHARE *share= tbl_info->s;
6189   share->state.state.records--;
6190   share->state.state.checksum+= *(ha_checksum *)hook_arg;
6191   return write_hook_for_undo(type, trn, tbl_info, lsn, hook_arg);
6192 }
6193 
6194 
6195 /**
6196    @brief Upates "records" and "checksum" and calls the generic UNDO hook
6197 
6198    @return Operation status, always 0 (success)
6199 */
6200 
write_hook_for_undo_row_update(enum translog_record_type type,TRN * trn,MARIA_HA * tbl_info,LSN * lsn,void * hook_arg)6201 my_bool write_hook_for_undo_row_update(enum translog_record_type type
6202                                        __attribute__ ((unused)),
6203                                        TRN *trn, MARIA_HA *tbl_info,
6204                                        LSN *lsn, void *hook_arg)
6205 {
6206   MARIA_SHARE *share= tbl_info->s;
6207   share->state.state.checksum+= *(ha_checksum *)hook_arg;
6208   return write_hook_for_undo(type, trn, tbl_info, lsn, hook_arg);
6209 }
6210 
6211 
write_hook_for_undo_bulk_insert(enum translog_record_type type,TRN * trn,MARIA_HA * tbl_info,LSN * lsn,void * hook_arg)6212 my_bool write_hook_for_undo_bulk_insert(enum translog_record_type type
6213                                         __attribute__ ((unused)),
6214                                         TRN *trn, MARIA_HA *tbl_info,
6215                                         LSN *lsn, void *hook_arg)
6216 {
6217   /*
6218     We are going to call maria_delete_all_rows(), but without logging and
6219     syncing, as an optimization (if we crash before commit, the UNDO will
6220     empty; if we crash after commit, we have flushed and forced the files).
6221     Status still needs to be reset under log mutex, in case of a concurrent
6222     checkpoint.
6223   */
6224   _ma_reset_status(tbl_info);
6225   return write_hook_for_undo(type, trn, tbl_info, lsn, hook_arg);
6226 }
6227 
6228 
6229 /**
6230    @brief Updates table's lsn_of_file_id.
6231 
6232    @return Operation status, always 0 (success)
6233 */
6234 
write_hook_for_file_id(enum translog_record_type type,TRN * trn,MARIA_HA * tbl_info,LSN * lsn,void * hook_arg)6235 my_bool write_hook_for_file_id(enum translog_record_type type
6236                                __attribute__ ((unused)),
6237                                TRN *trn
6238                                __attribute__ ((unused)),
6239                                MARIA_HA *tbl_info,
6240                                LSN *lsn,
6241                                void *hook_arg
6242                                __attribute__ ((unused)))
6243 {
6244   DBUG_ASSERT(cmp_translog_addr(tbl_info->s->lsn_of_file_id, *lsn) < 0);
6245   tbl_info->s->lsn_of_file_id= *lsn;
6246   return 0;
6247 }
6248 
6249 
6250 /**
6251    Updates transaction's rec_lsn when committing.
6252 
6253    A transaction writes its commit record before being committed in trnman, so
6254    if Checkpoint happens just between the COMMIT record log write and the
6255    commit in trnman, it will record that transaction is not committed. Assume
6256    the transaction (trn1) did an INSERT; after the checkpoint, a second
6257    transaction (trn2) does a DELETE of what trn1 has inserted. Then crash,
6258    Checkpoint record says that trn1 was not committed, and REDO phase starts
6259    from Checkpoint record's LSN. So it will not find the COMMIT record of
6260    trn1, will want to roll back trn1, which will fail because the row/key
6261    which it wants to delete does not exist anymore.
6262    To avoid this, Checkpoint needs to know that the REDO phase must start
6263    before this COMMIT record, so transaction sets its rec_lsn to the COMMIT's
6264    record LSN, and as Checkpoint reads the transaction's rec_lsn, Checkpoint
6265    will know.
6266 
6267    @note so after commit trn->rec_lsn is a "commit LSN", which could be of
6268    use later.
6269 
6270    @return Operation status, always 0 (success)
6271 */
6272 
write_hook_for_commit(enum translog_record_type type,TRN * trn,MARIA_HA * tbl_info,LSN * lsn,void * hook_arg)6273 my_bool write_hook_for_commit(enum translog_record_type type
6274                               __attribute__ ((unused)),
6275                               TRN *trn,
6276                               MARIA_HA *tbl_info
6277                               __attribute__ ((unused)),
6278                               LSN *lsn,
6279                               void *hook_arg
6280                               __attribute__ ((unused)))
6281 {
6282   trn->rec_lsn= *lsn;
6283   return 0;
6284 }
6285 
6286 
6287 /***************************************************************************
6288   Applying of REDO log records
6289 ***************************************************************************/
6290 
6291 /*
6292   Apply changes to head and tail pages
6293 
6294   SYNOPSIS
6295     _ma_apply_redo_insert_row_head_or_tail()
6296     info		Maria handler
6297     lsn			LSN to put on page
6298     page_type		HEAD_PAGE or TAIL_PAGE
6299     new_page		True if this is first entry on page
6300     header		Header (without FILEID)
6301     data		Data to be put on page
6302     data_length		Length of data
6303 
6304   NOTE
6305     Handles LOGREC_REDO_INSERT_ROW_HEAD, LOGREC_REDO_INSERT_ROW_TAIL
6306     LOGREC_REDO_NEW_ROW_HEAD and LOGREC_REDO_NEW_ROW_TAIL
6307 
6308   RETURN
6309     0   ok
6310     #   Error number
6311 */
6312 
_ma_apply_redo_insert_row_head_or_tail(MARIA_HA * info,LSN lsn,uint page_type,my_bool new_page,const uchar * header,const uchar * data,size_t data_length)6313 uint _ma_apply_redo_insert_row_head_or_tail(MARIA_HA *info, LSN lsn,
6314                                             uint page_type,
6315                                             my_bool new_page,
6316                                             const uchar *header,
6317                                             const uchar *data,
6318                                             size_t data_length)
6319 {
6320   MARIA_SHARE *share= info->s;
6321   pgcache_page_no_t page;
6322   uint      rownr, empty_space;
6323   uint      block_size= share->block_size;
6324   uint      rec_offset;
6325   uchar      *buff, *dir;
6326   uint      result;
6327   MARIA_PINNED_PAGE page_link;
6328   enum pagecache_page_lock lock_method;
6329   enum pagecache_page_pin pin_method;
6330   my_off_t end_of_page;
6331   uint error;
6332   DBUG_ENTER("_ma_apply_redo_insert_row_head_or_tail");
6333 
6334   page=  page_korr(header);
6335   rownr= dirpos_korr(header + PAGE_STORE_SIZE);
6336 
6337   DBUG_PRINT("enter", ("rowid: %lu  page: %lu  rownr: %u  data_length: %u",
6338                        (ulong) ma_recordpos(page, rownr),
6339                        (ulong) page, rownr, (uint) data_length));
6340 
6341   share->state.changed|= (STATE_CHANGED | STATE_NOT_ZEROFILLED |
6342                           STATE_NOT_MOVABLE);
6343 
6344   end_of_page= (page + 1) * share->block_size;
6345   if (end_of_page > share->state.state.data_file_length)
6346   {
6347     DBUG_PRINT("info", ("Enlarging data file from %lu to %lu",
6348                         (ulong) share->state.state.data_file_length,
6349                         (ulong) end_of_page));
6350     /*
6351       New page at end of file. Note that the test above is also positive if
6352       data_file_length is not a multiple of block_size (system crashed while
6353       writing the last page): in this case we just extend the last page and
6354       fill it entirely with zeroes, then the REDO will put correct data on
6355       it.
6356     */
6357     lock_method= PAGECACHE_LOCK_WRITE;
6358     pin_method=  PAGECACHE_PIN;
6359 
6360     DBUG_ASSERT(rownr == 0 && new_page);
6361     if (rownr != 0 || !new_page)
6362       goto crashed_file;
6363 
6364     buff= info->keyread_buff;
6365     info->keyread_buff_used= 1;
6366     make_empty_page(info, buff, page_type, 1);
6367     empty_space= (block_size - PAGE_OVERHEAD_SIZE(share));
6368     rec_offset= PAGE_HEADER_SIZE(share);
6369     dir= buff+ block_size - PAGE_SUFFIX_SIZE - DIR_ENTRY_SIZE;
6370   }
6371   else
6372   {
6373     lock_method= PAGECACHE_LOCK_LEFT_WRITELOCKED;
6374     pin_method=  PAGECACHE_PIN_LEFT_PINNED;
6375 
6376     share->pagecache->readwrite_flags&= ~MY_WME;
6377     buff= pagecache_read(share->pagecache, &info->dfile,
6378                          page, 0, 0,
6379                          PAGECACHE_PLAIN_PAGE, PAGECACHE_LOCK_WRITE,
6380                          &page_link.link);
6381     share->pagecache->readwrite_flags= share->pagecache->org_readwrite_flags;
6382     if (!buff)
6383     {
6384       /* Skip errors when reading outside of file and uninitialized pages */
6385       if (!new_page || (my_errno != HA_ERR_FILE_TOO_SHORT &&
6386                         my_errno != HA_ERR_WRONG_CRC))
6387       {
6388         DBUG_PRINT("error", ("Error %d when reading page", (int) my_errno));
6389         goto err;
6390       }
6391       /* Create new page */
6392       buff= pagecache_block_link_to_buffer(page_link.link);
6393       buff[PAGE_TYPE_OFFSET]= UNALLOCATED_PAGE;
6394     }
6395     else if (lsn_korr(buff) >= lsn)           /* Test if already applied */
6396     {
6397       check_skipped_lsn(info, lsn_korr(buff), 1, page);
6398       /* Fix bitmap, just in case */
6399       empty_space= uint2korr(buff + EMPTY_SPACE_OFFSET);
6400       if (!enough_free_entries_on_page(share, buff))
6401         empty_space= 0;                         /* Page is full */
6402 
6403       if (_ma_bitmap_set(info, page, page_type == HEAD_PAGE, empty_space))
6404         goto err;
6405       pagecache_unlock_by_link(share->pagecache, page_link.link,
6406                                PAGECACHE_LOCK_WRITE_UNLOCK,
6407                                PAGECACHE_UNPIN, LSN_IMPOSSIBLE,
6408                                LSN_IMPOSSIBLE, 0, FALSE);
6409       DBUG_RETURN(0);
6410     }
6411 
6412     if (((uint) (buff[PAGE_TYPE_OFFSET] & PAGE_TYPE_MASK) != page_type))
6413     {
6414       /*
6415         This is a page that has been freed before and now should be
6416         changed to new type.
6417       */
6418       if (!new_page)
6419       {
6420         DBUG_PRINT("error",
6421                    ("Found page of wrong type: %u, should have been %u",
6422                     (uint) (buff[PAGE_TYPE_OFFSET] & PAGE_TYPE_MASK),
6423                     page_type));
6424         goto crashed_file;
6425       }
6426       make_empty_page(info, buff, page_type, 0);
6427       empty_space= block_size - PAGE_HEADER_SIZE(share) - PAGE_SUFFIX_SIZE;
6428       (void) extend_directory(info, buff, block_size, 0, rownr, &empty_space,
6429                               page_type == HEAD_PAGE);
6430       rec_offset= PAGE_HEADER_SIZE(share);
6431       dir= dir_entry_pos(buff, block_size, rownr);
6432       empty_space+= uint2korr(dir+2);
6433     }
6434     else
6435     {
6436       uint max_entry= (uint) buff[DIR_COUNT_OFFSET];
6437       uint length;
6438 
6439       DBUG_ASSERT(!new_page);
6440       dir= dir_entry_pos(buff, block_size, rownr);
6441       empty_space= uint2korr(buff + EMPTY_SPACE_OFFSET);
6442 
6443       if (max_entry <= rownr)
6444       {
6445         /* Add directory entry first in directory and data last on page */
6446         if (extend_directory(info, buff, block_size, max_entry, rownr,
6447                              &empty_space, page_type == HEAD_PAGE))
6448           goto crashed_file;
6449       }
6450       if (extend_area_on_page(info, buff, dir, rownr,
6451                               (uint) data_length, &empty_space,
6452                               &rec_offset, &length, page_type == HEAD_PAGE))
6453         goto crashed_file;
6454     }
6455   }
6456   /* Copy data */
6457   int2store(dir+2, data_length);
6458   memcpy(buff + rec_offset, data, data_length);
6459   empty_space-= (uint) data_length;
6460   int2store(buff + EMPTY_SPACE_OFFSET, empty_space);
6461 
6462   /* Fix bitmap */
6463   if (!enough_free_entries_on_page(share, buff))
6464     empty_space= 0;                         /* Page is full */
6465   if (_ma_bitmap_set(info, page, page_type == HEAD_PAGE, empty_space))
6466     goto err;
6467 
6468   /*
6469     If page was not read before, write it but keep it pinned.
6470     We don't update its LSN When we have processed all REDOs for this page
6471     in the current REDO's group, we will stamp page with UNDO's LSN
6472     (if we stamped it now, a next REDO, in
6473     this group, for this page, would be skipped) and unpin then.
6474   */
6475   result= 0;
6476   if (lock_method == PAGECACHE_LOCK_WRITE &&
6477       pagecache_write(share->pagecache,
6478                       &info->dfile, page, 0,
6479                       buff, PAGECACHE_PLAIN_PAGE,
6480                       lock_method, pin_method,
6481                       PAGECACHE_WRITE_DELAY, &page_link.link,
6482                       LSN_IMPOSSIBLE))
6483     result= my_errno;
6484 
6485   page_link.unlock= PAGECACHE_LOCK_WRITE_UNLOCK;
6486   page_link.changed= 1;
6487   push_dynamic(&info->pinned_pages, (void*) &page_link);
6488 
6489   /*
6490     Data page and bitmap page are in place, we can update data_file_length in
6491     case we extended the file. We could not do it earlier: bitmap code tests
6492     data_file_length to know if it has to create a new page or not.
6493   */
6494   set_if_bigger(share->state.state.data_file_length, end_of_page);
6495   DBUG_RETURN(result);
6496 
6497 crashed_file:
6498   _ma_set_fatal_error(share, HA_ERR_WRONG_IN_RECORD);
6499 err:
6500   error= my_errno;
6501   if (lock_method == PAGECACHE_LOCK_LEFT_WRITELOCKED)
6502     pagecache_unlock_by_link(share->pagecache, page_link.link,
6503                              PAGECACHE_LOCK_WRITE_UNLOCK,
6504                              PAGECACHE_UNPIN, LSN_IMPOSSIBLE,
6505                              LSN_IMPOSSIBLE, 0, FALSE);
6506   _ma_mark_file_crashed(share);
6507   DBUG_ASSERT(!maria_assert_if_crashed_table); /* catch recovery error early */
6508   DBUG_RETURN((my_errno= error));
6509 }
6510 
6511 
6512 /*
6513   Apply LOGREC_REDO_PURGE_ROW_HEAD & LOGREC_REDO_PURGE_ROW_TAIL
6514 
6515   SYNOPSIS
6516     _ma_apply_redo_purge_row_head_or_tail()
6517     info		Maria handler
6518     lsn			LSN to put on page
6519     page_type		HEAD_PAGE or TAIL_PAGE
6520     header		Header (without FILEID)
6521 
6522   NOTES
6523     This function is very similar to delete_head_or_tail()
6524 
6525   RETURN
6526     0   ok
6527     #   Error number
6528 */
6529 
_ma_apply_redo_purge_row_head_or_tail(MARIA_HA * info,LSN lsn,uint page_type,const uchar * header)6530 uint _ma_apply_redo_purge_row_head_or_tail(MARIA_HA *info, LSN lsn,
6531                                            uint page_type,
6532                                            const uchar *header)
6533 {
6534   MARIA_SHARE *share= info->s;
6535   pgcache_page_no_t page;
6536   uint      rownr, empty_space;
6537   uchar     *buff;
6538   int result;
6539   uint error;
6540   MARIA_PINNED_PAGE page_link;
6541   DBUG_ENTER("_ma_apply_redo_purge_row_head_or_tail");
6542 
6543   page=  page_korr(header);
6544   rownr= dirpos_korr(header+PAGE_STORE_SIZE);
6545   DBUG_PRINT("enter", ("rowid: %lu  page: %lu  rownr: %u",
6546                        (ulong) ma_recordpos(page, rownr),
6547                        (ulong) page, rownr));
6548 
6549   share->state.changed|= (STATE_CHANGED | STATE_NOT_ZEROFILLED |
6550                           STATE_NOT_MOVABLE);
6551 
6552   if (!(buff= pagecache_read(share->pagecache, &info->dfile,
6553                              page, 0, 0,
6554                              PAGECACHE_PLAIN_PAGE, PAGECACHE_LOCK_WRITE,
6555                              &page_link.link)))
6556     goto err;
6557 
6558   if (lsn_korr(buff) >= lsn)
6559   {
6560     /*
6561       Already applied
6562       Note that in case the page is not anymore a head or tail page
6563       a future redo will fix the bitmap.
6564     */
6565     check_skipped_lsn(info, lsn_korr(buff), 1, page);
6566     if ((uint) (buff[PAGE_TYPE_OFFSET] & PAGE_TYPE_MASK) == page_type)
6567     {
6568       empty_space= uint2korr(buff+EMPTY_SPACE_OFFSET);
6569       if (!enough_free_entries_on_page(share, buff))
6570         empty_space= 0;                         /* Page is full */
6571       if (_ma_bitmap_set(info, page, page_type == HEAD_PAGE,
6572                          empty_space))
6573         goto err;
6574     }
6575     pagecache_unlock_by_link(share->pagecache, page_link.link,
6576                              PAGECACHE_LOCK_WRITE_UNLOCK,
6577                              PAGECACHE_UNPIN, LSN_IMPOSSIBLE,
6578                              LSN_IMPOSSIBLE, 0, FALSE);
6579     DBUG_RETURN(0);
6580   }
6581 
6582   DBUG_ASSERT((buff[PAGE_TYPE_OFFSET] & PAGE_TYPE_MASK) == (uchar) page_type);
6583 
6584   if (delete_dir_entry(share, buff, rownr, &empty_space) < 0)
6585   {
6586     _ma_set_fatal_error(share, HA_ERR_WRONG_IN_RECORD);
6587     goto err;
6588   }
6589 
6590   page_link.unlock= PAGECACHE_LOCK_WRITE_UNLOCK;
6591   page_link.changed= 1;
6592   push_dynamic(&info->pinned_pages, (void*) &page_link);
6593 
6594   result= 0;
6595   if (!enough_free_entries_on_page(share, buff))
6596     empty_space= 0;                         /* Page is full */
6597   /* This will work even if the page was marked as UNALLOCATED_PAGE */
6598   if (_ma_bitmap_set(info, page, page_type == HEAD_PAGE, empty_space))
6599     result= my_errno;
6600 
6601   DBUG_RETURN(result);
6602 
6603 err:
6604   error= my_errno;
6605   pagecache_unlock_by_link(share->pagecache, page_link.link,
6606                            PAGECACHE_LOCK_WRITE_UNLOCK,
6607                            PAGECACHE_UNPIN, LSN_IMPOSSIBLE,
6608                            LSN_IMPOSSIBLE, 0, FALSE);
6609   _ma_mark_file_crashed(share);
6610   DBUG_ASSERT(!maria_assert_if_crashed_table);
6611   DBUG_RETURN((my_errno= error));
6612 
6613 }
6614 
6615 
6616 /**
6617    @brief Apply LOGREC_REDO_FREE_BLOCKS
6618 
6619    @param  info            Maria handler
6620    @param  header          Header (without FILEID)
6621 
6622    Mark the pages free in the bitmap.
6623 
6624    We have to check against _ma_redo_not_needed_for_page()
6625    to guard against the case where we first clear a block and after
6626    that insert new data into the blocks.  If we would unconditionally
6627    clear the bitmap here, future changes would be ignored for the page
6628    if it's not in the dirty list (ie, it would be flushed).
6629 
6630    @return Operation status
6631      @retval 0      OK
6632      @retval 1      Error
6633 */
6634 
_ma_apply_redo_free_blocks(MARIA_HA * info,LSN lsn,LSN redo_lsn,const uchar * header)6635 uint _ma_apply_redo_free_blocks(MARIA_HA *info,
6636                                 LSN lsn __attribute__((unused)),
6637                                 LSN redo_lsn,
6638                                 const uchar *header)
6639 {
6640   MARIA_SHARE *share= info->s;
6641   uint ranges;
6642   uint16 sid;
6643   DBUG_ENTER("_ma_apply_redo_free_blocks");
6644 
6645   share->state.changed|= (STATE_CHANGED | STATE_NOT_ZEROFILLED |
6646                           STATE_NOT_MOVABLE);
6647 
6648   sid= fileid_korr(header);
6649   header+= FILEID_STORE_SIZE;
6650   ranges= pagerange_korr(header);
6651   header+= PAGERANGE_STORE_SIZE;
6652   DBUG_ASSERT(ranges > 0);
6653 
6654   /** @todo leave bitmap lock to the bitmap code... */
6655   mysql_mutex_lock(&share->bitmap.bitmap_lock);
6656   while (ranges--)
6657   {
6658     my_bool res;
6659     uint page_range;
6660     pgcache_page_no_t page, start_page;
6661 
6662     start_page= page= page_korr(header);
6663     header+= PAGE_STORE_SIZE;
6664     /* Page range may have this bit set to indicate a tail page */
6665     page_range= pagerange_korr(header) & ~(TAIL_BIT | START_EXTENT_BIT);
6666     DBUG_ASSERT(page_range > 0);
6667 
6668     header+= PAGERANGE_STORE_SIZE;
6669 
6670     DBUG_PRINT("info", ("page: %lu  pages: %u", (long) page, page_range));
6671 
6672     for ( ; page_range-- ; start_page++)
6673     {
6674       if (_ma_redo_not_needed_for_page(sid, redo_lsn, start_page, FALSE))
6675         continue;
6676       res= _ma_bitmap_reset_full_page_bits(info, &share->bitmap, start_page,
6677                                            1);
6678       if (res)
6679       {
6680         mysql_mutex_unlock(&share->bitmap.bitmap_lock);
6681         _ma_mark_file_crashed(share);
6682         DBUG_ASSERT(!maria_assert_if_crashed_table);
6683         DBUG_RETURN(res);
6684       }
6685     }
6686   }
6687   mysql_mutex_unlock(&share->bitmap.bitmap_lock);
6688   DBUG_RETURN(0);
6689 }
6690 
6691 
6692 /**
6693    @brief Apply LOGREC_REDO_FREE_HEAD_OR_TAIL
6694 
6695    @param  info            Maria handler
6696    @param  header          Header (without FILEID)
6697 
6698    @note It marks the page free in the bitmap, and sets the directory's count
6699    to 0.
6700 
6701    @return Operation status
6702      @retval 0      OK
6703      @retval 1      Error
6704 */
6705 
_ma_apply_redo_free_head_or_tail(MARIA_HA * info,LSN lsn,const uchar * header)6706 uint _ma_apply_redo_free_head_or_tail(MARIA_HA *info, LSN lsn,
6707                                       const uchar *header)
6708 {
6709   MARIA_SHARE *share= info->s;
6710   uchar *buff;
6711   pgcache_page_no_t page;
6712   MARIA_PINNED_PAGE page_link;
6713   my_bool res;
6714   DBUG_ENTER("_ma_apply_redo_free_head_or_tail");
6715 
6716   share->state.changed|= (STATE_CHANGED | STATE_NOT_ZEROFILLED |
6717                           STATE_NOT_MOVABLE);
6718 
6719   page= page_korr(header);
6720 
6721   if (!(buff= pagecache_read(share->pagecache,
6722                              &info->dfile,
6723                              page, 0, 0,
6724                              PAGECACHE_PLAIN_PAGE,
6725                              PAGECACHE_LOCK_WRITE, &page_link.link)))
6726   {
6727     pagecache_unlock_by_link(share->pagecache, page_link.link,
6728                              PAGECACHE_LOCK_WRITE_UNLOCK,
6729                              PAGECACHE_UNPIN, LSN_IMPOSSIBLE,
6730                              LSN_IMPOSSIBLE, 0, FALSE);
6731     goto err;
6732   }
6733   if (lsn_korr(buff) >= lsn)
6734   {
6735     /* Already applied */
6736     check_skipped_lsn(info, lsn_korr(buff), 1, page);
6737     pagecache_unlock_by_link(share->pagecache, page_link.link,
6738                              PAGECACHE_LOCK_WRITE_UNLOCK,
6739                              PAGECACHE_UNPIN, LSN_IMPOSSIBLE,
6740                              LSN_IMPOSSIBLE, 0, FALSE);
6741   }
6742   else
6743   {
6744     buff[PAGE_TYPE_OFFSET]= UNALLOCATED_PAGE;
6745 #ifdef IDENTICAL_PAGES_AFTER_RECOVERY
6746     {
6747       uint number_of_records= (uint) buff[DIR_COUNT_OFFSET];
6748       uchar *dir= dir_entry_pos(buff, share->block_size,
6749                                 number_of_records-1);
6750       buff[DIR_FREE_OFFSET]=  END_OF_DIR_FREE_LIST;
6751       bzero(dir, number_of_records * DIR_ENTRY_SIZE);
6752     }
6753 #endif
6754 
6755     page_link.unlock= PAGECACHE_LOCK_WRITE_UNLOCK;
6756     page_link.changed= 1;
6757     push_dynamic(&info->pinned_pages, (void*) &page_link);
6758   }
6759   /** @todo leave bitmap lock to the bitmap code... */
6760   mysql_mutex_lock(&share->bitmap.bitmap_lock);
6761   res= _ma_bitmap_reset_full_page_bits(info, &share->bitmap, page, 1);
6762   mysql_mutex_unlock(&share->bitmap.bitmap_lock);
6763   if (res)
6764     goto err;
6765   DBUG_RETURN(0);
6766 
6767 err:
6768   _ma_mark_file_crashed(share);
6769   DBUG_ASSERT(!maria_assert_if_crashed_table);
6770   DBUG_RETURN(1);
6771 }
6772 
6773 
6774 /**
6775    @brief Apply LOGREC_REDO_INSERT_ROW_BLOBS
6776 
6777    @param  info            Maria handler
6778    @parma  lsn             LSN to put on pages
6779    @param  header          Header (with FILEID)
6780    @param  redo_lsn        REDO record's LSN
6781    @param[out] number_of_blobs Number of blobs found in log record
6782    @param[out] number_of_ranges Number of ranges found
6783    @param[out] first_page  First page touched
6784    @param[out] last_page   Last page touched
6785 
6786    @note Write full pages (full head & blob pages)
6787 
6788    @return Operation status
6789      @retval 0      OK
6790      @retval !=0    Error
6791 */
6792 
_ma_apply_redo_insert_row_blobs(MARIA_HA * info,LSN lsn,const uchar * header,LSN redo_lsn,uint * const number_of_blobs,uint * const number_of_ranges,pgcache_page_no_t * const first_page,pgcache_page_no_t * const last_page)6793 uint _ma_apply_redo_insert_row_blobs(MARIA_HA *info,
6794                                      LSN lsn, const uchar *header,
6795                                      LSN redo_lsn,
6796                                      uint * const number_of_blobs,
6797                                      uint * const number_of_ranges,
6798                                      pgcache_page_no_t * const first_page,
6799                                      pgcache_page_no_t * const last_page)
6800 {
6801   MARIA_SHARE *share= info->s;
6802   const uchar *data;
6803   uint      data_size= FULL_PAGE_SIZE(share);
6804   uint      blob_count, ranges;
6805   uint16    sid;
6806   pgcache_page_no_t first_page2= ULONGLONG_MAX, last_page2= 0;
6807   DBUG_ENTER("_ma_apply_redo_insert_row_blobs");
6808 
6809   share->state.changed|= (STATE_CHANGED | STATE_NOT_ZEROFILLED |
6810                           STATE_NOT_MOVABLE);
6811 
6812   sid= fileid_korr(header);
6813   header+= FILEID_STORE_SIZE;
6814   *number_of_ranges= ranges= pagerange_korr(header);
6815   header+= PAGERANGE_STORE_SIZE;
6816   *number_of_blobs= blob_count= pagerange_korr(header);
6817   header+= PAGERANGE_STORE_SIZE;
6818   DBUG_ASSERT(ranges >= blob_count);
6819 
6820   data= (header + ranges * ROW_EXTENT_SIZE +
6821          blob_count * (SUB_RANGE_SIZE + BLOCK_FILLER_SIZE));
6822 
6823   while (blob_count--)
6824   {
6825     uint sub_ranges, empty_space;
6826 
6827     sub_ranges=  uint2korr(header);
6828     header+= SUB_RANGE_SIZE;
6829     empty_space= uint2korr(header);
6830     header+= BLOCK_FILLER_SIZE;
6831     DBUG_ASSERT(sub_ranges <= ranges && empty_space < data_size);
6832     ranges-= sub_ranges;
6833 
6834     while (sub_ranges--)
6835     {
6836       uint i;
6837       uint      res;
6838       uint      page_range;
6839       pgcache_page_no_t page;
6840       uchar     *buff;
6841       uint	data_on_page= data_size;
6842 
6843       page= page_korr(header);
6844       header+= PAGE_STORE_SIZE;
6845       page_range= pagerange_korr(header);
6846       header+= PAGERANGE_STORE_SIZE;
6847 
6848       for (i= page_range; i-- > 0 ; page++, data+= data_on_page)
6849       {
6850         MARIA_PINNED_PAGE page_link;
6851         enum pagecache_page_lock unlock_method;
6852         enum pagecache_page_pin unpin_method;
6853 
6854         set_if_smaller(first_page2, page);
6855         set_if_bigger(last_page2, page);
6856         if (i == 0 && sub_ranges == 0)
6857           data_on_page= data_size - empty_space; /* data on last page */
6858         if (_ma_redo_not_needed_for_page(sid, redo_lsn, page, FALSE))
6859           continue;
6860 
6861         if (((page + 1) * share->block_size) >
6862             share->state.state.data_file_length)
6863         {
6864           /* New page or half written page at end of file */
6865           DBUG_PRINT("info", ("Enlarging data file from %lu to %lu",
6866                               (ulong) share->state.state.data_file_length,
6867                               (ulong) ((page + 1 ) * share->block_size)));
6868           share->state.state.data_file_length= (page + 1) * share->block_size;
6869           buff= info->keyread_buff;
6870           info->keyread_buff_used= 1;
6871           make_empty_page(info, buff, BLOB_PAGE, 0);
6872           unlock_method= PAGECACHE_LOCK_LEFT_UNLOCKED;
6873           unpin_method=  PAGECACHE_PIN_LEFT_UNPINNED;
6874         }
6875         else
6876         {
6877           share->pagecache->readwrite_flags&= ~MY_WME;
6878           buff= pagecache_read(share->pagecache,
6879                                &info->dfile,
6880                                page, 0, 0,
6881                                PAGECACHE_PLAIN_PAGE,
6882                                PAGECACHE_LOCK_WRITE, &page_link.link);
6883           share->pagecache->readwrite_flags= share->pagecache->
6884             org_readwrite_flags;
6885           if (!buff)
6886           {
6887             if (my_errno != HA_ERR_FILE_TOO_SHORT &&
6888                 my_errno != HA_ERR_WRONG_CRC)
6889             {
6890               /* If not read outside of file */
6891               pagecache_unlock_by_link(share->pagecache, page_link.link,
6892                                        PAGECACHE_LOCK_WRITE_UNLOCK,
6893                                        PAGECACHE_UNPIN, LSN_IMPOSSIBLE,
6894                                        LSN_IMPOSSIBLE, 0, FALSE);
6895               goto err;
6896             }
6897             /*
6898               Physical file was too short, create new page. It can be that
6899               recovery started with a file with N pages, wrote page N+2 into
6900               pagecache (increased data_file_length but not physical file
6901               length), now reads page N+1: the read fails.
6902             */
6903             buff= pagecache_block_link_to_buffer(page_link.link);
6904             make_empty_page(info, buff, BLOB_PAGE, 0);
6905           }
6906           else
6907           {
6908 #ifdef DBUG_ASSERT_EXISTS
6909             uchar found_page_type= (buff[PAGE_TYPE_OFFSET] & PAGE_TYPE_MASK);
6910 #endif
6911             if (lsn_korr(buff) >= lsn)
6912             {
6913               /* Already applied */
6914               check_skipped_lsn(info, lsn_korr(buff), 1, page);
6915               pagecache_unlock_by_link(share->pagecache, page_link.link,
6916                                        PAGECACHE_LOCK_WRITE_UNLOCK,
6917                                        PAGECACHE_UNPIN, LSN_IMPOSSIBLE,
6918                                        LSN_IMPOSSIBLE, 0, FALSE);
6919               goto fix_bitmap;
6920             }
6921             DBUG_ASSERT((found_page_type == (uchar) BLOB_PAGE) ||
6922                         (found_page_type == (uchar) UNALLOCATED_PAGE));
6923           }
6924           unlock_method= PAGECACHE_LOCK_WRITE_UNLOCK;
6925           unpin_method=  PAGECACHE_UNPIN;
6926         }
6927 
6928         /*
6929           Blob pages are never updated twice in same redo-undo chain, so
6930           it's safe to update lsn for them here
6931         */
6932         lsn_store(buff, lsn);
6933         buff[PAGE_TYPE_OFFSET]= BLOB_PAGE;
6934         bzero(buff + LSN_SIZE + PAGE_TYPE_SIZE,
6935               FULL_PAGE_HEADER_SIZE(share) - (LSN_SIZE + PAGE_TYPE_SIZE));
6936 
6937         if (data_on_page != data_size)
6938         {
6939           /*
6940             Last page may be only partly filled. We zero the rest, like
6941             write_full_pages() does.
6942           */
6943           bzero(buff + share->block_size - PAGE_SUFFIX_SIZE - empty_space,
6944                 empty_space);
6945         }
6946         memcpy(buff + FULL_PAGE_HEADER_SIZE(share), data, data_on_page);
6947         if (pagecache_write(share->pagecache,
6948                             &info->dfile, page, 0,
6949                             buff, PAGECACHE_PLAIN_PAGE,
6950                             unlock_method, unpin_method,
6951                             PAGECACHE_WRITE_DELAY, 0, LSN_IMPOSSIBLE))
6952           goto err;
6953 
6954     fix_bitmap:
6955       /** @todo leave bitmap lock to the bitmap code... */
6956         mysql_mutex_lock(&share->bitmap.bitmap_lock);
6957         res= _ma_bitmap_set_full_page_bits(info, &share->bitmap, page,
6958                                            1);
6959         mysql_mutex_unlock(&share->bitmap.bitmap_lock);
6960         if (res)
6961           goto err;
6962       }
6963     }
6964   }
6965   *first_page= first_page2;
6966   *last_page=  last_page2;
6967   DBUG_RETURN(0);
6968 
6969 err:
6970   _ma_mark_file_crashed(share);
6971   DBUG_ASSERT(!maria_assert_if_crashed_table);
6972   DBUG_RETURN(1);
6973 }
6974 
6975 
6976 /****************************************************************************
6977  Applying of UNDO entries
6978 ****************************************************************************/
6979 
6980 /** Execute undo of a row insert (delete the inserted row) */
6981 
_ma_apply_undo_row_insert(MARIA_HA * info,LSN undo_lsn,const uchar * header)6982 my_bool _ma_apply_undo_row_insert(MARIA_HA *info, LSN undo_lsn,
6983                                   const uchar *header)
6984 {
6985   pgcache_page_no_t page;
6986   uint rownr;
6987   uchar *buff;
6988   my_bool res;
6989   MARIA_PINNED_PAGE page_link;
6990   MARIA_SHARE *share= info->s;
6991   ha_checksum checksum;
6992   LSN lsn;
6993   DBUG_ENTER("_ma_apply_undo_row_insert");
6994 
6995   page=  page_korr(header);
6996   header+= PAGE_STORE_SIZE;
6997   rownr= dirpos_korr(header);
6998   header+= DIRPOS_STORE_SIZE;
6999   DBUG_PRINT("enter", ("rowid: %lu  page: %lu  rownr: %u",
7000                        (ulong) ma_recordpos(page, rownr),
7001                        (ulong) page, rownr));
7002 
7003   buff= pagecache_read(share->pagecache,
7004                        &info->dfile, page, 0,
7005                        0, share->page_type,
7006                        PAGECACHE_LOCK_WRITE,
7007                        &page_link.link);
7008   page_link.unlock= PAGECACHE_LOCK_WRITE_UNLOCK;
7009   page_link.changed= buff != 0;
7010   push_dynamic(&info->pinned_pages, (void*) &page_link);
7011   if (!buff)
7012     goto err;
7013 
7014   if (read_row_extent_info(info, buff, rownr))
7015     goto err;
7016 
7017   _ma_bitmap_flushable(info, 1);
7018   if (delete_head_or_tail(info, page, rownr, 1, 1) ||
7019       delete_tails(info, info->cur_row.tail_positions))
7020     goto err;
7021 
7022   if (info->cur_row.extents_count && free_full_pages(info, &info->cur_row))
7023     goto err;
7024 
7025   checksum= 0;
7026   if (share->calc_checksum)
7027     checksum= (ha_checksum) 0 - ha_checksum_korr(header);
7028   info->last_auto_increment= ~ (ulonglong) 0;
7029   if (_ma_write_clr(info, undo_lsn, LOGREC_UNDO_ROW_INSERT,
7030                     share->calc_checksum != 0, checksum, &lsn, (void*) 0))
7031     goto err;
7032 
7033   res= 0;
7034 end:
7035   /* The following is true only if _ma_bitmap_flushable() was called earlier */
7036   if (info->non_flushable_state)
7037     _ma_bitmap_flushable(info, -1);
7038   _ma_unpin_all_pages_and_finalize_row(info, lsn);
7039   DBUG_RETURN(res);
7040 
7041 err:
7042   DBUG_ASSERT(!maria_assert_if_crashed_table);
7043   res= 1;
7044   _ma_mark_file_crashed(share);
7045   /*
7046     Don't write a new LSN on the used pages. Not important as the file is
7047     marked as crashed and need to be repaired before it can be used.
7048   */
7049   lsn= LSN_IMPOSSIBLE;
7050   goto end;
7051 }
7052 
7053 
7054 /** Execute undo of a row delete (insert the row back where it was) */
7055 
_ma_apply_undo_row_delete(MARIA_HA * info,LSN undo_lsn,const uchar * header,size_t header_length)7056 my_bool _ma_apply_undo_row_delete(MARIA_HA *info, LSN undo_lsn,
7057                                   const uchar *header, size_t header_length
7058                                   __attribute__((unused)))
7059 {
7060   MARIA_SHARE *share= info->s;
7061   MARIA_ROW row;
7062   MARIA_COLUMNDEF *column, *end_column;
7063   MARIA_BITMAP_BLOCKS *blocks;
7064   struct st_row_pos_info row_pos;
7065   uchar *record;
7066   const uchar *null_bits, *field_length_data, *extent_info;
7067   pgcache_page_no_t page;
7068   ulong *blob_lengths;
7069   uint *null_field_lengths, extent_count, rownr, length_on_head_page;
7070   DBUG_ENTER("_ma_apply_undo_row_delete");
7071 
7072   /*
7073     Use cur row as a base;  We need to make a copy as we will change
7074     some buffers to point directly to 'header'
7075   */
7076   memcpy(&row, &info->cur_row, sizeof(row));
7077 
7078   page=  page_korr(header);
7079   header+= PAGE_STORE_SIZE;
7080   rownr= dirpos_korr(header);
7081   header+= DIRPOS_STORE_SIZE;
7082   length_on_head_page= uint2korr(header);
7083   header+= 2;
7084   extent_count= pagerange_korr(header);
7085   header+= PAGERANGE_STORE_SIZE;
7086   DBUG_PRINT("enter", ("rowid: %lu  page: %lu  rownr: %u",
7087                        (ulong) ma_recordpos(page, rownr),
7088                        (ulong) page, rownr));
7089 
7090   if (share->calc_checksum)
7091   {
7092     /*
7093       We extract the checksum delta here, saving a recomputation in
7094       allocate_and_write_block_record(). It's only an optimization.
7095     */
7096     row.checksum= (ha_checksum) 0 - ha_checksum_korr(header);
7097     header+= HA_CHECKSUM_STORE_SIZE;
7098   }
7099   extent_info= header;
7100   header+= extent_count * ROW_EXTENT_SIZE;
7101 
7102   null_field_lengths= row.null_field_lengths;
7103   blob_lengths= row.blob_lengths;
7104 
7105   /*
7106     Fill in info->cur_row with information about the row, like in
7107     calc_record_size(), to be used by write_block_record()
7108   */
7109 
7110   row.normal_length= row.char_length= row.varchar_length=
7111     row.blob_length= row.extents_count= row.field_lengths_length= 0;
7112 
7113   null_bits= header;
7114   header+= share->base.null_bytes;
7115   /* This will not be changed */
7116   row.empty_bits= (uchar*) header;
7117   header+= share->base.pack_bytes;
7118   if (share->base.max_field_lengths)
7119   {
7120     row.field_lengths_length= uint2korr(header);
7121     row.field_lengths= (uchar*) header + 2 ;
7122     header+= 2 + row.field_lengths_length;
7123   }
7124   if (share->base.blobs)
7125     row.blob_length= ma_get_length(&header);
7126 
7127   /* We need to build up a record (without blobs) in rec_buff */
7128   if (!(record= my_malloc(share->base.reclength, MYF(MY_WME))))
7129     DBUG_RETURN(1);
7130 
7131   memcpy(record, null_bits, share->base.null_bytes);
7132 
7133   /* Copy field information from header to record */
7134 
7135   /* Handle constant length fields that are always present */
7136   for (column= share->columndef,
7137          end_column= column+ share->base.fixed_not_null_fields;
7138        column < end_column;
7139        column++)
7140   {
7141     memcpy(record + column->offset, header, column->length);
7142     header+= column->length;
7143   }
7144 
7145   /* Handle NULL fields and CHAR/VARCHAR fields */
7146   field_length_data= row.field_lengths;
7147   for (end_column= share->columndef + share->base.fields;
7148        column < end_column;
7149        column++, null_field_lengths++)
7150   {
7151     if ((record[column->null_pos] & column->null_bit) ||
7152         row.empty_bits[column->empty_pos] & column->empty_bit)
7153     {
7154       if (column->type != FIELD_BLOB)
7155         *null_field_lengths= 0;
7156       else
7157         *blob_lengths++= 0;
7158       if (share->calc_checksum)
7159         bfill(record + column->offset, column->fill_length,
7160               column->type == FIELD_SKIP_ENDSPACE ? ' ' : 0);
7161       continue;
7162     }
7163     switch (column->type) {
7164     case FIELD_CHECK:
7165     case FIELD_NORMAL:                          /* Fixed length field */
7166     case FIELD_ZERO:
7167     case FIELD_SKIP_PRESPACE:                   /* Not packed */
7168     case FIELD_SKIP_ZERO:                       /* Fixed length field */
7169       row.normal_length+= column->length;
7170       *null_field_lengths= column->length;
7171       memcpy(record + column->offset, header, column->length);
7172       header+= column->length;
7173       break;
7174     case FIELD_SKIP_ENDSPACE:                   /* CHAR */
7175     {
7176       uint length;
7177       if (column->length <= 255)
7178         length= (uint) *field_length_data++;
7179       else
7180       {
7181         length= uint2korr(field_length_data);
7182         field_length_data+= 2;
7183       }
7184       row.char_length+= length;
7185       *null_field_lengths= length;
7186       memcpy(record + column->offset, header, length);
7187       if (share->calc_checksum)
7188         bfill(record + column->offset + length, (column->length - length),
7189               ' ');
7190       header+= length;
7191       break;
7192     }
7193     case FIELD_VARCHAR:
7194     {
7195       uint length;
7196       uchar *field_pos= record + column->offset;
7197 
7198       /* 256 is correct as this includes the length uchar */
7199       if (column->fill_length == 1)
7200       {
7201         field_pos[0]= *field_length_data;
7202         length= (uint) *field_length_data;
7203       }
7204       else
7205       {
7206         field_pos[0]= field_length_data[0];
7207         field_pos[1]= field_length_data[1];
7208         length= uint2korr(field_length_data);
7209       }
7210       field_length_data+= column->fill_length;
7211       field_pos+= column->fill_length;
7212       row.varchar_length+= length;
7213       *null_field_lengths= length;
7214       memcpy(field_pos, header, length);
7215       header+= length;
7216       break;
7217     }
7218     case FIELD_BLOB:
7219     {
7220       /* Copy length of blob and pointer to blob data to record */
7221       uchar *field_pos= record + column->offset;
7222       uint size_length= column->length - portable_sizeof_char_ptr;
7223       ulong blob_length= _ma_calc_blob_length(size_length, field_length_data);
7224 
7225       memcpy(field_pos, field_length_data, size_length);
7226       field_length_data+= size_length;
7227       memcpy(field_pos + size_length, &header, sizeof(header));
7228       header+= blob_length;
7229       *blob_lengths++= blob_length;
7230       break;
7231     }
7232     default:
7233       DBUG_ASSERT(0);
7234     }
7235   }
7236   row.head_length= (info->row_base_length +
7237                     share->base.fixed_not_null_fields_length +
7238                     row.field_lengths_length +
7239                     size_to_store_key_length(row.field_lengths_length) +
7240                     row.normal_length +
7241                     row.char_length + row.varchar_length);
7242   row.total_length= (row.head_length + row.blob_length);
7243   if (row.total_length < share->base.min_block_length)
7244     row.total_length= share->base.min_block_length;
7245 
7246   /*
7247     Row is now generated. Now we need to insert record on the original
7248     pages with original size on each page.
7249   */
7250 
7251   _ma_bitmap_flushable(info, 1);
7252   /* Change extent information to be usable by write_block_record() */
7253   blocks= &row.insert_blocks;
7254   if (extent_to_bitmap_blocks(info, blocks, page, extent_count, extent_info))
7255     goto err;
7256   blocks->block->org_bitmap_value= _ma_bitmap_get_page_bits(info,
7257                                                             &share->bitmap,
7258                                                             page);
7259   blocks->block->used|= BLOCKUSED_USE_ORG_BITMAP;
7260 
7261   /* Read head page and allocate data for rowid */
7262   if (get_rowpos_in_head_or_tail_page(info, blocks->block,
7263                                       info->buff,
7264                                       length_on_head_page,
7265                                       HEAD_PAGE, PAGECACHE_LOCK_WRITE,
7266                                       rownr, &row_pos))
7267     goto err;
7268 
7269   if (share->calc_checksum)
7270   {
7271     DBUG_ASSERT(row.checksum == (share->calc_checksum)(info, record));
7272   }
7273   /* Store same amount of data on head page as on original page */
7274   row_pos.length= (length_on_head_page -
7275                    (extent_count + 1 - blocks->count) * ROW_EXTENT_SIZE);
7276   set_if_bigger(row_pos.length, share->base.min_block_length);
7277   if (write_block_record(info, (uchar*) 0, record, &row,
7278                          blocks, blocks->block->org_bitmap_value != 0,
7279                          &row_pos, undo_lsn, 0))
7280     goto err;
7281 
7282   my_free(record);
7283   DBUG_RETURN(0);
7284 
7285 err:
7286   DBUG_ASSERT(!maria_assert_if_crashed_table);
7287   _ma_mark_file_crashed(share);
7288   if (info->non_flushable_state)
7289     _ma_bitmap_flushable(info, -1);
7290   _ma_unpin_all_pages_and_finalize_row(info, LSN_IMPOSSIBLE);
7291   my_free(record);
7292   DBUG_RETURN(1);
7293 }
7294 
7295 
7296 /**
7297   Execute undo of a row update
7298 
7299   @fn _ma_apply_undo_row_update()
7300 
7301   @return Operation status
7302     @retval 0      OK
7303     @retval 1      Error
7304 */
7305 
_ma_apply_undo_row_update(MARIA_HA * info,LSN undo_lsn,const uchar * header,size_t header_length)7306 my_bool _ma_apply_undo_row_update(MARIA_HA *info, LSN undo_lsn,
7307                                   const uchar *header,
7308                                   size_t header_length
7309                                   __attribute__((unused)))
7310 {
7311   MARIA_SHARE *share= info->s;
7312   MARIA_RECORD_POS record_pos;
7313   const uchar *field_length_data, *field_length_data_end, *extent_info;
7314   uchar *current_record, *orig_record;
7315   pgcache_page_no_t page;
7316   ha_checksum UNINIT_VAR(checksum_delta);
7317   uint rownr, field_length_header, extent_count, length_on_head_page;
7318   int error;
7319   DBUG_ENTER("_ma_apply_undo_row_update");
7320 
7321   page=  page_korr(header);
7322   header+= PAGE_STORE_SIZE;
7323   rownr= dirpos_korr(header);
7324   header+= DIRPOS_STORE_SIZE;
7325 
7326   record_pos= ma_recordpos(page, rownr);
7327   DBUG_PRINT("enter", ("rowid: %lu  page: %lu  rownr: %u",
7328                        (ulong) record_pos, (ulong) page, rownr));
7329 
7330   if (share->calc_checksum)
7331   {
7332     checksum_delta= ha_checksum_korr(header);
7333     header+= HA_CHECKSUM_STORE_SIZE;
7334   }
7335   length_on_head_page= uint2korr(header);
7336   set_if_bigger(length_on_head_page, share->base.min_block_length);
7337   header+= 2;
7338   extent_count= pagerange_korr(header);
7339   header+= PAGERANGE_STORE_SIZE;
7340   extent_info= header;
7341   header+= extent_count * ROW_EXTENT_SIZE;
7342 
7343   /*
7344     Set header to point to old field values, generated by
7345     fill_update_undo_parts()
7346   */
7347   field_length_header= ma_get_length(&header);
7348   field_length_data= (uchar*) header;
7349   header+= field_length_header;
7350   field_length_data_end= header;
7351 
7352   /* Allocate buffer for current row & original row */
7353   if (!(current_record= my_malloc(share->base.reclength * 2, MYF(MY_WME))))
7354     DBUG_RETURN(1);
7355   orig_record= current_record+ share->base.reclength;
7356 
7357   /* Read current record */
7358   if (_ma_read_block_record(info, current_record, record_pos))
7359     goto err;
7360 
7361   if (*field_length_data == 255)
7362   {
7363     /* Bitmap changed */
7364     field_length_data++;
7365     memcpy(orig_record, header, share->base.null_bytes);
7366     header+= share->base.null_bytes;
7367   }
7368   else
7369     memcpy(orig_record, current_record, share->base.null_bytes);
7370   bitmap_clear_all(&info->changed_fields);
7371 
7372   while (field_length_data < field_length_data_end)
7373   {
7374     uint field_nr= ma_get_length(&field_length_data), field_length;
7375     MARIA_COLUMNDEF *column= share->columndef + field_nr;
7376     uchar *orig_field_pos= orig_record + column->offset;
7377 
7378     bitmap_set_bit(&info->changed_fields, field_nr);
7379     if (field_nr >= share->base.fixed_not_null_fields)
7380     {
7381       if (!(field_length= ma_get_length(&field_length_data)))
7382       {
7383         /* Null field or empty field */
7384         bfill(orig_field_pos, column->fill_length,
7385               column->type == FIELD_SKIP_ENDSPACE ? ' ' : 0);
7386         continue;
7387       }
7388     }
7389     else
7390       field_length= column->length;
7391 
7392     switch (column->type) {
7393     case FIELD_CHECK:
7394     case FIELD_NORMAL:                          /* Fixed length field */
7395     case FIELD_ZERO:
7396     case FIELD_SKIP_PRESPACE:                   /* Not packed */
7397       memcpy(orig_field_pos, header, column->length);
7398       header+= column->length;
7399       break;
7400     case FIELD_SKIP_ZERO:                       /* Number */
7401     case FIELD_SKIP_ENDSPACE:                   /* CHAR */
7402     {
7403       uint diff;
7404       memcpy(orig_field_pos, header, field_length);
7405       if ((diff= (column->length - field_length)))
7406         bfill(orig_field_pos + column->length - diff, diff,
7407               column->type == FIELD_SKIP_ENDSPACE ? ' ' : 0);
7408       header+= field_length;
7409     }
7410     break;
7411     case FIELD_VARCHAR:
7412       if (column->length <= 256)
7413       {
7414         *orig_field_pos++= (uchar) field_length;
7415       }
7416       else
7417       {
7418         int2store(orig_field_pos, field_length);
7419         orig_field_pos+= 2;
7420       }
7421       memcpy(orig_field_pos, header, field_length);
7422       header+= field_length;
7423       break;
7424     case FIELD_BLOB:
7425     {
7426       uint size_length= column->length - portable_sizeof_char_ptr;
7427       _ma_store_blob_length(orig_field_pos, size_length, field_length);
7428       memcpy(orig_field_pos + size_length, &header, sizeof(header));
7429       header+= field_length;
7430       break;
7431     }
7432     default:
7433       DBUG_ASSERT(0);
7434     }
7435   }
7436   copy_not_changed_fields(info, &info->changed_fields,
7437                           orig_record, current_record);
7438 
7439   if (share->calc_checksum)
7440   {
7441     info->new_row.checksum= checksum_delta +
7442       (info->cur_row.checksum= (*share->calc_checksum)(info, orig_record));
7443     /* verify that record's content is sane */
7444     DBUG_ASSERT(info->new_row.checksum ==
7445                 (*share->calc_checksum)(info, current_record));
7446   }
7447 
7448   info->last_auto_increment= ~ (ulonglong) 0;
7449   /* Now records are up to date, execute the update to original values */
7450   if (_ma_update_at_original_place(info, page, rownr, length_on_head_page,
7451                                    extent_count, extent_info,
7452                                    current_record, orig_record, undo_lsn))
7453     goto err;
7454 
7455   error= 0;
7456 end:
7457   my_free(current_record);
7458   DBUG_RETURN(error);
7459 
7460 err:
7461   DBUG_ASSERT(!maria_assert_if_crashed_table);
7462   error= 1;
7463   _ma_mark_file_crashed(share);
7464   goto end;
7465 }
7466 
7467 
7468 /**
7469   Execute undo of a bulk insert which used repair
7470 
7471   @return Operation status
7472     @retval 0      OK
7473     @retval 1      Error
7474 */
7475 
_ma_apply_undo_bulk_insert(MARIA_HA * info,LSN undo_lsn)7476 my_bool _ma_apply_undo_bulk_insert(MARIA_HA *info, LSN undo_lsn)
7477 {
7478   my_bool error;
7479   LSN lsn;
7480   DBUG_ENTER("_ma_apply_undo_bulk_insert");
7481   /*
7482     We delete all rows, re-enable indices as bulk insert had disabled
7483     non-unique ones.
7484   */
7485   error= (maria_delete_all_rows(info) ||
7486           maria_enable_indexes(info) ||
7487           /* we enabled indices so need '2' below */
7488           _ma_state_info_write(info->s,
7489                                MA_STATE_INFO_WRITE_DONT_MOVE_OFFSET |
7490                                MA_STATE_INFO_WRITE_FULL_INFO |
7491                                MA_STATE_INFO_WRITE_LOCK) ||
7492           _ma_write_clr(info, undo_lsn, LOGREC_UNDO_BULK_INSERT,
7493                         FALSE, 0, &lsn, NULL));
7494   DBUG_RETURN(error);
7495 }
7496 
7497 
7498 /**
7499   @brief Get the TRANSLOG_ADDRESS to flush up to
7500 
7501   @param page            Page's content
7502   @param page_no         Page's number (<offset>/<page length>)
7503   @param data_ptr        Callback data pointer (pointer to MARIA_SHARE)
7504 
7505   @note
7506   Usable for data (non-bitmap) and index pages
7507 
7508   @retval LSN to flush up to
7509 */
7510 
7511 TRANSLOG_ADDRESS
maria_page_get_lsn(uchar * page,pgcache_page_no_t page_no,uchar * data_ptr)7512 maria_page_get_lsn(uchar *page,
7513                    pgcache_page_no_t page_no __attribute__((unused)),
7514                    uchar* data_ptr __attribute__((unused)))
7515 {
7516 #ifndef DBUG_OFF
7517   const MARIA_SHARE *share= (MARIA_SHARE*)data_ptr;
7518   DBUG_ASSERT(share->page_type == PAGECACHE_LSN_PAGE &&
7519               share->now_transactional);
7520 #endif
7521   return lsn_korr(page);
7522 }
7523 
7524 
7525 /**
7526   @brief Enable reading of all rows, ignoring versioning
7527 
7528   @note
7529     This is mainly useful in single user applications, like maria_pack,
7530     where we want to be able to read all rows without having to read the
7531     transaction id from the control file
7532 */
7533 
maria_ignore_trids(MARIA_HA * info)7534 void maria_ignore_trids(MARIA_HA *info)
7535 {
7536   if (info->s->base.born_transactional)
7537   {
7538     if (!info->trn)
7539       _ma_set_tmp_trn_for_table(info, &dummy_transaction_object);
7540     /* Ignore transaction id when row is read */
7541     info->trn->min_read_from= ~(TrID) 0;
7542   }
7543 }
7544 
7545 
7546 #ifndef DBUG_OFF
7547 
7548 /* The following functions are useful to call from debugger */
7549 
_ma_print_block_info(MARIA_SHARE * share,uchar * buff)7550 void _ma_print_block_info(MARIA_SHARE *share, uchar *buff)
7551 {
7552   LSN lsn= lsn_korr(buff);
7553 
7554   printf("LSN: " LSN_FMT "  type: %u  dir_entries: %u  dir_free: %u  empty_space: %u\n",
7555          LSN_IN_PARTS(lsn),
7556          (uint)buff[PAGE_TYPE_OFFSET],
7557          (uint)buff[DIR_COUNT_OFFSET],
7558          (uint)buff[DIR_FREE_OFFSET],
7559          (uint) uint2korr(buff + EMPTY_SPACE_OFFSET));
7560   printf("Start of directory: %lu\n",
7561          maria_block_size - PAGE_SUFFIX_SIZE -
7562          (uint) buff[DIR_COUNT_OFFSET] * DIR_ENTRY_SIZE);
7563   _ma_print_directory(share, stdout, buff, maria_block_size);
7564 }
7565 #endif
7566