1 /* Copyright (C) 2006 MySQL AB & MySQL Finland AB & TCX DataKonsult AB
2
3 This program is free software; you can redistribute it and/or modify
4 it under the terms of the GNU General Public License as published by
5 the Free Software Foundation; version 2 of the License.
6
7 This program is distributed in the hope that it will be useful,
8 but WITHOUT ANY WARRANTY; without even the implied warranty of
9 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
10 GNU General Public License for more details.
11
12 You should have received a copy of the GNU General Public License
13 along with this program; if not, write to the Free Software
14 Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1335 USA */
15
16 /* Pack MARIA file */
17
18 #ifndef USE_MY_FUNC
19 #define USE_MY_FUNC /* We need at least my_malloc */
20 #endif
21
22 #include "maria_def.h"
23 #include "trnman_public.h"
24 #include "trnman.h"
25 #include <queues.h>
26 #include <my_tree.h>
27 #include "mysys_err.h"
28 #ifdef MSDOS
29 #include <io.h>
30 #endif
31 #ifndef __GNU_LIBRARY__
32 #define __GNU_LIBRARY__ /* Skip warnings in getopt.h */
33 #endif
34 #include <my_getopt.h>
35 #include <my_handler_errors.h>
36
37 #if SIZEOF_LONG_LONG > 4
38 #define BITS_SAVED 64
39 #else
40 #define BITS_SAVED 32
41 #endif
42 #ifndef MAX_INTERNAL_TRID
43 #define MAX_INTERNAL_TRID 0xffffffffffffLL
44 #endif
45
46 #define IS_OFFSET ((uint) 32768) /* Bit if offset or char in tree */
47 #define HEAD_LENGTH 32
48 #define ALLOWED_JOIN_DIFF 256 /* Diff allowed to join trees */
49
50 #define DATA_TMP_EXT ".TMD"
51 #define OLD_EXT ".OLD"
52 #define WRITE_COUNT MY_HOW_OFTEN_TO_WRITE
53
54 struct st_file_buffer {
55 File file;
56 uchar *buffer,*pos,*end;
57 my_off_t pos_in_file;
58 int bits;
59 ulonglong bitbucket;
60 };
61
62 struct st_huff_tree;
63 struct st_huff_element;
64
65 typedef struct st_huff_counts {
66 uint field_length,max_zero_fill;
67 uint pack_type;
68 uint max_end_space,max_pre_space,length_bits,min_space;
69 ulong max_length;
70 enum en_fieldtype field_type;
71 struct st_huff_tree *tree; /* Tree for field */
72 my_off_t counts[256];
73 my_off_t end_space[8];
74 my_off_t pre_space[8];
75 my_off_t tot_end_space,tot_pre_space,zero_fields,empty_fields,bytes_packed;
76 TREE int_tree; /* Tree for detecting distinct column values. */
77 uchar *tree_buff; /* Column values, 'field_length' each. */
78 uchar *tree_pos; /* Points to end of column values in 'tree_buff'. */
79 } HUFF_COUNTS;
80
81 typedef struct st_huff_element HUFF_ELEMENT;
82
83 /*
84 WARNING: It is crucial for the optimizations in calc_packed_length()
85 that 'count' is the first element of 'HUFF_ELEMENT'.
86 */
87 struct st_huff_element {
88 my_off_t count;
89 union un_element {
90 struct st_nod {
91 HUFF_ELEMENT *left,*right;
92 } nod;
93 struct st_leaf {
94 HUFF_ELEMENT *null;
95 uint element_nr; /* Number of element */
96 } leaf;
97 } a;
98 };
99
100
101 typedef struct st_huff_tree {
102 HUFF_ELEMENT *root,*element_buffer;
103 HUFF_COUNTS *counts;
104 uint tree_number;
105 uint elements;
106 my_off_t bytes_packed;
107 uint tree_pack_length;
108 uint min_chr,max_chr,char_bits,offset_bits,max_offset,height;
109 ulonglong *code;
110 uchar *code_len;
111 } HUFF_TREE;
112
113
114 typedef struct st_isam_mrg {
115 MARIA_HA **file,**current,**end;
116 uint free_file;
117 uint count;
118 uint min_pack_length; /* Theese is used by packed data */
119 uint max_pack_length;
120 uint ref_length;
121 uint max_blob_length;
122 my_off_t records;
123 /* true if at least one source file has at least one disabled index */
124 my_bool src_file_has_indexes_disabled;
125 } PACK_MRG_INFO;
126
127
128 extern int main(int argc,char * *argv);
129 static void get_options(int *argc,char ***argv);
130 static MARIA_HA *open_maria_file(char *name,int mode);
131 static my_bool open_maria_files(PACK_MRG_INFO *mrg,char **names,uint count);
132 static int compress(PACK_MRG_INFO *file,char *join_name);
133 static HUFF_COUNTS *init_huff_count(MARIA_HA *info,my_off_t records);
134 static void free_counts_and_tree_and_queue(HUFF_TREE *huff_trees,
135 uint trees,
136 HUFF_COUNTS *huff_counts,
137 uint fields);
138 static int compare_tree(void* cmp_arg __attribute__((unused)),
139 const uchar *s,const uchar *t);
140 static int get_statistic(PACK_MRG_INFO *mrg,HUFF_COUNTS *huff_counts);
141 static void check_counts(HUFF_COUNTS *huff_counts,uint trees,
142 my_off_t records);
143 static int test_space_compress(HUFF_COUNTS *huff_counts,my_off_t records,
144 uint max_space_length,my_off_t *space_counts,
145 my_off_t tot_space_count,
146 enum en_fieldtype field_type);
147 static HUFF_TREE* make_huff_trees(HUFF_COUNTS *huff_counts,uint trees);
148 static int make_huff_tree(HUFF_TREE *tree,HUFF_COUNTS *huff_counts);
149 static int compare_huff_elements(void *not_used, uchar *a,uchar *b);
150 static int save_counts_in_queue(uchar *key,element_count count,
151 HUFF_TREE *tree);
152 static my_off_t calc_packed_length(HUFF_COUNTS *huff_counts,uint flag);
153 static uint join_same_trees(HUFF_COUNTS *huff_counts,uint trees);
154 static int make_huff_decode_table(HUFF_TREE *huff_tree,uint trees);
155 static void make_traverse_code_tree(HUFF_TREE *huff_tree,
156 HUFF_ELEMENT *element,uint size,
157 ulonglong code);
158 static int write_header(PACK_MRG_INFO *isam_file, uint header_length,uint trees,
159 my_off_t tot_elements,my_off_t filelength);
160 static void write_field_info(HUFF_COUNTS *counts, uint fields,uint trees);
161 static my_off_t write_huff_tree(HUFF_TREE *huff_tree,uint trees);
162 static uint *make_offset_code_tree(HUFF_TREE *huff_tree,
163 HUFF_ELEMENT *element,
164 uint *offset);
165 static uint max_bit(uint value);
166 static int compress_maria_file(PACK_MRG_INFO *file,HUFF_COUNTS *huff_counts);
167 static char *make_new_name(char *new_name,char *old_name);
168 static char *make_old_name(char *new_name,char *old_name);
169 static void init_file_buffer(File file,pbool read_buffer);
170 static int flush_buffer(ulong neaded_length);
171 static void end_file_buffer(void);
172 static void write_bits(ulonglong value, uint bits);
173 static void flush_bits(void);
174 static int save_state(MARIA_HA *isam_file,PACK_MRG_INFO *mrg,
175 my_off_t new_length, ha_checksum crc);
176 static int save_state_mrg(File file,PACK_MRG_INFO *isam_file,
177 my_off_t new_length, ha_checksum crc);
178 static int mrg_close(PACK_MRG_INFO *mrg);
179 static int mrg_rrnd(PACK_MRG_INFO *info,uchar *buf);
180 static void mrg_reset(PACK_MRG_INFO *mrg);
181 #if !defined(DBUG_OFF)
182 static void fakebigcodes(HUFF_COUNTS *huff_counts, HUFF_COUNTS *end_count);
183 static int fakecmp(my_off_t **count1, my_off_t **count2);
184 #endif
185
186 /*
187 tree_buff_length is somewhat arbitrary. The bigger it is the better
188 the chance to win in terms of compression factor. On the other hand,
189 this table becomes part of the compressed file header. And its length
190 is coded with 16 bits in the header. Hence the limit is 2**16 - 1.
191 */
192 static uint tree_buff_length= 65536 - MALLOC_OVERHEAD;
193
194 static int error_on_write=0,test_only=0,verbose=0,silent=0,
195 write_loop=0,force_pack=0, isamchk_neaded=0;
196 static int tmpfile_createflag=O_RDWR | O_TRUNC | O_EXCL;
197 static my_bool backup, opt_wait;
198 static my_bool opt_ignore_control_file, opt_require_control_file;
199 static char tmp_dir[FN_REFLEN]={0},*join_table;
200 static my_off_t intervall_length;
201 static ha_checksum glob_crc;
202 static struct st_file_buffer file_buffer;
203 static QUEUE queue;
204 static HUFF_COUNTS *global_count;
205 static char zero_string[]={0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0};
206 static const char *load_default_groups[]= { "ariapack",0 };
207
208 /*
209 Register handler error messages for usage with my_error()
210
211 NOTES
212 This is safe to call multiple times as my_error_register()
213 will ignore calls to register already registered error numbers.
214 */
215
get_handler_error_messages(int e)216 static const char **get_handler_error_messages(int e __attribute__((unused)))
217 {
218 return handler_error_messages;
219 }
220
221
222 /* The main program */
223
main(int argc,char ** argv)224 int main(int argc, char **argv)
225 {
226 int error,ok;
227 PACK_MRG_INFO merge;
228 char **default_argv;
229 my_bool no_control_file= 0;
230 MY_INIT(argv[0]);
231
232 maria_data_root= (char *)".";
233 load_defaults_or_exit("my", load_default_groups, &argc, &argv);
234 default_argv= argv;
235 get_options(&argc,&argv);
236 my_error_register(get_handler_error_messages, HA_ERR_FIRST,
237 HA_ERR_FIRST+ array_elements(handler_error_messages)-1);
238
239 if (!opt_ignore_control_file &&
240 (no_control_file= ma_control_file_open(FALSE,
241 (opt_require_control_file ||
242 !silent))) &&
243 opt_require_control_file)
244 {
245 error= 1;
246 goto end;
247 }
248 maria_init();
249 if (no_control_file || force_pack)
250 {
251 /* Assume that all rows exists */
252 trnman_init(MAX_INTERNAL_TRID-16);
253 }
254
255 error=ok=isamchk_neaded=0;
256 if (join_table)
257 { /* Join files into one */
258 if (open_maria_files(&merge,argv,(uint) argc) ||
259 compress(&merge,join_table))
260 error=1;
261 }
262 else while (argc--)
263 {
264 MARIA_HA *isam_file;
265 if (!(isam_file=open_maria_file(*argv++,O_RDWR)))
266 error=1;
267 else
268 {
269 merge.file= &isam_file;
270 merge.current=0;
271 merge.free_file=0;
272 merge.count=1;
273 if (compress(&merge,0))
274 error=1;
275 else
276 ok=1;
277 }
278 }
279 if (ok && isamchk_neaded && !silent)
280 puts("Remember to run aria_chk -rq on compressed tables");
281
282 end:
283 fflush(stdout);
284 fflush(stderr);
285 free_defaults(default_argv);
286 my_error_unregister(HA_ERR_FIRST,
287 HA_ERR_FIRST+ array_elements(handler_error_messages)-1);
288 maria_end();
289 my_end(verbose ? MY_CHECK_ERROR | MY_GIVE_INFO : MY_CHECK_ERROR);
290 exit(error ? 2 : 0);
291 #ifndef _lint
292 return 0; /* No compiler warning */
293 #endif
294 }
295
296 enum options_mp {OPT_CHARSETS_DIR_MP=256, OPT_AUTO_CLOSE};
297
298 static struct my_option my_long_options[] =
299 {
300 #ifdef __NETWARE__
301 {"autoclose", OPT_AUTO_CLOSE, "Auto close the screen on exit for Netware.",
302 0, 0, 0, GET_NO_ARG, NO_ARG, 0, 0, 0, 0, 0, 0},
303 #endif
304 {"backup", 'b', "Make a backup of the table as table_name.OLD.",
305 &backup, &backup, 0, GET_BOOL, NO_ARG, 0, 0, 0, 0, 0, 0},
306 {"character-sets-dir", OPT_CHARSETS_DIR_MP,
307 "Directory where character sets are.", (char**) &charsets_dir,
308 (char**) &charsets_dir, 0, GET_STR, REQUIRED_ARG, 0, 0, 0, 0, 0, 0},
309 {"datadir", 'h',
310 "Path for control file (and logs if --logdir not used).",
311 &maria_data_root, 0, 0, GET_STR, REQUIRED_ARG,
312 0, 0, 0, 0, 0, 0},
313 {"debug", '#', "Output debug log. Often this is 'd:t:o,filename'.",
314 0, 0, 0, GET_STR, OPT_ARG, 0, 0, 0, 0, 0, 0},
315 {"force", 'f',
316 "Force packing of table even if it gets bigger or if tempfile exists.",
317 0, 0, 0, GET_NO_ARG, NO_ARG, 0, 0, 0, 0, 0, 0},
318 { "ignore-control-file", 0,
319 "Ignore the control file",
320 (uchar**)&opt_ignore_control_file, 0, 0, GET_BOOL, NO_ARG,
321 0, 0, 0, 0, 0, 0},
322 {"join", 'j',
323 "Join all given tables into 'new_table_name'. All tables MUST have identical layouts.",
324 &join_table, &join_table, 0, GET_STR, REQUIRED_ARG, 0, 0, 0,
325 0, 0, 0},
326 {"help", '?', "Display this help and exit.",
327 0, 0, 0, GET_NO_ARG, NO_ARG, 0, 0, 0, 0, 0, 0},
328 { "require-control-file", 0,
329 "Abort if cannot find control file",
330 (uchar**)&opt_require_control_file, 0, 0, GET_BOOL, NO_ARG,
331 0, 0, 0, 0, 0, 0},
332 {"silent", 's', "Be more silent.",
333 0, 0, 0, GET_NO_ARG, NO_ARG, 0, 0, 0, 0, 0, 0},
334 {"tmpdir", 'T', "Use temporary directory to store temporary table.",
335 0, 0, 0, GET_STR, REQUIRED_ARG, 0, 0, 0, 0, 0, 0},
336 {"test", 't', "Don't pack table, only test packing it.",
337 0, 0, 0, GET_NO_ARG, NO_ARG, 0, 0, 0, 0, 0, 0},
338 {"verbose", 'v', "Write info about progress and packing result. Use many -v for more verbosity!",
339 0, 0, 0, GET_NO_ARG, NO_ARG, 0, 0, 0, 0, 0, 0},
340 {"version", 'V', "Output version information and exit.",
341 0, 0, 0, GET_NO_ARG, NO_ARG, 0, 0, 0, 0, 0, 0},
342 {"wait", 'w', "Wait and retry if table is in use.", &opt_wait,
343 &opt_wait, 0, GET_BOOL, NO_ARG, 0, 0, 0, 0, 0, 0},
344 { 0, 0, 0, 0, 0, 0, GET_NO_ARG, NO_ARG, 0, 0, 0, 0, 0, 0}
345 };
346
347
print_version(void)348 static void print_version(void)
349 {
350 printf("%s Ver 1.0 for %s on %s\n", my_progname, SYSTEM_TYPE, MACHINE_TYPE);
351 }
352
353
usage(void)354 static void usage(void)
355 {
356 print_version();
357 puts("Copyright 2002-2008 MySQL AB, 2008-2009 Sun Microsystems, Inc.");
358 puts("This software comes with ABSOLUTELY NO WARRANTY. This is free software,");
359 puts("and you are welcome to modify and redistribute it under the GPL license\n");
360
361 puts("Pack a Aria-table to take much less space.");
362 puts("Keys are not updated, you must run aria_chk -rq on the index (.MAI) file");
363 puts("afterwards to update the keys.");
364 puts("You should give the .MAI file as the filename argument.");
365 puts("To unpack a packed table, run aria_chk -u on the table");
366
367 printf("\nUsage: %s [OPTIONS] filename...\n", my_progname);
368 my_print_help(my_long_options);
369 print_defaults("my", load_default_groups);
370 my_print_variables(my_long_options);
371 }
372
373
374 static my_bool
get_one_option(int optid,const struct my_option * opt,char * argument)375 get_one_option(int optid, const struct my_option *opt __attribute__((unused)),
376 char *argument)
377 {
378 uint length;
379
380 switch(optid) {
381 #ifdef __NETWARE__
382 case OPT_AUTO_CLOSE:
383 setscreenmode(SCR_AUTOCLOSE_ON_EXIT);
384 break;
385 #endif
386 case 'f':
387 force_pack= 1;
388 tmpfile_createflag= O_RDWR | O_TRUNC;
389 break;
390 case 's':
391 write_loop= verbose= 0;
392 silent= 1;
393 break;
394 case 't':
395 test_only= 1;
396 /* Avoid to reset 'verbose' if it was already set > 1. */
397 if (! verbose)
398 verbose= 1;
399 break;
400 case 'T':
401 length= (uint) (strmov(tmp_dir, argument) - tmp_dir);
402 if (length != dirname_length(tmp_dir))
403 {
404 tmp_dir[length]=FN_LIBCHAR;
405 tmp_dir[length+1]=0;
406 }
407 break;
408 case 'v':
409 verbose++; /* Allow for selecting the level of verbosity. */
410 silent= 0;
411 break;
412 case '#':
413 DBUG_PUSH(argument ? argument : "d:t:o,/tmp/aria_pack.trace");
414 break;
415 case 'V':
416 print_version();
417 exit(0);
418 case 'I':
419 case '?':
420 usage();
421 exit(0);
422 }
423 return 0;
424 }
425
426 /* reads options */
427 /* Initiates DEBUG - but no debugging here ! */
428
get_options(int * argc,char *** argv)429 static void get_options(int *argc,char ***argv)
430 {
431 int ho_error;
432
433 my_progname= argv[0][0];
434 if (isatty(fileno(stdout)))
435 write_loop=1;
436
437 if ((ho_error=handle_options(argc, argv, my_long_options, get_one_option)))
438 exit(ho_error);
439
440 if (!*argc)
441 {
442 usage();
443 exit(1);
444 }
445 if (join_table)
446 {
447 backup=0; /* Not needed */
448 tmp_dir[0]=0;
449 }
450 return;
451 }
452
453
print_error(int error,const char * filename)454 static void print_error(int error, const char *filename)
455 {
456 switch (error) {
457 case HA_ERR_CRASHED:
458 fprintf(stderr, "'%s' doesn't have a correct index definition. You need to recreate it before you can do a repair",filename);
459 break;
460 case HA_ERR_NOT_A_TABLE:
461 fprintf(stderr, "'%s' is not a Aria table",filename);
462 break;
463 case HA_ERR_CRASHED_ON_USAGE:
464 fprintf(stderr, "'%s' is marked as crashed",filename);
465 break;
466 case HA_ERR_CRASHED_ON_REPAIR:
467 fprintf(stderr, "'%s' is marked as crashed after last repair",filename);
468 break;
469 case HA_ERR_OLD_FILE:
470 fprintf(stderr, "'%s' has transactions newer than registered in control file. If this is ok, please re-run with --ignore-control-file", filename);
471 break;
472 case HA_ERR_NEW_FILE:
473 fprintf(stderr, "'%s' uses new features not supported by this version of the Aria library", filename);
474 break;
475 case HA_ERR_END_OF_FILE:
476 fprintf(stderr, "Couldn't read complete header from '%s'", filename);
477 break;
478 case EAGAIN:
479 fprintf(stderr, "'%s' is locked. Use -w to wait until unlocked",filename);
480 break;
481 case ENOENT:
482 fprintf(stderr, "File '%s' doesn't exist",filename);
483 break;
484 case EACCES:
485 fprintf(stderr, "You don't have permission to use '%s'", filename);
486 break;
487 default:
488 fprintf(stderr, "%d when opening Aria table '%s'", error, filename);
489 break;
490 }
491 fputc('\n',stderr);
492 }
493
494
open_maria_file(char * name,int mode)495 static MARIA_HA *open_maria_file(char *name,int mode)
496 {
497 MARIA_HA *isam_file;
498 MARIA_SHARE *share;
499 DBUG_ENTER("open_maria_file");
500
501 if (!(isam_file=maria_open(name, mode, HA_OPEN_IGNORE_MOVED_STATE |
502 (opt_wait ? HA_OPEN_WAIT_IF_LOCKED :
503 HA_OPEN_ABORT_IF_LOCKED))))
504 {
505 print_error(my_errno, name);
506 DBUG_RETURN(0);
507 }
508 share=isam_file->s;
509 if (share->options & HA_OPTION_COMPRESS_RECORD && !join_table)
510 {
511 if (!force_pack)
512 {
513 fprintf(stderr, "%s is already compressed\n", name);
514 maria_close(isam_file);
515 DBUG_RETURN(0);
516 }
517 if (verbose)
518 puts("Recompressing already compressed table");
519 share->options&= ~HA_OPTION_READ_ONLY_DATA; /* We are modifing it */
520 }
521 if (! force_pack && share->state.state.records != 0 &&
522 (share->state.state.records <= 1 ||
523 share->state.state.data_file_length < 1024))
524 {
525 fprintf(stderr, "%s is too small to compress\n", name);
526 maria_close(isam_file);
527 DBUG_RETURN(0);
528 }
529 maria_lock_database(isam_file,F_WRLCK);
530 maria_ignore_trids(isam_file);
531 DBUG_RETURN(isam_file);
532 }
533
534
open_maria_files(PACK_MRG_INFO * mrg,char ** names,uint count)535 static my_bool open_maria_files(PACK_MRG_INFO *mrg,char **names,uint count)
536 {
537 uint i,j;
538 mrg->count=0;
539 mrg->current=0;
540 mrg->file=(MARIA_HA**) my_malloc(sizeof(MARIA_HA*)*count,MYF(MY_FAE));
541 mrg->free_file=1;
542 mrg->src_file_has_indexes_disabled= 0;
543 for (i=0; i < count ; i++)
544 {
545 if (!(mrg->file[i]=open_maria_file(names[i],O_RDONLY)))
546 goto error;
547
548 mrg->src_file_has_indexes_disabled|=
549 ! maria_is_all_keys_active(mrg->file[i]->s->state.key_map,
550 mrg->file[i]->s->base.keys);
551 }
552 /* Check that files are identical */
553 for (j=0 ; j < count-1 ; j++)
554 {
555 MARIA_COLUMNDEF *m1,*m2,*end;
556 if (mrg->file[j]->s->base.reclength != mrg->file[j+1]->s->base.reclength ||
557 mrg->file[j]->s->base.fields != mrg->file[j+1]->s->base.fields)
558 goto diff_file;
559 m1=mrg->file[j]->s->columndef;
560 end=m1+mrg->file[j]->s->base.fields;
561 m2=mrg->file[j+1]->s->columndef;
562 for ( ; m1 != end ; m1++,m2++)
563 {
564 if (m1->type != m2->type || m1->length != m2->length)
565 goto diff_file;
566 }
567 }
568 mrg->count=count;
569 return 0;
570
571 diff_file:
572 fprintf(stderr, "%s: Tables '%s' and '%s' are not identical\n",
573 my_progname, names[j], names[j+1]);
574 error:
575 while (i--)
576 maria_close(mrg->file[i]);
577 my_free(mrg->file);
578 return 1;
579 }
580
581
compress(PACK_MRG_INFO * mrg,char * result_table)582 static int compress(PACK_MRG_INFO *mrg,char *result_table)
583 {
584 int error;
585 File new_file,join_maria_file;
586 MARIA_HA *isam_file;
587 MARIA_SHARE *share;
588 char org_name[FN_REFLEN],new_name[FN_REFLEN],temp_name[FN_REFLEN];
589 uint i,header_length,fields,trees,used_trees;
590 my_off_t old_length,new_length,tot_elements;
591 HUFF_COUNTS *huff_counts;
592 HUFF_TREE *huff_trees;
593 DBUG_ENTER("compress");
594
595 isam_file=mrg->file[0]; /* Take this as an example */
596 share=isam_file->s;
597 new_file=join_maria_file= -1;
598 trees=fields=0;
599 huff_trees=0;
600 huff_counts=0;
601 maria_block_size= isam_file->s->block_size;
602
603 /* Create temporary or join file */
604 if (backup)
605 fn_format(org_name,isam_file->s->open_file_name.str, "",MARIA_NAME_DEXT, 2);
606 else
607 fn_format(org_name,isam_file->s->open_file_name.str, "",MARIA_NAME_DEXT, 2+4+16);
608
609 if (init_pagecache(maria_pagecache, MARIA_MIN_PAGE_CACHE_SIZE, 0, 0,
610 maria_block_size, 0, MY_WME) == 0)
611 {
612 fprintf(stderr, "Can't initialize page cache\n");
613 goto err;
614 }
615
616 if (!test_only && result_table)
617 {
618 /* Make a new indexfile based on first file in list */
619 uint length;
620 uchar *buff;
621 strmov(org_name,result_table); /* Fix error messages */
622 fn_format(new_name,result_table,"",MARIA_NAME_IEXT,2);
623 if ((join_maria_file=my_create(new_name,0,tmpfile_createflag,MYF(MY_WME)))
624 < 0)
625 goto err;
626 length=(uint) share->base.keystart;
627 if (!(buff= (uchar*) my_malloc(length,MYF(MY_WME))))
628 goto err;
629 if (my_pread(share->kfile.file, buff, length, 0L, MYF(MY_WME | MY_NABP)) ||
630 my_write(join_maria_file,buff,length,
631 MYF(MY_WME | MY_NABP | MY_WAIT_IF_FULL)))
632 {
633 my_free(buff);
634 goto err;
635 }
636 my_free(buff);
637 fn_format(new_name,result_table,"",MARIA_NAME_DEXT,2);
638 }
639 else if (!tmp_dir[0])
640 make_new_name(new_name,org_name);
641 else
642 fn_format(new_name,org_name,tmp_dir,DATA_TMP_EXT,1+2+4);
643 if (!test_only &&
644 (new_file=my_create(new_name,0,tmpfile_createflag,MYF(MY_WME))) < 0)
645 goto err;
646
647 /* Start calculating statistics */
648
649 mrg->records=0;
650 for (i=0 ; i < mrg->count ; i++)
651 mrg->records+=mrg->file[i]->s->state.state.records;
652
653 DBUG_PRINT("info", ("Compressing %s: (%lu records)",
654 result_table ? new_name : org_name,
655 (ulong) mrg->records));
656 if (write_loop || verbose)
657 {
658 printf("Compressing %s: (%lu records)\n",
659 result_table ? new_name : org_name, (ulong) mrg->records);
660 }
661 trees=fields=share->base.fields;
662 huff_counts=init_huff_count(isam_file,mrg->records);
663
664 /*
665 Read the whole data file(s) for statistics.
666 */
667 DBUG_PRINT("info", ("- Calculating statistics"));
668 if (write_loop || verbose)
669 printf("- Calculating statistics\n");
670 if (get_statistic(mrg,huff_counts))
671 goto err;
672
673 old_length=0;
674 for (i=0; i < mrg->count ; i++)
675 old_length+= (mrg->file[i]->s->state.state.data_file_length -
676 mrg->file[i]->s->state.state.empty);
677
678 /*
679 Create a global priority queue in preparation for making
680 temporary Huffman trees.
681 */
682 if (init_queue(&queue, 256, 0, 0, compare_huff_elements, 0, 0, 0))
683 goto err;
684
685 /*
686 Check each column if we should use pre-space-compress, end-space-
687 compress, empty-field-compress or zero-field-compress.
688 */
689 check_counts(huff_counts,fields,mrg->records);
690
691 /*
692 Build a Huffman tree for each column.
693 */
694 huff_trees=make_huff_trees(huff_counts,trees);
695
696 /*
697 If the packed lengths of combined columns is less then the sum of
698 the non-combined columns, then create common Huffman trees for them.
699 We do this only for uchar compressed columns, not for distinct values
700 compressed columns.
701 */
702 if ((int) (used_trees=join_same_trees(huff_counts,trees)) < 0)
703 goto err;
704
705 /*
706 Assign codes to all uchar or column values.
707 */
708 if (make_huff_decode_table(huff_trees,fields))
709 goto err;
710
711 /* Prepare a file buffer. */
712 init_file_buffer(new_file,0);
713
714 /*
715 Reserve space in the target file for the fixed compressed file header.
716 */
717 file_buffer.pos_in_file=HEAD_LENGTH;
718 if (! test_only)
719 my_seek(new_file,file_buffer.pos_in_file,MY_SEEK_SET,MYF(0));
720
721 /*
722 Write field infos: field type, pack type, length bits, tree number.
723 */
724 write_field_info(huff_counts,fields,used_trees);
725
726 /*
727 Write decode trees.
728 */
729 if (!(tot_elements=write_huff_tree(huff_trees,trees)))
730 goto err;
731
732 /*
733 Calculate the total length of the compression info header.
734 This includes the fixed compressed file header, the column compression
735 type descriptions, and the decode trees.
736 */
737 header_length=(uint) file_buffer.pos_in_file+
738 (uint) (file_buffer.pos-file_buffer.buffer);
739
740 /*
741 Compress the source file into the target file.
742 */
743 DBUG_PRINT("info", ("- Compressing file"));
744 if (write_loop || verbose)
745 printf("- Compressing file\n");
746 error=compress_maria_file(mrg,huff_counts);
747 new_length=file_buffer.pos_in_file;
748 if (!error && !test_only)
749 {
750 uchar buff[MEMMAP_EXTRA_MARGIN]; /* End marginal for memmap */
751 bzero(buff,sizeof(buff));
752 error=my_write(file_buffer.file,buff,sizeof(buff),
753 MYF(MY_WME | MY_NABP | MY_WAIT_IF_FULL)) != 0;
754 }
755
756 /*
757 Write the fixed compressed file header.
758 */
759 if (!error)
760 error=write_header(mrg,header_length,used_trees,tot_elements,
761 new_length);
762
763 /* Flush the file buffer. */
764 end_file_buffer();
765
766 /* Display statistics. */
767 DBUG_PRINT("info", ("Min record length: %6d Max length: %6d "
768 "Mean total length: %6ld",
769 mrg->min_pack_length, mrg->max_pack_length,
770 (ulong) (mrg->records ? (new_length/mrg->records) : 0)));
771 if (verbose && mrg->records)
772 printf("Min record length: %6d Max length: %6d "
773 "Mean total length: %6ld\n", mrg->min_pack_length,
774 mrg->max_pack_length, (ulong) (new_length/mrg->records));
775
776 /* Close source and target file. */
777 if (!test_only)
778 {
779 error|=my_close(new_file,MYF(MY_WME));
780 if (!result_table)
781 {
782 (void) flush_pagecache_blocks(isam_file->s->pagecache, &isam_file->dfile,
783 FLUSH_RELEASE);
784 error|=my_close(isam_file->dfile.file, MYF(MY_WME));
785 isam_file->dfile.file= -1; /* Tell maria_close file is closed */
786 isam_file->s->bitmap.file.file= -1;
787 }
788 }
789
790 /* Cleanup. */
791 free_counts_and_tree_and_queue(huff_trees,trees,huff_counts,fields);
792 if (! test_only && ! error)
793 {
794 if (result_table)
795 {
796 error=save_state_mrg(join_maria_file,mrg,new_length,glob_crc);
797 }
798 else
799 {
800 if (backup)
801 {
802 if (my_rename(org_name,make_old_name(temp_name,
803 isam_file->s->open_file_name.str),
804 MYF(MY_WME)))
805 error=1;
806 else
807 {
808 if (tmp_dir[0])
809 error=my_copy(new_name,org_name,MYF(MY_WME));
810 else
811 error=my_rename(new_name,org_name,MYF(MY_WME));
812 if (!error)
813 {
814 my_copystat(temp_name,org_name,MYF(MY_COPYTIME));
815 if (tmp_dir[0])
816 my_delete(new_name,MYF(MY_WME));
817 }
818 }
819 }
820 else
821 {
822 if (tmp_dir[0])
823 {
824 error=my_copy(new_name,org_name,
825 MYF(MY_WME | MY_HOLD_ORIGINAL_MODES | MY_COPYTIME));
826 if (!error)
827 my_delete(new_name,MYF(MY_WME));
828 }
829 else
830 error=my_redel(org_name, new_name, 0, MYF(MY_WME | MY_COPYTIME));
831 }
832 if (! error)
833 error=save_state(isam_file,mrg,new_length,glob_crc);
834 }
835 }
836 error|=mrg_close(mrg);
837 if (join_maria_file >= 0)
838 error|=my_close(join_maria_file,MYF(MY_WME));
839 if (error)
840 {
841 fprintf(stderr, "Aborting: %s is not compressed\n", org_name);
842 my_delete(new_name,MYF(MY_WME));
843 DBUG_RETURN(-1);
844 }
845 if (write_loop || verbose)
846 {
847 if (old_length)
848 printf("%.4g%% \n",
849 (((longlong) (old_length - new_length)) * 100.0 /
850 (longlong) old_length));
851 else
852 puts("Empty file saved in compressed format");
853 }
854 DBUG_RETURN(0);
855
856 err:
857 free_counts_and_tree_and_queue(huff_trees,trees,huff_counts,fields);
858 if (new_file >= 0)
859 my_close(new_file,MYF(0));
860 if (join_maria_file >= 0)
861 my_close(join_maria_file,MYF(0));
862 mrg_close(mrg);
863 end_pagecache(maria_pagecache, 1);
864 fprintf(stderr, "Aborted: %s is not compressed\n", org_name);
865 DBUG_RETURN(-1);
866 }
867
868 /* Init a huff_count-struct for each field and init it */
869
init_huff_count(MARIA_HA * info,my_off_t records)870 static HUFF_COUNTS *init_huff_count(MARIA_HA *info,my_off_t records)
871 {
872 reg2 uint i;
873 reg1 HUFF_COUNTS *count;
874 if ((count = (HUFF_COUNTS*) my_malloc(info->s->base.fields*
875 sizeof(HUFF_COUNTS),
876 MYF(MY_ZEROFILL | MY_WME))))
877 {
878 for (i=0 ; i < info->s->base.fields ; i++)
879 {
880 enum en_fieldtype type;
881 uint col_nr = info->s->columndef[i].column_nr;
882 count[col_nr].field_length=info->s->columndef[i].length;
883 type= count[col_nr].field_type=
884 (enum en_fieldtype) info->s->columndef[i].type;
885 if (type == FIELD_INTERVALL ||
886 type == FIELD_CONSTANT ||
887 type == FIELD_ZERO)
888 type = FIELD_NORMAL;
889 if (count[col_nr].field_length <= 8 &&
890 (type == FIELD_NORMAL ||
891 type == FIELD_SKIP_ZERO))
892 count[col_nr].max_zero_fill= count[col_nr].field_length;
893 /*
894 For every column initialize a tree, which is used to detect distinct
895 column values. 'int_tree' works together with 'tree_buff' and
896 'tree_pos'. It's keys are implemented by pointers into 'tree_buff'.
897 This is accomplished by '-1' as the element size.
898 */
899 init_tree(&count[col_nr].int_tree,0,0,-1,(qsort_cmp2) compare_tree, NULL,
900 NULL, MYF(0));
901 if (records && type != FIELD_BLOB && type != FIELD_VARCHAR)
902 count[col_nr].tree_pos=count[col_nr].tree_buff =
903 my_malloc(count[col_nr].field_length > 1 ? tree_buff_length : 2,
904 MYF(MY_WME));
905 }
906 }
907 return count;
908 }
909
910
911 /* Free memory used by counts and trees */
912
free_counts_and_tree_and_queue(HUFF_TREE * huff_trees,uint trees,HUFF_COUNTS * huff_counts,uint fields)913 static void free_counts_and_tree_and_queue(HUFF_TREE *huff_trees, uint trees,
914 HUFF_COUNTS *huff_counts,
915 uint fields)
916 {
917 register uint i;
918
919 if (huff_trees)
920 {
921 for (i=0 ; i < trees ; i++)
922 {
923 if (huff_trees[i].element_buffer)
924 my_free(huff_trees[i].element_buffer);
925 if (huff_trees[i].code)
926 my_free(huff_trees[i].code);
927 }
928 my_free(huff_trees);
929 }
930 if (huff_counts)
931 {
932 for (i=0 ; i < fields ; i++)
933 {
934 if (huff_counts[i].tree_buff)
935 {
936 my_free(huff_counts[i].tree_buff);
937 delete_tree(&huff_counts[i].int_tree, 0);
938 }
939 }
940 my_free(huff_counts);
941 }
942 delete_queue(&queue); /* This is safe to free */
943 return;
944 }
945
946 /* Read through old file and gather some statistics */
947
get_statistic(PACK_MRG_INFO * mrg,HUFF_COUNTS * huff_counts)948 static int get_statistic(PACK_MRG_INFO *mrg,HUFF_COUNTS *huff_counts)
949 {
950 int error;
951 uint length, null_bytes;
952 ulong reclength,max_blob_length;
953 uchar *record,*pos,*next_pos,*end_pos,*start_pos;
954 ha_rows record_count;
955 HUFF_COUNTS *count,*end_count;
956 TREE_ELEMENT *element;
957 ha_checksum(*calc_checksum)(MARIA_HA *, const uchar *);
958 DBUG_ENTER("get_statistic");
959
960 reclength= mrg->file[0]->s->base.reclength;
961 null_bytes= mrg->file[0]->s->base.null_bytes;
962 record=(uchar*) my_safe_alloca(reclength);
963 end_count=huff_counts+mrg->file[0]->s->base.fields;
964 record_count=0; glob_crc=0;
965 max_blob_length=0;
966
967 /* Check how to calculate checksum */
968 if (mrg->file[0]->s->data_file_type == STATIC_RECORD)
969 calc_checksum= _ma_static_checksum;
970 else
971 calc_checksum= _ma_checksum;
972
973 mrg_reset(mrg);
974 while ((error=mrg_rrnd(mrg,record)) != HA_ERR_END_OF_FILE)
975 {
976 ulong tot_blob_length=0;
977 if (! error)
978 {
979 /* glob_crc is a checksum over all bytes of all records. */
980 glob_crc+= (*calc_checksum)(mrg->file[0],record);
981
982 /* Count the incidence of values separately for every column. */
983 for (pos=record + null_bytes, count=huff_counts ;
984 count < end_count ;
985 count++,
986 pos=next_pos)
987 {
988 next_pos=end_pos=(start_pos=pos)+count->field_length;
989
990 /*
991 Put the whole column value in a tree if there is room for it.
992 'int_tree' is used to quickly check for duplicate values.
993 'tree_buff' collects as many distinct column values as
994 possible. If the field length is > 1, it is tree_buff_length,
995 else 2 bytes. Each value is 'field_length' bytes big. If there
996 are more distinct column values than fit into the buffer, we
997 give up with this tree. BLOBs and VARCHARs do not have a
998 tree_buff as it can only be used with fixed length columns.
999 For the special case of field length == 1, we handle only the
1000 case that there is only one distinct value in the table(s).
1001 Otherwise, we can have a maximum of 256 distinct values. This
1002 is then handled by the normal Huffman tree build.
1003
1004 Another limit for collecting distinct column values is the
1005 number of values itself. Since we would need to build a
1006 Huffman tree for the values, we are limited by the 'IS_OFFSET'
1007 constant. This constant expresses a bit which is used to
1008 determine if a tree element holds a final value or an offset
1009 to a child element. Hence, all values and offsets need to be
1010 smaller than 'IS_OFFSET'. A tree element is implemented with
1011 two integer values, one for the left branch and one for the
1012 right branch. For the extreme case that the first element
1013 points to the last element, the number of integers in the tree
1014 must be less or equal to IS_OFFSET. So the number of elements
1015 must be less or equal to IS_OFFSET / 2.
1016
1017 WARNING: At first, we insert a pointer into the record buffer
1018 as the key for the tree. If we got a new distinct value, which
1019 is really inserted into the tree, instead of being counted
1020 only, we will copy the column value from the record buffer to
1021 'tree_buff' and adjust the key pointer of the tree accordingly.
1022 */
1023 if (count->tree_buff)
1024 {
1025 global_count=count;
1026 if (!(element=tree_insert(&count->int_tree,pos, 0,
1027 count->int_tree.custom_arg)) ||
1028 (element->count == 1 &&
1029 (count->tree_buff + tree_buff_length <
1030 count->tree_pos + count->field_length)) ||
1031 (count->int_tree.elements_in_tree > IS_OFFSET / 2) ||
1032 (count->field_length == 1 &&
1033 count->int_tree.elements_in_tree > 1))
1034 {
1035 delete_tree(&count->int_tree, 0);
1036 my_free(count->tree_buff);
1037 count->tree_buff=0;
1038 }
1039 else
1040 {
1041 /*
1042 If tree_insert() succeeds, it either creates a new element
1043 or increments the counter of an existing element.
1044 */
1045 if (element->count == 1)
1046 {
1047 /* Copy the new column value into 'tree_buff'. */
1048 memcpy(count->tree_pos,pos,(size_t) count->field_length);
1049 /* Adjust the key pointer in the tree. */
1050 tree_set_pointer(element,count->tree_pos);
1051 /* Point behind the last column value so far. */
1052 count->tree_pos+=count->field_length;
1053 }
1054 }
1055 }
1056
1057 /* Save character counters and space-counts and zero-field-counts */
1058 if (count->field_type == FIELD_NORMAL ||
1059 count->field_type == FIELD_SKIP_ENDSPACE)
1060 {
1061 /* Ignore trailing space. */
1062 for ( ; end_pos > pos ; end_pos--)
1063 if (end_pos[-1] != ' ')
1064 break;
1065 /* Empty fields are just counted. Go to the next record. */
1066 if (end_pos == pos)
1067 {
1068 count->empty_fields++;
1069 count->max_zero_fill=0;
1070 continue;
1071 }
1072 /*
1073 Count the total of all trailing spaces and the number of
1074 short trailing spaces. Remember the longest trailing space.
1075 */
1076 length= (uint) (next_pos-end_pos);
1077 count->tot_end_space+=length;
1078 if (length < 8)
1079 count->end_space[length]++;
1080 if (count->max_end_space < length)
1081 count->max_end_space = length;
1082 }
1083
1084 if (count->field_type == FIELD_NORMAL ||
1085 count->field_type == FIELD_SKIP_PRESPACE)
1086 {
1087 /* Ignore leading space. */
1088 for (pos=start_pos; pos < end_pos ; pos++)
1089 if (pos[0] != ' ')
1090 break;
1091 /* Empty fields are just counted. Go to the next record. */
1092 if (end_pos == pos)
1093 {
1094 count->empty_fields++;
1095 count->max_zero_fill=0;
1096 continue;
1097 }
1098 /*
1099 Count the total of all leading spaces and the number of
1100 short leading spaces. Remember the longest leading space.
1101 */
1102 length= (uint) (pos-start_pos);
1103 count->tot_pre_space+=length;
1104 if (length < 8)
1105 count->pre_space[length]++;
1106 if (count->max_pre_space < length)
1107 count->max_pre_space = length;
1108 }
1109
1110 /* Calculate pos, end_pos, and max_length for variable length fields. */
1111 if (count->field_type == FIELD_BLOB)
1112 {
1113 uint field_length=count->field_length -portable_sizeof_char_ptr;
1114 ulong blob_length= _ma_calc_blob_length(field_length, start_pos);
1115 memcpy(&pos, start_pos+field_length,sizeof(char*));
1116 end_pos=pos+blob_length;
1117 tot_blob_length+=blob_length;
1118 set_if_bigger(count->max_length,blob_length);
1119 }
1120 else if (count->field_type == FIELD_VARCHAR)
1121 {
1122 uint pack_length= HA_VARCHAR_PACKLENGTH(count->field_length-1);
1123 length= (pack_length == 1 ? (uint) *(uchar*) start_pos :
1124 uint2korr(start_pos));
1125 pos= start_pos+pack_length;
1126 end_pos= pos+length;
1127 set_if_bigger(count->max_length,length);
1128 }
1129
1130 /* Evaluate 'max_zero_fill' for short fields. */
1131 if (count->field_length <= 8 &&
1132 (count->field_type == FIELD_NORMAL ||
1133 count->field_type == FIELD_SKIP_ZERO))
1134 {
1135 uint i;
1136 /* Zero fields are just counted. Go to the next record. */
1137 if (!memcmp(start_pos, zero_string, count->field_length))
1138 {
1139 count->zero_fields++;
1140 continue;
1141 }
1142 /*
1143 max_zero_fill starts with field_length. It is decreased every
1144 time a shorter "zero trailer" is found. It is set to zero when
1145 an empty field is found (see above). This suggests that the
1146 variable should be called 'min_zero_fill'.
1147 */
1148 for (i =0 ; i < count->max_zero_fill && ! end_pos[-1 - (int) i] ;
1149 i++) ;
1150 if (i < count->max_zero_fill)
1151 count->max_zero_fill=i;
1152 }
1153
1154 /* Ignore zero fields and check fields. */
1155 if (count->field_type == FIELD_ZERO ||
1156 count->field_type == FIELD_CHECK)
1157 continue;
1158
1159 /*
1160 Count the incidence of every uchar value in the
1161 significant field value.
1162 */
1163 for ( ; pos < end_pos ; pos++)
1164 count->counts[(uchar) *pos]++;
1165
1166 /* Step to next field. */
1167 }
1168
1169 if (tot_blob_length > max_blob_length)
1170 max_blob_length=tot_blob_length;
1171 record_count++;
1172 if (write_loop && record_count % WRITE_COUNT == 0)
1173 {
1174 printf("%lu\r", (ulong) record_count);
1175 fflush(stdout);
1176 }
1177 }
1178 else if (error != HA_ERR_RECORD_DELETED)
1179 {
1180 fprintf(stderr, "Got error %d while reading rows\n", error);
1181 break;
1182 }
1183
1184 /* Step to next record. */
1185 }
1186 if (write_loop)
1187 {
1188 printf(" \r");
1189 fflush(stdout);
1190 }
1191
1192 /*
1193 If --debug=d,fakebigcodes is set, fake the counts to get big Huffman
1194 codes.
1195 */
1196 DBUG_EXECUTE_IF("fakebigcodes", fakebigcodes(huff_counts, end_count););
1197
1198 DBUG_PRINT("info", ("Found the following number of incidents "
1199 "of the uchar codes:"));
1200 if (verbose >= 2)
1201 printf("Found the following number of incidents "
1202 "of the uchar codes:\n");
1203 for (count= huff_counts ; count < end_count; count++)
1204 {
1205 uint idx;
1206 my_off_t total_count;
1207 char llbuf[32];
1208
1209 DBUG_PRINT("info", ("column: %3u", (uint) (count - huff_counts + 1)));
1210 if (verbose >= 2)
1211 printf("column: %3u\n", (uint) (count - huff_counts + 1));
1212 if (count->tree_buff)
1213 {
1214 DBUG_PRINT("info", ("number of distinct values: %u",
1215 (uint) ((count->tree_pos - count->tree_buff) /
1216 count->field_length)));
1217 if (verbose >= 2)
1218 printf("number of distinct values: %u\n",
1219 (uint) ((count->tree_pos - count->tree_buff) /
1220 count->field_length));
1221 }
1222 total_count= 0;
1223 for (idx= 0; idx < 256; idx++)
1224 {
1225 if (count->counts[idx])
1226 {
1227 total_count+= count->counts[idx];
1228 DBUG_PRINT("info", ("counts[0x%02x]: %12s", idx,
1229 llstr((longlong) count->counts[idx], llbuf)));
1230 if (verbose >= 2)
1231 printf("counts[0x%02x]: %12s\n", idx,
1232 llstr((longlong) count->counts[idx], llbuf));
1233 }
1234 }
1235 DBUG_PRINT("info", ("total: %12s", llstr((longlong) total_count,
1236 llbuf)));
1237 if ((verbose >= 2) && total_count)
1238 {
1239 printf("total: %12s\n",
1240 llstr((longlong) total_count, llbuf));
1241 }
1242 }
1243
1244 mrg->records=record_count;
1245 mrg->max_blob_length=max_blob_length;
1246 my_safe_afree(record, reclength);
1247 DBUG_RETURN(error != HA_ERR_END_OF_FILE);
1248 }
1249
compare_huff_elements(void * not_used,uchar * a,uchar * b)1250 static int compare_huff_elements(void *not_used __attribute__((unused)),
1251 uchar *a, uchar *b)
1252 {
1253 return *((my_off_t*) a) < *((my_off_t*) b) ? -1 :
1254 (*((my_off_t*) a) == *((my_off_t*) b) ? 0 : 1);
1255 }
1256
1257 /* Check each tree if we should use pre-space-compress, end-space-
1258 compress, empty-field-compress or zero-field-compress */
1259
check_counts(HUFF_COUNTS * huff_counts,uint trees,my_off_t records)1260 static void check_counts(HUFF_COUNTS *huff_counts, uint trees,
1261 my_off_t records)
1262 {
1263 uint space_fields,fill_zero_fields,field_count[(int) FIELD_enum_val_count];
1264 my_off_t old_length,new_length,length;
1265 DBUG_ENTER("check_counts");
1266
1267 bzero((uchar*) field_count,sizeof(field_count));
1268 space_fields=fill_zero_fields=0;
1269
1270 for (; trees-- ; huff_counts++)
1271 {
1272 if (huff_counts->field_type == FIELD_BLOB)
1273 {
1274 huff_counts->length_bits=max_bit(huff_counts->max_length);
1275 goto found_pack;
1276 }
1277 else if (huff_counts->field_type == FIELD_VARCHAR)
1278 {
1279 huff_counts->length_bits=max_bit(huff_counts->max_length);
1280 goto found_pack;
1281 }
1282 else if (huff_counts->field_type == FIELD_CHECK)
1283 {
1284 huff_counts->bytes_packed=0;
1285 huff_counts->counts[0]=0;
1286 goto found_pack;
1287 }
1288
1289 huff_counts->field_type=FIELD_NORMAL;
1290 huff_counts->pack_type=0;
1291
1292 /* Check for zero-filled records (in this column), or zero records. */
1293 if (huff_counts->zero_fields || ! records)
1294 {
1295 my_off_t old_space_count;
1296 /*
1297 If there are only zero filled records (in this column),
1298 or no records at all, we are done.
1299 */
1300 if (huff_counts->zero_fields == records)
1301 {
1302 huff_counts->field_type= FIELD_ZERO;
1303 huff_counts->bytes_packed=0;
1304 huff_counts->counts[0]=0;
1305 goto found_pack;
1306 }
1307 /* Remeber the number of significant spaces. */
1308 old_space_count=huff_counts->counts[' '];
1309 /* Add all leading and trailing spaces. */
1310 huff_counts->counts[' ']+= (huff_counts->tot_end_space +
1311 huff_counts->tot_pre_space +
1312 huff_counts->empty_fields *
1313 huff_counts->field_length);
1314 /* Check, what the compressed length of this would be. */
1315 old_length=calc_packed_length(huff_counts,0)+records/8;
1316 /* Get the number of zero bytes. */
1317 length=huff_counts->zero_fields*huff_counts->field_length;
1318 /* Add it to the counts. */
1319 huff_counts->counts[0]+=length;
1320 /* Check, what the compressed length of this would be. */
1321 new_length=calc_packed_length(huff_counts,0);
1322 /* If the compression without the zeroes would be shorter, we are done. */
1323 if (old_length < new_length && huff_counts->field_length > 1)
1324 {
1325 huff_counts->field_type=FIELD_SKIP_ZERO;
1326 huff_counts->counts[0]-=length;
1327 huff_counts->bytes_packed=old_length- records/8;
1328 goto found_pack;
1329 }
1330 /* Remove the insignificant spaces, but keep the zeroes. */
1331 huff_counts->counts[' ']=old_space_count;
1332 }
1333 /* Check, what the compressed length of this column would be. */
1334 huff_counts->bytes_packed=calc_packed_length(huff_counts,0);
1335
1336 /*
1337 If there are enough empty records (in this column),
1338 treating them specially may pay off.
1339 */
1340 if (huff_counts->empty_fields)
1341 {
1342 if (huff_counts->field_length > 2 &&
1343 huff_counts->empty_fields + (records - huff_counts->empty_fields)*
1344 (1+max_bit(MY_MAX(huff_counts->max_pre_space,
1345 huff_counts->max_end_space))) <
1346 records * max_bit(huff_counts->field_length))
1347 {
1348 huff_counts->pack_type |= PACK_TYPE_SPACE_FIELDS;
1349 }
1350 else
1351 {
1352 length=huff_counts->empty_fields*huff_counts->field_length;
1353 if (huff_counts->tot_end_space || ! huff_counts->tot_pre_space)
1354 {
1355 huff_counts->tot_end_space+=length;
1356 huff_counts->max_end_space=huff_counts->field_length;
1357 if (huff_counts->field_length < 8)
1358 huff_counts->end_space[huff_counts->field_length]+=
1359 huff_counts->empty_fields;
1360 }
1361 if (huff_counts->tot_pre_space)
1362 {
1363 huff_counts->tot_pre_space+=length;
1364 huff_counts->max_pre_space=huff_counts->field_length;
1365 if (huff_counts->field_length < 8)
1366 huff_counts->pre_space[huff_counts->field_length]+=
1367 huff_counts->empty_fields;
1368 }
1369 }
1370 }
1371
1372 /*
1373 If there are enough trailing spaces (in this column),
1374 treating them specially may pay off.
1375 */
1376 if (huff_counts->tot_end_space)
1377 {
1378 huff_counts->counts[' ']+=huff_counts->tot_pre_space;
1379 if (test_space_compress(huff_counts,records,huff_counts->max_end_space,
1380 huff_counts->end_space,
1381 huff_counts->tot_end_space,FIELD_SKIP_ENDSPACE))
1382 goto found_pack;
1383 huff_counts->counts[' ']-=huff_counts->tot_pre_space;
1384 }
1385
1386 /*
1387 If there are enough leading spaces (in this column),
1388 treating them specially may pay off.
1389 */
1390 if (huff_counts->tot_pre_space)
1391 {
1392 if (test_space_compress(huff_counts,records,huff_counts->max_pre_space,
1393 huff_counts->pre_space,
1394 huff_counts->tot_pre_space,FIELD_SKIP_PRESPACE))
1395 goto found_pack;
1396 }
1397
1398 found_pack: /* Found field-packing */
1399
1400 /* Test if we can use zero-fill */
1401
1402 if (huff_counts->max_zero_fill &&
1403 (huff_counts->field_type == FIELD_NORMAL ||
1404 huff_counts->field_type == FIELD_SKIP_ZERO))
1405 {
1406 huff_counts->counts[0]-=huff_counts->max_zero_fill*
1407 (huff_counts->field_type == FIELD_SKIP_ZERO ?
1408 records - huff_counts->zero_fields : records);
1409 huff_counts->pack_type|=PACK_TYPE_ZERO_FILL;
1410 huff_counts->bytes_packed=calc_packed_length(huff_counts,0);
1411 }
1412
1413 /* Test if intervall-field is better */
1414
1415 if (huff_counts->tree_buff)
1416 {
1417 HUFF_TREE tree;
1418
1419 DBUG_EXECUTE_IF("forceintervall",
1420 huff_counts->bytes_packed= ~ (my_off_t) 0;);
1421 tree.element_buffer=0;
1422 if (!make_huff_tree(&tree,huff_counts) &&
1423 tree.bytes_packed+tree.tree_pack_length < huff_counts->bytes_packed)
1424 {
1425 if (tree.elements == 1)
1426 huff_counts->field_type=FIELD_CONSTANT;
1427 else
1428 huff_counts->field_type=FIELD_INTERVALL;
1429 huff_counts->pack_type=0;
1430 }
1431 else
1432 {
1433 my_free(huff_counts->tree_buff);
1434 delete_tree(&huff_counts->int_tree, 0);
1435 huff_counts->tree_buff=0;
1436 }
1437 if (tree.element_buffer)
1438 my_free(tree.element_buffer);
1439 }
1440 if (huff_counts->pack_type & PACK_TYPE_SPACE_FIELDS)
1441 space_fields++;
1442 if (huff_counts->pack_type & PACK_TYPE_ZERO_FILL)
1443 fill_zero_fields++;
1444 field_count[huff_counts->field_type]++;
1445 }
1446 DBUG_PRINT("info", ("normal: %3d empty-space: %3d "
1447 "empty-zero: %3d empty-fill: %3d",
1448 field_count[FIELD_NORMAL],space_fields,
1449 field_count[FIELD_SKIP_ZERO],fill_zero_fields));
1450 DBUG_PRINT("info", ("pre-space: %3d end-space: %3d "
1451 "intervall-fields: %3d zero: %3d",
1452 field_count[FIELD_SKIP_PRESPACE],
1453 field_count[FIELD_SKIP_ENDSPACE],
1454 field_count[FIELD_INTERVALL],
1455 field_count[FIELD_ZERO]));
1456 if (verbose)
1457 printf("\nnormal: %3d empty-space: %3d "
1458 "empty-zero: %3d empty-fill: %3d\n"
1459 "pre-space: %3d end-space: %3d "
1460 "intervall-fields: %3d zero: %3d\n",
1461 field_count[FIELD_NORMAL],space_fields,
1462 field_count[FIELD_SKIP_ZERO],fill_zero_fields,
1463 field_count[FIELD_SKIP_PRESPACE],
1464 field_count[FIELD_SKIP_ENDSPACE],
1465 field_count[FIELD_INTERVALL],
1466 field_count[FIELD_ZERO]);
1467 DBUG_VOID_RETURN;
1468 }
1469
1470
1471 /* Test if we can use space-compression and empty-field-compression */
1472
1473 static int
test_space_compress(HUFF_COUNTS * huff_counts,my_off_t records,uint max_space_length,my_off_t * space_counts,my_off_t tot_space_count,enum en_fieldtype field_type)1474 test_space_compress(HUFF_COUNTS *huff_counts, my_off_t records,
1475 uint max_space_length, my_off_t *space_counts,
1476 my_off_t tot_space_count, enum en_fieldtype field_type)
1477 {
1478 int min_pos;
1479 uint length_bits,i;
1480 my_off_t space_count,min_space_count,min_pack,new_length,skip;
1481
1482 length_bits=max_bit(max_space_length);
1483
1484 /* Default no end_space-packing */
1485 space_count=huff_counts->counts[(uint) ' '];
1486 min_space_count= (huff_counts->counts[(uint) ' ']+= tot_space_count);
1487 min_pack=calc_packed_length(huff_counts,0);
1488 min_pos= -2;
1489 huff_counts->counts[(uint) ' ']=space_count;
1490
1491 /* Test with allways space-count */
1492 new_length=huff_counts->bytes_packed+length_bits*records/8;
1493 if (new_length+1 < min_pack)
1494 {
1495 min_pos= -1;
1496 min_pack=new_length;
1497 min_space_count=space_count;
1498 }
1499 /* Test with length-flag */
1500 for (skip=0L, i=0 ; i < 8 ; i++)
1501 {
1502 if (space_counts[i])
1503 {
1504 if (i)
1505 huff_counts->counts[(uint) ' ']+=space_counts[i];
1506 skip+=huff_counts->pre_space[i];
1507 new_length=calc_packed_length(huff_counts,0)+
1508 (records+(records-skip)*(1+length_bits))/8;
1509 if (new_length < min_pack)
1510 {
1511 min_pos=(int) i;
1512 min_pack=new_length;
1513 min_space_count=huff_counts->counts[(uint) ' '];
1514 }
1515 }
1516 }
1517
1518 huff_counts->counts[(uint) ' ']=min_space_count;
1519 huff_counts->bytes_packed=min_pack;
1520 switch (min_pos) {
1521 case -2:
1522 return(0); /* No space-compress */
1523 case -1: /* Always space-count */
1524 huff_counts->field_type=field_type;
1525 huff_counts->min_space=0;
1526 huff_counts->length_bits=max_bit(max_space_length);
1527 break;
1528 default:
1529 huff_counts->field_type=field_type;
1530 huff_counts->min_space=(uint) min_pos;
1531 huff_counts->pack_type|=PACK_TYPE_SELECTED;
1532 huff_counts->length_bits=max_bit(max_space_length);
1533 break;
1534 }
1535 return(1); /* Using space-compress */
1536 }
1537
1538
1539 /* Make a huff_tree of each huff_count */
1540
make_huff_trees(HUFF_COUNTS * huff_counts,uint trees)1541 static HUFF_TREE* make_huff_trees(HUFF_COUNTS *huff_counts, uint trees)
1542 {
1543 uint tree;
1544 HUFF_TREE *huff_tree;
1545 DBUG_ENTER("make_huff_trees");
1546
1547 if (!(huff_tree=(HUFF_TREE*) my_malloc(trees*sizeof(HUFF_TREE),
1548 MYF(MY_WME | MY_ZEROFILL))))
1549 DBUG_RETURN(0);
1550
1551 for (tree=0 ; tree < trees ; tree++)
1552 {
1553 if (make_huff_tree(huff_tree+tree,huff_counts+tree))
1554 {
1555 while (tree--)
1556 my_free(huff_tree[tree].element_buffer);
1557 my_free(huff_tree);
1558 DBUG_RETURN(0);
1559 }
1560 }
1561 DBUG_RETURN(huff_tree);
1562 }
1563
1564 /*
1565 Build a Huffman tree.
1566
1567 SYNOPSIS
1568 make_huff_tree()
1569 huff_tree The Huffman tree.
1570 huff_counts The counts.
1571
1572 DESCRIPTION
1573 Build a Huffman tree according to huff_counts->counts or
1574 huff_counts->tree_buff. tree_buff, if non-NULL contains up to
1575 tree_buff_length of distinct column values. In that case, whole
1576 values can be Huffman encoded instead of single bytes.
1577
1578 RETURN
1579 0 OK
1580 != 0 Error
1581 */
1582
make_huff_tree(HUFF_TREE * huff_tree,HUFF_COUNTS * huff_counts)1583 static int make_huff_tree(HUFF_TREE *huff_tree, HUFF_COUNTS *huff_counts)
1584 {
1585 uint i,found,bits_packed,first,last;
1586 my_off_t bytes_packed;
1587 HUFF_ELEMENT *a,*b,*new_huff_el;
1588
1589 first=last=0;
1590 if (huff_counts->tree_buff)
1591 {
1592 /* Calculate the number of distinct values in tree_buff. */
1593 found= (uint) (huff_counts->tree_pos - huff_counts->tree_buff) /
1594 huff_counts->field_length;
1595 first=0; last=found-1;
1596 }
1597 else
1598 {
1599 /* Count the number of uchar codes found in the column. */
1600 for (i=found=0 ; i < 256 ; i++)
1601 {
1602 if (huff_counts->counts[i])
1603 {
1604 if (! found++)
1605 first=i;
1606 last=i;
1607 }
1608 }
1609 if (found < 2)
1610 found=2;
1611 }
1612
1613 /* When using 'tree_buff' we can have more that 256 values. */
1614 if (queue.max_elements < found)
1615 {
1616 delete_queue(&queue);
1617 if (init_queue(&queue,found, 0, 0, compare_huff_elements, 0, 0, 0))
1618 return -1;
1619 }
1620
1621 /* Allocate or reallocate an element buffer for the Huffman tree. */
1622 if (!huff_tree->element_buffer)
1623 {
1624 if (!(huff_tree->element_buffer=
1625 (HUFF_ELEMENT*) my_malloc(found*2*sizeof(HUFF_ELEMENT),MYF(MY_WME))))
1626 return 1;
1627 }
1628 else
1629 {
1630 HUFF_ELEMENT *temp;
1631 if (!(temp=
1632 (HUFF_ELEMENT*) my_realloc((uchar*) huff_tree->element_buffer,
1633 found*2*sizeof(HUFF_ELEMENT),
1634 MYF(MY_WME))))
1635 return 1;
1636 huff_tree->element_buffer=temp;
1637 }
1638
1639 huff_counts->tree=huff_tree;
1640 huff_tree->counts=huff_counts;
1641 huff_tree->min_chr=first;
1642 huff_tree->max_chr=last;
1643 huff_tree->char_bits=max_bit(last-first);
1644 huff_tree->offset_bits=max_bit(found-1)+1;
1645
1646 if (huff_counts->tree_buff)
1647 {
1648 huff_tree->elements=0;
1649 huff_tree->tree_pack_length=(1+15+16+5+5+
1650 (huff_tree->char_bits+1)*found+
1651 (huff_tree->offset_bits+1)*
1652 (found-2)+7)/8 +
1653 (uint) (huff_tree->counts->tree_pos-
1654 huff_tree->counts->tree_buff);
1655 /*
1656 Put a HUFF_ELEMENT into the queue for every distinct column value.
1657
1658 tree_walk() calls save_counts_in_queue() for every element in
1659 'int_tree'. This takes elements from the target trees element
1660 buffer and places references to them into the buffer of the
1661 priority queue. We insert in column value order, but the order is
1662 in fact irrelevant here. We will establish the correct order
1663 later.
1664 */
1665 tree_walk(&huff_counts->int_tree,
1666 (int (*)(void*, element_count,void*)) save_counts_in_queue,
1667 (uchar*) huff_tree, left_root_right);
1668 }
1669 else
1670 {
1671 huff_tree->elements=found;
1672 huff_tree->tree_pack_length=(9+9+5+5+
1673 (huff_tree->char_bits+1)*found+
1674 (huff_tree->offset_bits+1)*
1675 (found-2)+7)/8;
1676 /*
1677 Put a HUFF_ELEMENT into the queue for every uchar code found in the column.
1678
1679 The elements are taken from the target trees element buffer.
1680 Instead of using queue_insert(), we just place references to the
1681 elements into the buffer of the priority queue. We insert in byte
1682 value order, but the order is in fact irrelevant here. We will
1683 establish the correct order later.
1684 */
1685 for (i=first, found=0 ; i <= last ; i++)
1686 {
1687 if (huff_counts->counts[i])
1688 {
1689 new_huff_el=huff_tree->element_buffer+(found++);
1690 new_huff_el->count=huff_counts->counts[i];
1691 new_huff_el->a.leaf.null=0;
1692 new_huff_el->a.leaf.element_nr=i;
1693 queue.root[found]=(uchar*) new_huff_el;
1694 }
1695 }
1696 /*
1697 If there is only a single uchar value in this field in all records,
1698 add a second element with zero incidence. This is required to enter
1699 the loop, which builds the Huffman tree.
1700 */
1701 while (found < 2)
1702 {
1703 new_huff_el=huff_tree->element_buffer+(found++);
1704 new_huff_el->count=0;
1705 new_huff_el->a.leaf.null=0;
1706 if (last)
1707 new_huff_el->a.leaf.element_nr=huff_tree->min_chr=last-1;
1708 else
1709 new_huff_el->a.leaf.element_nr=huff_tree->max_chr=last+1;
1710 queue.root[found]=(uchar*) new_huff_el;
1711 }
1712 }
1713
1714 /* Make a queue from the queue buffer. */
1715 queue.elements=found;
1716
1717 /*
1718 Make a priority queue from the queue. Construct its index so that we
1719 have a partially ordered tree.
1720 */
1721 queue_fix(&queue);
1722
1723 /* The Huffman algorithm. */
1724 bytes_packed=0; bits_packed=0;
1725 for (i=1 ; i < found ; i++)
1726 {
1727 /*
1728 Pop the top element from the queue (the one with the least incidence).
1729 Popping from a priority queue includes a re-ordering of the queue,
1730 to get the next least incidence element to the top.
1731 */
1732 a=(HUFF_ELEMENT*) queue_remove_top(&queue);
1733 /* Copy the next least incidence element */
1734 b=(HUFF_ELEMENT*) queue_top(&queue);
1735 /* Get a new element from the element buffer. */
1736 new_huff_el=huff_tree->element_buffer+found+i;
1737 /* The new element gets the sum of the two least incidence elements. */
1738 new_huff_el->count=a->count+b->count;
1739 /*
1740 The Huffman algorithm assigns another bit to the code for a byte
1741 every time that bytes incidence is combined (directly or indirectly)
1742 to a new element as one of the two least incidence elements.
1743 This means that one more bit per incidence of that uchar is required
1744 in the resulting file. So we add the new combined incidence as the
1745 number of bits by which the result grows.
1746 */
1747 bits_packed+=(uint) (new_huff_el->count & 7);
1748 bytes_packed+=new_huff_el->count/8;
1749 /* The new element points to its children, lesser in left. */
1750 new_huff_el->a.nod.left=a;
1751 new_huff_el->a.nod.right=b;
1752 /*
1753 Replace the copied top element by the new element and re-order the
1754 queue.
1755 */
1756 queue_top(&queue)= (uchar*) new_huff_el;
1757 queue_replace_top(&queue);
1758 }
1759 huff_tree->root=(HUFF_ELEMENT*) queue.root[1];
1760 huff_tree->bytes_packed=bytes_packed+(bits_packed+7)/8;
1761 return 0;
1762 }
1763
compare_tree(void * cmp_arg,register const uchar * s,register const uchar * t)1764 static int compare_tree(void* cmp_arg __attribute__((unused)),
1765 register const uchar *s, register const uchar *t)
1766 {
1767 uint length;
1768 for (length=global_count->field_length; length-- ;)
1769 if (*s++ != *t++)
1770 return (int) s[-1] - (int) t[-1];
1771 return 0;
1772 }
1773
1774 /*
1775 Organize distinct column values and their incidences into a priority queue.
1776
1777 SYNOPSIS
1778 save_counts_in_queue()
1779 key The column value.
1780 count The incidence of this value.
1781 tree The Huffman tree to be built later.
1782
1783 DESCRIPTION
1784 We use the element buffer of the targeted tree. The distinct column
1785 values are organized in a priority queue first. The Huffman
1786 algorithm will later organize the elements into a Huffman tree. For
1787 the time being, we just place references to the elements into the
1788 queue buffer. The buffer will later be organized into a priority
1789 queue.
1790
1791 RETURN
1792 0
1793 */
1794
save_counts_in_queue(uchar * key,element_count count,HUFF_TREE * tree)1795 static int save_counts_in_queue(uchar *key, element_count count,
1796 HUFF_TREE *tree)
1797 {
1798 HUFF_ELEMENT *new_huff_el;
1799
1800 new_huff_el=tree->element_buffer+(tree->elements++);
1801 new_huff_el->count=count;
1802 new_huff_el->a.leaf.null=0;
1803 new_huff_el->a.leaf.element_nr= (uint) (key- tree->counts->tree_buff) /
1804 tree->counts->field_length;
1805 queue.root[tree->elements]=(uchar*) new_huff_el;
1806 return 0;
1807 }
1808
1809
1810 /*
1811 Calculate length of file if given counts should be used.
1812
1813 SYNOPSIS
1814 calc_packed_length()
1815 huff_counts The counts for a column of the table(s).
1816 add_tree_lenght If the decode tree length should be added.
1817
1818 DESCRIPTION
1819 We need to follow the Huffman algorithm until we know, how many bits
1820 are required for each uchar code. But we do not need the resulting
1821 Huffman tree. Hence, we can leave out some steps which are essential
1822 in make_huff_tree().
1823
1824 RETURN
1825 Number of bytes required to compress this table column.
1826 */
1827
calc_packed_length(HUFF_COUNTS * huff_counts,uint add_tree_lenght)1828 static my_off_t calc_packed_length(HUFF_COUNTS *huff_counts,
1829 uint add_tree_lenght)
1830 {
1831 uint i,found,bits_packed,first,last;
1832 my_off_t bytes_packed;
1833 HUFF_ELEMENT element_buffer[256];
1834 DBUG_ENTER("calc_packed_length");
1835
1836 /*
1837 WARNING: We use a small hack for efficiency: Instead of placing
1838 references to HUFF_ELEMENTs into the queue, we just insert
1839 references to the counts of the uchar codes which appeared in this
1840 table column. During the Huffman algorithm they are successively
1841 replaced by references to HUFF_ELEMENTs. This works, because
1842 HUFF_ELEMENTs have the incidence count at their beginning.
1843 Regardless, wether the queue array contains references to counts of
1844 type my_off_t or references to HUFF_ELEMENTs which have the count of
1845 type my_off_t at their beginning, it always points to a count of the
1846 same type.
1847
1848 Instead of using queue_insert(), we just copy the references into
1849 the buffer of the priority queue. We insert in uchar value order, but
1850 the order is in fact irrelevant here. We will establish the correct
1851 order later.
1852 */
1853 first=last=0;
1854 for (i=found=0 ; i < 256 ; i++)
1855 {
1856 if (huff_counts->counts[i])
1857 {
1858 if (! found++)
1859 first=i;
1860 last=i;
1861 /* We start with root[1], which is the queues top element. */
1862 queue.root[found]=(uchar*) &huff_counts->counts[i];
1863 }
1864 }
1865 if (!found)
1866 DBUG_RETURN(0); /* Empty tree */
1867 /*
1868 If there is only a single uchar value in this field in all records,
1869 add a second element with zero incidence. This is required to enter
1870 the loop, which follows the Huffman algorithm.
1871 */
1872 if (found < 2)
1873 queue.root[++found]=(uchar*) &huff_counts->counts[last ? 0 : 1];
1874
1875 /* Make a queue from the queue buffer. */
1876 queue.elements=found;
1877
1878 bytes_packed=0; bits_packed=0;
1879 /* Add the length of the coding table, which would become part of the file. */
1880 if (add_tree_lenght)
1881 bytes_packed=(8+9+5+5+(max_bit(last-first)+1)*found+
1882 (max_bit(found-1)+1+1)*(found-2) +7)/8;
1883
1884 /*
1885 Make a priority queue from the queue. Construct its index so that we
1886 have a partially ordered tree.
1887 */
1888 queue_fix(&queue);
1889
1890 /* The Huffman algorithm. */
1891 for (i=0 ; i < found-1 ; i++)
1892 {
1893 my_off_t *a;
1894 my_off_t *b;
1895 HUFF_ELEMENT *new_huff_el;
1896
1897 /*
1898 Pop the top element from the queue (the one with the least
1899 incidence). Popping from a priority queue includes a re-ordering
1900 of the queue, to get the next least incidence element to the top.
1901 */
1902 a= (my_off_t*) queue_remove_top(&queue);
1903 /* Copy the next least incidence element. */
1904 b= (my_off_t*) queue_top(&queue);
1905 /* Create a new element in a local (automatic) buffer. */
1906 new_huff_el= element_buffer + i;
1907 /* The new element gets the sum of the two least incidence elements. */
1908 new_huff_el->count= *a + *b;
1909 /*
1910 The Huffman algorithm assigns another bit to the code for a byte
1911 every time that bytes incidence is combined (directly or indirectly)
1912 to a new element as one of the two least incidence elements.
1913 This means that one more bit per incidence of that uchar is required
1914 in the resulting file. So we add the new combined incidence as the
1915 number of bits by which the result grows.
1916 */
1917 bits_packed+=(uint) (new_huff_el->count & 7);
1918 bytes_packed+=new_huff_el->count/8;
1919 /*
1920 Replace the copied top element by the new element and re-order the
1921 queue. This successively replaces the references to counts by
1922 references to HUFF_ELEMENTs.
1923 */
1924 queue_top(&queue)= (uchar*) new_huff_el;
1925 queue_replace_top(&queue);
1926 }
1927 DBUG_RETURN(bytes_packed+(bits_packed+7)/8);
1928 }
1929
1930
1931 /* Remove trees that don't give any compression */
1932
join_same_trees(HUFF_COUNTS * huff_counts,uint trees)1933 static uint join_same_trees(HUFF_COUNTS *huff_counts, uint trees)
1934 {
1935 uint k,tree_number;
1936 HUFF_COUNTS count,*i,*j,*last_count;
1937
1938 last_count=huff_counts+trees;
1939 for (tree_number=0, i=huff_counts ; i < last_count ; i++)
1940 {
1941 if (!i->tree->tree_number)
1942 {
1943 i->tree->tree_number= ++tree_number;
1944 if (i->tree_buff)
1945 continue; /* Don't join intervall */
1946 for (j=i+1 ; j < last_count ; j++)
1947 {
1948 if (! j->tree->tree_number && ! j->tree_buff)
1949 {
1950 for (k=0 ; k < 256 ; k++)
1951 count.counts[k]=i->counts[k]+j->counts[k];
1952 if (calc_packed_length(&count,1) <=
1953 i->tree->bytes_packed + j->tree->bytes_packed+
1954 i->tree->tree_pack_length+j->tree->tree_pack_length+
1955 ALLOWED_JOIN_DIFF)
1956 {
1957 memcpy(i->counts,(uchar*) count.counts, sizeof(count.counts[0])*256);
1958 my_free(j->tree->element_buffer);
1959 j->tree->element_buffer=0;
1960 j->tree=i->tree;
1961 bmove((uchar*) i->counts,(uchar*) count.counts,
1962 sizeof(count.counts[0])*256);
1963 if (make_huff_tree(i->tree,i))
1964 return (uint) -1;
1965 }
1966 }
1967 }
1968 }
1969 }
1970 DBUG_PRINT("info", ("Original trees: %d After join: %d",
1971 trees, tree_number));
1972 if (verbose)
1973 printf("Original trees: %d After join: %d\n", trees, tree_number);
1974 return tree_number; /* Return trees left */
1975 }
1976
1977
1978 /*
1979 Fill in huff_tree encode tables.
1980
1981 SYNOPSIS
1982 make_huff_decode_table()
1983 huff_tree An array of HUFF_TREE which are to be encoded.
1984 trees The number of HUFF_TREE in the array.
1985
1986 RETURN
1987 0 success
1988 != 0 error
1989 */
1990
make_huff_decode_table(HUFF_TREE * huff_tree,uint trees)1991 static int make_huff_decode_table(HUFF_TREE *huff_tree, uint trees)
1992 {
1993 uint elements;
1994 for ( ; trees-- ; huff_tree++)
1995 {
1996 if (huff_tree->tree_number > 0)
1997 {
1998 elements=huff_tree->counts->tree_buff ? huff_tree->elements : 256;
1999 if (!(huff_tree->code =
2000 (ulonglong*) my_malloc(elements*
2001 (sizeof(ulonglong) + sizeof(uchar)),
2002 MYF(MY_WME | MY_ZEROFILL))))
2003 return 1;
2004 huff_tree->code_len=(uchar*) (huff_tree->code+elements);
2005 make_traverse_code_tree(huff_tree, huff_tree->root,
2006 8 * sizeof(ulonglong), 0);
2007 }
2008 }
2009 return 0;
2010 }
2011
2012
make_traverse_code_tree(HUFF_TREE * huff_tree,HUFF_ELEMENT * element,uint size,ulonglong code)2013 static void make_traverse_code_tree(HUFF_TREE *huff_tree,
2014 HUFF_ELEMENT *element,
2015 uint size, ulonglong code)
2016 {
2017 uint chr;
2018 if (!element->a.leaf.null)
2019 {
2020 chr=element->a.leaf.element_nr;
2021 huff_tree->code_len[chr]= (uchar) (8 * sizeof(ulonglong) - size);
2022 huff_tree->code[chr]= (code >> size);
2023 if (huff_tree->height < 8 * sizeof(ulonglong) - size)
2024 huff_tree->height= 8 * sizeof(ulonglong) - size;
2025 }
2026 else
2027 {
2028 size--;
2029 make_traverse_code_tree(huff_tree,element->a.nod.left,size,code);
2030 make_traverse_code_tree(huff_tree, element->a.nod.right, size,
2031 code + (((ulonglong) 1) << size));
2032 }
2033 return;
2034 }
2035
2036
2037 /*
2038 Convert a value into binary digits.
2039
2040 SYNOPSIS
2041 bindigits()
2042 value The value.
2043 length The number of low order bits to convert.
2044
2045 NOTE
2046 The result string is in static storage. It is reused on every call.
2047 So you cannot use it twice in one expression.
2048
2049 RETURN
2050 A pointer to a static NUL-terminated string.
2051 */
2052
bindigits(ulonglong value,uint bits)2053 static char *bindigits(ulonglong value, uint bits)
2054 {
2055 static char digits[72];
2056 char *ptr= digits;
2057 uint idx= bits;
2058
2059 DBUG_ASSERT(idx < sizeof(digits));
2060 while (idx)
2061 *(ptr++)= '0' + ((char) (value >> (--idx)) & (char) 1);
2062 *ptr= '\0';
2063 return digits;
2064 }
2065
2066
2067 /*
2068 Convert a value into hexadecimal digits.
2069
2070 SYNOPSIS
2071 hexdigits()
2072 value The value.
2073
2074 NOTE
2075 The result string is in static storage. It is reused on every call.
2076 So you cannot use it twice in one expression.
2077
2078 RETURN
2079 A pointer to a static NUL-terminated string.
2080 */
2081
hexdigits(ulonglong value)2082 static char *hexdigits(ulonglong value)
2083 {
2084 static char digits[20];
2085 char *ptr= digits;
2086 uint idx= 2 * sizeof(value); /* Two hex digits per byte. */
2087
2088 DBUG_ASSERT(idx < sizeof(digits));
2089 while (idx)
2090 {
2091 if ((*(ptr++)= '0' + ((char) (value >> (4 * (--idx))) & (char) 0xf)) > '9')
2092 *(ptr - 1)+= 'a' - '9' - 1;
2093 }
2094 *ptr= '\0';
2095 return digits;
2096 }
2097
2098
2099 /* Write header to new packed data file */
2100
write_header(PACK_MRG_INFO * mrg,uint head_length,uint trees,my_off_t tot_elements,my_off_t filelength)2101 static int write_header(PACK_MRG_INFO *mrg,uint head_length,uint trees,
2102 my_off_t tot_elements,my_off_t filelength)
2103 {
2104 uchar *buff= (uchar*) file_buffer.pos;
2105
2106 bzero(buff,HEAD_LENGTH);
2107 memcpy(buff,maria_pack_file_magic,4);
2108 int4store(buff+4,head_length);
2109 int4store(buff+8, mrg->min_pack_length);
2110 int4store(buff+12,mrg->max_pack_length);
2111 int4store(buff+16,tot_elements);
2112 int4store(buff+20,intervall_length);
2113 int2store(buff+24,trees);
2114 buff[26]=(char) mrg->ref_length;
2115 /* Save record pointer length */
2116 buff[27]= (uchar) maria_get_pointer_length((ulonglong) filelength,2);
2117 if (test_only)
2118 return 0;
2119 my_seek(file_buffer.file,0L,MY_SEEK_SET,MYF(0));
2120 return my_write(file_buffer.file,(const uchar *) file_buffer.pos,HEAD_LENGTH,
2121 MYF(MY_WME | MY_NABP | MY_WAIT_IF_FULL)) != 0;
2122 }
2123
2124 /* Write fieldinfo to new packed file */
2125
write_field_info(HUFF_COUNTS * counts,uint fields,uint trees)2126 static void write_field_info(HUFF_COUNTS *counts, uint fields, uint trees)
2127 {
2128 reg1 uint i;
2129 uint huff_tree_bits;
2130 huff_tree_bits=max_bit(trees ? trees-1 : 0);
2131
2132 DBUG_PRINT("info", (" "));
2133 DBUG_PRINT("info", ("column types:"));
2134 DBUG_PRINT("info", ("FIELD_NORMAL 0"));
2135 DBUG_PRINT("info", ("FIELD_SKIP_ENDSPACE 1"));
2136 DBUG_PRINT("info", ("FIELD_SKIP_PRESPACE 2"));
2137 DBUG_PRINT("info", ("FIELD_SKIP_ZERO 3"));
2138 DBUG_PRINT("info", ("FIELD_BLOB 4"));
2139 DBUG_PRINT("info", ("FIELD_CONSTANT 5"));
2140 DBUG_PRINT("info", ("FIELD_INTERVALL 6"));
2141 DBUG_PRINT("info", ("FIELD_ZERO 7"));
2142 DBUG_PRINT("info", ("FIELD_VARCHAR 8"));
2143 DBUG_PRINT("info", ("FIELD_CHECK 9"));
2144 DBUG_PRINT("info", (" "));
2145 DBUG_PRINT("info", ("pack type as a set of flags:"));
2146 DBUG_PRINT("info", ("PACK_TYPE_SELECTED 1"));
2147 DBUG_PRINT("info", ("PACK_TYPE_SPACE_FIELDS 2"));
2148 DBUG_PRINT("info", ("PACK_TYPE_ZERO_FILL 4"));
2149 DBUG_PRINT("info", (" "));
2150 if (verbose >= 2)
2151 {
2152 printf("\n");
2153 printf("column types:\n");
2154 printf("FIELD_NORMAL 0\n");
2155 printf("FIELD_SKIP_ENDSPACE 1\n");
2156 printf("FIELD_SKIP_PRESPACE 2\n");
2157 printf("FIELD_SKIP_ZERO 3\n");
2158 printf("FIELD_BLOB 4\n");
2159 printf("FIELD_CONSTANT 5\n");
2160 printf("FIELD_INTERVALL 6\n");
2161 printf("FIELD_ZERO 7\n");
2162 printf("FIELD_VARCHAR 8\n");
2163 printf("FIELD_CHECK 9\n");
2164 printf("\n");
2165 printf("pack type as a set of flags:\n");
2166 printf("PACK_TYPE_SELECTED 1\n");
2167 printf("PACK_TYPE_SPACE_FIELDS 2\n");
2168 printf("PACK_TYPE_ZERO_FILL 4\n");
2169 printf("\n");
2170 }
2171 for (i=0 ; i++ < fields ; counts++)
2172 {
2173 write_bits((ulonglong) (int) counts->field_type, 5);
2174 write_bits(counts->pack_type,6);
2175 if (counts->pack_type & PACK_TYPE_ZERO_FILL)
2176 write_bits(counts->max_zero_fill,5);
2177 else
2178 write_bits(counts->length_bits,5);
2179 write_bits((ulonglong) counts->tree->tree_number - 1, huff_tree_bits);
2180 DBUG_PRINT("info", ("column: %3u type: %2u pack: %2u zero: %4u "
2181 "lbits: %2u tree: %2u length: %4u",
2182 i , counts->field_type, counts->pack_type,
2183 counts->max_zero_fill, counts->length_bits,
2184 counts->tree->tree_number, counts->field_length));
2185 if (verbose >= 2)
2186 printf("column: %3u type: %2u pack: %2u zero: %4u lbits: %2u "
2187 "tree: %2u length: %4u\n", i , counts->field_type,
2188 counts->pack_type, counts->max_zero_fill, counts->length_bits,
2189 counts->tree->tree_number, counts->field_length);
2190 }
2191 flush_bits();
2192 return;
2193 }
2194
2195 /* Write all huff_trees to new datafile. Return tot count of
2196 elements in all trees
2197 Returns 0 on error */
2198
write_huff_tree(HUFF_TREE * huff_tree,uint trees)2199 static my_off_t write_huff_tree(HUFF_TREE *huff_tree, uint trees)
2200 {
2201 uint i,int_length;
2202 uint tree_no;
2203 uint codes;
2204 uint errors= 0;
2205 uint *packed_tree,*offset,length;
2206 my_off_t elements;
2207
2208 /* Find the highest number of elements in the trees. */
2209 for (i=length=0 ; i < trees ; i++)
2210 if (huff_tree[i].tree_number > 0 && huff_tree[i].elements > length)
2211 length=huff_tree[i].elements;
2212 /*
2213 Allocate a buffer for packing a decode tree. Two numbers per element
2214 (left child and right child).
2215 */
2216 if (!(packed_tree=(uint*) my_alloca(sizeof(uint)*length*2)))
2217 {
2218 my_error(EE_OUTOFMEMORY,MYF(ME_BELL),sizeof(uint)*length*2);
2219 return 0;
2220 }
2221
2222 DBUG_PRINT("info", (" "));
2223 if (verbose >= 2)
2224 printf("\n");
2225 tree_no= 0;
2226 intervall_length=0;
2227 for (elements=0; trees-- ; huff_tree++)
2228 {
2229 /* Skip columns that have been joined with other columns. */
2230 if (huff_tree->tree_number == 0)
2231 continue; /* Deleted tree */
2232 tree_no++;
2233 DBUG_PRINT("info", (" "));
2234 if (verbose >= 3)
2235 printf("\n");
2236 /* Count the total number of elements (byte codes or column values). */
2237 elements+=huff_tree->elements;
2238 huff_tree->max_offset=2;
2239 /* Build a tree of offsets and codes for decoding in 'packed_tree'. */
2240 if (huff_tree->elements <= 1)
2241 offset=packed_tree;
2242 else
2243 offset=make_offset_code_tree(huff_tree,huff_tree->root,packed_tree);
2244
2245 /* This should be the same as 'length' above. */
2246 huff_tree->offset_bits=max_bit(huff_tree->max_offset);
2247
2248 /*
2249 Since we check this during collecting the distinct column values,
2250 this should never happen.
2251 */
2252 if (huff_tree->max_offset >= IS_OFFSET)
2253 { /* This should be impossible */
2254 fprintf(stderr, "Tree offset got too big: %d, aborted\n",
2255 huff_tree->max_offset);
2256 my_afree(packed_tree);
2257 return 0;
2258 }
2259
2260 DBUG_PRINT("info", ("pos: %lu elements: %u tree-elements: %lu "
2261 "char_bits: %u\n",
2262 (ulong) (file_buffer.pos - file_buffer.buffer),
2263 huff_tree->elements, (ulong) (offset - packed_tree),
2264 huff_tree->char_bits));
2265 if (!huff_tree->counts->tree_buff)
2266 {
2267 /* We do a uchar compression on this column. Mark with bit 0. */
2268 write_bits(0,1);
2269 write_bits(huff_tree->min_chr,8);
2270 write_bits(huff_tree->elements,9);
2271 write_bits(huff_tree->char_bits,5);
2272 write_bits(huff_tree->offset_bits,5);
2273 int_length=0;
2274 }
2275 else
2276 {
2277 int_length=(uint) (huff_tree->counts->tree_pos -
2278 huff_tree->counts->tree_buff);
2279 /* We have distinct column values for this column. Mark with bit 1. */
2280 write_bits(1,1);
2281 write_bits(huff_tree->elements,15);
2282 write_bits(int_length,16);
2283 write_bits(huff_tree->char_bits,5);
2284 write_bits(huff_tree->offset_bits,5);
2285 intervall_length+=int_length;
2286 }
2287 DBUG_PRINT("info", ("tree: %2u elements: %4u char_bits: %2u "
2288 "offset_bits: %2u %s: %5u codelen: %2u",
2289 tree_no, huff_tree->elements, huff_tree->char_bits,
2290 huff_tree->offset_bits, huff_tree->counts->tree_buff ?
2291 "bufflen" : "min_chr", huff_tree->counts->tree_buff ?
2292 int_length : huff_tree->min_chr, huff_tree->height));
2293 if (verbose >= 2)
2294 printf("tree: %2u elements: %4u char_bits: %2u offset_bits: %2u "
2295 "%s: %5u codelen: %2u\n", tree_no, huff_tree->elements,
2296 huff_tree->char_bits, huff_tree->offset_bits,
2297 huff_tree->counts->tree_buff ? "bufflen" : "min_chr",
2298 huff_tree->counts->tree_buff ? int_length :
2299 huff_tree->min_chr, huff_tree->height);
2300
2301 /* Check that the code tree length matches the element count. */
2302 length=(uint) (offset-packed_tree);
2303 if (length != huff_tree->elements*2-2)
2304 {
2305 fprintf(stderr, "error: Huff-tree-length: %d != calc_length: %d\n",
2306 length, huff_tree->elements * 2 - 2);
2307 errors++;
2308 break;
2309 }
2310
2311 for (i=0 ; i < length ; i++)
2312 {
2313 if (packed_tree[i] & IS_OFFSET)
2314 write_bits(packed_tree[i] - IS_OFFSET+ (1 << huff_tree->offset_bits),
2315 huff_tree->offset_bits+1);
2316 else
2317 write_bits(packed_tree[i]-huff_tree->min_chr,huff_tree->char_bits+1);
2318 DBUG_PRINT("info", ("tree[0x%04x]: %s0x%04x",
2319 i, (packed_tree[i] & IS_OFFSET) ?
2320 " -> " : "", (packed_tree[i] & IS_OFFSET) ?
2321 packed_tree[i] - IS_OFFSET + i : packed_tree[i]));
2322 if (verbose >= 3)
2323 printf("tree[0x%04x]: %s0x%04x\n",
2324 i, (packed_tree[i] & IS_OFFSET) ? " -> " : "",
2325 (packed_tree[i] & IS_OFFSET) ?
2326 packed_tree[i] - IS_OFFSET + i : packed_tree[i]);
2327 }
2328 flush_bits();
2329
2330 /*
2331 Display coding tables and check their correctness.
2332 */
2333 codes= huff_tree->counts->tree_buff ? huff_tree->elements : 256;
2334 for (i= 0; i < codes; i++)
2335 {
2336 ulonglong code;
2337 uint bits;
2338 uint len;
2339 uint idx;
2340
2341 if (! (len= huff_tree->code_len[i]))
2342 continue;
2343 DBUG_PRINT("info", ("code[0x%04x]: 0x%s bits: %2u bin: %s", i,
2344 hexdigits(huff_tree->code[i]), huff_tree->code_len[i],
2345 bindigits(huff_tree->code[i],
2346 huff_tree->code_len[i])));
2347 if (verbose >= 3)
2348 printf("code[0x%04x]: 0x%s bits: %2u bin: %s\n", i,
2349 hexdigits(huff_tree->code[i]), huff_tree->code_len[i],
2350 bindigits(huff_tree->code[i], huff_tree->code_len[i]));
2351
2352 /* Check that the encode table decodes correctly. */
2353 code= 0;
2354 bits= 0;
2355 idx= 0;
2356 DBUG_EXECUTE_IF("forcechkerr1", len--;);
2357 DBUG_EXECUTE_IF("forcechkerr2", bits= 8 * sizeof(code););
2358 DBUG_EXECUTE_IF("forcechkerr3", idx= length;);
2359 for (;;)
2360 {
2361 if (! len)
2362 {
2363 fflush(stdout);
2364 fprintf(stderr, "error: code 0x%s with %u bits not found\n",
2365 hexdigits(huff_tree->code[i]), huff_tree->code_len[i]);
2366 errors++;
2367 break;
2368 }
2369 code<<= 1;
2370 code|= (huff_tree->code[i] >> (--len)) & 1;
2371 bits++;
2372 if (bits > 8 * sizeof(code))
2373 {
2374 fflush(stdout);
2375 fprintf(stderr, "error: Huffman code too long: %u/%u\n",
2376 bits, (uint) (8 * sizeof(code)));
2377 errors++;
2378 break;
2379 }
2380 idx+= (uint) code & 1;
2381 if (idx >= length)
2382 {
2383 fflush(stdout);
2384 fprintf(stderr, "error: illegal tree offset: %u/%u\n", idx, length);
2385 errors++;
2386 break;
2387 }
2388 if (packed_tree[idx] & IS_OFFSET)
2389 idx+= packed_tree[idx] & ~IS_OFFSET;
2390 else
2391 break; /* Hit a leaf. This contains the result value. */
2392 }
2393 if (errors)
2394 break;
2395
2396 DBUG_EXECUTE_IF("forcechkerr4", packed_tree[idx]++;);
2397 if (packed_tree[idx] != i)
2398 {
2399 fflush(stdout);
2400 fprintf(stderr, "error: decoded value 0x%04x should be: 0x%04x\n",
2401 packed_tree[idx], i);
2402 errors++;
2403 break;
2404 }
2405 } /*end for (codes)*/
2406 if (errors)
2407 break;
2408
2409 /* Write column values in case of distinct column value compression. */
2410 if (huff_tree->counts->tree_buff)
2411 {
2412 for (i=0 ; i < int_length ; i++)
2413 {
2414 write_bits((ulonglong) (uchar) huff_tree->counts->tree_buff[i], 8);
2415 DBUG_PRINT("info", ("column_values[0x%04x]: 0x%02x",
2416 i, (uchar) huff_tree->counts->tree_buff[i]));
2417 if (verbose >= 3)
2418 printf("column_values[0x%04x]: 0x%02x\n",
2419 i, (uchar) huff_tree->counts->tree_buff[i]);
2420 }
2421 }
2422 flush_bits();
2423 }
2424 DBUG_PRINT("info", (" "));
2425 if (verbose >= 2)
2426 printf("\n");
2427 my_afree(packed_tree);
2428 if (errors)
2429 {
2430 fprintf(stderr, "Error: Generated decode trees are corrupt. Stop.\n");
2431 return 0;
2432 }
2433 return elements;
2434 }
2435
2436
make_offset_code_tree(HUFF_TREE * huff_tree,HUFF_ELEMENT * element,uint * offset)2437 static uint *make_offset_code_tree(HUFF_TREE *huff_tree, HUFF_ELEMENT *element,
2438 uint *offset)
2439 {
2440 uint *prev_offset;
2441
2442 prev_offset= offset;
2443 /*
2444 'a.leaf.null' takes the same place as 'a.nod.left'. If this is null,
2445 then there is no left child and, hence no right child either. This
2446 is a property of a binary tree. An element is either a node with two
2447 childs, or a leaf without childs.
2448
2449 The current element is always a node with two childs. Go left first.
2450 */
2451 if (!element->a.nod.left->a.leaf.null)
2452 {
2453 /* Store the uchar code or the index of the column value. */
2454 prev_offset[0] =(uint) element->a.nod.left->a.leaf.element_nr;
2455 offset+=2;
2456 }
2457 else
2458 {
2459 /*
2460 Recursively traverse the tree to the left. Mark it as an offset to
2461 another tree node (in contrast to a uchar code or column value index).
2462 */
2463 prev_offset[0]= IS_OFFSET+2;
2464 offset=make_offset_code_tree(huff_tree,element->a.nod.left,offset+2);
2465 }
2466
2467 /* Now, check the right child. */
2468 if (!element->a.nod.right->a.leaf.null)
2469 {
2470 /* Store the uchar code or the index of the column value. */
2471 prev_offset[1]=element->a.nod.right->a.leaf.element_nr;
2472 return offset;
2473 }
2474 else
2475 {
2476 /*
2477 Recursively traverse the tree to the right. Mark it as an offset to
2478 another tree node (in contrast to a uchar code or column value index).
2479 */
2480 uint temp=(uint) (offset-prev_offset-1);
2481 prev_offset[1]= IS_OFFSET+ temp;
2482 if (huff_tree->max_offset < temp)
2483 huff_tree->max_offset = temp;
2484 return make_offset_code_tree(huff_tree,element->a.nod.right,offset);
2485 }
2486 }
2487
2488 /* Get number of bits neaded to represent value */
2489
max_bit(register uint value)2490 static uint max_bit(register uint value)
2491 {
2492 reg2 uint power=1;
2493
2494 while ((value>>=1))
2495 power++;
2496 return (power);
2497 }
2498
2499
compress_maria_file(PACK_MRG_INFO * mrg,HUFF_COUNTS * huff_counts)2500 static int compress_maria_file(PACK_MRG_INFO *mrg, HUFF_COUNTS *huff_counts)
2501 {
2502 int error;
2503 uint i,max_calc_length,pack_ref_length,min_record_length,max_record_length;
2504 uint intervall,field_length,max_pack_length,pack_blob_length, null_bytes;
2505 my_off_t record_count;
2506 char llbuf[32];
2507 ulong length,pack_length;
2508 uchar *record,*pos,*end_pos,*record_pos,*start_pos;
2509 HUFF_COUNTS *count,*end_count;
2510 HUFF_TREE *tree;
2511 MARIA_HA *isam_file=mrg->file[0];
2512 uint pack_version= (uint) isam_file->s->pack.version;
2513 DBUG_ENTER("compress_maria_file");
2514
2515 /* Allocate a buffer for the records (excluding blobs). */
2516 if (!(record=(uchar*) my_safe_alloca(isam_file->s->base.reclength)))
2517 return -1;
2518
2519 end_count=huff_counts+isam_file->s->base.fields;
2520 min_record_length= (uint) ~0;
2521 max_record_length=0;
2522 null_bytes= isam_file->s->base.null_bytes;
2523
2524 /*
2525 Calculate the maximum number of bits required to pack the records.
2526 Remember to understand 'max_zero_fill' as 'min_zero_fill'.
2527 The tree height determines the maximum number of bits per value.
2528 Some fields skip leading or trailing spaces or zeroes. The skipped
2529 number of bytes is encoded by 'length_bits' bits.
2530 Empty blobs and varchar are encoded with a single 1 bit. Other blobs
2531 and varchar get a leading 0 bit.
2532 */
2533 max_calc_length= null_bytes;
2534 for (i= 0 ; i < isam_file->s->base.fields ; i++)
2535 {
2536 if (!(huff_counts[i].pack_type & PACK_TYPE_ZERO_FILL))
2537 huff_counts[i].max_zero_fill=0;
2538 if (huff_counts[i].field_type == FIELD_CONSTANT ||
2539 huff_counts[i].field_type == FIELD_ZERO ||
2540 huff_counts[i].field_type == FIELD_CHECK)
2541 continue;
2542 if (huff_counts[i].field_type == FIELD_INTERVALL)
2543 max_calc_length+=huff_counts[i].tree->height;
2544 else if (huff_counts[i].field_type == FIELD_BLOB ||
2545 huff_counts[i].field_type == FIELD_VARCHAR)
2546 max_calc_length+=huff_counts[i].tree->height*huff_counts[i].max_length + huff_counts[i].length_bits +1;
2547 else
2548 max_calc_length+=
2549 (huff_counts[i].field_length - huff_counts[i].max_zero_fill)*
2550 huff_counts[i].tree->height+huff_counts[i].length_bits;
2551 }
2552 max_calc_length= (max_calc_length + 7) / 8;
2553 pack_ref_length= _ma_calc_pack_length(pack_version, max_calc_length);
2554 record_count=0;
2555 /* 'max_blob_length' is the max length of all blobs of a record. */
2556 pack_blob_length= isam_file->s->base.blobs ?
2557 _ma_calc_pack_length(pack_version, mrg->max_blob_length) : 0;
2558 max_pack_length=pack_ref_length+pack_blob_length;
2559
2560 DBUG_PRINT("fields", ("==="));
2561 mrg_reset(mrg);
2562 while ((error=mrg_rrnd(mrg,record)) != HA_ERR_END_OF_FILE)
2563 {
2564 ulong tot_blob_length=0;
2565 if (! error)
2566 {
2567 if (flush_buffer((ulong) max_calc_length + (ulong) max_pack_length +
2568 null_bytes))
2569 break;
2570 record_pos= file_buffer.pos;
2571 file_buffer.pos+= max_pack_length;
2572 if (null_bytes)
2573 {
2574 /* Copy null bits 'as is' */
2575 memcpy(file_buffer.pos, record, null_bytes);
2576 file_buffer.pos+= null_bytes;
2577 }
2578 for (start_pos=record+null_bytes, count= huff_counts;
2579 count < end_count ;
2580 count++)
2581 {
2582 end_pos=start_pos+(field_length=count->field_length);
2583 tree=count->tree;
2584
2585 DBUG_PRINT("fields", ("column: %3lu type: %2u pack: %2u zero: %4u "
2586 "lbits: %2u tree: %2u length: %4u",
2587 (ulong) (count - huff_counts + 1),
2588 count->field_type,
2589 count->pack_type, count->max_zero_fill,
2590 count->length_bits, count->tree->tree_number,
2591 count->field_length));
2592
2593 /* Check if the column contains spaces only. */
2594 if (count->pack_type & PACK_TYPE_SPACE_FIELDS)
2595 {
2596 for (pos=start_pos ; *pos == ' ' && pos < end_pos; pos++) ;
2597 if (pos == end_pos)
2598 {
2599 DBUG_PRINT("fields",
2600 ("PACK_TYPE_SPACE_FIELDS spaces only, bits: 1"));
2601 DBUG_PRINT("fields", ("---"));
2602 write_bits(1,1);
2603 start_pos=end_pos;
2604 continue;
2605 }
2606 DBUG_PRINT("fields",
2607 ("PACK_TYPE_SPACE_FIELDS not only spaces, bits: 1"));
2608 write_bits(0,1);
2609 }
2610 end_pos-=count->max_zero_fill;
2611 field_length-=count->max_zero_fill;
2612
2613 switch (count->field_type) {
2614 case FIELD_SKIP_ZERO:
2615 if (!memcmp(start_pos, zero_string, field_length))
2616 {
2617 DBUG_PRINT("fields", ("FIELD_SKIP_ZERO zeroes only, bits: 1"));
2618 write_bits(1,1);
2619 start_pos=end_pos;
2620 break;
2621 }
2622 DBUG_PRINT("fields", ("FIELD_SKIP_ZERO not only zeroes, bits: 1"));
2623 write_bits(0,1);
2624 /* Fall through */
2625 case FIELD_NORMAL:
2626 DBUG_PRINT("fields", ("FIELD_NORMAL %lu bytes",
2627 (ulong) (end_pos - start_pos)));
2628 for ( ; start_pos < end_pos ; start_pos++)
2629 {
2630 DBUG_PRINT("fields",
2631 ("value: 0x%02x code: 0x%s bits: %2u bin: %s",
2632 (uchar) *start_pos,
2633 hexdigits(tree->code[(uchar) *start_pos]),
2634 (uint) tree->code_len[(uchar) *start_pos],
2635 bindigits(tree->code[(uchar) *start_pos],
2636 (uint) tree->code_len[(uchar) *start_pos])));
2637 write_bits(tree->code[(uchar) *start_pos],
2638 (uint) tree->code_len[(uchar) *start_pos]);
2639 }
2640 break;
2641 case FIELD_SKIP_ENDSPACE:
2642 for (pos=end_pos ; pos > start_pos && pos[-1] == ' ' ; pos--) ;
2643 length= (ulong) (end_pos - pos);
2644 if (count->pack_type & PACK_TYPE_SELECTED)
2645 {
2646 if (length > count->min_space)
2647 {
2648 DBUG_PRINT("fields",
2649 ("FIELD_SKIP_ENDSPACE more than min_space, bits: 1"));
2650 DBUG_PRINT("fields",
2651 ("FIELD_SKIP_ENDSPACE skip %lu/%u bytes, bits: %2u",
2652 length, field_length, count->length_bits));
2653 write_bits(1,1);
2654 write_bits(length,count->length_bits);
2655 }
2656 else
2657 {
2658 DBUG_PRINT("fields",
2659 ("FIELD_SKIP_ENDSPACE not more than min_space, "
2660 "bits: 1"));
2661 write_bits(0,1);
2662 pos=end_pos;
2663 }
2664 }
2665 else
2666 {
2667 DBUG_PRINT("fields",
2668 ("FIELD_SKIP_ENDSPACE skip %lu/%u bytes, bits: %2u",
2669 length, field_length, count->length_bits));
2670 write_bits(length,count->length_bits);
2671 }
2672 /* Encode all significant bytes. */
2673 DBUG_PRINT("fields", ("FIELD_SKIP_ENDSPACE %lu bytes",
2674 (ulong) (pos - start_pos)));
2675 for ( ; start_pos < pos ; start_pos++)
2676 {
2677 DBUG_PRINT("fields",
2678 ("value: 0x%02x code: 0x%s bits: %2u bin: %s",
2679 (uchar) *start_pos,
2680 hexdigits(tree->code[(uchar) *start_pos]),
2681 (uint) tree->code_len[(uchar) *start_pos],
2682 bindigits(tree->code[(uchar) *start_pos],
2683 (uint) tree->code_len[(uchar) *start_pos])));
2684 write_bits(tree->code[(uchar) *start_pos],
2685 (uint) tree->code_len[(uchar) *start_pos]);
2686 }
2687 start_pos=end_pos;
2688 break;
2689 case FIELD_SKIP_PRESPACE:
2690 for (pos=start_pos ; pos < end_pos && pos[0] == ' ' ; pos++) ;
2691 length= (ulong) (pos - start_pos);
2692 if (count->pack_type & PACK_TYPE_SELECTED)
2693 {
2694 if (length > count->min_space)
2695 {
2696 DBUG_PRINT("fields",
2697 ("FIELD_SKIP_PRESPACE more than min_space, bits: 1"));
2698 DBUG_PRINT("fields",
2699 ("FIELD_SKIP_PRESPACE skip %lu/%u bytes, bits: %2u",
2700 length, field_length, count->length_bits));
2701 write_bits(1,1);
2702 write_bits(length,count->length_bits);
2703 }
2704 else
2705 {
2706 DBUG_PRINT("fields",
2707 ("FIELD_SKIP_PRESPACE not more than min_space, "
2708 "bits: 1"));
2709 pos=start_pos;
2710 write_bits(0,1);
2711 }
2712 }
2713 else
2714 {
2715 DBUG_PRINT("fields",
2716 ("FIELD_SKIP_PRESPACE skip %lu/%u bytes, bits: %2u",
2717 length, field_length, count->length_bits));
2718 write_bits(length,count->length_bits);
2719 }
2720 /* Encode all significant bytes. */
2721 DBUG_PRINT("fields", ("FIELD_SKIP_PRESPACE %lu bytes",
2722 (ulong) (end_pos - start_pos)));
2723 for (start_pos=pos ; start_pos < end_pos ; start_pos++)
2724 {
2725 DBUG_PRINT("fields",
2726 ("value: 0x%02x code: 0x%s bits: %2u bin: %s",
2727 (uchar) *start_pos,
2728 hexdigits(tree->code[(uchar) *start_pos]),
2729 (uint) tree->code_len[(uchar) *start_pos],
2730 bindigits(tree->code[(uchar) *start_pos],
2731 (uint) tree->code_len[(uchar) *start_pos])));
2732 write_bits(tree->code[(uchar) *start_pos],
2733 (uint) tree->code_len[(uchar) *start_pos]);
2734 }
2735 break;
2736 case FIELD_CONSTANT:
2737 case FIELD_ZERO:
2738 case FIELD_CHECK:
2739 DBUG_PRINT("fields", ("FIELD_CONSTANT/ZERO/CHECK"));
2740 start_pos=end_pos;
2741 break;
2742 case FIELD_INTERVALL:
2743 global_count=count;
2744 pos=(uchar*) tree_search(&count->int_tree, start_pos,
2745 count->int_tree.custom_arg);
2746 intervall=(uint) (pos - count->tree_buff)/field_length;
2747 DBUG_PRINT("fields", ("FIELD_INTERVALL"));
2748 DBUG_PRINT("fields", ("index: %4u code: 0x%s bits: %2u",
2749 intervall, hexdigits(tree->code[intervall]),
2750 (uint) tree->code_len[intervall]));
2751 write_bits(tree->code[intervall],(uint) tree->code_len[intervall]);
2752 start_pos=end_pos;
2753 break;
2754 case FIELD_BLOB:
2755 {
2756 ulong blob_length= _ma_calc_blob_length(field_length-
2757 portable_sizeof_char_ptr,
2758 start_pos);
2759 /* Empty blobs are encoded with a single 1 bit. */
2760 if (!blob_length)
2761 {
2762 DBUG_PRINT("fields", ("FIELD_BLOB empty, bits: 1"));
2763 write_bits(1,1);
2764 }
2765 else
2766 {
2767 uchar *blob,*blob_end;
2768 DBUG_PRINT("fields", ("FIELD_BLOB not empty, bits: 1"));
2769 write_bits(0,1);
2770 /* Write the blob length. */
2771 DBUG_PRINT("fields", ("FIELD_BLOB %lu bytes, bits: %2u",
2772 blob_length, count->length_bits));
2773 write_bits(blob_length,count->length_bits);
2774 memcpy(&blob,end_pos-portable_sizeof_char_ptr, sizeof(char*));
2775 blob_end=blob+blob_length;
2776 /* Encode the blob bytes. */
2777 for ( ; blob < blob_end ; blob++)
2778 {
2779 DBUG_PRINT("fields",
2780 ("value: 0x%02x code: 0x%s bits: %2u bin: %s",
2781 (uchar) *blob, hexdigits(tree->code[(uchar) *blob]),
2782 (uint) tree->code_len[(uchar) *blob],
2783 bindigits(tree->code[(uchar) *start_pos],
2784 (uint)tree->code_len[(uchar) *start_pos])));
2785 write_bits(tree->code[(uchar) *blob],
2786 (uint) tree->code_len[(uchar) *blob]);
2787 }
2788 tot_blob_length+=blob_length;
2789 }
2790 start_pos= end_pos;
2791 break;
2792 }
2793 case FIELD_VARCHAR:
2794 {
2795 uint var_pack_length= HA_VARCHAR_PACKLENGTH(count->field_length-1);
2796 ulong col_length= (var_pack_length == 1 ?
2797 (uint) *(uchar*) start_pos :
2798 uint2korr(start_pos));
2799 /* Empty varchar are encoded with a single 1 bit. */
2800 if (!col_length)
2801 {
2802 DBUG_PRINT("fields", ("FIELD_VARCHAR empty, bits: 1"));
2803 write_bits(1,1); /* Empty varchar */
2804 }
2805 else
2806 {
2807 uchar *end= start_pos + var_pack_length + col_length;
2808 DBUG_PRINT("fields", ("FIELD_VARCHAR not empty, bits: 1"));
2809 write_bits(0,1);
2810 /* Write the varchar length. */
2811 DBUG_PRINT("fields", ("FIELD_VARCHAR %lu bytes, bits: %2u",
2812 col_length, count->length_bits));
2813 write_bits(col_length,count->length_bits);
2814 /* Encode the varchar bytes. */
2815 for (start_pos+= var_pack_length ; start_pos < end ; start_pos++)
2816 {
2817 DBUG_PRINT("fields",
2818 ("value: 0x%02x code: 0x%s bits: %2u bin: %s",
2819 (uchar) *start_pos,
2820 hexdigits(tree->code[(uchar) *start_pos]),
2821 (uint) tree->code_len[(uchar) *start_pos],
2822 bindigits(tree->code[(uchar) *start_pos],
2823 (uint)tree->code_len[(uchar) *start_pos])));
2824 write_bits(tree->code[(uchar) *start_pos],
2825 (uint) tree->code_len[(uchar) *start_pos]);
2826 }
2827 }
2828 start_pos= end_pos;
2829 break;
2830 }
2831 case FIELD_LAST:
2832 case FIELD_enum_val_count:
2833 abort(); /* Impossible */
2834 }
2835 start_pos+=count->max_zero_fill;
2836 DBUG_PRINT("fields", ("---"));
2837 }
2838 flush_bits();
2839 length=(ulong) (file_buffer.pos - record_pos) - max_pack_length;
2840 pack_length= _ma_save_pack_length(pack_version, record_pos, length);
2841 if (pack_blob_length)
2842 pack_length+= _ma_save_pack_length(pack_version,
2843 record_pos + pack_length,
2844 tot_blob_length);
2845 DBUG_PRINT("fields", ("record: %lu length: %lu blob-length: %lu "
2846 "length-bytes: %lu", (ulong) record_count, length,
2847 tot_blob_length, pack_length));
2848 DBUG_PRINT("fields", ("==="));
2849
2850 /* Correct file buffer if the header was smaller */
2851 if (pack_length != max_pack_length)
2852 {
2853 bmove(record_pos+pack_length,record_pos+max_pack_length,length);
2854 file_buffer.pos-= (max_pack_length-pack_length);
2855 }
2856 if (length < (ulong) min_record_length)
2857 min_record_length=(uint) length;
2858 if (length > (ulong) max_record_length)
2859 max_record_length=(uint) length;
2860 record_count++;
2861 if (write_loop && record_count % WRITE_COUNT == 0)
2862 {
2863 printf("%lu\r", (ulong) record_count);
2864 fflush(stdout);
2865 }
2866 }
2867 else if (error != HA_ERR_RECORD_DELETED)
2868 break;
2869 }
2870 if (error == HA_ERR_END_OF_FILE)
2871 error=0;
2872 else
2873 {
2874 fprintf(stderr, "%s: Got error %d reading records\n", my_progname, error);
2875 }
2876 if (verbose >= 2)
2877 printf("wrote %s records.\n", llstr((longlong) record_count, llbuf));
2878
2879 my_safe_afree(record, isam_file->s->base.reclength);
2880 mrg->ref_length=max_pack_length;
2881 mrg->min_pack_length=max_record_length ? min_record_length : 0;
2882 mrg->max_pack_length=max_record_length;
2883 DBUG_RETURN(error || error_on_write || flush_buffer(~(ulong) 0));
2884 }
2885
2886
make_new_name(char * new_name,char * old_name)2887 static char *make_new_name(char *new_name, char *old_name)
2888 {
2889 return fn_format(new_name,old_name,"",DATA_TMP_EXT,2+4);
2890 }
2891
make_old_name(char * new_name,char * old_name)2892 static char *make_old_name(char *new_name, char *old_name)
2893 {
2894 return fn_format(new_name,old_name,"",OLD_EXT,2+4);
2895 }
2896
2897 /* rutines for bit writing buffer */
2898
init_file_buffer(File file,pbool read_buffer)2899 static void init_file_buffer(File file, pbool read_buffer)
2900 {
2901 file_buffer.file=file;
2902 file_buffer.buffer= (uchar*) my_malloc(ALIGN_SIZE(RECORD_CACHE_SIZE),
2903 MYF(MY_WME));
2904 file_buffer.end=file_buffer.buffer+ALIGN_SIZE(RECORD_CACHE_SIZE)-8;
2905 file_buffer.pos_in_file=0;
2906 error_on_write=0;
2907 if (read_buffer)
2908 {
2909
2910 file_buffer.pos=file_buffer.end;
2911 file_buffer.bits=0;
2912 }
2913 else
2914 {
2915 file_buffer.pos=file_buffer.buffer;
2916 file_buffer.bits=BITS_SAVED;
2917 }
2918 file_buffer.bitbucket= 0;
2919 }
2920
2921
flush_buffer(ulong neaded_length)2922 static int flush_buffer(ulong neaded_length)
2923 {
2924 ulong length;
2925
2926 /*
2927 file_buffer.end is 8 bytes lower than the real end of the buffer.
2928 This is done so that the end-of-buffer condition does not need to be
2929 checked for every uchar (see write_bits()). Consequently,
2930 file_buffer.pos can become greater than file_buffer.end. The
2931 algorithms in the other functions ensure that there will never be
2932 more than 8 bytes written to the buffer without an end-of-buffer
2933 check. So the buffer cannot be overrun. But we need to check for the
2934 near-to-buffer-end condition to avoid a negative result, which is
2935 casted to unsigned and thus becomes giant.
2936 */
2937 if ((file_buffer.pos < file_buffer.end) &&
2938 ((ulong) (file_buffer.end - file_buffer.pos) > neaded_length))
2939 return 0;
2940 length=(ulong) (file_buffer.pos-file_buffer.buffer);
2941 file_buffer.pos=file_buffer.buffer;
2942 file_buffer.pos_in_file+=length;
2943 if (test_only)
2944 return 0;
2945 if (error_on_write|| my_write(file_buffer.file,
2946 (const uchar*) file_buffer.buffer,
2947 length,
2948 MYF(MY_WME | MY_NABP | MY_WAIT_IF_FULL)))
2949 {
2950 error_on_write=1;
2951 return 1;
2952 }
2953
2954 if (neaded_length != ~(ulong) 0 &&
2955 (ulong) (file_buffer.end-file_buffer.buffer) < neaded_length)
2956 {
2957 uchar *tmp;
2958 neaded_length+=256; /* some margin */
2959 tmp= (uchar*) my_realloc(file_buffer.buffer, neaded_length,MYF(MY_WME));
2960 if (!tmp)
2961 return 1;
2962 file_buffer.pos= (tmp + (ulong) (file_buffer.pos - file_buffer.buffer));
2963 file_buffer.buffer= tmp;
2964 file_buffer.end= (tmp+neaded_length-8);
2965 }
2966 return 0;
2967 }
2968
2969
end_file_buffer(void)2970 static void end_file_buffer(void)
2971 {
2972 my_free(file_buffer.buffer);
2973 }
2974
2975 /* output `bits` low bits of `value' */
2976
write_bits(register ulonglong value,register uint bits)2977 static void write_bits(register ulonglong value, register uint bits)
2978 {
2979 DBUG_ASSERT(((bits < 8 * sizeof(value)) && ! (value >> bits)) ||
2980 (bits == 8 * sizeof(value)));
2981
2982 if ((file_buffer.bits-= (int) bits) >= 0)
2983 {
2984 file_buffer.bitbucket|= value << file_buffer.bits;
2985 }
2986 else
2987 {
2988 reg3 ulonglong bit_buffer;
2989 bits= (uint) -file_buffer.bits;
2990 bit_buffer= (file_buffer.bitbucket |
2991 ((bits != 8 * sizeof(value)) ? (value >> bits) : 0));
2992 #if BITS_SAVED == 64
2993 *file_buffer.pos++= (uchar) (bit_buffer >> 56);
2994 *file_buffer.pos++= (uchar) (bit_buffer >> 48);
2995 *file_buffer.pos++= (uchar) (bit_buffer >> 40);
2996 *file_buffer.pos++= (uchar) (bit_buffer >> 32);
2997 #endif
2998 *file_buffer.pos++= (uchar) (bit_buffer >> 24);
2999 *file_buffer.pos++= (uchar) (bit_buffer >> 16);
3000 *file_buffer.pos++= (uchar) (bit_buffer >> 8);
3001 *file_buffer.pos++= (uchar) (bit_buffer);
3002
3003 if (bits != 8 * sizeof(value))
3004 value&= (((ulonglong) 1) << bits) - 1;
3005 if (file_buffer.pos >= file_buffer.end)
3006 flush_buffer(~ (ulong) 0);
3007 file_buffer.bits=(int) (BITS_SAVED - bits);
3008 file_buffer.bitbucket= value << (BITS_SAVED - bits);
3009 }
3010 return;
3011 }
3012
3013 /* Flush bits in bit_buffer to buffer */
3014
flush_bits(void)3015 static void flush_bits(void)
3016 {
3017 int bits;
3018 ulonglong bit_buffer;
3019
3020 bits= file_buffer.bits & ~7;
3021 bit_buffer= file_buffer.bitbucket >> bits;
3022 bits= BITS_SAVED - bits;
3023 while (bits > 0)
3024 {
3025 bits-= 8;
3026 *file_buffer.pos++= (uchar) (bit_buffer >> bits);
3027 }
3028 if (file_buffer.pos >= file_buffer.end)
3029 flush_buffer(~ (ulong) 0);
3030 file_buffer.bits= BITS_SAVED;
3031 file_buffer.bitbucket= 0;
3032 }
3033
3034
3035 /****************************************************************************
3036 ** functions to handle the joined files
3037 ****************************************************************************/
3038
save_state(MARIA_HA * isam_file,PACK_MRG_INFO * mrg,my_off_t new_length,ha_checksum crc)3039 static int save_state(MARIA_HA *isam_file,PACK_MRG_INFO *mrg,
3040 my_off_t new_length,
3041 ha_checksum crc)
3042 {
3043 MARIA_SHARE *share=isam_file->s;
3044 uint options=mi_uint2korr(share->state.header.options);
3045 uint key;
3046 DBUG_ENTER("save_state");
3047
3048 options|= HA_OPTION_COMPRESS_RECORD | HA_OPTION_READ_ONLY_DATA;
3049 mi_int2store(share->state.header.options,options);
3050 /* Save the original file type of we have to undo the packing later */
3051 share->state.header.org_data_file_type= share->state.header.data_file_type;
3052 share->state.header.data_file_type= COMPRESSED_RECORD;
3053
3054 share->state.state.data_file_length=new_length;
3055 share->state.state.del=0;
3056 share->state.state.empty=0;
3057 share->state.dellink= HA_OFFSET_ERROR;
3058 share->state.split=(ha_rows) mrg->records;
3059 share->state.version=(ulong) time((time_t*) 0);
3060 if (share->base.born_transactional)
3061 share->state.create_rename_lsn= share->state.is_of_horizon=
3062 share->state.skip_redo_lsn= LSN_NEEDS_NEW_STATE_LSNS;
3063 if (! maria_is_all_keys_active(share->state.key_map, share->base.keys))
3064 {
3065 /*
3066 Some indexes are disabled, cannot use current key_file_length value
3067 as an estimate of upper bound of index file size. Use packed data file
3068 size instead.
3069 */
3070 share->state.state.key_file_length= new_length;
3071 }
3072 /*
3073 If there are no disabled indexes, keep key_file_length value from
3074 original file so "aria_chk -rq" can use this value (this is necessary
3075 because index size cannot be easily calculated for fulltext keys)
3076 */
3077 maria_clear_all_keys_active(share->state.key_map);
3078 for (key=0 ; key < share->base.keys ; key++)
3079 share->state.key_root[key]= HA_OFFSET_ERROR;
3080 share->state.key_del= HA_OFFSET_ERROR;
3081 share->state.state.checksum= crc; /* Save crc in file */
3082 share->changed=1; /* Force write of header */
3083 share->state.open_count=0;
3084 share->global_changed=0;
3085 my_chsize(share->kfile.file, share->base.keystart, 0, MYF(0));
3086 if (share->base.keys)
3087 isamchk_neaded=1;
3088 DBUG_RETURN(_ma_state_info_write_sub(share->kfile.file,
3089 &share->state,
3090 MA_STATE_INFO_WRITE_DONT_MOVE_OFFSET |
3091 MA_STATE_INFO_WRITE_FULL_INFO));
3092 }
3093
3094
save_state_mrg(File file,PACK_MRG_INFO * mrg,my_off_t new_length,ha_checksum crc)3095 static int save_state_mrg(File file,PACK_MRG_INFO *mrg,my_off_t new_length,
3096 ha_checksum crc)
3097 {
3098 MARIA_STATE_INFO state;
3099 MARIA_HA *isam_file=mrg->file[0];
3100 uint options;
3101 DBUG_ENTER("save_state_mrg");
3102
3103 state= isam_file->s->state;
3104 options= (mi_uint2korr(state.header.options) | HA_OPTION_COMPRESS_RECORD |
3105 HA_OPTION_READ_ONLY_DATA);
3106 mi_int2store(state.header.options,options);
3107 /* Save the original file type of we have to undo the packing later */
3108 state.header.org_data_file_type= state.header.data_file_type;
3109 state.header.data_file_type= COMPRESSED_RECORD;
3110
3111 state.state.data_file_length=new_length;
3112 state.state.del=0;
3113 state.state.empty=0;
3114 state.state.records=state.split=(ha_rows) mrg->records;
3115 state.create_rename_lsn= state.is_of_horizon= state.skip_redo_lsn=
3116 LSN_NEEDS_NEW_STATE_LSNS;
3117
3118 /* See comment above in save_state about key_file_length handling. */
3119 if (mrg->src_file_has_indexes_disabled)
3120 {
3121 isam_file->s->state.state.key_file_length=
3122 MY_MAX(isam_file->s->state.state.key_file_length, new_length);
3123 }
3124 state.dellink= HA_OFFSET_ERROR;
3125 state.version=(ulong) time((time_t*) 0);
3126 maria_clear_all_keys_active(state.key_map);
3127 state.state.checksum=crc;
3128 if (isam_file->s->base.keys)
3129 isamchk_neaded=1;
3130 state.changed=STATE_CHANGED | STATE_NOT_ANALYZED; /* Force check of table */
3131 DBUG_RETURN (_ma_state_info_write_sub(file, &state,
3132 MA_STATE_INFO_WRITE_DONT_MOVE_OFFSET |
3133 MA_STATE_INFO_WRITE_FULL_INFO));
3134 }
3135
3136
3137 /* reset for mrg_rrnd */
3138
mrg_reset(PACK_MRG_INFO * mrg)3139 static void mrg_reset(PACK_MRG_INFO *mrg)
3140 {
3141 if (mrg->current)
3142 {
3143 maria_extra(*mrg->current, HA_EXTRA_NO_CACHE, 0);
3144 mrg->current=0;
3145 }
3146 }
3147
mrg_rrnd(PACK_MRG_INFO * info,uchar * buf)3148 static int mrg_rrnd(PACK_MRG_INFO *info,uchar *buf)
3149 {
3150 int error;
3151 MARIA_HA *isam_info;
3152
3153 if (!info->current)
3154 {
3155 isam_info= *(info->current=info->file);
3156 info->end=info->current+info->count;
3157 maria_reset(isam_info);
3158 maria_extra(isam_info, HA_EXTRA_CACHE, 0);
3159 if ((error= maria_scan_init(isam_info)))
3160 return(error);
3161 }
3162 else
3163 isam_info= *info->current;
3164
3165 for (;;)
3166 {
3167 if (!(error= maria_scan(isam_info, buf)) ||
3168 error != HA_ERR_END_OF_FILE)
3169 return (error);
3170 maria_scan_end(isam_info);
3171 maria_extra(isam_info,HA_EXTRA_NO_CACHE, 0);
3172 if (info->current+1 == info->end)
3173 return(HA_ERR_END_OF_FILE);
3174 info->current++;
3175 isam_info= *info->current;
3176 maria_reset(isam_info);
3177 maria_extra(isam_info,HA_EXTRA_CACHE, 0);
3178 if ((error= maria_scan_init(isam_info)))
3179 return(error);
3180 }
3181 }
3182
3183
mrg_close(PACK_MRG_INFO * mrg)3184 static int mrg_close(PACK_MRG_INFO *mrg)
3185 {
3186 uint i;
3187 int error=0;
3188 DBUG_ENTER("mrg_close");
3189
3190 for (i=0 ; i < mrg->count ; i++)
3191 error|=maria_close(mrg->file[i]);
3192 if (mrg->free_file)
3193 my_free(mrg->file);
3194 DBUG_RETURN(error);
3195 }
3196
3197
3198 #if !defined(DBUG_OFF)
3199 /*
3200 Fake the counts to get big Huffman codes.
3201
3202 SYNOPSIS
3203 fakebigcodes()
3204 huff_counts A pointer to the counts array.
3205 end_count A pointer past the counts array.
3206
3207 DESCRIPTION
3208
3209 Huffman coding works by removing the two least frequent values from
3210 the list of values and add a new value with the sum of their
3211 incidences in a loop until only one value is left. Every time a
3212 value is reused for a new value, it gets one more bit for its
3213 encoding. Hence, the least frequent values get the longest codes.
3214
3215 To get a maximum code length for a value, two of the values must
3216 have an incidence of 1. As their sum is 2, the next infrequent value
3217 must have at least an incidence of 2, then 4, 8, 16 and so on. This
3218 means that one needs 2**n bytes (values) for a code length of n
3219 bits. However, using more distinct values forces the use of longer
3220 codes, or reaching the code length with less total bytes (values).
3221
3222 To get 64(32)-bit codes, I sort the counts by decreasing incidence.
3223 I assign counts of 1 to the two most frequent values, a count of 2
3224 for the next one, then 4, 8, and so on until 2**64-1(2**30-1). All
3225 the remaining values get 1. That way every possible uchar has an
3226 assigned code, though not all codes are used if not all uchar values
3227 are present in the column.
3228
3229 This strategy would work with distinct column values too, but
3230 requires that at least 64(32) values are present. To make things
3231 easier here, I cancel all distinct column values and force byte
3232 compression for all columns.
3233
3234 RETURN
3235 void
3236 */
3237
fakebigcodes(HUFF_COUNTS * huff_counts,HUFF_COUNTS * end_count)3238 static void fakebigcodes(HUFF_COUNTS *huff_counts, HUFF_COUNTS *end_count)
3239 {
3240 HUFF_COUNTS *count;
3241 my_off_t *cur_count_p;
3242 my_off_t *end_count_p;
3243 my_off_t **cur_sort_p;
3244 my_off_t **end_sort_p;
3245 my_off_t *sort_counts[256];
3246 my_off_t total;
3247 DBUG_ENTER("fakebigcodes");
3248
3249 for (count= huff_counts; count < end_count; count++)
3250 {
3251 /*
3252 Remove distinct column values.
3253 */
3254 if (huff_counts->tree_buff)
3255 {
3256 my_free(huff_counts->tree_buff);
3257 delete_tree(&huff_counts->int_tree, 0);
3258 huff_counts->tree_buff= NULL;
3259 DBUG_PRINT("fakebigcodes", ("freed distinct column values"));
3260 }
3261
3262 /*
3263 Sort counts by decreasing incidence.
3264 */
3265 cur_count_p= count->counts;
3266 end_count_p= cur_count_p + 256;
3267 cur_sort_p= sort_counts;
3268 while (cur_count_p < end_count_p)
3269 *(cur_sort_p++)= cur_count_p++;
3270 (void) my_qsort(sort_counts, 256, sizeof(my_off_t*), (qsort_cmp) fakecmp);
3271
3272 /*
3273 Assign faked counts.
3274 */
3275 cur_sort_p= sort_counts;
3276 #if SIZEOF_LONG_LONG > 4
3277 end_sort_p= sort_counts + 8 * sizeof(ulonglong) - 1;
3278 #else
3279 end_sort_p= sort_counts + 8 * sizeof(ulonglong) - 2;
3280 #endif
3281 /* Most frequent value gets a faked count of 1. */
3282 **(cur_sort_p++)= 1;
3283 total= 1;
3284 while (cur_sort_p < end_sort_p)
3285 {
3286 **(cur_sort_p++)= total;
3287 total<<= 1;
3288 }
3289 /* Set the last value. */
3290 **(cur_sort_p++)= --total;
3291 /*
3292 Set the remaining counts.
3293 */
3294 end_sort_p= sort_counts + 256;
3295 while (cur_sort_p < end_sort_p)
3296 **(cur_sort_p++)= 1;
3297 }
3298 DBUG_VOID_RETURN;
3299 }
3300
3301
3302 /*
3303 Compare two counts for reverse sorting.
3304
3305 SYNOPSIS
3306 fakecmp()
3307 count1 One count.
3308 count2 Another count.
3309
3310 RETURN
3311 1 count1 < count2
3312 0 count1 == count2
3313 -1 count1 > count2
3314 */
3315
fakecmp(my_off_t ** count1,my_off_t ** count2)3316 static int fakecmp(my_off_t **count1, my_off_t **count2)
3317 {
3318 return ((**count1 < **count2) ? 1 :
3319 (**count1 > **count2) ? -1 : 0);
3320 }
3321 #endif
3322
3323 #include "ma_check_standalone.h"
3324
3325