1 /* Copyright (C) 2006 MySQL AB & MySQL Finland AB & TCX DataKonsult AB
2
3 This program is free software; you can redistribute it and/or modify
4 it under the terms of the GNU General Public License as published by
5 the Free Software Foundation; version 2 of the License.
6
7 This program is distributed in the hope that it will be useful,
8 but WITHOUT ANY WARRANTY; without even the implied warranty of
9 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
10 GNU General Public License for more details.
11
12 You should have received a copy of the GNU General Public License
13 along with this program; if not, write to the Free Software
14 Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1335 USA */
15
16 /* Pack MARIA file */
17
18 #ifndef USE_MY_FUNC
19 #define USE_MY_FUNC /* We need at least my_malloc */
20 #endif
21
22 #include "maria_def.h"
23 #include "trnman_public.h"
24 #include "trnman.h"
25 #include <queues.h>
26 #include <my_tree.h>
27 #include "mysys_err.h"
28 #ifdef MSDOS
29 #include <io.h>
30 #endif
31 #ifndef __GNU_LIBRARY__
32 #define __GNU_LIBRARY__ /* Skip warnings in getopt.h */
33 #endif
34 #include <my_getopt.h>
35 #include <my_handler_errors.h>
36
37 #if SIZEOF_LONG_LONG > 4
38 #define BITS_SAVED 64
39 #else
40 #define BITS_SAVED 32
41 #endif
42 #ifndef MAX_INTERNAL_TRID
43 #define MAX_INTERNAL_TRID 0xffffffffffffLL
44 #endif
45
46 #define IS_OFFSET ((uint) 32768) /* Bit if offset or char in tree */
47 #define HEAD_LENGTH 32
48 #define ALLOWED_JOIN_DIFF 256 /* Diff allowed to join trees */
49
50 #define DATA_TMP_EXT ".TMD"
51 #define OLD_EXT ".OLD"
52 #define WRITE_COUNT MY_HOW_OFTEN_TO_WRITE
53
54 struct st_file_buffer {
55 File file;
56 uchar *buffer,*pos,*end;
57 my_off_t pos_in_file;
58 int bits;
59 ulonglong bitbucket;
60 };
61
62 struct st_huff_tree;
63 struct st_huff_element;
64
65 typedef struct st_huff_counts {
66 uint field_length,max_zero_fill;
67 uint pack_type;
68 uint max_end_space,max_pre_space,length_bits,min_space;
69 ulong max_length;
70 enum en_fieldtype field_type;
71 struct st_huff_tree *tree; /* Tree for field */
72 my_off_t counts[256];
73 my_off_t end_space[8];
74 my_off_t pre_space[8];
75 my_off_t tot_end_space,tot_pre_space,zero_fields,empty_fields,bytes_packed;
76 TREE int_tree; /* Tree for detecting distinct column values. */
77 uchar *tree_buff; /* Column values, 'field_length' each. */
78 uchar *tree_pos; /* Points to end of column values in 'tree_buff'. */
79 } HUFF_COUNTS;
80
81 typedef struct st_huff_element HUFF_ELEMENT;
82
83 /*
84 WARNING: It is crucial for the optimizations in calc_packed_length()
85 that 'count' is the first element of 'HUFF_ELEMENT'.
86 */
87 struct st_huff_element {
88 my_off_t count;
89 union un_element {
90 struct st_nod {
91 HUFF_ELEMENT *left,*right;
92 } nod;
93 struct st_leaf {
94 HUFF_ELEMENT *null;
95 uint element_nr; /* Number of element */
96 } leaf;
97 } a;
98 };
99
100
101 typedef struct st_huff_tree {
102 HUFF_ELEMENT *root,*element_buffer;
103 HUFF_COUNTS *counts;
104 uint tree_number;
105 uint elements;
106 my_off_t bytes_packed;
107 uint tree_pack_length;
108 uint min_chr,max_chr,char_bits,offset_bits,max_offset,height;
109 ulonglong *code;
110 uchar *code_len;
111 } HUFF_TREE;
112
113
114 typedef struct st_isam_mrg {
115 MARIA_HA **file,**current,**end;
116 uint free_file;
117 uint count;
118 uint min_pack_length; /* Theese is used by packed data */
119 uint max_pack_length;
120 uint ref_length;
121 uint max_blob_length;
122 my_off_t records;
123 /* true if at least one source file has at least one disabled index */
124 my_bool src_file_has_indexes_disabled;
125 } PACK_MRG_INFO;
126
127
128 extern int main(int argc,char * *argv);
129 static void get_options(int *argc,char ***argv);
130 static MARIA_HA *open_maria_file(char *name,int mode);
131 static my_bool open_maria_files(PACK_MRG_INFO *mrg,char **names,uint count);
132 static int compress(PACK_MRG_INFO *file,char *join_name);
133 static HUFF_COUNTS *init_huff_count(MARIA_HA *info,my_off_t records);
134 static void free_counts_and_tree_and_queue(HUFF_TREE *huff_trees,
135 uint trees,
136 HUFF_COUNTS *huff_counts,
137 uint fields);
138 static int compare_tree(void* cmp_arg __attribute__((unused)),
139 const uchar *s,const uchar *t);
140 static int get_statistic(PACK_MRG_INFO *mrg,HUFF_COUNTS *huff_counts);
141 static void check_counts(HUFF_COUNTS *huff_counts,uint trees,
142 my_off_t records);
143 static int test_space_compress(HUFF_COUNTS *huff_counts,my_off_t records,
144 uint max_space_length,my_off_t *space_counts,
145 my_off_t tot_space_count,
146 enum en_fieldtype field_type);
147 static HUFF_TREE* make_huff_trees(HUFF_COUNTS *huff_counts,uint trees);
148 static int make_huff_tree(HUFF_TREE *tree,HUFF_COUNTS *huff_counts);
149 static int compare_huff_elements(void *not_used, uchar *a,uchar *b);
150 static int save_counts_in_queue(uchar *key,element_count count,
151 HUFF_TREE *tree);
152 static my_off_t calc_packed_length(HUFF_COUNTS *huff_counts,uint flag);
153 static uint join_same_trees(HUFF_COUNTS *huff_counts,uint trees);
154 static int make_huff_decode_table(HUFF_TREE *huff_tree,uint trees);
155 static void make_traverse_code_tree(HUFF_TREE *huff_tree,
156 HUFF_ELEMENT *element,uint size,
157 ulonglong code);
158 static int write_header(PACK_MRG_INFO *isam_file, uint header_length,uint trees,
159 my_off_t tot_elements,my_off_t filelength);
160 static void write_field_info(HUFF_COUNTS *counts, uint fields,uint trees);
161 static my_off_t write_huff_tree(HUFF_TREE *huff_tree,uint trees);
162 static uint *make_offset_code_tree(HUFF_TREE *huff_tree,
163 HUFF_ELEMENT *element,
164 uint *offset);
165 static uint max_bit(uint value);
166 static int compress_maria_file(PACK_MRG_INFO *file,HUFF_COUNTS *huff_counts);
167 static char *make_new_name(char *new_name,char *old_name);
168 static char *make_old_name(char *new_name,char *old_name);
169 static void init_file_buffer(File file,pbool read_buffer);
170 static int flush_buffer(ulong neaded_length);
171 static void end_file_buffer(void);
172 static void write_bits(ulonglong value, uint bits);
173 static void flush_bits(void);
174 static int save_state(MARIA_HA *isam_file,PACK_MRG_INFO *mrg,
175 my_off_t new_length, ha_checksum crc);
176 static int save_state_mrg(File file,PACK_MRG_INFO *isam_file,
177 my_off_t new_length, ha_checksum crc);
178 static int mrg_close(PACK_MRG_INFO *mrg);
179 static int mrg_rrnd(PACK_MRG_INFO *info,uchar *buf);
180 static void mrg_reset(PACK_MRG_INFO *mrg);
181 #if !defined(DBUG_OFF)
182 static void fakebigcodes(HUFF_COUNTS *huff_counts, HUFF_COUNTS *end_count);
183 static int fakecmp(my_off_t **count1, my_off_t **count2);
184 #endif
185
186 /*
187 tree_buff_length is somewhat arbitrary. The bigger it is the better
188 the chance to win in terms of compression factor. On the other hand,
189 this table becomes part of the compressed file header. And its length
190 is coded with 16 bits in the header. Hence the limit is 2**16 - 1.
191 */
192 static uint tree_buff_length= 65536 - MALLOC_OVERHEAD;
193
194 static int error_on_write=0,test_only=0,verbose=0,silent=0,
195 write_loop=0,force_pack=0, isamchk_neaded=0;
196 static int tmpfile_createflag=O_RDWR | O_TRUNC | O_EXCL;
197 static my_bool backup, opt_wait;
198 static my_bool opt_ignore_control_file, opt_require_control_file;
199 static char tmp_dir[FN_REFLEN]={0},*join_table;
200 static my_off_t intervall_length;
201 static ha_checksum glob_crc;
202 static struct st_file_buffer file_buffer;
203 static QUEUE queue;
204 static HUFF_COUNTS *global_count;
205 static char zero_string[]={0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0};
206 static const char *load_default_groups[]= { "ariapack",0 };
207 static char **default_argv;
208
209 /*
210 Register handler error messages for usage with my_error()
211
212 NOTES
213 This is safe to call multiple times as my_error_register()
214 will ignore calls to register already registered error numbers.
215 */
216
get_handler_error_messages(int e)217 static const char **get_handler_error_messages(int e __attribute__((unused)))
218 {
219 return handler_error_messages;
220 }
221
222
223 /* The main program */
224
main(int argc,char ** argv)225 int main(int argc, char **argv)
226 {
227 int error,ok;
228 PACK_MRG_INFO merge;
229 my_bool no_control_file= 0;
230 MY_INIT(argv[0]);
231
232 maria_data_root= ".";
233 load_defaults_or_exit("my", load_default_groups, &argc, &argv);
234 default_argv= argv;
235 get_options(&argc,&argv);
236 my_error_register(get_handler_error_messages, HA_ERR_FIRST,
237 HA_ERR_FIRST+ array_elements(handler_error_messages)-1);
238
239 if (!opt_ignore_control_file &&
240 (no_control_file= ma_control_file_open(FALSE,
241 (opt_require_control_file ||
242 !silent), FALSE)) &&
243 opt_require_control_file)
244 {
245 error= 1;
246 goto end;
247 }
248 maria_init();
249 if (no_control_file || force_pack)
250 {
251 /* Assume that all rows exists */
252 trnman_init(MAX_INTERNAL_TRID-16);
253 }
254
255 error=ok=isamchk_neaded=0;
256 if (join_table)
257 { /* Join files into one */
258 if (open_maria_files(&merge,argv,(uint) argc) ||
259 compress(&merge,join_table))
260 error=1;
261 }
262 else while (argc--)
263 {
264 MARIA_HA *isam_file;
265 if (!(isam_file=open_maria_file(*argv++,O_RDWR)))
266 error=1;
267 else
268 {
269 merge.file= &isam_file;
270 merge.current=0;
271 merge.free_file=0;
272 merge.count=1;
273 if (compress(&merge,0))
274 error=1;
275 else
276 ok=1;
277 }
278 }
279 if (ok && isamchk_neaded && !silent)
280 puts("Remember to run aria_chk -rq on compressed tables");
281
282 end:
283 fflush(stdout);
284 fflush(stderr);
285 free_defaults(default_argv);
286 my_error_unregister(HA_ERR_FIRST,
287 HA_ERR_FIRST+ array_elements(handler_error_messages)-1);
288 maria_end();
289 my_end(verbose ? MY_CHECK_ERROR | MY_GIVE_INFO : MY_CHECK_ERROR);
290 exit(error ? 2 : 0);
291 #ifndef _lint
292 return 0; /* No compiler warning */
293 #endif
294 }
295
my_exit(int error)296 static void my_exit(int error)
297 {
298 free_defaults(default_argv);
299 maria_end();
300 my_end(verbose ? MY_CHECK_ERROR | MY_GIVE_INFO : MY_CHECK_ERROR);
301 exit(error);
302 }
303
304 enum options_mp {OPT_CHARSETS_DIR_MP=256, OPT_AUTO_CLOSE};
305
306 static struct my_option my_long_options[] =
307 {
308 #ifdef __NETWARE__
309 {"autoclose", OPT_AUTO_CLOSE, "Auto close the screen on exit for Netware.",
310 0, 0, 0, GET_NO_ARG, NO_ARG, 0, 0, 0, 0, 0, 0},
311 #endif
312 {"backup", 'b', "Make a backup of the table as table_name.OLD.",
313 &backup, &backup, 0, GET_BOOL, NO_ARG, 0, 0, 0, 0, 0, 0},
314 {"character-sets-dir", OPT_CHARSETS_DIR_MP,
315 "Directory where character sets are.", (char**) &charsets_dir,
316 (char**) &charsets_dir, 0, GET_STR, REQUIRED_ARG, 0, 0, 0, 0, 0, 0},
317 {"datadir", 'h',
318 "Path for control file (and logs if --logdir not used).",
319 (char**) &maria_data_root, 0, 0, GET_STR, REQUIRED_ARG,
320 0, 0, 0, 0, 0, 0},
321 {"debug", '#', "Output debug log. Often this is 'd:t:o,filename'.",
322 0, 0, 0, GET_STR, OPT_ARG, 0, 0, 0, 0, 0, 0},
323 {"force", 'f',
324 "Force packing of table even if it gets bigger or if tempfile exists.",
325 0, 0, 0, GET_NO_ARG, NO_ARG, 0, 0, 0, 0, 0, 0},
326 { "ignore-control-file", 0,
327 "Ignore the control file",
328 (uchar**)&opt_ignore_control_file, 0, 0, GET_BOOL, NO_ARG,
329 0, 0, 0, 0, 0, 0},
330 {"join", 'j',
331 "Join all given tables into 'new_table_name'. All tables MUST have identical layouts.",
332 &join_table, &join_table, 0, GET_STR, REQUIRED_ARG, 0, 0, 0,
333 0, 0, 0},
334 {"help", '?', "Display this help and exit.",
335 0, 0, 0, GET_NO_ARG, NO_ARG, 0, 0, 0, 0, 0, 0},
336 { "require-control-file", 0,
337 "Abort if cannot find control file",
338 (uchar**)&opt_require_control_file, 0, 0, GET_BOOL, NO_ARG,
339 0, 0, 0, 0, 0, 0},
340 {"silent", 's', "Be more silent.",
341 0, 0, 0, GET_NO_ARG, NO_ARG, 0, 0, 0, 0, 0, 0},
342 {"tmpdir", 'T', "Use temporary directory to store temporary table.",
343 0, 0, 0, GET_STR, REQUIRED_ARG, 0, 0, 0, 0, 0, 0},
344 {"test", 't', "Don't pack table, only test packing it.",
345 0, 0, 0, GET_NO_ARG, NO_ARG, 0, 0, 0, 0, 0, 0},
346 {"verbose", 'v', "Write info about progress and packing result. Use many -v for more verbosity!",
347 0, 0, 0, GET_NO_ARG, NO_ARG, 0, 0, 0, 0, 0, 0},
348 {"version", 'V', "Output version information and exit.",
349 0, 0, 0, GET_NO_ARG, NO_ARG, 0, 0, 0, 0, 0, 0},
350 {"wait", 'w', "Wait and retry if table is in use.", &opt_wait,
351 &opt_wait, 0, GET_BOOL, NO_ARG, 0, 0, 0, 0, 0, 0},
352 { 0, 0, 0, 0, 0, 0, GET_NO_ARG, NO_ARG, 0, 0, 0, 0, 0, 0}
353 };
354
355
print_version(void)356 static void print_version(void)
357 {
358 printf("%s Ver 1.0 for %s on %s\n", my_progname, SYSTEM_TYPE, MACHINE_TYPE);
359 }
360
361
usage(void)362 static void usage(void)
363 {
364 print_version();
365 puts("Copyright 2002-2008 MySQL AB, 2008-2009 Sun Microsystems, Inc.");
366 puts("This software comes with ABSOLUTELY NO WARRANTY. This is free software,");
367 puts("and you are welcome to modify and redistribute it under the GPL license\n");
368
369 puts("Pack a Aria-table to take much less space.");
370 puts("Keys are not updated, you must run aria_chk -rq on the index (.MAI) file");
371 puts("afterwards to update the keys.");
372 puts("You should give the .MAI file as the filename argument.");
373 puts("To unpack a packed table, run aria_chk -u on the table");
374
375 printf("\nUsage: %s [OPTIONS] filename...\n", my_progname);
376 my_print_help(my_long_options);
377 print_defaults("my", load_default_groups);
378 my_print_variables(my_long_options);
379 }
380
381
382 static my_bool
get_one_option(const struct my_option * opt,const char * argument,const char * filename)383 get_one_option(const struct my_option *opt,
384 const char *argument,
385 const char *filename __attribute__((unused)))
386 {
387 uint length;
388
389 switch(opt->id) {
390 #ifdef __NETWARE__
391 case OPT_AUTO_CLOSE:
392 setscreenmode(SCR_AUTOCLOSE_ON_EXIT);
393 break;
394 #endif
395 case 'f':
396 force_pack= 1;
397 tmpfile_createflag= O_RDWR | O_TRUNC;
398 break;
399 case 's':
400 write_loop= verbose= 0;
401 silent= 1;
402 break;
403 case 't':
404 test_only= 1;
405 /* Avoid to reset 'verbose' if it was already set > 1. */
406 if (! verbose)
407 verbose= 1;
408 break;
409 case 'T':
410 length= (uint) (strmov(tmp_dir, argument) - tmp_dir);
411 if (length != dirname_length(tmp_dir))
412 {
413 tmp_dir[length]=FN_LIBCHAR;
414 tmp_dir[length+1]=0;
415 }
416 break;
417 case 'v':
418 verbose++; /* Allow for selecting the level of verbosity. */
419 silent= 0;
420 break;
421 case '#':
422 DBUG_PUSH(argument ? argument : "d:t:o,/tmp/aria_pack.trace");
423 break;
424 case 'V':
425 print_version();
426 my_exit(0);
427 break;
428 case 'I':
429 case '?':
430 usage();
431 my_exit(0);
432 }
433 return 0;
434 }
435
436 /* reads options */
437 /* Initiates DEBUG - but no debugging here ! */
438
get_options(int * argc,char *** argv)439 static void get_options(int *argc,char ***argv)
440 {
441 int ho_error;
442
443 my_progname= argv[0][0];
444 if (isatty(fileno(stdout)))
445 write_loop=1;
446
447 if ((ho_error=handle_options(argc, argv, my_long_options, get_one_option)))
448 my_exit(ho_error);
449
450 if (!*argc)
451 {
452 usage();
453 my_exit(1);
454 }
455 if (join_table)
456 {
457 backup=0; /* Not needed */
458 tmp_dir[0]=0;
459 }
460 return;
461 }
462
463
print_error(int error,const char * filename)464 static void print_error(int error, const char *filename)
465 {
466 switch (error) {
467 case HA_ERR_CRASHED:
468 fprintf(stderr, "'%s' doesn't have a correct index definition. You need to recreate it before you can do a repair",filename);
469 break;
470 case HA_ERR_NOT_A_TABLE:
471 fprintf(stderr, "'%s' is not a Aria table",filename);
472 break;
473 case HA_ERR_CRASHED_ON_USAGE:
474 fprintf(stderr, "'%s' is marked as crashed",filename);
475 break;
476 case HA_ERR_CRASHED_ON_REPAIR:
477 fprintf(stderr, "'%s' is marked as crashed after last repair",filename);
478 break;
479 case HA_ERR_OLD_FILE:
480 fprintf(stderr, "'%s' has transactions newer than registered in control file. If this is ok, please re-run with --ignore-control-file", filename);
481 break;
482 case HA_ERR_NEW_FILE:
483 fprintf(stderr, "'%s' uses new features not supported by this version of the Aria library", filename);
484 break;
485 case HA_ERR_END_OF_FILE:
486 fprintf(stderr, "Couldn't read complete header from '%s'", filename);
487 break;
488 case EAGAIN:
489 fprintf(stderr, "'%s' is locked. Use -w to wait until unlocked",filename);
490 break;
491 case ENOENT:
492 fprintf(stderr, "File '%s' doesn't exist",filename);
493 break;
494 case EACCES:
495 fprintf(stderr, "You don't have permission to use '%s'", filename);
496 break;
497 default:
498 fprintf(stderr, "%d when opening Aria table '%s'", error, filename);
499 break;
500 }
501 fputc('\n',stderr);
502 }
503
504
open_maria_file(char * name,int mode)505 static MARIA_HA *open_maria_file(char *name,int mode)
506 {
507 MARIA_HA *isam_file;
508 MARIA_SHARE *share;
509 DBUG_ENTER("open_maria_file");
510
511 if (!(isam_file=maria_open(name, mode, HA_OPEN_IGNORE_MOVED_STATE |
512 (opt_wait ? HA_OPEN_WAIT_IF_LOCKED :
513 HA_OPEN_ABORT_IF_LOCKED), 0)))
514 {
515 print_error(my_errno, name);
516 DBUG_RETURN(0);
517 }
518 share=isam_file->s;
519 if (share->options & HA_OPTION_COMPRESS_RECORD && !join_table)
520 {
521 if (!force_pack)
522 {
523 fprintf(stderr, "%s is already compressed\n", name);
524 maria_close(isam_file);
525 DBUG_RETURN(0);
526 }
527 if (verbose)
528 puts("Recompressing already compressed table");
529 share->options&= ~HA_OPTION_READ_ONLY_DATA; /* We are modifing it */
530 }
531 if (! force_pack && share->state.state.records != 0 &&
532 (share->state.state.records <= 1 ||
533 share->state.state.data_file_length < 1024))
534 {
535 fprintf(stderr, "%s is too small to compress\n", name);
536 maria_close(isam_file);
537 DBUG_RETURN(0);
538 }
539 maria_lock_database(isam_file,F_WRLCK);
540 maria_ignore_trids(isam_file);
541 DBUG_RETURN(isam_file);
542 }
543
544
open_maria_files(PACK_MRG_INFO * mrg,char ** names,uint count)545 static my_bool open_maria_files(PACK_MRG_INFO *mrg,char **names,uint count)
546 {
547 uint i,j;
548 mrg->count=0;
549 mrg->current=0;
550 mrg->file=(MARIA_HA**) my_malloc(PSI_NOT_INSTRUMENTED, sizeof(MARIA_HA*)*count,MYF(MY_FAE));
551 mrg->free_file=1;
552 mrg->src_file_has_indexes_disabled= 0;
553 for (i=0; i < count ; i++)
554 {
555 if (!(mrg->file[i]=open_maria_file(names[i],O_RDONLY)))
556 goto error;
557
558 mrg->src_file_has_indexes_disabled|=
559 ! maria_is_all_keys_active(mrg->file[i]->s->state.key_map,
560 mrg->file[i]->s->base.keys);
561 }
562 /* Check that files are identical */
563 for (j=0 ; j < count-1 ; j++)
564 {
565 MARIA_COLUMNDEF *m1,*m2,*end;
566 if (mrg->file[j]->s->base.reclength != mrg->file[j+1]->s->base.reclength ||
567 mrg->file[j]->s->base.fields != mrg->file[j+1]->s->base.fields)
568 goto diff_file;
569 m1=mrg->file[j]->s->columndef;
570 end=m1+mrg->file[j]->s->base.fields;
571 m2=mrg->file[j+1]->s->columndef;
572 for ( ; m1 != end ; m1++,m2++)
573 {
574 if (m1->type != m2->type || m1->length != m2->length)
575 goto diff_file;
576 }
577 }
578 mrg->count=count;
579 return 0;
580
581 diff_file:
582 fprintf(stderr, "%s: Tables '%s' and '%s' are not identical\n",
583 my_progname, names[j], names[j+1]);
584 error:
585 while (i--)
586 maria_close(mrg->file[i]);
587 my_free(mrg->file);
588 return 1;
589 }
590
591
compress(PACK_MRG_INFO * mrg,char * result_table)592 static int compress(PACK_MRG_INFO *mrg,char *result_table)
593 {
594 int error;
595 File new_file,join_maria_file;
596 MARIA_HA *isam_file;
597 MARIA_SHARE *share;
598 char org_name[FN_REFLEN],new_name[FN_REFLEN],temp_name[FN_REFLEN];
599 uint i,header_length,fields,trees,used_trees;
600 my_off_t old_length,new_length,tot_elements;
601 HUFF_COUNTS *huff_counts;
602 HUFF_TREE *huff_trees;
603 DBUG_ENTER("compress");
604
605 isam_file=mrg->file[0]; /* Take this as an example */
606 share=isam_file->s;
607 new_file=join_maria_file= -1;
608 trees=fields=0;
609 huff_trees=0;
610 huff_counts=0;
611 maria_block_size= isam_file->s->block_size;
612
613 /* Create temporary or join file */
614 if (backup)
615 fn_format(org_name,isam_file->s->open_file_name.str, "",MARIA_NAME_DEXT, 2);
616 else
617 fn_format(org_name,isam_file->s->open_file_name.str, "",MARIA_NAME_DEXT, 2+4+16);
618
619 if (init_pagecache(maria_pagecache, MARIA_MIN_PAGE_CACHE_SIZE, 0, 0,
620 maria_block_size, 0, MY_WME) == 0)
621 {
622 fprintf(stderr, "Can't initialize page cache\n");
623 goto err;
624 }
625
626 if (!test_only && result_table)
627 {
628 /* Make a new indexfile based on first file in list */
629 uint length;
630 uchar *buff;
631 strmov(org_name,result_table); /* Fix error messages */
632 fn_format(new_name,result_table,"",MARIA_NAME_IEXT,2);
633 if ((join_maria_file=my_create(new_name,0,tmpfile_createflag,MYF(MY_WME)))
634 < 0)
635 goto err;
636 length=(uint) share->base.keystart;
637 if (!(buff= (uchar*) my_malloc(PSI_NOT_INSTRUMENTED, length, MYF(MY_WME))))
638 goto err;
639 if (my_pread(share->kfile.file, buff, length, 0L, MYF(MY_WME | MY_NABP)) ||
640 my_write(join_maria_file,buff,length,
641 MYF(MY_WME | MY_NABP | MY_WAIT_IF_FULL)))
642 {
643 my_free(buff);
644 goto err;
645 }
646 my_free(buff);
647 fn_format(new_name,result_table,"",MARIA_NAME_DEXT,2);
648 }
649 else if (!tmp_dir[0])
650 make_new_name(new_name,org_name);
651 else
652 fn_format(new_name,org_name,tmp_dir,DATA_TMP_EXT,1+2+4);
653 if (!test_only &&
654 (new_file=my_create(new_name,0,tmpfile_createflag,MYF(MY_WME))) < 0)
655 goto err;
656
657 /* Start calculating statistics */
658
659 mrg->records=0;
660 for (i=0 ; i < mrg->count ; i++)
661 mrg->records+=mrg->file[i]->s->state.state.records;
662
663 DBUG_PRINT("info", ("Compressing %s: (%lu records)",
664 result_table ? new_name : org_name,
665 (ulong) mrg->records));
666 if (write_loop || verbose)
667 {
668 printf("Compressing %s: (%lu records)\n",
669 result_table ? new_name : org_name, (ulong) mrg->records);
670 }
671 trees=fields=share->base.fields;
672 huff_counts=init_huff_count(isam_file,mrg->records);
673
674 /*
675 Read the whole data file(s) for statistics.
676 */
677 DBUG_PRINT("info", ("- Calculating statistics"));
678 if (write_loop || verbose)
679 printf("- Calculating statistics\n");
680 if (get_statistic(mrg,huff_counts))
681 goto err;
682
683 old_length=0;
684 for (i=0; i < mrg->count ; i++)
685 old_length+= (mrg->file[i]->s->state.state.data_file_length -
686 mrg->file[i]->s->state.state.empty);
687
688 /*
689 Create a global priority queue in preparation for making
690 temporary Huffman trees.
691 */
692 if (init_queue(&queue, 256, 0, 0, compare_huff_elements, 0, 0, 0))
693 goto err;
694
695 /*
696 Check each column if we should use pre-space-compress, end-space-
697 compress, empty-field-compress or zero-field-compress.
698 */
699 check_counts(huff_counts,fields,mrg->records);
700
701 /*
702 Build a Huffman tree for each column.
703 */
704 huff_trees=make_huff_trees(huff_counts,trees);
705
706 /*
707 If the packed lengths of combined columns is less then the sum of
708 the non-combined columns, then create common Huffman trees for them.
709 We do this only for uchar compressed columns, not for distinct values
710 compressed columns.
711 */
712 if ((int) (used_trees=join_same_trees(huff_counts,trees)) < 0)
713 goto err;
714
715 /*
716 Assign codes to all uchar or column values.
717 */
718 if (make_huff_decode_table(huff_trees,fields))
719 goto err;
720
721 /* Prepare a file buffer. */
722 init_file_buffer(new_file,0);
723
724 /*
725 Reserve space in the target file for the fixed compressed file header.
726 */
727 file_buffer.pos_in_file=HEAD_LENGTH;
728 if (! test_only)
729 my_seek(new_file,file_buffer.pos_in_file,MY_SEEK_SET,MYF(0));
730
731 /*
732 Write field infos: field type, pack type, length bits, tree number.
733 */
734 write_field_info(huff_counts,fields,used_trees);
735
736 /*
737 Write decode trees.
738 */
739 if (!(tot_elements=write_huff_tree(huff_trees,trees)))
740 goto err;
741
742 /*
743 Calculate the total length of the compression info header.
744 This includes the fixed compressed file header, the column compression
745 type descriptions, and the decode trees.
746 */
747 header_length=(uint) file_buffer.pos_in_file+
748 (uint) (file_buffer.pos-file_buffer.buffer);
749
750 /*
751 Compress the source file into the target file.
752 */
753 DBUG_PRINT("info", ("- Compressing file"));
754 if (write_loop || verbose)
755 printf("- Compressing file\n");
756 error=compress_maria_file(mrg,huff_counts);
757 new_length=file_buffer.pos_in_file;
758 if (!error && !test_only)
759 {
760 uchar buff[MEMMAP_EXTRA_MARGIN]; /* End marginal for memmap */
761 bzero(buff,sizeof(buff));
762 error=my_write(file_buffer.file,buff,sizeof(buff),
763 MYF(MY_WME | MY_NABP | MY_WAIT_IF_FULL)) != 0;
764 }
765
766 /*
767 Write the fixed compressed file header.
768 */
769 if (!error)
770 error=write_header(mrg,header_length,used_trees,tot_elements,
771 new_length);
772
773 /* Flush the file buffer. */
774 end_file_buffer();
775
776 /* Display statistics. */
777 DBUG_PRINT("info", ("Min record length: %6d Max length: %6d "
778 "Mean total length: %6ld",
779 mrg->min_pack_length, mrg->max_pack_length,
780 (ulong) (mrg->records ? (new_length/mrg->records) : 0)));
781 if (verbose && mrg->records)
782 printf("Min record length: %6d Max length: %6d "
783 "Mean total length: %6ld\n", mrg->min_pack_length,
784 mrg->max_pack_length, (ulong) (new_length/mrg->records));
785
786 /* Close source and target file. */
787 if (!test_only)
788 {
789 error|=my_close(new_file,MYF(MY_WME));
790 if (!result_table)
791 {
792 (void) flush_pagecache_blocks(isam_file->s->pagecache, &isam_file->dfile,
793 FLUSH_RELEASE);
794 error|=my_close(isam_file->dfile.file, MYF(MY_WME));
795 isam_file->dfile.file= -1; /* Tell maria_close file is closed */
796 isam_file->s->bitmap.file.file= -1;
797 }
798 }
799
800 /* Cleanup. */
801 free_counts_and_tree_and_queue(huff_trees,trees,huff_counts,fields);
802 if (! test_only && ! error)
803 {
804 if (result_table)
805 {
806 error=save_state_mrg(join_maria_file,mrg,new_length,glob_crc);
807 }
808 else
809 {
810 if (backup)
811 {
812 if (my_rename(org_name,make_old_name(temp_name,
813 isam_file->s->open_file_name.str),
814 MYF(MY_WME)))
815 error=1;
816 else
817 {
818 if (tmp_dir[0])
819 error=my_copy(new_name,org_name,MYF(MY_WME));
820 else
821 error=my_rename(new_name,org_name,MYF(MY_WME));
822 if (!error)
823 {
824 my_copystat(temp_name,org_name,MYF(MY_COPYTIME));
825 if (tmp_dir[0])
826 my_delete(new_name,MYF(MY_WME));
827 }
828 }
829 }
830 else
831 {
832 if (tmp_dir[0])
833 {
834 error=my_copy(new_name,org_name,
835 MYF(MY_WME | MY_HOLD_ORIGINAL_MODES | MY_COPYTIME));
836 if (!error)
837 my_delete(new_name,MYF(MY_WME));
838 }
839 else
840 error=my_redel(org_name, new_name, 0, MYF(MY_WME | MY_COPYTIME));
841 }
842 if (! error)
843 error=save_state(isam_file,mrg,new_length,glob_crc);
844 }
845 }
846 error|=mrg_close(mrg);
847 if (join_maria_file >= 0)
848 error|=my_close(join_maria_file,MYF(MY_WME));
849 if (error)
850 {
851 fprintf(stderr, "Aborting: %s is not compressed\n", org_name);
852 my_delete(new_name,MYF(MY_WME));
853 DBUG_RETURN(-1);
854 }
855 if (write_loop || verbose)
856 {
857 if (old_length)
858 printf("%.4g%% \n",
859 (((longlong) (old_length - new_length)) * 100.0 /
860 (longlong) old_length));
861 else
862 puts("Empty file saved in compressed format");
863 }
864 DBUG_RETURN(0);
865
866 err:
867 free_counts_and_tree_and_queue(huff_trees,trees,huff_counts,fields);
868 if (new_file >= 0)
869 my_close(new_file,MYF(0));
870 if (join_maria_file >= 0)
871 my_close(join_maria_file,MYF(0));
872 mrg_close(mrg);
873 end_pagecache(maria_pagecache, 1);
874 fprintf(stderr, "Aborted: %s is not compressed\n", org_name);
875 DBUG_RETURN(-1);
876 }
877
878 /* Init a huff_count-struct for each field and init it */
879
init_huff_count(MARIA_HA * info,my_off_t records)880 static HUFF_COUNTS *init_huff_count(MARIA_HA *info,my_off_t records)
881 {
882 reg2 uint i;
883 reg1 HUFF_COUNTS *count;
884 if ((count = (HUFF_COUNTS*) my_malloc(PSI_NOT_INSTRUMENTED,
885 info->s->base.fields*sizeof(HUFF_COUNTS), MYF(MY_ZEROFILL | MY_WME))))
886 {
887 for (i=0 ; i < info->s->base.fields ; i++)
888 {
889 enum en_fieldtype type;
890 uint col_nr = info->s->columndef[i].column_nr;
891 count[col_nr].field_length=info->s->columndef[i].length;
892 type= count[col_nr].field_type=
893 (enum en_fieldtype) info->s->columndef[i].type;
894 if (type == FIELD_INTERVALL ||
895 type == FIELD_CONSTANT ||
896 type == FIELD_ZERO)
897 type = FIELD_NORMAL;
898 if (count[col_nr].field_length <= 8 &&
899 (type == FIELD_NORMAL ||
900 type == FIELD_SKIP_ZERO))
901 count[col_nr].max_zero_fill= count[col_nr].field_length;
902 /*
903 For every column initialize a tree, which is used to detect distinct
904 column values. 'int_tree' works together with 'tree_buff' and
905 'tree_pos'. It's keys are implemented by pointers into 'tree_buff'.
906 This is accomplished by '-1' as the element size.
907 */
908 init_tree(&count[col_nr].int_tree,0,0,-1,(qsort_cmp2) compare_tree, NULL,
909 NULL, MYF(0));
910 if (records && type != FIELD_BLOB && type != FIELD_VARCHAR)
911 count[col_nr].tree_pos=count[col_nr].tree_buff =
912 my_malloc(PSI_NOT_INSTRUMENTED,
913 count[col_nr].field_length > 1 ? tree_buff_length : 2,
914 MYF(MY_WME));
915 }
916 }
917 return count;
918 }
919
920
921 /* Free memory used by counts and trees */
922
free_counts_and_tree_and_queue(HUFF_TREE * huff_trees,uint trees,HUFF_COUNTS * huff_counts,uint fields)923 static void free_counts_and_tree_and_queue(HUFF_TREE *huff_trees, uint trees,
924 HUFF_COUNTS *huff_counts,
925 uint fields)
926 {
927 register uint i;
928
929 if (huff_trees)
930 {
931 for (i=0 ; i < trees ; i++)
932 {
933 if (huff_trees[i].element_buffer)
934 my_free(huff_trees[i].element_buffer);
935 if (huff_trees[i].code)
936 my_free(huff_trees[i].code);
937 }
938 my_free(huff_trees);
939 }
940 if (huff_counts)
941 {
942 for (i=0 ; i < fields ; i++)
943 {
944 if (huff_counts[i].tree_buff)
945 {
946 my_free(huff_counts[i].tree_buff);
947 delete_tree(&huff_counts[i].int_tree, 0);
948 }
949 }
950 my_free(huff_counts);
951 }
952 delete_queue(&queue); /* This is safe to free */
953 return;
954 }
955
956 /* Read through old file and gather some statistics */
957
get_statistic(PACK_MRG_INFO * mrg,HUFF_COUNTS * huff_counts)958 static int get_statistic(PACK_MRG_INFO *mrg,HUFF_COUNTS *huff_counts)
959 {
960 int error;
961 uint length, null_bytes;
962 ulong reclength,max_blob_length;
963 uchar *record,*pos,*next_pos,*end_pos,*start_pos;
964 ha_rows record_count;
965 HUFF_COUNTS *count,*end_count;
966 TREE_ELEMENT *element;
967 ha_checksum(*calc_checksum)(MARIA_HA *, const uchar *);
968 DBUG_ENTER("get_statistic");
969
970 reclength= mrg->file[0]->s->base.reclength;
971 null_bytes= mrg->file[0]->s->base.null_bytes;
972 record=(uchar*) my_safe_alloca(reclength);
973 end_count=huff_counts+mrg->file[0]->s->base.fields;
974 record_count=0; glob_crc=0;
975 max_blob_length=0;
976
977 /* Check how to calculate checksum */
978 if (mrg->file[0]->s->data_file_type == STATIC_RECORD)
979 calc_checksum= _ma_static_checksum;
980 else
981 calc_checksum= _ma_checksum;
982
983 mrg_reset(mrg);
984 while ((error=mrg_rrnd(mrg,record)) != HA_ERR_END_OF_FILE)
985 {
986 ulong tot_blob_length=0;
987 if (! error)
988 {
989 /* glob_crc is a checksum over all bytes of all records. */
990 glob_crc+= (*calc_checksum)(mrg->file[0],record);
991
992 /* Count the incidence of values separately for every column. */
993 for (pos=record + null_bytes, count=huff_counts ;
994 count < end_count ;
995 count++,
996 pos=next_pos)
997 {
998 next_pos=end_pos=(start_pos=pos)+count->field_length;
999
1000 /*
1001 Put the whole column value in a tree if there is room for it.
1002 'int_tree' is used to quickly check for duplicate values.
1003 'tree_buff' collects as many distinct column values as
1004 possible. If the field length is > 1, it is tree_buff_length,
1005 else 2 bytes. Each value is 'field_length' bytes big. If there
1006 are more distinct column values than fit into the buffer, we
1007 give up with this tree. BLOBs and VARCHARs do not have a
1008 tree_buff as it can only be used with fixed length columns.
1009 For the special case of field length == 1, we handle only the
1010 case that there is only one distinct value in the table(s).
1011 Otherwise, we can have a maximum of 256 distinct values. This
1012 is then handled by the normal Huffman tree build.
1013
1014 Another limit for collecting distinct column values is the
1015 number of values itself. Since we would need to build a
1016 Huffman tree for the values, we are limited by the 'IS_OFFSET'
1017 constant. This constant expresses a bit which is used to
1018 determine if a tree element holds a final value or an offset
1019 to a child element. Hence, all values and offsets need to be
1020 smaller than 'IS_OFFSET'. A tree element is implemented with
1021 two integer values, one for the left branch and one for the
1022 right branch. For the extreme case that the first element
1023 points to the last element, the number of integers in the tree
1024 must be less or equal to IS_OFFSET. So the number of elements
1025 must be less or equal to IS_OFFSET / 2.
1026
1027 WARNING: At first, we insert a pointer into the record buffer
1028 as the key for the tree. If we got a new distinct value, which
1029 is really inserted into the tree, instead of being counted
1030 only, we will copy the column value from the record buffer to
1031 'tree_buff' and adjust the key pointer of the tree accordingly.
1032 */
1033 if (count->tree_buff)
1034 {
1035 global_count=count;
1036 if (!(element=tree_insert(&count->int_tree,pos, 0,
1037 count->int_tree.custom_arg)) ||
1038 (element->count == 1 &&
1039 (count->tree_buff + tree_buff_length <
1040 count->tree_pos + count->field_length)) ||
1041 (count->int_tree.elements_in_tree > IS_OFFSET / 2) ||
1042 (count->field_length == 1 &&
1043 count->int_tree.elements_in_tree > 1))
1044 {
1045 delete_tree(&count->int_tree, 0);
1046 my_free(count->tree_buff);
1047 count->tree_buff=0;
1048 }
1049 else
1050 {
1051 /*
1052 If tree_insert() succeeds, it either creates a new element
1053 or increments the counter of an existing element.
1054 */
1055 if (element->count == 1)
1056 {
1057 /* Copy the new column value into 'tree_buff'. */
1058 memcpy(count->tree_pos,pos,(size_t) count->field_length);
1059 /* Adjust the key pointer in the tree. */
1060 tree_set_pointer(element,count->tree_pos);
1061 /* Point behind the last column value so far. */
1062 count->tree_pos+=count->field_length;
1063 }
1064 }
1065 }
1066
1067 /* Save character counters and space-counts and zero-field-counts */
1068 if (count->field_type == FIELD_NORMAL ||
1069 count->field_type == FIELD_SKIP_ENDSPACE)
1070 {
1071 /* Ignore trailing space. */
1072 for ( ; end_pos > pos ; end_pos--)
1073 if (end_pos[-1] != ' ')
1074 break;
1075 /* Empty fields are just counted. Go to the next record. */
1076 if (end_pos == pos)
1077 {
1078 count->empty_fields++;
1079 count->max_zero_fill=0;
1080 continue;
1081 }
1082 /*
1083 Count the total of all trailing spaces and the number of
1084 short trailing spaces. Remember the longest trailing space.
1085 */
1086 length= (uint) (next_pos-end_pos);
1087 count->tot_end_space+=length;
1088 if (length < 8)
1089 count->end_space[length]++;
1090 if (count->max_end_space < length)
1091 count->max_end_space = length;
1092 }
1093
1094 if (count->field_type == FIELD_NORMAL ||
1095 count->field_type == FIELD_SKIP_PRESPACE)
1096 {
1097 /* Ignore leading space. */
1098 for (pos=start_pos; pos < end_pos ; pos++)
1099 if (pos[0] != ' ')
1100 break;
1101 /* Empty fields are just counted. Go to the next record. */
1102 if (end_pos == pos)
1103 {
1104 count->empty_fields++;
1105 count->max_zero_fill=0;
1106 continue;
1107 }
1108 /*
1109 Count the total of all leading spaces and the number of
1110 short leading spaces. Remember the longest leading space.
1111 */
1112 length= (uint) (pos-start_pos);
1113 count->tot_pre_space+=length;
1114 if (length < 8)
1115 count->pre_space[length]++;
1116 if (count->max_pre_space < length)
1117 count->max_pre_space = length;
1118 }
1119
1120 /* Calculate pos, end_pos, and max_length for variable length fields. */
1121 if (count->field_type == FIELD_BLOB)
1122 {
1123 uint field_length=count->field_length -portable_sizeof_char_ptr;
1124 ulong blob_length= _ma_calc_blob_length(field_length, start_pos);
1125 memcpy(&pos, start_pos+field_length,sizeof(char*));
1126 end_pos=pos+blob_length;
1127 tot_blob_length+=blob_length;
1128 set_if_bigger(count->max_length,blob_length);
1129 }
1130 else if (count->field_type == FIELD_VARCHAR)
1131 {
1132 uint pack_length= HA_VARCHAR_PACKLENGTH(count->field_length-1);
1133 length= (pack_length == 1 ? (uint) *(uchar*) start_pos :
1134 uint2korr(start_pos));
1135 pos= start_pos+pack_length;
1136 end_pos= pos+length;
1137 set_if_bigger(count->max_length,length);
1138 }
1139
1140 /* Evaluate 'max_zero_fill' for short fields. */
1141 if (count->field_length <= 8 &&
1142 (count->field_type == FIELD_NORMAL ||
1143 count->field_type == FIELD_SKIP_ZERO))
1144 {
1145 uint i;
1146 /* Zero fields are just counted. Go to the next record. */
1147 if (!memcmp(start_pos, zero_string, count->field_length))
1148 {
1149 count->zero_fields++;
1150 continue;
1151 }
1152 /*
1153 max_zero_fill starts with field_length. It is decreased every
1154 time a shorter "zero trailer" is found. It is set to zero when
1155 an empty field is found (see above). This suggests that the
1156 variable should be called 'min_zero_fill'.
1157 */
1158 for (i =0 ; i < count->max_zero_fill && ! end_pos[-1 - (int) i] ;
1159 i++) ;
1160 if (i < count->max_zero_fill)
1161 count->max_zero_fill=i;
1162 }
1163
1164 /* Ignore zero fields and check fields. */
1165 if (count->field_type == FIELD_ZERO ||
1166 count->field_type == FIELD_CHECK)
1167 continue;
1168
1169 /*
1170 Count the incidence of every uchar value in the
1171 significant field value.
1172 */
1173 for ( ; pos < end_pos ; pos++)
1174 count->counts[(uchar) *pos]++;
1175
1176 /* Step to next field. */
1177 }
1178
1179 if (tot_blob_length > max_blob_length)
1180 max_blob_length=tot_blob_length;
1181 record_count++;
1182 if (write_loop && record_count % WRITE_COUNT == 0)
1183 {
1184 printf("%lu\r", (ulong) record_count);
1185 fflush(stdout);
1186 }
1187 }
1188 else if (error != HA_ERR_RECORD_DELETED)
1189 {
1190 fprintf(stderr, "Got error %d while reading rows\n", error);
1191 break;
1192 }
1193
1194 /* Step to next record. */
1195 }
1196 if (write_loop)
1197 {
1198 printf(" \r");
1199 fflush(stdout);
1200 }
1201
1202 /*
1203 If --debug=d,fakebigcodes is set, fake the counts to get big Huffman
1204 codes.
1205 */
1206 DBUG_EXECUTE_IF("fakebigcodes", fakebigcodes(huff_counts, end_count););
1207
1208 DBUG_PRINT("info", ("Found the following number of incidents "
1209 "of the uchar codes:"));
1210 if (verbose >= 2)
1211 printf("Found the following number of incidents "
1212 "of the uchar codes:\n");
1213 for (count= huff_counts ; count < end_count; count++)
1214 {
1215 uint idx;
1216 my_off_t total_count;
1217 char llbuf[32];
1218
1219 DBUG_PRINT("info", ("column: %3u", (uint) (count - huff_counts + 1)));
1220 if (verbose >= 2)
1221 printf("column: %3u\n", (uint) (count - huff_counts + 1));
1222 if (count->tree_buff)
1223 {
1224 DBUG_PRINT("info", ("number of distinct values: %u",
1225 (uint) ((count->tree_pos - count->tree_buff) /
1226 count->field_length)));
1227 if (verbose >= 2)
1228 printf("number of distinct values: %u\n",
1229 (uint) ((count->tree_pos - count->tree_buff) /
1230 count->field_length));
1231 }
1232 total_count= 0;
1233 for (idx= 0; idx < 256; idx++)
1234 {
1235 if (count->counts[idx])
1236 {
1237 total_count+= count->counts[idx];
1238 DBUG_PRINT("info", ("counts[0x%02x]: %12s", idx,
1239 llstr((longlong) count->counts[idx], llbuf)));
1240 if (verbose >= 2)
1241 printf("counts[0x%02x]: %12s\n", idx,
1242 llstr((longlong) count->counts[idx], llbuf));
1243 }
1244 }
1245 DBUG_PRINT("info", ("total: %12s", llstr((longlong) total_count,
1246 llbuf)));
1247 if ((verbose >= 2) && total_count)
1248 {
1249 printf("total: %12s\n",
1250 llstr((longlong) total_count, llbuf));
1251 }
1252 }
1253
1254 mrg->records=record_count;
1255 mrg->max_blob_length=max_blob_length;
1256 my_safe_afree(record, reclength);
1257 DBUG_RETURN(error != HA_ERR_END_OF_FILE);
1258 }
1259
compare_huff_elements(void * not_used,uchar * a,uchar * b)1260 static int compare_huff_elements(void *not_used __attribute__((unused)),
1261 uchar *a, uchar *b)
1262 {
1263 return *((my_off_t*) a) < *((my_off_t*) b) ? -1 :
1264 (*((my_off_t*) a) == *((my_off_t*) b) ? 0 : 1);
1265 }
1266
1267 /* Check each tree if we should use pre-space-compress, end-space-
1268 compress, empty-field-compress or zero-field-compress */
1269
check_counts(HUFF_COUNTS * huff_counts,uint trees,my_off_t records)1270 static void check_counts(HUFF_COUNTS *huff_counts, uint trees,
1271 my_off_t records)
1272 {
1273 uint space_fields,fill_zero_fields,field_count[(int) FIELD_enum_val_count];
1274 my_off_t old_length,new_length,length;
1275 DBUG_ENTER("check_counts");
1276
1277 bzero((uchar*) field_count,sizeof(field_count));
1278 space_fields=fill_zero_fields=0;
1279
1280 for (; trees-- ; huff_counts++)
1281 {
1282 if (huff_counts->field_type == FIELD_BLOB)
1283 {
1284 huff_counts->length_bits=max_bit(huff_counts->max_length);
1285 goto found_pack;
1286 }
1287 else if (huff_counts->field_type == FIELD_VARCHAR)
1288 {
1289 huff_counts->length_bits=max_bit(huff_counts->max_length);
1290 goto found_pack;
1291 }
1292 else if (huff_counts->field_type == FIELD_CHECK)
1293 {
1294 huff_counts->bytes_packed=0;
1295 huff_counts->counts[0]=0;
1296 goto found_pack;
1297 }
1298
1299 huff_counts->field_type=FIELD_NORMAL;
1300 huff_counts->pack_type=0;
1301
1302 /* Check for zero-filled records (in this column), or zero records. */
1303 if (huff_counts->zero_fields || ! records)
1304 {
1305 my_off_t old_space_count;
1306 /*
1307 If there are only zero filled records (in this column),
1308 or no records at all, we are done.
1309 */
1310 if (huff_counts->zero_fields == records)
1311 {
1312 huff_counts->field_type= FIELD_ZERO;
1313 huff_counts->bytes_packed=0;
1314 huff_counts->counts[0]=0;
1315 goto found_pack;
1316 }
1317 /* Remeber the number of significant spaces. */
1318 old_space_count=huff_counts->counts[' '];
1319 /* Add all leading and trailing spaces. */
1320 huff_counts->counts[' ']+= (huff_counts->tot_end_space +
1321 huff_counts->tot_pre_space +
1322 huff_counts->empty_fields *
1323 huff_counts->field_length);
1324 /* Check, what the compressed length of this would be. */
1325 old_length=calc_packed_length(huff_counts,0)+records/8;
1326 /* Get the number of zero bytes. */
1327 length=huff_counts->zero_fields*huff_counts->field_length;
1328 /* Add it to the counts. */
1329 huff_counts->counts[0]+=length;
1330 /* Check, what the compressed length of this would be. */
1331 new_length=calc_packed_length(huff_counts,0);
1332 /* If the compression without the zeroes would be shorter, we are done. */
1333 if (old_length < new_length && huff_counts->field_length > 1)
1334 {
1335 huff_counts->field_type=FIELD_SKIP_ZERO;
1336 huff_counts->counts[0]-=length;
1337 huff_counts->bytes_packed=old_length- records/8;
1338 goto found_pack;
1339 }
1340 /* Remove the insignificant spaces, but keep the zeroes. */
1341 huff_counts->counts[' ']=old_space_count;
1342 }
1343 /* Check, what the compressed length of this column would be. */
1344 huff_counts->bytes_packed=calc_packed_length(huff_counts,0);
1345
1346 /*
1347 If there are enough empty records (in this column),
1348 treating them specially may pay off.
1349 */
1350 if (huff_counts->empty_fields)
1351 {
1352 if (huff_counts->field_length > 2 &&
1353 huff_counts->empty_fields + (records - huff_counts->empty_fields)*
1354 (1+max_bit(MY_MAX(huff_counts->max_pre_space,
1355 huff_counts->max_end_space))) <
1356 records * max_bit(huff_counts->field_length))
1357 {
1358 huff_counts->pack_type |= PACK_TYPE_SPACE_FIELDS;
1359 }
1360 else
1361 {
1362 length=huff_counts->empty_fields*huff_counts->field_length;
1363 if (huff_counts->tot_end_space || ! huff_counts->tot_pre_space)
1364 {
1365 huff_counts->tot_end_space+=length;
1366 huff_counts->max_end_space=huff_counts->field_length;
1367 if (huff_counts->field_length < 8)
1368 huff_counts->end_space[huff_counts->field_length]+=
1369 huff_counts->empty_fields;
1370 }
1371 if (huff_counts->tot_pre_space)
1372 {
1373 huff_counts->tot_pre_space+=length;
1374 huff_counts->max_pre_space=huff_counts->field_length;
1375 if (huff_counts->field_length < 8)
1376 huff_counts->pre_space[huff_counts->field_length]+=
1377 huff_counts->empty_fields;
1378 }
1379 }
1380 }
1381
1382 /*
1383 If there are enough trailing spaces (in this column),
1384 treating them specially may pay off.
1385 */
1386 if (huff_counts->tot_end_space)
1387 {
1388 huff_counts->counts[' ']+=huff_counts->tot_pre_space;
1389 if (test_space_compress(huff_counts,records,huff_counts->max_end_space,
1390 huff_counts->end_space,
1391 huff_counts->tot_end_space,FIELD_SKIP_ENDSPACE))
1392 goto found_pack;
1393 huff_counts->counts[' ']-=huff_counts->tot_pre_space;
1394 }
1395
1396 /*
1397 If there are enough leading spaces (in this column),
1398 treating them specially may pay off.
1399 */
1400 if (huff_counts->tot_pre_space)
1401 {
1402 if (test_space_compress(huff_counts,records,huff_counts->max_pre_space,
1403 huff_counts->pre_space,
1404 huff_counts->tot_pre_space,FIELD_SKIP_PRESPACE))
1405 goto found_pack;
1406 }
1407
1408 found_pack: /* Found field-packing */
1409
1410 /* Test if we can use zero-fill */
1411
1412 if (huff_counts->max_zero_fill &&
1413 (huff_counts->field_type == FIELD_NORMAL ||
1414 huff_counts->field_type == FIELD_SKIP_ZERO))
1415 {
1416 huff_counts->counts[0]-=huff_counts->max_zero_fill*
1417 (huff_counts->field_type == FIELD_SKIP_ZERO ?
1418 records - huff_counts->zero_fields : records);
1419 huff_counts->pack_type|=PACK_TYPE_ZERO_FILL;
1420 huff_counts->bytes_packed=calc_packed_length(huff_counts,0);
1421 }
1422
1423 /* Test if intervall-field is better */
1424
1425 if (huff_counts->tree_buff)
1426 {
1427 HUFF_TREE tree;
1428
1429 DBUG_EXECUTE_IF("forceintervall",
1430 huff_counts->bytes_packed= ~ (my_off_t) 0;);
1431 tree.element_buffer=0;
1432 if (!make_huff_tree(&tree,huff_counts) &&
1433 tree.bytes_packed+tree.tree_pack_length < huff_counts->bytes_packed)
1434 {
1435 if (tree.elements == 1)
1436 huff_counts->field_type=FIELD_CONSTANT;
1437 else
1438 huff_counts->field_type=FIELD_INTERVALL;
1439 huff_counts->pack_type=0;
1440 }
1441 else
1442 {
1443 my_free(huff_counts->tree_buff);
1444 delete_tree(&huff_counts->int_tree, 0);
1445 huff_counts->tree_buff=0;
1446 }
1447 if (tree.element_buffer)
1448 my_free(tree.element_buffer);
1449 }
1450 if (huff_counts->pack_type & PACK_TYPE_SPACE_FIELDS)
1451 space_fields++;
1452 if (huff_counts->pack_type & PACK_TYPE_ZERO_FILL)
1453 fill_zero_fields++;
1454 field_count[huff_counts->field_type]++;
1455 }
1456 DBUG_PRINT("info", ("normal: %3d empty-space: %3d "
1457 "empty-zero: %3d empty-fill: %3d",
1458 field_count[FIELD_NORMAL],space_fields,
1459 field_count[FIELD_SKIP_ZERO],fill_zero_fields));
1460 DBUG_PRINT("info", ("pre-space: %3d end-space: %3d "
1461 "intervall-fields: %3d zero: %3d",
1462 field_count[FIELD_SKIP_PRESPACE],
1463 field_count[FIELD_SKIP_ENDSPACE],
1464 field_count[FIELD_INTERVALL],
1465 field_count[FIELD_ZERO]));
1466 if (verbose)
1467 printf("\nnormal: %3d empty-space: %3d "
1468 "empty-zero: %3d empty-fill: %3d\n"
1469 "pre-space: %3d end-space: %3d "
1470 "intervall-fields: %3d zero: %3d\n",
1471 field_count[FIELD_NORMAL],space_fields,
1472 field_count[FIELD_SKIP_ZERO],fill_zero_fields,
1473 field_count[FIELD_SKIP_PRESPACE],
1474 field_count[FIELD_SKIP_ENDSPACE],
1475 field_count[FIELD_INTERVALL],
1476 field_count[FIELD_ZERO]);
1477 DBUG_VOID_RETURN;
1478 }
1479
1480
1481 /* Test if we can use space-compression and empty-field-compression */
1482
1483 static int
test_space_compress(HUFF_COUNTS * huff_counts,my_off_t records,uint max_space_length,my_off_t * space_counts,my_off_t tot_space_count,enum en_fieldtype field_type)1484 test_space_compress(HUFF_COUNTS *huff_counts, my_off_t records,
1485 uint max_space_length, my_off_t *space_counts,
1486 my_off_t tot_space_count, enum en_fieldtype field_type)
1487 {
1488 int min_pos;
1489 uint length_bits,i;
1490 my_off_t space_count,min_space_count,min_pack,new_length,skip;
1491
1492 length_bits=max_bit(max_space_length);
1493
1494 /* Default no end_space-packing */
1495 space_count=huff_counts->counts[(uint) ' '];
1496 min_space_count= (huff_counts->counts[(uint) ' ']+= tot_space_count);
1497 min_pack=calc_packed_length(huff_counts,0);
1498 min_pos= -2;
1499 huff_counts->counts[(uint) ' ']=space_count;
1500
1501 /* Test with allways space-count */
1502 new_length=huff_counts->bytes_packed+length_bits*records/8;
1503 if (new_length+1 < min_pack)
1504 {
1505 min_pos= -1;
1506 min_pack=new_length;
1507 min_space_count=space_count;
1508 }
1509 /* Test with length-flag */
1510 for (skip=0L, i=0 ; i < 8 ; i++)
1511 {
1512 if (space_counts[i])
1513 {
1514 if (i)
1515 huff_counts->counts[(uint) ' ']+=space_counts[i];
1516 skip+=huff_counts->pre_space[i];
1517 new_length=calc_packed_length(huff_counts,0)+
1518 (records+(records-skip)*(1+length_bits))/8;
1519 if (new_length < min_pack)
1520 {
1521 min_pos=(int) i;
1522 min_pack=new_length;
1523 min_space_count=huff_counts->counts[(uint) ' '];
1524 }
1525 }
1526 }
1527
1528 huff_counts->counts[(uint) ' ']=min_space_count;
1529 huff_counts->bytes_packed=min_pack;
1530 switch (min_pos) {
1531 case -2:
1532 return(0); /* No space-compress */
1533 case -1: /* Always space-count */
1534 huff_counts->field_type=field_type;
1535 huff_counts->min_space=0;
1536 huff_counts->length_bits=max_bit(max_space_length);
1537 break;
1538 default:
1539 huff_counts->field_type=field_type;
1540 huff_counts->min_space=(uint) min_pos;
1541 huff_counts->pack_type|=PACK_TYPE_SELECTED;
1542 huff_counts->length_bits=max_bit(max_space_length);
1543 break;
1544 }
1545 return(1); /* Using space-compress */
1546 }
1547
1548
1549 /* Make a huff_tree of each huff_count */
1550
make_huff_trees(HUFF_COUNTS * huff_counts,uint trees)1551 static HUFF_TREE* make_huff_trees(HUFF_COUNTS *huff_counts, uint trees)
1552 {
1553 uint tree;
1554 HUFF_TREE *huff_tree;
1555 DBUG_ENTER("make_huff_trees");
1556
1557 if (!(huff_tree=(HUFF_TREE*) my_malloc(PSI_NOT_INSTRUMENTED,
1558 trees*sizeof(HUFF_TREE), MYF(MY_WME | MY_ZEROFILL))))
1559 DBUG_RETURN(0);
1560
1561 for (tree=0 ; tree < trees ; tree++)
1562 {
1563 if (make_huff_tree(huff_tree+tree,huff_counts+tree))
1564 {
1565 while (tree--)
1566 my_free(huff_tree[tree].element_buffer);
1567 my_free(huff_tree);
1568 DBUG_RETURN(0);
1569 }
1570 }
1571 DBUG_RETURN(huff_tree);
1572 }
1573
1574 /*
1575 Build a Huffman tree.
1576
1577 SYNOPSIS
1578 make_huff_tree()
1579 huff_tree The Huffman tree.
1580 huff_counts The counts.
1581
1582 DESCRIPTION
1583 Build a Huffman tree according to huff_counts->counts or
1584 huff_counts->tree_buff. tree_buff, if non-NULL contains up to
1585 tree_buff_length of distinct column values. In that case, whole
1586 values can be Huffman encoded instead of single bytes.
1587
1588 RETURN
1589 0 OK
1590 != 0 Error
1591 */
1592
make_huff_tree(HUFF_TREE * huff_tree,HUFF_COUNTS * huff_counts)1593 static int make_huff_tree(HUFF_TREE *huff_tree, HUFF_COUNTS *huff_counts)
1594 {
1595 uint i,found,bits_packed,first,last;
1596 my_off_t bytes_packed;
1597 HUFF_ELEMENT *a,*b,*new_huff_el;
1598
1599 first=last=0;
1600 if (huff_counts->tree_buff)
1601 {
1602 /* Calculate the number of distinct values in tree_buff. */
1603 found= (uint) (huff_counts->tree_pos - huff_counts->tree_buff) /
1604 huff_counts->field_length;
1605 first=0; last=found-1;
1606 }
1607 else
1608 {
1609 /* Count the number of uchar codes found in the column. */
1610 for (i=found=0 ; i < 256 ; i++)
1611 {
1612 if (huff_counts->counts[i])
1613 {
1614 if (! found++)
1615 first=i;
1616 last=i;
1617 }
1618 }
1619 if (found < 2)
1620 found=2;
1621 }
1622
1623 /* When using 'tree_buff' we can have more that 256 values. */
1624 if (queue.max_elements < found)
1625 {
1626 delete_queue(&queue);
1627 if (init_queue(&queue,found, 0, 0, compare_huff_elements, 0, 0, 0))
1628 return -1;
1629 }
1630
1631 /* Allocate or reallocate an element buffer for the Huffman tree. */
1632 if (!huff_tree->element_buffer)
1633 {
1634 if (!(huff_tree->element_buffer=
1635 (HUFF_ELEMENT*) my_malloc(PSI_NOT_INSTRUMENTED,
1636 found*2*sizeof(HUFF_ELEMENT),MYF(MY_WME))))
1637 return 1;
1638 }
1639 else
1640 {
1641 HUFF_ELEMENT *temp;
1642 if (!(temp= (HUFF_ELEMENT*) my_realloc(PSI_NOT_INSTRUMENTED,
1643 (uchar*) huff_tree->element_buffer, found*2*sizeof(HUFF_ELEMENT), MYF(MY_WME))))
1644 return 1;
1645 huff_tree->element_buffer=temp;
1646 }
1647
1648 huff_counts->tree=huff_tree;
1649 huff_tree->counts=huff_counts;
1650 huff_tree->min_chr=first;
1651 huff_tree->max_chr=last;
1652 huff_tree->char_bits=max_bit(last-first);
1653 huff_tree->offset_bits=max_bit(found-1)+1;
1654
1655 if (huff_counts->tree_buff)
1656 {
1657 huff_tree->elements=0;
1658 huff_tree->tree_pack_length=(1+15+16+5+5+
1659 (huff_tree->char_bits+1)*found+
1660 (huff_tree->offset_bits+1)*
1661 (found-2)+7)/8 +
1662 (uint) (huff_tree->counts->tree_pos-
1663 huff_tree->counts->tree_buff);
1664 /*
1665 Put a HUFF_ELEMENT into the queue for every distinct column value.
1666
1667 tree_walk() calls save_counts_in_queue() for every element in
1668 'int_tree'. This takes elements from the target trees element
1669 buffer and places references to them into the buffer of the
1670 priority queue. We insert in column value order, but the order is
1671 in fact irrelevant here. We will establish the correct order
1672 later.
1673 */
1674 tree_walk(&huff_counts->int_tree,
1675 (int (*)(void*, element_count,void*)) save_counts_in_queue,
1676 (uchar*) huff_tree, left_root_right);
1677 }
1678 else
1679 {
1680 huff_tree->elements=found;
1681 huff_tree->tree_pack_length=(9+9+5+5+
1682 (huff_tree->char_bits+1)*found+
1683 (huff_tree->offset_bits+1)*
1684 (found-2)+7)/8;
1685 /*
1686 Put a HUFF_ELEMENT into the queue for every uchar code found in the column.
1687
1688 The elements are taken from the target trees element buffer.
1689 Instead of using queue_insert(), we just place references to the
1690 elements into the buffer of the priority queue. We insert in byte
1691 value order, but the order is in fact irrelevant here. We will
1692 establish the correct order later.
1693 */
1694 for (i=first, found=0 ; i <= last ; i++)
1695 {
1696 if (huff_counts->counts[i])
1697 {
1698 new_huff_el=huff_tree->element_buffer+(found++);
1699 new_huff_el->count=huff_counts->counts[i];
1700 new_huff_el->a.leaf.null=0;
1701 new_huff_el->a.leaf.element_nr=i;
1702 queue.root[found]=(uchar*) new_huff_el;
1703 }
1704 }
1705 /*
1706 If there is only a single uchar value in this field in all records,
1707 add a second element with zero incidence. This is required to enter
1708 the loop, which builds the Huffman tree.
1709 */
1710 while (found < 2)
1711 {
1712 new_huff_el=huff_tree->element_buffer+(found++);
1713 new_huff_el->count=0;
1714 new_huff_el->a.leaf.null=0;
1715 if (last)
1716 new_huff_el->a.leaf.element_nr=huff_tree->min_chr=last-1;
1717 else
1718 new_huff_el->a.leaf.element_nr=huff_tree->max_chr=last+1;
1719 queue.root[found]=(uchar*) new_huff_el;
1720 }
1721 }
1722
1723 /* Make a queue from the queue buffer. */
1724 queue.elements=found;
1725
1726 /*
1727 Make a priority queue from the queue. Construct its index so that we
1728 have a partially ordered tree.
1729 */
1730 queue_fix(&queue);
1731
1732 /* The Huffman algorithm. */
1733 bytes_packed=0; bits_packed=0;
1734 for (i=1 ; i < found ; i++)
1735 {
1736 /*
1737 Pop the top element from the queue (the one with the least incidence).
1738 Popping from a priority queue includes a re-ordering of the queue,
1739 to get the next least incidence element to the top.
1740 */
1741 a=(HUFF_ELEMENT*) queue_remove_top(&queue);
1742 /* Copy the next least incidence element */
1743 b=(HUFF_ELEMENT*) queue_top(&queue);
1744 /* Get a new element from the element buffer. */
1745 new_huff_el=huff_tree->element_buffer+found+i;
1746 /* The new element gets the sum of the two least incidence elements. */
1747 new_huff_el->count=a->count+b->count;
1748 /*
1749 The Huffman algorithm assigns another bit to the code for a byte
1750 every time that bytes incidence is combined (directly or indirectly)
1751 to a new element as one of the two least incidence elements.
1752 This means that one more bit per incidence of that uchar is required
1753 in the resulting file. So we add the new combined incidence as the
1754 number of bits by which the result grows.
1755 */
1756 bits_packed+=(uint) (new_huff_el->count & 7);
1757 bytes_packed+=new_huff_el->count/8;
1758 /* The new element points to its children, lesser in left. */
1759 new_huff_el->a.nod.left=a;
1760 new_huff_el->a.nod.right=b;
1761 /*
1762 Replace the copied top element by the new element and re-order the
1763 queue.
1764 */
1765 queue_top(&queue)= (uchar*) new_huff_el;
1766 queue_replace_top(&queue);
1767 }
1768 huff_tree->root=(HUFF_ELEMENT*) queue.root[1];
1769 huff_tree->bytes_packed=bytes_packed+(bits_packed+7)/8;
1770 return 0;
1771 }
1772
compare_tree(void * cmp_arg,register const uchar * s,register const uchar * t)1773 static int compare_tree(void* cmp_arg __attribute__((unused)),
1774 register const uchar *s, register const uchar *t)
1775 {
1776 uint length;
1777 for (length=global_count->field_length; length-- ;)
1778 if (*s++ != *t++)
1779 return (int) s[-1] - (int) t[-1];
1780 return 0;
1781 }
1782
1783 /*
1784 Organize distinct column values and their incidences into a priority queue.
1785
1786 SYNOPSIS
1787 save_counts_in_queue()
1788 key The column value.
1789 count The incidence of this value.
1790 tree The Huffman tree to be built later.
1791
1792 DESCRIPTION
1793 We use the element buffer of the targeted tree. The distinct column
1794 values are organized in a priority queue first. The Huffman
1795 algorithm will later organize the elements into a Huffman tree. For
1796 the time being, we just place references to the elements into the
1797 queue buffer. The buffer will later be organized into a priority
1798 queue.
1799
1800 RETURN
1801 0
1802 */
1803
save_counts_in_queue(uchar * key,element_count count,HUFF_TREE * tree)1804 static int save_counts_in_queue(uchar *key, element_count count,
1805 HUFF_TREE *tree)
1806 {
1807 HUFF_ELEMENT *new_huff_el;
1808
1809 new_huff_el=tree->element_buffer+(tree->elements++);
1810 new_huff_el->count=count;
1811 new_huff_el->a.leaf.null=0;
1812 new_huff_el->a.leaf.element_nr= (uint) (key- tree->counts->tree_buff) /
1813 tree->counts->field_length;
1814 queue.root[tree->elements]=(uchar*) new_huff_el;
1815 return 0;
1816 }
1817
1818
1819 /*
1820 Calculate length of file if given counts should be used.
1821
1822 SYNOPSIS
1823 calc_packed_length()
1824 huff_counts The counts for a column of the table(s).
1825 add_tree_lenght If the decode tree length should be added.
1826
1827 DESCRIPTION
1828 We need to follow the Huffman algorithm until we know, how many bits
1829 are required for each uchar code. But we do not need the resulting
1830 Huffman tree. Hence, we can leave out some steps which are essential
1831 in make_huff_tree().
1832
1833 RETURN
1834 Number of bytes required to compress this table column.
1835 */
1836
calc_packed_length(HUFF_COUNTS * huff_counts,uint add_tree_lenght)1837 static my_off_t calc_packed_length(HUFF_COUNTS *huff_counts,
1838 uint add_tree_lenght)
1839 {
1840 uint i,found,bits_packed,first,last;
1841 my_off_t bytes_packed;
1842 HUFF_ELEMENT element_buffer[256];
1843 DBUG_ENTER("calc_packed_length");
1844
1845 /*
1846 WARNING: We use a small hack for efficiency: Instead of placing
1847 references to HUFF_ELEMENTs into the queue, we just insert
1848 references to the counts of the uchar codes which appeared in this
1849 table column. During the Huffman algorithm they are successively
1850 replaced by references to HUFF_ELEMENTs. This works, because
1851 HUFF_ELEMENTs have the incidence count at their beginning.
1852 Regardless, wether the queue array contains references to counts of
1853 type my_off_t or references to HUFF_ELEMENTs which have the count of
1854 type my_off_t at their beginning, it always points to a count of the
1855 same type.
1856
1857 Instead of using queue_insert(), we just copy the references into
1858 the buffer of the priority queue. We insert in uchar value order, but
1859 the order is in fact irrelevant here. We will establish the correct
1860 order later.
1861 */
1862 first=last=0;
1863 for (i=found=0 ; i < 256 ; i++)
1864 {
1865 if (huff_counts->counts[i])
1866 {
1867 if (! found++)
1868 first=i;
1869 last=i;
1870 /* We start with root[1], which is the queues top element. */
1871 queue.root[found]=(uchar*) &huff_counts->counts[i];
1872 }
1873 }
1874 if (!found)
1875 DBUG_RETURN(0); /* Empty tree */
1876 /*
1877 If there is only a single uchar value in this field in all records,
1878 add a second element with zero incidence. This is required to enter
1879 the loop, which follows the Huffman algorithm.
1880 */
1881 if (found < 2)
1882 queue.root[++found]=(uchar*) &huff_counts->counts[last ? 0 : 1];
1883
1884 /* Make a queue from the queue buffer. */
1885 queue.elements=found;
1886
1887 bytes_packed=0; bits_packed=0;
1888 /* Add the length of the coding table, which would become part of the file. */
1889 if (add_tree_lenght)
1890 bytes_packed=(8+9+5+5+(max_bit(last-first)+1)*found+
1891 (max_bit(found-1)+1+1)*(found-2) +7)/8;
1892
1893 /*
1894 Make a priority queue from the queue. Construct its index so that we
1895 have a partially ordered tree.
1896 */
1897 queue_fix(&queue);
1898
1899 /* The Huffman algorithm. */
1900 for (i=0 ; i < found-1 ; i++)
1901 {
1902 my_off_t *a;
1903 my_off_t *b;
1904 HUFF_ELEMENT *new_huff_el;
1905
1906 /*
1907 Pop the top element from the queue (the one with the least
1908 incidence). Popping from a priority queue includes a re-ordering
1909 of the queue, to get the next least incidence element to the top.
1910 */
1911 a= (my_off_t*) queue_remove_top(&queue);
1912 /* Copy the next least incidence element. */
1913 b= (my_off_t*) queue_top(&queue);
1914 /* Create a new element in a local (automatic) buffer. */
1915 new_huff_el= element_buffer + i;
1916 /* The new element gets the sum of the two least incidence elements. */
1917 new_huff_el->count= *a + *b;
1918 /*
1919 The Huffman algorithm assigns another bit to the code for a byte
1920 every time that bytes incidence is combined (directly or indirectly)
1921 to a new element as one of the two least incidence elements.
1922 This means that one more bit per incidence of that uchar is required
1923 in the resulting file. So we add the new combined incidence as the
1924 number of bits by which the result grows.
1925 */
1926 bits_packed+=(uint) (new_huff_el->count & 7);
1927 bytes_packed+=new_huff_el->count/8;
1928 /*
1929 Replace the copied top element by the new element and re-order the
1930 queue. This successively replaces the references to counts by
1931 references to HUFF_ELEMENTs.
1932 */
1933 queue_top(&queue)= (uchar*) new_huff_el;
1934 queue_replace_top(&queue);
1935 }
1936 DBUG_RETURN(bytes_packed+(bits_packed+7)/8);
1937 }
1938
1939
1940 /* Remove trees that don't give any compression */
1941
join_same_trees(HUFF_COUNTS * huff_counts,uint trees)1942 static uint join_same_trees(HUFF_COUNTS *huff_counts, uint trees)
1943 {
1944 uint k,tree_number;
1945 HUFF_COUNTS count,*i,*j,*last_count;
1946
1947 last_count=huff_counts+trees;
1948 for (tree_number=0, i=huff_counts ; i < last_count ; i++)
1949 {
1950 if (!i->tree->tree_number)
1951 {
1952 i->tree->tree_number= ++tree_number;
1953 if (i->tree_buff)
1954 continue; /* Don't join intervall */
1955 for (j=i+1 ; j < last_count ; j++)
1956 {
1957 if (! j->tree->tree_number && ! j->tree_buff)
1958 {
1959 for (k=0 ; k < 256 ; k++)
1960 count.counts[k]=i->counts[k]+j->counts[k];
1961 if (calc_packed_length(&count,1) <=
1962 i->tree->bytes_packed + j->tree->bytes_packed+
1963 i->tree->tree_pack_length+j->tree->tree_pack_length+
1964 ALLOWED_JOIN_DIFF)
1965 {
1966 memcpy(i->counts,(uchar*) count.counts, sizeof(count.counts[0])*256);
1967 my_free(j->tree->element_buffer);
1968 j->tree->element_buffer=0;
1969 j->tree=i->tree;
1970 bmove((uchar*) i->counts,(uchar*) count.counts,
1971 sizeof(count.counts[0])*256);
1972 if (make_huff_tree(i->tree,i))
1973 return (uint) -1;
1974 }
1975 }
1976 }
1977 }
1978 }
1979 DBUG_PRINT("info", ("Original trees: %d After join: %d",
1980 trees, tree_number));
1981 if (verbose)
1982 printf("Original trees: %d After join: %d\n", trees, tree_number);
1983 return tree_number; /* Return trees left */
1984 }
1985
1986
1987 /*
1988 Fill in huff_tree encode tables.
1989
1990 SYNOPSIS
1991 make_huff_decode_table()
1992 huff_tree An array of HUFF_TREE which are to be encoded.
1993 trees The number of HUFF_TREE in the array.
1994
1995 RETURN
1996 0 success
1997 != 0 error
1998 */
1999
make_huff_decode_table(HUFF_TREE * huff_tree,uint trees)2000 static int make_huff_decode_table(HUFF_TREE *huff_tree, uint trees)
2001 {
2002 uint elements;
2003 for ( ; trees-- ; huff_tree++)
2004 {
2005 if (huff_tree->tree_number > 0)
2006 {
2007 elements=huff_tree->counts->tree_buff ? huff_tree->elements : 256;
2008 if (!(huff_tree->code =
2009 (ulonglong*) my_malloc(PSI_NOT_INSTRUMENTED,
2010 elements* (sizeof(ulonglong) + sizeof(uchar)),
2011 MYF(MY_WME | MY_ZEROFILL))))
2012 return 1;
2013 huff_tree->code_len=(uchar*) (huff_tree->code+elements);
2014 make_traverse_code_tree(huff_tree, huff_tree->root,
2015 8 * sizeof(ulonglong), 0);
2016 }
2017 }
2018 return 0;
2019 }
2020
2021
make_traverse_code_tree(HUFF_TREE * huff_tree,HUFF_ELEMENT * element,uint size,ulonglong code)2022 static void make_traverse_code_tree(HUFF_TREE *huff_tree,
2023 HUFF_ELEMENT *element,
2024 uint size, ulonglong code)
2025 {
2026 uint chr;
2027 if (!element->a.leaf.null)
2028 {
2029 chr=element->a.leaf.element_nr;
2030 huff_tree->code_len[chr]= (uchar) (8 * sizeof(ulonglong) - size);
2031 huff_tree->code[chr]= (code >> size);
2032 if (huff_tree->height < 8 * sizeof(ulonglong) - size)
2033 huff_tree->height= 8 * sizeof(ulonglong) - size;
2034 }
2035 else
2036 {
2037 size--;
2038 make_traverse_code_tree(huff_tree,element->a.nod.left,size,code);
2039 make_traverse_code_tree(huff_tree, element->a.nod.right, size,
2040 code + (((ulonglong) 1) << size));
2041 }
2042 return;
2043 }
2044
2045
2046 /*
2047 Convert a value into binary digits.
2048
2049 SYNOPSIS
2050 bindigits()
2051 value The value.
2052 length The number of low order bits to convert.
2053
2054 NOTE
2055 The result string is in static storage. It is reused on every call.
2056 So you cannot use it twice in one expression.
2057
2058 RETURN
2059 A pointer to a static NUL-terminated string.
2060 */
2061
bindigits(ulonglong value,uint bits)2062 static char *bindigits(ulonglong value, uint bits)
2063 {
2064 static char digits[72];
2065 char *ptr= digits;
2066 uint idx= bits;
2067
2068 DBUG_ASSERT(idx < sizeof(digits));
2069 while (idx)
2070 *(ptr++)= '0' + ((char) (value >> (--idx)) & (char) 1);
2071 *ptr= '\0';
2072 return digits;
2073 }
2074
2075
2076 /*
2077 Convert a value into hexadecimal digits.
2078
2079 SYNOPSIS
2080 hexdigits()
2081 value The value.
2082
2083 NOTE
2084 The result string is in static storage. It is reused on every call.
2085 So you cannot use it twice in one expression.
2086
2087 RETURN
2088 A pointer to a static NUL-terminated string.
2089 */
2090
hexdigits(ulonglong value)2091 static char *hexdigits(ulonglong value)
2092 {
2093 static char digits[20];
2094 char *ptr= digits;
2095 uint idx= 2 * sizeof(value); /* Two hex digits per byte. */
2096
2097 DBUG_ASSERT(idx < sizeof(digits));
2098 while (idx)
2099 {
2100 if ((*(ptr++)= '0' + ((char) (value >> (4 * (--idx))) & (char) 0xf)) > '9')
2101 *(ptr - 1)+= 'a' - '9' - 1;
2102 }
2103 *ptr= '\0';
2104 return digits;
2105 }
2106
2107
2108 /* Write header to new packed data file */
2109
write_header(PACK_MRG_INFO * mrg,uint head_length,uint trees,my_off_t tot_elements,my_off_t filelength)2110 static int write_header(PACK_MRG_INFO *mrg,uint head_length,uint trees,
2111 my_off_t tot_elements,my_off_t filelength)
2112 {
2113 uchar *buff= (uchar*) file_buffer.pos;
2114
2115 bzero(buff,HEAD_LENGTH);
2116 memcpy(buff,maria_pack_file_magic,4);
2117 int4store(buff+4,head_length);
2118 int4store(buff+8, mrg->min_pack_length);
2119 int4store(buff+12,mrg->max_pack_length);
2120 int4store(buff+16,tot_elements);
2121 int4store(buff+20,intervall_length);
2122 int2store(buff+24,trees);
2123 buff[26]=(char) mrg->ref_length;
2124 /* Save record pointer length */
2125 buff[27]= (uchar) maria_get_pointer_length((ulonglong) filelength,2);
2126 if (test_only)
2127 return 0;
2128 my_seek(file_buffer.file,0L,MY_SEEK_SET,MYF(0));
2129 return my_write(file_buffer.file,(const uchar *) file_buffer.pos,HEAD_LENGTH,
2130 MYF(MY_WME | MY_NABP | MY_WAIT_IF_FULL)) != 0;
2131 }
2132
2133 /* Write fieldinfo to new packed file */
2134
write_field_info(HUFF_COUNTS * counts,uint fields,uint trees)2135 static void write_field_info(HUFF_COUNTS *counts, uint fields, uint trees)
2136 {
2137 reg1 uint i;
2138 uint huff_tree_bits;
2139 huff_tree_bits=max_bit(trees ? trees-1 : 0);
2140
2141 DBUG_PRINT("info", (" "));
2142 DBUG_PRINT("info", ("column types:"));
2143 DBUG_PRINT("info", ("FIELD_NORMAL 0"));
2144 DBUG_PRINT("info", ("FIELD_SKIP_ENDSPACE 1"));
2145 DBUG_PRINT("info", ("FIELD_SKIP_PRESPACE 2"));
2146 DBUG_PRINT("info", ("FIELD_SKIP_ZERO 3"));
2147 DBUG_PRINT("info", ("FIELD_BLOB 4"));
2148 DBUG_PRINT("info", ("FIELD_CONSTANT 5"));
2149 DBUG_PRINT("info", ("FIELD_INTERVALL 6"));
2150 DBUG_PRINT("info", ("FIELD_ZERO 7"));
2151 DBUG_PRINT("info", ("FIELD_VARCHAR 8"));
2152 DBUG_PRINT("info", ("FIELD_CHECK 9"));
2153 DBUG_PRINT("info", (" "));
2154 DBUG_PRINT("info", ("pack type as a set of flags:"));
2155 DBUG_PRINT("info", ("PACK_TYPE_SELECTED 1"));
2156 DBUG_PRINT("info", ("PACK_TYPE_SPACE_FIELDS 2"));
2157 DBUG_PRINT("info", ("PACK_TYPE_ZERO_FILL 4"));
2158 DBUG_PRINT("info", (" "));
2159 if (verbose >= 2)
2160 {
2161 printf("\n");
2162 printf("column types:\n");
2163 printf("FIELD_NORMAL 0\n");
2164 printf("FIELD_SKIP_ENDSPACE 1\n");
2165 printf("FIELD_SKIP_PRESPACE 2\n");
2166 printf("FIELD_SKIP_ZERO 3\n");
2167 printf("FIELD_BLOB 4\n");
2168 printf("FIELD_CONSTANT 5\n");
2169 printf("FIELD_INTERVALL 6\n");
2170 printf("FIELD_ZERO 7\n");
2171 printf("FIELD_VARCHAR 8\n");
2172 printf("FIELD_CHECK 9\n");
2173 printf("\n");
2174 printf("pack type as a set of flags:\n");
2175 printf("PACK_TYPE_SELECTED 1\n");
2176 printf("PACK_TYPE_SPACE_FIELDS 2\n");
2177 printf("PACK_TYPE_ZERO_FILL 4\n");
2178 printf("\n");
2179 }
2180 for (i=0 ; i++ < fields ; counts++)
2181 {
2182 write_bits((ulonglong) (int) counts->field_type, 5);
2183 write_bits(counts->pack_type,6);
2184 if (counts->pack_type & PACK_TYPE_ZERO_FILL)
2185 write_bits(counts->max_zero_fill,5);
2186 else
2187 write_bits(counts->length_bits,5);
2188 write_bits((ulonglong) counts->tree->tree_number - 1, huff_tree_bits);
2189 DBUG_PRINT("info", ("column: %3u type: %2u pack: %2u zero: %4u "
2190 "lbits: %2u tree: %2u length: %4u",
2191 i , counts->field_type, counts->pack_type,
2192 counts->max_zero_fill, counts->length_bits,
2193 counts->tree->tree_number, counts->field_length));
2194 if (verbose >= 2)
2195 printf("column: %3u type: %2u pack: %2u zero: %4u lbits: %2u "
2196 "tree: %2u length: %4u\n", i , counts->field_type,
2197 counts->pack_type, counts->max_zero_fill, counts->length_bits,
2198 counts->tree->tree_number, counts->field_length);
2199 }
2200 flush_bits();
2201 return;
2202 }
2203
2204 /* Write all huff_trees to new datafile. Return tot count of
2205 elements in all trees
2206 Returns 0 on error */
2207
write_huff_tree(HUFF_TREE * huff_tree,uint trees)2208 static my_off_t write_huff_tree(HUFF_TREE *huff_tree, uint trees)
2209 {
2210 uint i,int_length;
2211 uint tree_no;
2212 uint codes;
2213 uint errors= 0;
2214 uint *packed_tree,*offset,length;
2215 my_off_t elements;
2216
2217 /* Find the highest number of elements in the trees. */
2218 for (i=length=0 ; i < trees ; i++)
2219 if (huff_tree[i].tree_number > 0 && huff_tree[i].elements > length)
2220 length=huff_tree[i].elements;
2221 /*
2222 Allocate a buffer for packing a decode tree. Two numbers per element
2223 (left child and right child).
2224 */
2225 if (!(packed_tree=(uint*) my_alloca(sizeof(uint)*length*2)))
2226 {
2227 my_error(EE_OUTOFMEMORY,MYF(ME_BELL),sizeof(uint)*length*2);
2228 return 0;
2229 }
2230
2231 DBUG_PRINT("info", (" "));
2232 if (verbose >= 2)
2233 printf("\n");
2234 tree_no= 0;
2235 intervall_length=0;
2236 for (elements=0; trees-- ; huff_tree++)
2237 {
2238 /* Skip columns that have been joined with other columns. */
2239 if (huff_tree->tree_number == 0)
2240 continue; /* Deleted tree */
2241 tree_no++;
2242 DBUG_PRINT("info", (" "));
2243 if (verbose >= 3)
2244 printf("\n");
2245 /* Count the total number of elements (byte codes or column values). */
2246 elements+=huff_tree->elements;
2247 huff_tree->max_offset=2;
2248 /* Build a tree of offsets and codes for decoding in 'packed_tree'. */
2249 if (huff_tree->elements <= 1)
2250 offset=packed_tree;
2251 else
2252 offset=make_offset_code_tree(huff_tree,huff_tree->root,packed_tree);
2253
2254 /* This should be the same as 'length' above. */
2255 huff_tree->offset_bits=max_bit(huff_tree->max_offset);
2256
2257 /*
2258 Since we check this during collecting the distinct column values,
2259 this should never happen.
2260 */
2261 if (huff_tree->max_offset >= IS_OFFSET)
2262 { /* This should be impossible */
2263 fprintf(stderr, "Tree offset got too big: %d, aborted\n",
2264 huff_tree->max_offset);
2265 my_afree(packed_tree);
2266 return 0;
2267 }
2268
2269 DBUG_PRINT("info", ("pos: %lu elements: %u tree-elements: %lu "
2270 "char_bits: %u\n",
2271 (ulong) (file_buffer.pos - file_buffer.buffer),
2272 huff_tree->elements, (ulong) (offset - packed_tree),
2273 huff_tree->char_bits));
2274 if (!huff_tree->counts->tree_buff)
2275 {
2276 /* We do a uchar compression on this column. Mark with bit 0. */
2277 write_bits(0,1);
2278 write_bits(huff_tree->min_chr,8);
2279 write_bits(huff_tree->elements,9);
2280 write_bits(huff_tree->char_bits,5);
2281 write_bits(huff_tree->offset_bits,5);
2282 int_length=0;
2283 }
2284 else
2285 {
2286 int_length=(uint) (huff_tree->counts->tree_pos -
2287 huff_tree->counts->tree_buff);
2288 /* We have distinct column values for this column. Mark with bit 1. */
2289 write_bits(1,1);
2290 write_bits(huff_tree->elements,15);
2291 write_bits(int_length,16);
2292 write_bits(huff_tree->char_bits,5);
2293 write_bits(huff_tree->offset_bits,5);
2294 intervall_length+=int_length;
2295 }
2296 DBUG_PRINT("info", ("tree: %2u elements: %4u char_bits: %2u "
2297 "offset_bits: %2u %s: %5u codelen: %2u",
2298 tree_no, huff_tree->elements, huff_tree->char_bits,
2299 huff_tree->offset_bits, huff_tree->counts->tree_buff ?
2300 "bufflen" : "min_chr", huff_tree->counts->tree_buff ?
2301 int_length : huff_tree->min_chr, huff_tree->height));
2302 if (verbose >= 2)
2303 printf("tree: %2u elements: %4u char_bits: %2u offset_bits: %2u "
2304 "%s: %5u codelen: %2u\n", tree_no, huff_tree->elements,
2305 huff_tree->char_bits, huff_tree->offset_bits,
2306 huff_tree->counts->tree_buff ? "bufflen" : "min_chr",
2307 huff_tree->counts->tree_buff ? int_length :
2308 huff_tree->min_chr, huff_tree->height);
2309
2310 /* Check that the code tree length matches the element count. */
2311 length=(uint) (offset-packed_tree);
2312 if (length != huff_tree->elements*2-2)
2313 {
2314 fprintf(stderr, "error: Huff-tree-length: %d != calc_length: %d\n",
2315 length, huff_tree->elements * 2 - 2);
2316 errors++;
2317 break;
2318 }
2319
2320 for (i=0 ; i < length ; i++)
2321 {
2322 if (packed_tree[i] & IS_OFFSET)
2323 write_bits(packed_tree[i] - IS_OFFSET+ (1 << huff_tree->offset_bits),
2324 huff_tree->offset_bits+1);
2325 else
2326 write_bits(packed_tree[i]-huff_tree->min_chr,huff_tree->char_bits+1);
2327 DBUG_PRINT("info", ("tree[0x%04x]: %s0x%04x",
2328 i, (packed_tree[i] & IS_OFFSET) ?
2329 " -> " : "", (packed_tree[i] & IS_OFFSET) ?
2330 packed_tree[i] - IS_OFFSET + i : packed_tree[i]));
2331 if (verbose >= 3)
2332 printf("tree[0x%04x]: %s0x%04x\n",
2333 i, (packed_tree[i] & IS_OFFSET) ? " -> " : "",
2334 (packed_tree[i] & IS_OFFSET) ?
2335 packed_tree[i] - IS_OFFSET + i : packed_tree[i]);
2336 }
2337 flush_bits();
2338
2339 /*
2340 Display coding tables and check their correctness.
2341 */
2342 codes= huff_tree->counts->tree_buff ? huff_tree->elements : 256;
2343 for (i= 0; i < codes; i++)
2344 {
2345 ulonglong code;
2346 uint bits;
2347 uint len;
2348 uint idx;
2349
2350 if (! (len= huff_tree->code_len[i]))
2351 continue;
2352 DBUG_PRINT("info", ("code[0x%04x]: 0x%s bits: %2u bin: %s", i,
2353 hexdigits(huff_tree->code[i]), huff_tree->code_len[i],
2354 bindigits(huff_tree->code[i],
2355 huff_tree->code_len[i])));
2356 if (verbose >= 3)
2357 printf("code[0x%04x]: 0x%s bits: %2u bin: %s\n", i,
2358 hexdigits(huff_tree->code[i]), huff_tree->code_len[i],
2359 bindigits(huff_tree->code[i], huff_tree->code_len[i]));
2360
2361 /* Check that the encode table decodes correctly. */
2362 code= 0;
2363 bits= 0;
2364 idx= 0;
2365 DBUG_EXECUTE_IF("forcechkerr1", len--;);
2366 DBUG_EXECUTE_IF("forcechkerr2", bits= 8 * sizeof(code););
2367 DBUG_EXECUTE_IF("forcechkerr3", idx= length;);
2368 for (;;)
2369 {
2370 if (! len)
2371 {
2372 fflush(stdout);
2373 fprintf(stderr, "error: code 0x%s with %u bits not found\n",
2374 hexdigits(huff_tree->code[i]), huff_tree->code_len[i]);
2375 errors++;
2376 break;
2377 }
2378 code<<= 1;
2379 code|= (huff_tree->code[i] >> (--len)) & 1;
2380 bits++;
2381 if (bits > 8 * sizeof(code))
2382 {
2383 fflush(stdout);
2384 fprintf(stderr, "error: Huffman code too long: %u/%u\n",
2385 bits, (uint) (8 * sizeof(code)));
2386 errors++;
2387 break;
2388 }
2389 idx+= (uint) code & 1;
2390 if (idx >= length)
2391 {
2392 fflush(stdout);
2393 fprintf(stderr, "error: illegal tree offset: %u/%u\n", idx, length);
2394 errors++;
2395 break;
2396 }
2397 if (packed_tree[idx] & IS_OFFSET)
2398 idx+= packed_tree[idx] & ~IS_OFFSET;
2399 else
2400 break; /* Hit a leaf. This contains the result value. */
2401 }
2402 if (errors)
2403 break;
2404
2405 DBUG_EXECUTE_IF("forcechkerr4", packed_tree[idx]++;);
2406 if (packed_tree[idx] != i)
2407 {
2408 fflush(stdout);
2409 fprintf(stderr, "error: decoded value 0x%04x should be: 0x%04x\n",
2410 packed_tree[idx], i);
2411 errors++;
2412 break;
2413 }
2414 } /*end for (codes)*/
2415 if (errors)
2416 break;
2417
2418 /* Write column values in case of distinct column value compression. */
2419 if (huff_tree->counts->tree_buff)
2420 {
2421 for (i=0 ; i < int_length ; i++)
2422 {
2423 write_bits((ulonglong) (uchar) huff_tree->counts->tree_buff[i], 8);
2424 DBUG_PRINT("info", ("column_values[0x%04x]: 0x%02x",
2425 i, (uchar) huff_tree->counts->tree_buff[i]));
2426 if (verbose >= 3)
2427 printf("column_values[0x%04x]: 0x%02x\n",
2428 i, (uchar) huff_tree->counts->tree_buff[i]);
2429 }
2430 }
2431 flush_bits();
2432 }
2433 DBUG_PRINT("info", (" "));
2434 if (verbose >= 2)
2435 printf("\n");
2436 my_afree(packed_tree);
2437 if (errors)
2438 {
2439 fprintf(stderr, "Error: Generated decode trees are corrupt. Stop.\n");
2440 return 0;
2441 }
2442 return elements;
2443 }
2444
2445
make_offset_code_tree(HUFF_TREE * huff_tree,HUFF_ELEMENT * element,uint * offset)2446 static uint *make_offset_code_tree(HUFF_TREE *huff_tree, HUFF_ELEMENT *element,
2447 uint *offset)
2448 {
2449 uint *prev_offset;
2450
2451 prev_offset= offset;
2452 /*
2453 'a.leaf.null' takes the same place as 'a.nod.left'. If this is null,
2454 then there is no left child and, hence no right child either. This
2455 is a property of a binary tree. An element is either a node with two
2456 childs, or a leaf without childs.
2457
2458 The current element is always a node with two childs. Go left first.
2459 */
2460 if (!element->a.nod.left->a.leaf.null)
2461 {
2462 /* Store the uchar code or the index of the column value. */
2463 prev_offset[0] =(uint) element->a.nod.left->a.leaf.element_nr;
2464 offset+=2;
2465 }
2466 else
2467 {
2468 /*
2469 Recursively traverse the tree to the left. Mark it as an offset to
2470 another tree node (in contrast to a uchar code or column value index).
2471 */
2472 prev_offset[0]= IS_OFFSET+2;
2473 offset=make_offset_code_tree(huff_tree,element->a.nod.left,offset+2);
2474 }
2475
2476 /* Now, check the right child. */
2477 if (!element->a.nod.right->a.leaf.null)
2478 {
2479 /* Store the uchar code or the index of the column value. */
2480 prev_offset[1]=element->a.nod.right->a.leaf.element_nr;
2481 return offset;
2482 }
2483 else
2484 {
2485 /*
2486 Recursively traverse the tree to the right. Mark it as an offset to
2487 another tree node (in contrast to a uchar code or column value index).
2488 */
2489 uint temp=(uint) (offset-prev_offset-1);
2490 prev_offset[1]= IS_OFFSET+ temp;
2491 if (huff_tree->max_offset < temp)
2492 huff_tree->max_offset = temp;
2493 return make_offset_code_tree(huff_tree,element->a.nod.right,offset);
2494 }
2495 }
2496
2497 /* Get number of bits neaded to represent value */
2498
max_bit(register uint value)2499 static uint max_bit(register uint value)
2500 {
2501 reg2 uint power=1;
2502
2503 while ((value>>=1))
2504 power++;
2505 return (power);
2506 }
2507
2508
compress_maria_file(PACK_MRG_INFO * mrg,HUFF_COUNTS * huff_counts)2509 static int compress_maria_file(PACK_MRG_INFO *mrg, HUFF_COUNTS *huff_counts)
2510 {
2511 int error;
2512 uint i,max_calc_length,pack_ref_length,min_record_length,max_record_length;
2513 uint intervall,field_length,max_pack_length,pack_blob_length, null_bytes;
2514 my_off_t record_count;
2515 char llbuf[32];
2516 ulong length,pack_length;
2517 uchar *record,*pos,*end_pos,*record_pos,*start_pos;
2518 HUFF_COUNTS *count,*end_count;
2519 HUFF_TREE *tree;
2520 MARIA_HA *isam_file=mrg->file[0];
2521 uint pack_version= (uint) isam_file->s->pack.version;
2522 DBUG_ENTER("compress_maria_file");
2523
2524 /* Allocate a buffer for the records (excluding blobs). */
2525 if (!(record=(uchar*) my_safe_alloca(isam_file->s->base.reclength)))
2526 return -1;
2527
2528 end_count=huff_counts+isam_file->s->base.fields;
2529 min_record_length= (uint) ~0;
2530 max_record_length=0;
2531 null_bytes= isam_file->s->base.null_bytes;
2532
2533 /*
2534 Calculate the maximum number of bits required to pack the records.
2535 Remember to understand 'max_zero_fill' as 'min_zero_fill'.
2536 The tree height determines the maximum number of bits per value.
2537 Some fields skip leading or trailing spaces or zeroes. The skipped
2538 number of bytes is encoded by 'length_bits' bits.
2539 Empty blobs and varchar are encoded with a single 1 bit. Other blobs
2540 and varchar get a leading 0 bit.
2541 */
2542 max_calc_length= null_bytes;
2543 for (i= 0 ; i < isam_file->s->base.fields ; i++)
2544 {
2545 if (!(huff_counts[i].pack_type & PACK_TYPE_ZERO_FILL))
2546 huff_counts[i].max_zero_fill=0;
2547 if (huff_counts[i].field_type == FIELD_CONSTANT ||
2548 huff_counts[i].field_type == FIELD_ZERO ||
2549 huff_counts[i].field_type == FIELD_CHECK)
2550 continue;
2551 if (huff_counts[i].field_type == FIELD_INTERVALL)
2552 max_calc_length+=huff_counts[i].tree->height;
2553 else if (huff_counts[i].field_type == FIELD_BLOB ||
2554 huff_counts[i].field_type == FIELD_VARCHAR)
2555 max_calc_length+=huff_counts[i].tree->height*huff_counts[i].max_length + huff_counts[i].length_bits +1;
2556 else
2557 max_calc_length+=
2558 (huff_counts[i].field_length - huff_counts[i].max_zero_fill)*
2559 huff_counts[i].tree->height+huff_counts[i].length_bits;
2560 }
2561 max_calc_length= (max_calc_length + 7) / 8;
2562 pack_ref_length= _ma_calc_pack_length(pack_version, max_calc_length);
2563 record_count=0;
2564 /* 'max_blob_length' is the max length of all blobs of a record. */
2565 pack_blob_length= isam_file->s->base.blobs ?
2566 _ma_calc_pack_length(pack_version, mrg->max_blob_length) : 0;
2567 max_pack_length=pack_ref_length+pack_blob_length;
2568
2569 DBUG_PRINT("fields", ("==="));
2570 mrg_reset(mrg);
2571 while ((error=mrg_rrnd(mrg,record)) != HA_ERR_END_OF_FILE)
2572 {
2573 ulong tot_blob_length=0;
2574 if (! error)
2575 {
2576 if (flush_buffer((ulong) max_calc_length + (ulong) max_pack_length +
2577 null_bytes))
2578 break;
2579 record_pos= file_buffer.pos;
2580 file_buffer.pos+= max_pack_length;
2581 if (null_bytes)
2582 {
2583 /* Copy null bits 'as is' */
2584 memcpy(file_buffer.pos, record, null_bytes);
2585 file_buffer.pos+= null_bytes;
2586 }
2587 for (start_pos=record+null_bytes, count= huff_counts;
2588 count < end_count ;
2589 count++)
2590 {
2591 end_pos=start_pos+(field_length=count->field_length);
2592 tree=count->tree;
2593
2594 DBUG_PRINT("fields", ("column: %3lu type: %2u pack: %2u zero: %4u "
2595 "lbits: %2u tree: %2u length: %4u",
2596 (ulong) (count - huff_counts + 1),
2597 count->field_type,
2598 count->pack_type, count->max_zero_fill,
2599 count->length_bits, count->tree->tree_number,
2600 count->field_length));
2601
2602 /* Check if the column contains spaces only. */
2603 if (count->pack_type & PACK_TYPE_SPACE_FIELDS)
2604 {
2605 for (pos=start_pos ; *pos == ' ' && pos < end_pos; pos++) ;
2606 if (pos == end_pos)
2607 {
2608 DBUG_PRINT("fields",
2609 ("PACK_TYPE_SPACE_FIELDS spaces only, bits: 1"));
2610 DBUG_PRINT("fields", ("---"));
2611 write_bits(1,1);
2612 start_pos=end_pos;
2613 continue;
2614 }
2615 DBUG_PRINT("fields",
2616 ("PACK_TYPE_SPACE_FIELDS not only spaces, bits: 1"));
2617 write_bits(0,1);
2618 }
2619 end_pos-=count->max_zero_fill;
2620 field_length-=count->max_zero_fill;
2621
2622 switch (count->field_type) {
2623 case FIELD_SKIP_ZERO:
2624 if (!memcmp(start_pos, zero_string, field_length))
2625 {
2626 DBUG_PRINT("fields", ("FIELD_SKIP_ZERO zeroes only, bits: 1"));
2627 write_bits(1,1);
2628 start_pos=end_pos;
2629 break;
2630 }
2631 DBUG_PRINT("fields", ("FIELD_SKIP_ZERO not only zeroes, bits: 1"));
2632 write_bits(0,1);
2633 /* Fall through */
2634 case FIELD_NORMAL:
2635 DBUG_PRINT("fields", ("FIELD_NORMAL %lu bytes",
2636 (ulong) (end_pos - start_pos)));
2637 for ( ; start_pos < end_pos ; start_pos++)
2638 {
2639 DBUG_PRINT("fields",
2640 ("value: 0x%02x code: 0x%s bits: %2u bin: %s",
2641 (uchar) *start_pos,
2642 hexdigits(tree->code[(uchar) *start_pos]),
2643 (uint) tree->code_len[(uchar) *start_pos],
2644 bindigits(tree->code[(uchar) *start_pos],
2645 (uint) tree->code_len[(uchar) *start_pos])));
2646 write_bits(tree->code[(uchar) *start_pos],
2647 (uint) tree->code_len[(uchar) *start_pos]);
2648 }
2649 break;
2650 case FIELD_SKIP_ENDSPACE:
2651 for (pos=end_pos ; pos > start_pos && pos[-1] == ' ' ; pos--) ;
2652 length= (ulong) (end_pos - pos);
2653 if (count->pack_type & PACK_TYPE_SELECTED)
2654 {
2655 if (length > count->min_space)
2656 {
2657 DBUG_PRINT("fields",
2658 ("FIELD_SKIP_ENDSPACE more than min_space, bits: 1"));
2659 DBUG_PRINT("fields",
2660 ("FIELD_SKIP_ENDSPACE skip %lu/%u bytes, bits: %2u",
2661 length, field_length, count->length_bits));
2662 write_bits(1,1);
2663 write_bits(length,count->length_bits);
2664 }
2665 else
2666 {
2667 DBUG_PRINT("fields",
2668 ("FIELD_SKIP_ENDSPACE not more than min_space, "
2669 "bits: 1"));
2670 write_bits(0,1);
2671 pos=end_pos;
2672 }
2673 }
2674 else
2675 {
2676 DBUG_PRINT("fields",
2677 ("FIELD_SKIP_ENDSPACE skip %lu/%u bytes, bits: %2u",
2678 length, field_length, count->length_bits));
2679 write_bits(length,count->length_bits);
2680 }
2681 /* Encode all significant bytes. */
2682 DBUG_PRINT("fields", ("FIELD_SKIP_ENDSPACE %lu bytes",
2683 (ulong) (pos - start_pos)));
2684 for ( ; start_pos < pos ; start_pos++)
2685 {
2686 DBUG_PRINT("fields",
2687 ("value: 0x%02x code: 0x%s bits: %2u bin: %s",
2688 (uchar) *start_pos,
2689 hexdigits(tree->code[(uchar) *start_pos]),
2690 (uint) tree->code_len[(uchar) *start_pos],
2691 bindigits(tree->code[(uchar) *start_pos],
2692 (uint) tree->code_len[(uchar) *start_pos])));
2693 write_bits(tree->code[(uchar) *start_pos],
2694 (uint) tree->code_len[(uchar) *start_pos]);
2695 }
2696 start_pos=end_pos;
2697 break;
2698 case FIELD_SKIP_PRESPACE:
2699 for (pos=start_pos ; pos < end_pos && pos[0] == ' ' ; pos++) ;
2700 length= (ulong) (pos - start_pos);
2701 if (count->pack_type & PACK_TYPE_SELECTED)
2702 {
2703 if (length > count->min_space)
2704 {
2705 DBUG_PRINT("fields",
2706 ("FIELD_SKIP_PRESPACE more than min_space, bits: 1"));
2707 DBUG_PRINT("fields",
2708 ("FIELD_SKIP_PRESPACE skip %lu/%u bytes, bits: %2u",
2709 length, field_length, count->length_bits));
2710 write_bits(1,1);
2711 write_bits(length,count->length_bits);
2712 }
2713 else
2714 {
2715 DBUG_PRINT("fields",
2716 ("FIELD_SKIP_PRESPACE not more than min_space, "
2717 "bits: 1"));
2718 pos=start_pos;
2719 write_bits(0,1);
2720 }
2721 }
2722 else
2723 {
2724 DBUG_PRINT("fields",
2725 ("FIELD_SKIP_PRESPACE skip %lu/%u bytes, bits: %2u",
2726 length, field_length, count->length_bits));
2727 write_bits(length,count->length_bits);
2728 }
2729 /* Encode all significant bytes. */
2730 DBUG_PRINT("fields", ("FIELD_SKIP_PRESPACE %lu bytes",
2731 (ulong) (end_pos - start_pos)));
2732 for (start_pos=pos ; start_pos < end_pos ; start_pos++)
2733 {
2734 DBUG_PRINT("fields",
2735 ("value: 0x%02x code: 0x%s bits: %2u bin: %s",
2736 (uchar) *start_pos,
2737 hexdigits(tree->code[(uchar) *start_pos]),
2738 (uint) tree->code_len[(uchar) *start_pos],
2739 bindigits(tree->code[(uchar) *start_pos],
2740 (uint) tree->code_len[(uchar) *start_pos])));
2741 write_bits(tree->code[(uchar) *start_pos],
2742 (uint) tree->code_len[(uchar) *start_pos]);
2743 }
2744 break;
2745 case FIELD_CONSTANT:
2746 case FIELD_ZERO:
2747 case FIELD_CHECK:
2748 DBUG_PRINT("fields", ("FIELD_CONSTANT/ZERO/CHECK"));
2749 start_pos=end_pos;
2750 break;
2751 case FIELD_INTERVALL:
2752 global_count=count;
2753 pos=(uchar*) tree_search(&count->int_tree, start_pos,
2754 count->int_tree.custom_arg);
2755 intervall=(uint) (pos - count->tree_buff)/field_length;
2756 DBUG_PRINT("fields", ("FIELD_INTERVALL"));
2757 DBUG_PRINT("fields", ("index: %4u code: 0x%s bits: %2u",
2758 intervall, hexdigits(tree->code[intervall]),
2759 (uint) tree->code_len[intervall]));
2760 write_bits(tree->code[intervall],(uint) tree->code_len[intervall]);
2761 start_pos=end_pos;
2762 break;
2763 case FIELD_BLOB:
2764 {
2765 ulong blob_length= _ma_calc_blob_length(field_length-
2766 portable_sizeof_char_ptr,
2767 start_pos);
2768 /* Empty blobs are encoded with a single 1 bit. */
2769 if (!blob_length)
2770 {
2771 DBUG_PRINT("fields", ("FIELD_BLOB empty, bits: 1"));
2772 write_bits(1,1);
2773 }
2774 else
2775 {
2776 uchar *blob,*blob_end;
2777 DBUG_PRINT("fields", ("FIELD_BLOB not empty, bits: 1"));
2778 write_bits(0,1);
2779 /* Write the blob length. */
2780 DBUG_PRINT("fields", ("FIELD_BLOB %lu bytes, bits: %2u",
2781 blob_length, count->length_bits));
2782 write_bits(blob_length,count->length_bits);
2783 memcpy(&blob,end_pos-portable_sizeof_char_ptr, sizeof(char*));
2784 blob_end=blob+blob_length;
2785 /* Encode the blob bytes. */
2786 for ( ; blob < blob_end ; blob++)
2787 {
2788 DBUG_PRINT("fields",
2789 ("value: 0x%02x code: 0x%s bits: %2u bin: %s",
2790 (uchar) *blob, hexdigits(tree->code[(uchar) *blob]),
2791 (uint) tree->code_len[(uchar) *blob],
2792 bindigits(tree->code[(uchar) *start_pos],
2793 (uint)tree->code_len[(uchar) *start_pos])));
2794 write_bits(tree->code[(uchar) *blob],
2795 (uint) tree->code_len[(uchar) *blob]);
2796 }
2797 tot_blob_length+=blob_length;
2798 }
2799 start_pos= end_pos;
2800 break;
2801 }
2802 case FIELD_VARCHAR:
2803 {
2804 uint var_pack_length= HA_VARCHAR_PACKLENGTH(count->field_length-1);
2805 ulong col_length= (var_pack_length == 1 ?
2806 (uint) *(uchar*) start_pos :
2807 uint2korr(start_pos));
2808 /* Empty varchar are encoded with a single 1 bit. */
2809 if (!col_length)
2810 {
2811 DBUG_PRINT("fields", ("FIELD_VARCHAR empty, bits: 1"));
2812 write_bits(1,1); /* Empty varchar */
2813 }
2814 else
2815 {
2816 uchar *end= start_pos + var_pack_length + col_length;
2817 DBUG_PRINT("fields", ("FIELD_VARCHAR not empty, bits: 1"));
2818 write_bits(0,1);
2819 /* Write the varchar length. */
2820 DBUG_PRINT("fields", ("FIELD_VARCHAR %lu bytes, bits: %2u",
2821 col_length, count->length_bits));
2822 write_bits(col_length,count->length_bits);
2823 /* Encode the varchar bytes. */
2824 for (start_pos+= var_pack_length ; start_pos < end ; start_pos++)
2825 {
2826 DBUG_PRINT("fields",
2827 ("value: 0x%02x code: 0x%s bits: %2u bin: %s",
2828 (uchar) *start_pos,
2829 hexdigits(tree->code[(uchar) *start_pos]),
2830 (uint) tree->code_len[(uchar) *start_pos],
2831 bindigits(tree->code[(uchar) *start_pos],
2832 (uint)tree->code_len[(uchar) *start_pos])));
2833 write_bits(tree->code[(uchar) *start_pos],
2834 (uint) tree->code_len[(uchar) *start_pos]);
2835 }
2836 }
2837 start_pos= end_pos;
2838 break;
2839 }
2840 case FIELD_LAST:
2841 case FIELD_enum_val_count:
2842 abort(); /* Impossible */
2843 }
2844 start_pos+=count->max_zero_fill;
2845 DBUG_PRINT("fields", ("---"));
2846 }
2847 flush_bits();
2848 length=(ulong) (file_buffer.pos - record_pos) - max_pack_length;
2849 pack_length= _ma_save_pack_length(pack_version, record_pos, length);
2850 if (pack_blob_length)
2851 pack_length+= _ma_save_pack_length(pack_version,
2852 record_pos + pack_length,
2853 tot_blob_length);
2854 DBUG_PRINT("fields", ("record: %lu length: %lu blob-length: %lu "
2855 "length-bytes: %lu", (ulong) record_count, length,
2856 tot_blob_length, pack_length));
2857 DBUG_PRINT("fields", ("==="));
2858
2859 /* Correct file buffer if the header was smaller */
2860 if (pack_length != max_pack_length)
2861 {
2862 bmove(record_pos+pack_length,record_pos+max_pack_length,length);
2863 file_buffer.pos-= (max_pack_length-pack_length);
2864 }
2865 if (length < (ulong) min_record_length)
2866 min_record_length=(uint) length;
2867 if (length > (ulong) max_record_length)
2868 max_record_length=(uint) length;
2869 record_count++;
2870 if (write_loop && record_count % WRITE_COUNT == 0)
2871 {
2872 printf("%lu\r", (ulong) record_count);
2873 fflush(stdout);
2874 }
2875 }
2876 else if (error != HA_ERR_RECORD_DELETED)
2877 break;
2878 }
2879 if (error == HA_ERR_END_OF_FILE)
2880 error=0;
2881 else
2882 {
2883 fprintf(stderr, "%s: Got error %d reading records\n", my_progname, error);
2884 }
2885 if (verbose >= 2)
2886 printf("wrote %s records.\n", llstr((longlong) record_count, llbuf));
2887
2888 my_safe_afree(record, isam_file->s->base.reclength);
2889 mrg->ref_length=max_pack_length;
2890 mrg->min_pack_length=max_record_length ? min_record_length : 0;
2891 mrg->max_pack_length=max_record_length;
2892 DBUG_RETURN(error || error_on_write || flush_buffer(~(ulong) 0));
2893 }
2894
2895
make_new_name(char * new_name,char * old_name)2896 static char *make_new_name(char *new_name, char *old_name)
2897 {
2898 return fn_format(new_name,old_name,"",DATA_TMP_EXT,2+4);
2899 }
2900
make_old_name(char * new_name,char * old_name)2901 static char *make_old_name(char *new_name, char *old_name)
2902 {
2903 return fn_format(new_name,old_name,"",OLD_EXT,2+4);
2904 }
2905
2906 /* rutines for bit writing buffer */
2907
init_file_buffer(File file,pbool read_buffer)2908 static void init_file_buffer(File file, pbool read_buffer)
2909 {
2910 file_buffer.file=file;
2911 file_buffer.buffer= (uchar*) my_malloc(PSI_NOT_INSTRUMENTED,
2912 ALIGN_SIZE(RECORD_CACHE_SIZE), MYF(MY_WME));
2913 file_buffer.end=file_buffer.buffer+ALIGN_SIZE(RECORD_CACHE_SIZE)-8;
2914 file_buffer.pos_in_file=0;
2915 error_on_write=0;
2916 if (read_buffer)
2917 {
2918
2919 file_buffer.pos=file_buffer.end;
2920 file_buffer.bits=0;
2921 }
2922 else
2923 {
2924 file_buffer.pos=file_buffer.buffer;
2925 file_buffer.bits=BITS_SAVED;
2926 }
2927 file_buffer.bitbucket= 0;
2928 }
2929
2930
flush_buffer(ulong neaded_length)2931 static int flush_buffer(ulong neaded_length)
2932 {
2933 ulong length;
2934
2935 /*
2936 file_buffer.end is 8 bytes lower than the real end of the buffer.
2937 This is done so that the end-of-buffer condition does not need to be
2938 checked for every uchar (see write_bits()). Consequently,
2939 file_buffer.pos can become greater than file_buffer.end. The
2940 algorithms in the other functions ensure that there will never be
2941 more than 8 bytes written to the buffer without an end-of-buffer
2942 check. So the buffer cannot be overrun. But we need to check for the
2943 near-to-buffer-end condition to avoid a negative result, which is
2944 casted to unsigned and thus becomes giant.
2945 */
2946 if ((file_buffer.pos < file_buffer.end) &&
2947 ((ulong) (file_buffer.end - file_buffer.pos) > neaded_length))
2948 return 0;
2949 length=(ulong) (file_buffer.pos-file_buffer.buffer);
2950 file_buffer.pos=file_buffer.buffer;
2951 file_buffer.pos_in_file+=length;
2952 if (test_only)
2953 return 0;
2954 if (error_on_write|| my_write(file_buffer.file,
2955 (const uchar*) file_buffer.buffer,
2956 length,
2957 MYF(MY_WME | MY_NABP | MY_WAIT_IF_FULL)))
2958 {
2959 error_on_write=1;
2960 return 1;
2961 }
2962
2963 if (neaded_length != ~(ulong) 0 &&
2964 (ulong) (file_buffer.end-file_buffer.buffer) < neaded_length)
2965 {
2966 uchar *tmp;
2967 neaded_length+=256; /* some margin */
2968 tmp= (uchar*) my_realloc(PSI_NOT_INSTRUMENTED, file_buffer.buffer,
2969 neaded_length,MYF(MY_WME));
2970 if (!tmp)
2971 return 1;
2972 file_buffer.pos= (tmp + (ulong) (file_buffer.pos - file_buffer.buffer));
2973 file_buffer.buffer= tmp;
2974 file_buffer.end= (tmp+neaded_length-8);
2975 }
2976 return 0;
2977 }
2978
2979
end_file_buffer(void)2980 static void end_file_buffer(void)
2981 {
2982 my_free(file_buffer.buffer);
2983 }
2984
2985 /* output `bits` low bits of `value' */
2986
write_bits(register ulonglong value,register uint bits)2987 static void write_bits(register ulonglong value, register uint bits)
2988 {
2989 DBUG_ASSERT(((bits < 8 * sizeof(value)) && ! (value >> bits)) ||
2990 (bits == 8 * sizeof(value)));
2991
2992 if ((file_buffer.bits-= (int) bits) >= 0)
2993 {
2994 file_buffer.bitbucket|= value << file_buffer.bits;
2995 }
2996 else
2997 {
2998 reg3 ulonglong bit_buffer;
2999 bits= (uint) -file_buffer.bits;
3000 bit_buffer= (file_buffer.bitbucket |
3001 ((bits != 8 * sizeof(value)) ? (value >> bits) : 0));
3002 #if BITS_SAVED == 64
3003 *file_buffer.pos++= (uchar) (bit_buffer >> 56);
3004 *file_buffer.pos++= (uchar) (bit_buffer >> 48);
3005 *file_buffer.pos++= (uchar) (bit_buffer >> 40);
3006 *file_buffer.pos++= (uchar) (bit_buffer >> 32);
3007 #endif
3008 *file_buffer.pos++= (uchar) (bit_buffer >> 24);
3009 *file_buffer.pos++= (uchar) (bit_buffer >> 16);
3010 *file_buffer.pos++= (uchar) (bit_buffer >> 8);
3011 *file_buffer.pos++= (uchar) (bit_buffer);
3012
3013 if (bits != 8 * sizeof(value))
3014 value&= (((ulonglong) 1) << bits) - 1;
3015 if (file_buffer.pos >= file_buffer.end)
3016 flush_buffer(~ (ulong) 0);
3017 file_buffer.bits=(int) (BITS_SAVED - bits);
3018 file_buffer.bitbucket= value << (BITS_SAVED - bits);
3019 }
3020 return;
3021 }
3022
3023 /* Flush bits in bit_buffer to buffer */
3024
flush_bits(void)3025 static void flush_bits(void)
3026 {
3027 int bits;
3028 ulonglong bit_buffer;
3029
3030 bits= file_buffer.bits & ~7;
3031 bit_buffer= file_buffer.bitbucket >> bits;
3032 bits= BITS_SAVED - bits;
3033 while (bits > 0)
3034 {
3035 bits-= 8;
3036 *file_buffer.pos++= (uchar) (bit_buffer >> bits);
3037 }
3038 if (file_buffer.pos >= file_buffer.end)
3039 flush_buffer(~ (ulong) 0);
3040 file_buffer.bits= BITS_SAVED;
3041 file_buffer.bitbucket= 0;
3042 }
3043
3044
3045 /****************************************************************************
3046 ** functions to handle the joined files
3047 ****************************************************************************/
3048
save_state(MARIA_HA * isam_file,PACK_MRG_INFO * mrg,my_off_t new_length,ha_checksum crc)3049 static int save_state(MARIA_HA *isam_file,PACK_MRG_INFO *mrg,
3050 my_off_t new_length,
3051 ha_checksum crc)
3052 {
3053 MARIA_SHARE *share=isam_file->s;
3054 uint options=mi_uint2korr(share->state.header.options);
3055 uint key;
3056 DBUG_ENTER("save_state");
3057
3058 options|= HA_OPTION_COMPRESS_RECORD | HA_OPTION_READ_ONLY_DATA;
3059 mi_int2store(share->state.header.options,options);
3060 /* Save the original file type of we have to undo the packing later */
3061 share->state.header.org_data_file_type= share->state.header.data_file_type;
3062 share->state.header.data_file_type= COMPRESSED_RECORD;
3063
3064 share->state.state.data_file_length=new_length;
3065 share->state.state.del=0;
3066 share->state.state.empty=0;
3067 share->state.dellink= HA_OFFSET_ERROR;
3068 share->state.split=(ha_rows) mrg->records;
3069 share->state.version=(ulong) time((time_t*) 0);
3070 if (share->base.born_transactional)
3071 share->state.create_rename_lsn= share->state.is_of_horizon=
3072 share->state.skip_redo_lsn= LSN_NEEDS_NEW_STATE_LSNS;
3073 if (! maria_is_all_keys_active(share->state.key_map, share->base.keys))
3074 {
3075 /*
3076 Some indexes are disabled, cannot use current key_file_length value
3077 as an estimate of upper bound of index file size. Use packed data file
3078 size instead.
3079 */
3080 share->state.state.key_file_length= new_length;
3081 }
3082 /*
3083 If there are no disabled indexes, keep key_file_length value from
3084 original file so "aria_chk -rq" can use this value (this is necessary
3085 because index size cannot be easily calculated for fulltext keys)
3086 */
3087 maria_clear_all_keys_active(share->state.key_map);
3088 for (key=0 ; key < share->base.keys ; key++)
3089 share->state.key_root[key]= HA_OFFSET_ERROR;
3090 share->state.key_del= HA_OFFSET_ERROR;
3091 share->state.state.checksum= crc; /* Save crc in file */
3092 share->changed=1; /* Force write of header */
3093 share->state.open_count=0;
3094 share->global_changed=0;
3095 my_chsize(share->kfile.file, share->base.keystart, 0, MYF(0));
3096 if (share->base.keys)
3097 isamchk_neaded=1;
3098 DBUG_RETURN(_ma_state_info_write_sub(share->kfile.file,
3099 &share->state,
3100 MA_STATE_INFO_WRITE_DONT_MOVE_OFFSET |
3101 MA_STATE_INFO_WRITE_FULL_INFO));
3102 }
3103
3104
save_state_mrg(File file,PACK_MRG_INFO * mrg,my_off_t new_length,ha_checksum crc)3105 static int save_state_mrg(File file,PACK_MRG_INFO *mrg,my_off_t new_length,
3106 ha_checksum crc)
3107 {
3108 MARIA_STATE_INFO state;
3109 MARIA_HA *isam_file=mrg->file[0];
3110 uint options;
3111 DBUG_ENTER("save_state_mrg");
3112
3113 state= isam_file->s->state;
3114 options= (mi_uint2korr(state.header.options) | HA_OPTION_COMPRESS_RECORD |
3115 HA_OPTION_READ_ONLY_DATA);
3116 mi_int2store(state.header.options,options);
3117 /* Save the original file type of we have to undo the packing later */
3118 state.header.org_data_file_type= state.header.data_file_type;
3119 state.header.data_file_type= COMPRESSED_RECORD;
3120
3121 state.state.data_file_length=new_length;
3122 state.state.del=0;
3123 state.state.empty=0;
3124 state.state.records=state.split=(ha_rows) mrg->records;
3125 state.create_rename_lsn= state.is_of_horizon= state.skip_redo_lsn=
3126 LSN_NEEDS_NEW_STATE_LSNS;
3127
3128 /* See comment above in save_state about key_file_length handling. */
3129 if (mrg->src_file_has_indexes_disabled)
3130 {
3131 isam_file->s->state.state.key_file_length=
3132 MY_MAX(isam_file->s->state.state.key_file_length, new_length);
3133 }
3134 state.dellink= HA_OFFSET_ERROR;
3135 state.version=(ulong) time((time_t*) 0);
3136 maria_clear_all_keys_active(state.key_map);
3137 state.state.checksum=crc;
3138 if (isam_file->s->base.keys)
3139 isamchk_neaded=1;
3140 state.changed=STATE_CHANGED | STATE_NOT_ANALYZED; /* Force check of table */
3141 DBUG_RETURN (_ma_state_info_write_sub(file, &state,
3142 MA_STATE_INFO_WRITE_DONT_MOVE_OFFSET |
3143 MA_STATE_INFO_WRITE_FULL_INFO));
3144 }
3145
3146
3147 /* reset for mrg_rrnd */
3148
mrg_reset(PACK_MRG_INFO * mrg)3149 static void mrg_reset(PACK_MRG_INFO *mrg)
3150 {
3151 if (mrg->current)
3152 {
3153 maria_extra(*mrg->current, HA_EXTRA_NO_CACHE, 0);
3154 mrg->current=0;
3155 }
3156 }
3157
mrg_rrnd(PACK_MRG_INFO * info,uchar * buf)3158 static int mrg_rrnd(PACK_MRG_INFO *info,uchar *buf)
3159 {
3160 int error;
3161 MARIA_HA *isam_info;
3162
3163 if (!info->current)
3164 {
3165 isam_info= *(info->current=info->file);
3166 info->end=info->current+info->count;
3167 maria_reset(isam_info);
3168 maria_extra(isam_info, HA_EXTRA_CACHE, 0);
3169 if ((error= maria_scan_init(isam_info)))
3170 return(error);
3171 }
3172 else
3173 isam_info= *info->current;
3174
3175 for (;;)
3176 {
3177 if (!(error= maria_scan(isam_info, buf)) ||
3178 error != HA_ERR_END_OF_FILE)
3179 return (error);
3180 maria_scan_end(isam_info);
3181 maria_extra(isam_info,HA_EXTRA_NO_CACHE, 0);
3182 if (info->current+1 == info->end)
3183 return(HA_ERR_END_OF_FILE);
3184 info->current++;
3185 isam_info= *info->current;
3186 maria_reset(isam_info);
3187 maria_extra(isam_info,HA_EXTRA_CACHE, 0);
3188 if ((error= maria_scan_init(isam_info)))
3189 return(error);
3190 }
3191 }
3192
3193
mrg_close(PACK_MRG_INFO * mrg)3194 static int mrg_close(PACK_MRG_INFO *mrg)
3195 {
3196 uint i;
3197 int error=0;
3198 DBUG_ENTER("mrg_close");
3199
3200 for (i=0 ; i < mrg->count ; i++)
3201 error|=maria_close(mrg->file[i]);
3202 if (mrg->free_file)
3203 my_free(mrg->file);
3204 DBUG_RETURN(error);
3205 }
3206
3207
3208 #if !defined(DBUG_OFF)
3209 /*
3210 Fake the counts to get big Huffman codes.
3211
3212 SYNOPSIS
3213 fakebigcodes()
3214 huff_counts A pointer to the counts array.
3215 end_count A pointer past the counts array.
3216
3217 DESCRIPTION
3218
3219 Huffman coding works by removing the two least frequent values from
3220 the list of values and add a new value with the sum of their
3221 incidences in a loop until only one value is left. Every time a
3222 value is reused for a new value, it gets one more bit for its
3223 encoding. Hence, the least frequent values get the longest codes.
3224
3225 To get a maximum code length for a value, two of the values must
3226 have an incidence of 1. As their sum is 2, the next infrequent value
3227 must have at least an incidence of 2, then 4, 8, 16 and so on. This
3228 means that one needs 2**n bytes (values) for a code length of n
3229 bits. However, using more distinct values forces the use of longer
3230 codes, or reaching the code length with less total bytes (values).
3231
3232 To get 64(32)-bit codes, I sort the counts by decreasing incidence.
3233 I assign counts of 1 to the two most frequent values, a count of 2
3234 for the next one, then 4, 8, and so on until 2**64-1(2**30-1). All
3235 the remaining values get 1. That way every possible uchar has an
3236 assigned code, though not all codes are used if not all uchar values
3237 are present in the column.
3238
3239 This strategy would work with distinct column values too, but
3240 requires that at least 64(32) values are present. To make things
3241 easier here, I cancel all distinct column values and force byte
3242 compression for all columns.
3243
3244 RETURN
3245 void
3246 */
3247
fakebigcodes(HUFF_COUNTS * huff_counts,HUFF_COUNTS * end_count)3248 static void fakebigcodes(HUFF_COUNTS *huff_counts, HUFF_COUNTS *end_count)
3249 {
3250 HUFF_COUNTS *count;
3251 my_off_t *cur_count_p;
3252 my_off_t *end_count_p;
3253 my_off_t **cur_sort_p;
3254 my_off_t **end_sort_p;
3255 my_off_t *sort_counts[256];
3256 my_off_t total;
3257 DBUG_ENTER("fakebigcodes");
3258
3259 for (count= huff_counts; count < end_count; count++)
3260 {
3261 /*
3262 Remove distinct column values.
3263 */
3264 if (huff_counts->tree_buff)
3265 {
3266 my_free(huff_counts->tree_buff);
3267 delete_tree(&huff_counts->int_tree, 0);
3268 huff_counts->tree_buff= NULL;
3269 DBUG_PRINT("fakebigcodes", ("freed distinct column values"));
3270 }
3271
3272 /*
3273 Sort counts by decreasing incidence.
3274 */
3275 cur_count_p= count->counts;
3276 end_count_p= cur_count_p + 256;
3277 cur_sort_p= sort_counts;
3278 while (cur_count_p < end_count_p)
3279 *(cur_sort_p++)= cur_count_p++;
3280 (void) my_qsort(sort_counts, 256, sizeof(my_off_t*), (qsort_cmp) fakecmp);
3281
3282 /*
3283 Assign faked counts.
3284 */
3285 cur_sort_p= sort_counts;
3286 #if SIZEOF_LONG_LONG > 4
3287 end_sort_p= sort_counts + 8 * sizeof(ulonglong) - 1;
3288 #else
3289 end_sort_p= sort_counts + 8 * sizeof(ulonglong) - 2;
3290 #endif
3291 /* Most frequent value gets a faked count of 1. */
3292 **(cur_sort_p++)= 1;
3293 total= 1;
3294 while (cur_sort_p < end_sort_p)
3295 {
3296 **(cur_sort_p++)= total;
3297 total<<= 1;
3298 }
3299 /* Set the last value. */
3300 **(cur_sort_p++)= --total;
3301 /*
3302 Set the remaining counts.
3303 */
3304 end_sort_p= sort_counts + 256;
3305 while (cur_sort_p < end_sort_p)
3306 **(cur_sort_p++)= 1;
3307 }
3308 DBUG_VOID_RETURN;
3309 }
3310
3311
3312 /*
3313 Compare two counts for reverse sorting.
3314
3315 SYNOPSIS
3316 fakecmp()
3317 count1 One count.
3318 count2 Another count.
3319
3320 RETURN
3321 1 count1 < count2
3322 0 count1 == count2
3323 -1 count1 > count2
3324 */
3325
fakecmp(my_off_t ** count1,my_off_t ** count2)3326 static int fakecmp(my_off_t **count1, my_off_t **count2)
3327 {
3328 return ((**count1 < **count2) ? 1 :
3329 (**count1 > **count2) ? -1 : 0);
3330 }
3331 #endif
3332
3333 #include "ma_check_standalone.h"
3334
3335