1 /* Copyright (C) 2006 MySQL AB & MySQL Finland AB & TCX DataKonsult AB
2
3 This program is free software; you can redistribute it and/or modify
4 it under the terms of the GNU General Public License as published by
5 the Free Software Foundation; version 2 of the License.
6
7 This program is distributed in the hope that it will be useful,
8 but WITHOUT ANY WARRANTY; without even the implied warranty of
9 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
10 GNU General Public License for more details.
11
12 You should have received a copy of the GNU General Public License
13 along with this program; if not, write to the Free Software
14 Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1335 USA */
15
16 /* Pack MARIA file */
17
18 #ifndef USE_MY_FUNC
19 #define USE_MY_FUNC /* We need at least my_malloc */
20 #endif
21
22 #include "maria_def.h"
23 #include <queues.h>
24 #include <my_tree.h>
25 #include "mysys_err.h"
26 #ifdef MSDOS
27 #include <io.h>
28 #endif
29 #ifndef __GNU_LIBRARY__
30 #define __GNU_LIBRARY__ /* Skip warnings in getopt.h */
31 #endif
32 #include <my_getopt.h>
33 #include <assert.h>
34
35 #if SIZEOF_LONG_LONG > 4
36 #define BITS_SAVED 64
37 #else
38 #define BITS_SAVED 32
39 #endif
40
41 #define IS_OFFSET ((uint) 32768) /* Bit if offset or char in tree */
42 #define HEAD_LENGTH 32
43 #define ALLOWED_JOIN_DIFF 256 /* Diff allowed to join trees */
44
45 #define DATA_TMP_EXT ".TMD"
46 #define OLD_EXT ".OLD"
47 #define WRITE_COUNT MY_HOW_OFTEN_TO_WRITE
48
49 struct st_file_buffer {
50 File file;
51 uchar *buffer,*pos,*end;
52 my_off_t pos_in_file;
53 int bits;
54 ulonglong bitbucket;
55 };
56
57 struct st_huff_tree;
58 struct st_huff_element;
59
60 typedef struct st_huff_counts {
61 uint field_length,max_zero_fill;
62 uint pack_type;
63 uint max_end_space,max_pre_space,length_bits,min_space;
64 ulong max_length;
65 enum en_fieldtype field_type;
66 struct st_huff_tree *tree; /* Tree for field */
67 my_off_t counts[256];
68 my_off_t end_space[8];
69 my_off_t pre_space[8];
70 my_off_t tot_end_space,tot_pre_space,zero_fields,empty_fields,bytes_packed;
71 TREE int_tree; /* Tree for detecting distinct column values. */
72 uchar *tree_buff; /* Column values, 'field_length' each. */
73 uchar *tree_pos; /* Points to end of column values in 'tree_buff'. */
74 } HUFF_COUNTS;
75
76 typedef struct st_huff_element HUFF_ELEMENT;
77
78 /*
79 WARNING: It is crucial for the optimizations in calc_packed_length()
80 that 'count' is the first element of 'HUFF_ELEMENT'.
81 */
82 struct st_huff_element {
83 my_off_t count;
84 union un_element {
85 struct st_nod {
86 HUFF_ELEMENT *left,*right;
87 } nod;
88 struct st_leaf {
89 HUFF_ELEMENT *null;
90 uint element_nr; /* Number of element */
91 } leaf;
92 } a;
93 };
94
95
96 typedef struct st_huff_tree {
97 HUFF_ELEMENT *root,*element_buffer;
98 HUFF_COUNTS *counts;
99 uint tree_number;
100 uint elements;
101 my_off_t bytes_packed;
102 uint tree_pack_length;
103 uint min_chr,max_chr,char_bits,offset_bits,max_offset,height;
104 ulonglong *code;
105 uchar *code_len;
106 } HUFF_TREE;
107
108
109 typedef struct st_isam_mrg {
110 MARIA_HA **file,**current,**end;
111 uint free_file;
112 uint count;
113 uint min_pack_length; /* Theese is used by packed data */
114 uint max_pack_length;
115 uint ref_length;
116 uint max_blob_length;
117 my_off_t records;
118 /* true if at least one source file has at least one disabled index */
119 my_bool src_file_has_indexes_disabled;
120 } PACK_MRG_INFO;
121
122
123 extern int main(int argc,char * *argv);
124 static void get_options(int *argc,char ***argv);
125 static MARIA_HA *open_maria_file(char *name,int mode);
126 static my_bool open_maria_files(PACK_MRG_INFO *mrg,char **names,uint count);
127 static int compress(PACK_MRG_INFO *file,char *join_name);
128 static HUFF_COUNTS *init_huff_count(MARIA_HA *info,my_off_t records);
129 static void free_counts_and_tree_and_queue(HUFF_TREE *huff_trees,
130 uint trees,
131 HUFF_COUNTS *huff_counts,
132 uint fields);
133 static int compare_tree(void* cmp_arg __attribute__((unused)),
134 const uchar *s,const uchar *t);
135 static int get_statistic(PACK_MRG_INFO *mrg,HUFF_COUNTS *huff_counts);
136 static void check_counts(HUFF_COUNTS *huff_counts,uint trees,
137 my_off_t records);
138 static int test_space_compress(HUFF_COUNTS *huff_counts,my_off_t records,
139 uint max_space_length,my_off_t *space_counts,
140 my_off_t tot_space_count,
141 enum en_fieldtype field_type);
142 static HUFF_TREE* make_huff_trees(HUFF_COUNTS *huff_counts,uint trees);
143 static int make_huff_tree(HUFF_TREE *tree,HUFF_COUNTS *huff_counts);
144 static int compare_huff_elements(void *not_used, uchar *a,uchar *b);
145 static int save_counts_in_queue(uchar *key,element_count count,
146 HUFF_TREE *tree);
147 static my_off_t calc_packed_length(HUFF_COUNTS *huff_counts,uint flag);
148 static uint join_same_trees(HUFF_COUNTS *huff_counts,uint trees);
149 static int make_huff_decode_table(HUFF_TREE *huff_tree,uint trees);
150 static void make_traverse_code_tree(HUFF_TREE *huff_tree,
151 HUFF_ELEMENT *element,uint size,
152 ulonglong code);
153 static int write_header(PACK_MRG_INFO *isam_file, uint header_length,uint trees,
154 my_off_t tot_elements,my_off_t filelength);
155 static void write_field_info(HUFF_COUNTS *counts, uint fields,uint trees);
156 static my_off_t write_huff_tree(HUFF_TREE *huff_tree,uint trees);
157 static uint *make_offset_code_tree(HUFF_TREE *huff_tree,
158 HUFF_ELEMENT *element,
159 uint *offset);
160 static uint max_bit(uint value);
161 static int compress_maria_file(PACK_MRG_INFO *file,HUFF_COUNTS *huff_counts);
162 static char *make_new_name(char *new_name,char *old_name);
163 static char *make_old_name(char *new_name,char *old_name);
164 static void init_file_buffer(File file,pbool read_buffer);
165 static int flush_buffer(ulong neaded_length);
166 static void end_file_buffer(void);
167 static void write_bits(ulonglong value, uint bits);
168 static void flush_bits(void);
169 static int save_state(MARIA_HA *isam_file,PACK_MRG_INFO *mrg,
170 my_off_t new_length, ha_checksum crc);
171 static int save_state_mrg(File file,PACK_MRG_INFO *isam_file,
172 my_off_t new_length, ha_checksum crc);
173 static int mrg_close(PACK_MRG_INFO *mrg);
174 static int mrg_rrnd(PACK_MRG_INFO *info,uchar *buf);
175 static void mrg_reset(PACK_MRG_INFO *mrg);
176 #if !defined(DBUG_OFF)
177 static void fakebigcodes(HUFF_COUNTS *huff_counts, HUFF_COUNTS *end_count);
178 static int fakecmp(my_off_t **count1, my_off_t **count2);
179 #endif
180
181
182 static int error_on_write=0,test_only=0,verbose=0,silent=0,
183 write_loop=0,force_pack=0, isamchk_neaded=0;
184 static int tmpfile_createflag=O_RDWR | O_TRUNC | O_EXCL;
185 static my_bool backup, opt_wait;
186 /*
187 tree_buff_length is somewhat arbitrary. The bigger it is the better
188 the chance to win in terms of compression factor. On the other hand,
189 this table becomes part of the compressed file header. And its length
190 is coded with 16 bits in the header. Hence the limit is 2**16 - 1.
191 */
192 static uint tree_buff_length= 65536 - MALLOC_OVERHEAD;
193 static char tmp_dir[FN_REFLEN]={0},*join_table;
194 static my_off_t intervall_length;
195 static ha_checksum glob_crc;
196 static struct st_file_buffer file_buffer;
197 static QUEUE queue;
198 static HUFF_COUNTS *global_count;
199 static char zero_string[]={0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0};
200 static const char *load_default_groups[]= { "ariapack",0 };
201
202 /* The main program */
203
main(int argc,char ** argv)204 int main(int argc, char **argv)
205 {
206 int error,ok;
207 PACK_MRG_INFO merge;
208 char **default_argv;
209 MY_INIT(argv[0]);
210
211 load_defaults_or_exit("my", load_default_groups, &argc, &argv);
212 default_argv= argv;
213 get_options(&argc,&argv);
214 maria_init();
215
216 error=ok=isamchk_neaded=0;
217 if (join_table)
218 { /* Join files into one */
219 if (open_maria_files(&merge,argv,(uint) argc) ||
220 compress(&merge,join_table))
221 error=1;
222 }
223 else while (argc--)
224 {
225 MARIA_HA *isam_file;
226 if (!(isam_file=open_maria_file(*argv++,O_RDWR)))
227 error=1;
228 else
229 {
230 merge.file= &isam_file;
231 merge.current=0;
232 merge.free_file=0;
233 merge.count=1;
234 if (compress(&merge,0))
235 error=1;
236 else
237 ok=1;
238 }
239 }
240 if (ok && isamchk_neaded && !silent)
241 puts("Remember to run aria_chk -rq on compressed tables");
242 fflush(stdout);
243 fflush(stderr);
244 free_defaults(default_argv);
245 maria_end();
246 my_end(verbose ? MY_CHECK_ERROR | MY_GIVE_INFO : MY_CHECK_ERROR);
247 exit(error ? 2 : 0);
248 #ifndef _lint
249 return 0; /* No compiler warning */
250 #endif
251 }
252
253 enum options_mp {OPT_CHARSETS_DIR_MP=256, OPT_AUTO_CLOSE};
254
255 static struct my_option my_long_options[] =
256 {
257 #ifdef __NETWARE__
258 {"autoclose", OPT_AUTO_CLOSE, "Auto close the screen on exit for Netware.",
259 0, 0, 0, GET_NO_ARG, NO_ARG, 0, 0, 0, 0, 0, 0},
260 #endif
261 {"backup", 'b', "Make a backup of the table as table_name.OLD.",
262 &backup, &backup, 0, GET_BOOL, NO_ARG, 0, 0, 0, 0, 0, 0},
263 {"character-sets-dir", OPT_CHARSETS_DIR_MP,
264 "Directory where character sets are.", (char**) &charsets_dir,
265 (char**) &charsets_dir, 0, GET_STR, REQUIRED_ARG, 0, 0, 0, 0, 0, 0},
266 {"debug", '#', "Output debug log. Often this is 'd:t:o,filename'.",
267 0, 0, 0, GET_STR, OPT_ARG, 0, 0, 0, 0, 0, 0},
268 {"force", 'f',
269 "Force packing of table even if it gets bigger or if tempfile exists.",
270 0, 0, 0, GET_NO_ARG, NO_ARG, 0, 0, 0, 0, 0, 0},
271 {"join", 'j',
272 "Join all given tables into 'new_table_name'. All tables MUST have identical layouts.",
273 &join_table, &join_table, 0, GET_STR, REQUIRED_ARG, 0, 0, 0,
274 0, 0, 0},
275 {"help", '?', "Display this help and exit.",
276 0, 0, 0, GET_NO_ARG, NO_ARG, 0, 0, 0, 0, 0, 0},
277 {"silent", 's', "Be more silent.",
278 0, 0, 0, GET_NO_ARG, NO_ARG, 0, 0, 0, 0, 0, 0},
279 {"tmpdir", 'T', "Use temporary directory to store temporary table.",
280 0, 0, 0, GET_STR, REQUIRED_ARG, 0, 0, 0, 0, 0, 0},
281 {"test", 't', "Don't pack table, only test packing it.",
282 0, 0, 0, GET_NO_ARG, NO_ARG, 0, 0, 0, 0, 0, 0},
283 {"verbose", 'v', "Write info about progress and packing result. Use many -v for more verbosity!",
284 0, 0, 0, GET_NO_ARG, NO_ARG, 0, 0, 0, 0, 0, 0},
285 {"version", 'V', "Output version information and exit.",
286 0, 0, 0, GET_NO_ARG, NO_ARG, 0, 0, 0, 0, 0, 0},
287 {"wait", 'w', "Wait and retry if table is in use.", &opt_wait,
288 &opt_wait, 0, GET_BOOL, NO_ARG, 0, 0, 0, 0, 0, 0},
289 { 0, 0, 0, 0, 0, 0, GET_NO_ARG, NO_ARG, 0, 0, 0, 0, 0, 0}
290 };
291
292
print_version(void)293 static void print_version(void)
294 {
295 printf("%s Ver 1.0 for %s on %s\n", my_progname, SYSTEM_TYPE, MACHINE_TYPE);
296 }
297
298
usage(void)299 static void usage(void)
300 {
301 print_version();
302 puts("Copyright 2002-2008 MySQL AB, 2008-2009 Sun Microsystems, Inc.");
303 puts("This software comes with ABSOLUTELY NO WARRANTY. This is free software,");
304 puts("and you are welcome to modify and redistribute it under the GPL license\n");
305
306 puts("Pack a Aria-table to take much less space.");
307 puts("Keys are not updated, you must run aria_chk -rq on the index (.MAI) file");
308 puts("afterwards to update the keys.");
309 puts("You should give the .MAI file as the filename argument.");
310 puts("To unpack a packed table, run aria_chk -u on the table");
311
312 printf("\nUsage: %s [OPTIONS] filename...\n", my_progname);
313 my_print_help(my_long_options);
314 print_defaults("my", load_default_groups);
315 my_print_variables(my_long_options);
316 }
317
318
319 static my_bool
get_one_option(int optid,const struct my_option * opt,char * argument)320 get_one_option(int optid, const struct my_option *opt __attribute__((unused)),
321 char *argument)
322 {
323 uint length;
324
325 switch(optid) {
326 #ifdef __NETWARE__
327 case OPT_AUTO_CLOSE:
328 setscreenmode(SCR_AUTOCLOSE_ON_EXIT);
329 break;
330 #endif
331 case 'f':
332 force_pack= 1;
333 tmpfile_createflag= O_RDWR | O_TRUNC;
334 break;
335 case 's':
336 write_loop= verbose= 0;
337 silent= 1;
338 break;
339 case 't':
340 test_only= 1;
341 /* Avoid to reset 'verbose' if it was already set > 1. */
342 if (! verbose)
343 verbose= 1;
344 break;
345 case 'T':
346 length= (uint) (strmov(tmp_dir, argument) - tmp_dir);
347 if (length != dirname_length(tmp_dir))
348 {
349 tmp_dir[length]=FN_LIBCHAR;
350 tmp_dir[length+1]=0;
351 }
352 break;
353 case 'v':
354 verbose++; /* Allow for selecting the level of verbosity. */
355 silent= 0;
356 break;
357 case '#':
358 DBUG_PUSH(argument ? argument : "d:t:o,/tmp/aria_pack.trace");
359 break;
360 case 'V':
361 print_version();
362 exit(0);
363 case 'I':
364 case '?':
365 usage();
366 exit(0);
367 }
368 return 0;
369 }
370
371 /* reads options */
372 /* Initiates DEBUG - but no debugging here ! */
373
get_options(int * argc,char *** argv)374 static void get_options(int *argc,char ***argv)
375 {
376 int ho_error;
377
378 my_progname= argv[0][0];
379 if (isatty(fileno(stdout)))
380 write_loop=1;
381
382 if ((ho_error=handle_options(argc, argv, my_long_options, get_one_option)))
383 exit(ho_error);
384
385 if (!*argc)
386 {
387 usage();
388 exit(1);
389 }
390 if (join_table)
391 {
392 backup=0; /* Not needed */
393 tmp_dir[0]=0;
394 }
395 return;
396 }
397
398
open_maria_file(char * name,int mode)399 static MARIA_HA *open_maria_file(char *name,int mode)
400 {
401 MARIA_HA *isam_file;
402 MARIA_SHARE *share;
403 DBUG_ENTER("open_maria_file");
404
405 if (!(isam_file=maria_open(name, mode, HA_OPEN_IGNORE_MOVED_STATE |
406 (opt_wait ? HA_OPEN_WAIT_IF_LOCKED :
407 HA_OPEN_ABORT_IF_LOCKED))))
408 {
409 fprintf(stderr, "%s gave error %d on open\n", name, my_errno);
410 DBUG_RETURN(0);
411 }
412 share=isam_file->s;
413 if (share->options & HA_OPTION_COMPRESS_RECORD && !join_table)
414 {
415 if (!force_pack)
416 {
417 fprintf(stderr, "%s is already compressed\n", name);
418 maria_close(isam_file);
419 DBUG_RETURN(0);
420 }
421 if (verbose)
422 puts("Recompressing already compressed table");
423 share->options&= ~HA_OPTION_READ_ONLY_DATA; /* We are modifing it */
424 }
425 if (! force_pack && share->state.state.records != 0 &&
426 (share->state.state.records <= 1 ||
427 share->state.state.data_file_length < 1024))
428 {
429 fprintf(stderr, "%s is too small to compress\n", name);
430 maria_close(isam_file);
431 DBUG_RETURN(0);
432 }
433 maria_lock_database(isam_file,F_WRLCK);
434 maria_ignore_trids(isam_file);
435 DBUG_RETURN(isam_file);
436 }
437
438
open_maria_files(PACK_MRG_INFO * mrg,char ** names,uint count)439 static my_bool open_maria_files(PACK_MRG_INFO *mrg,char **names,uint count)
440 {
441 uint i,j;
442 mrg->count=0;
443 mrg->current=0;
444 mrg->file=(MARIA_HA**) my_malloc(sizeof(MARIA_HA*)*count,MYF(MY_FAE));
445 mrg->free_file=1;
446 mrg->src_file_has_indexes_disabled= 0;
447 for (i=0; i < count ; i++)
448 {
449 if (!(mrg->file[i]=open_maria_file(names[i],O_RDONLY)))
450 goto error;
451
452 mrg->src_file_has_indexes_disabled|=
453 ! maria_is_all_keys_active(mrg->file[i]->s->state.key_map,
454 mrg->file[i]->s->base.keys);
455 }
456 /* Check that files are identical */
457 for (j=0 ; j < count-1 ; j++)
458 {
459 MARIA_COLUMNDEF *m1,*m2,*end;
460 if (mrg->file[j]->s->base.reclength != mrg->file[j+1]->s->base.reclength ||
461 mrg->file[j]->s->base.fields != mrg->file[j+1]->s->base.fields)
462 goto diff_file;
463 m1=mrg->file[j]->s->columndef;
464 end=m1+mrg->file[j]->s->base.fields;
465 m2=mrg->file[j+1]->s->columndef;
466 for ( ; m1 != end ; m1++,m2++)
467 {
468 if (m1->type != m2->type || m1->length != m2->length)
469 goto diff_file;
470 }
471 }
472 mrg->count=count;
473 return 0;
474
475 diff_file:
476 fprintf(stderr, "%s: Tables '%s' and '%s' are not identical\n",
477 my_progname, names[j], names[j+1]);
478 error:
479 while (i--)
480 maria_close(mrg->file[i]);
481 my_free(mrg->file);
482 return 1;
483 }
484
485
compress(PACK_MRG_INFO * mrg,char * result_table)486 static int compress(PACK_MRG_INFO *mrg,char *result_table)
487 {
488 int error;
489 File new_file,join_maria_file;
490 MARIA_HA *isam_file;
491 MARIA_SHARE *share;
492 char org_name[FN_REFLEN],new_name[FN_REFLEN],temp_name[FN_REFLEN];
493 uint i,header_length,fields,trees,used_trees;
494 my_off_t old_length,new_length,tot_elements;
495 HUFF_COUNTS *huff_counts;
496 HUFF_TREE *huff_trees;
497 DBUG_ENTER("compress");
498
499 isam_file=mrg->file[0]; /* Take this as an example */
500 share=isam_file->s;
501 new_file=join_maria_file= -1;
502 trees=fields=0;
503 huff_trees=0;
504 huff_counts=0;
505 maria_block_size= isam_file->s->block_size;
506
507 /* Create temporary or join file */
508 if (backup)
509 fn_format(org_name,isam_file->s->open_file_name.str, "",MARIA_NAME_DEXT, 2);
510 else
511 fn_format(org_name,isam_file->s->open_file_name.str, "",MARIA_NAME_DEXT, 2+4+16);
512
513 if (init_pagecache(maria_pagecache, MARIA_MIN_PAGE_CACHE_SIZE, 0, 0,
514 maria_block_size, 0, MY_WME) == 0)
515 {
516 fprintf(stderr, "Can't initialize page cache\n");
517 goto err;
518 }
519
520 if (!test_only && result_table)
521 {
522 /* Make a new indexfile based on first file in list */
523 uint length;
524 uchar *buff;
525 strmov(org_name,result_table); /* Fix error messages */
526 fn_format(new_name,result_table,"",MARIA_NAME_IEXT,2);
527 if ((join_maria_file=my_create(new_name,0,tmpfile_createflag,MYF(MY_WME)))
528 < 0)
529 goto err;
530 length=(uint) share->base.keystart;
531 if (!(buff= (uchar*) my_malloc(length,MYF(MY_WME))))
532 goto err;
533 if (my_pread(share->kfile.file, buff, length, 0L, MYF(MY_WME | MY_NABP)) ||
534 my_write(join_maria_file,buff,length,
535 MYF(MY_WME | MY_NABP | MY_WAIT_IF_FULL)))
536 {
537 my_free(buff);
538 goto err;
539 }
540 my_free(buff);
541 fn_format(new_name,result_table,"",MARIA_NAME_DEXT,2);
542 }
543 else if (!tmp_dir[0])
544 make_new_name(new_name,org_name);
545 else
546 fn_format(new_name,org_name,tmp_dir,DATA_TMP_EXT,1+2+4);
547 if (!test_only &&
548 (new_file=my_create(new_name,0,tmpfile_createflag,MYF(MY_WME))) < 0)
549 goto err;
550
551 /* Start calculating statistics */
552
553 mrg->records=0;
554 for (i=0 ; i < mrg->count ; i++)
555 mrg->records+=mrg->file[i]->s->state.state.records;
556
557 DBUG_PRINT("info", ("Compressing %s: (%lu records)",
558 result_table ? new_name : org_name,
559 (ulong) mrg->records));
560 if (write_loop || verbose)
561 {
562 printf("Compressing %s: (%lu records)\n",
563 result_table ? new_name : org_name, (ulong) mrg->records);
564 }
565 trees=fields=share->base.fields;
566 huff_counts=init_huff_count(isam_file,mrg->records);
567
568 /*
569 Read the whole data file(s) for statistics.
570 */
571 DBUG_PRINT("info", ("- Calculating statistics"));
572 if (write_loop || verbose)
573 printf("- Calculating statistics\n");
574 if (get_statistic(mrg,huff_counts))
575 goto err;
576
577 old_length=0;
578 for (i=0; i < mrg->count ; i++)
579 old_length+= (mrg->file[i]->s->state.state.data_file_length -
580 mrg->file[i]->s->state.state.empty);
581
582 /*
583 Create a global priority queue in preparation for making
584 temporary Huffman trees.
585 */
586 if (init_queue(&queue, 256, 0, 0, compare_huff_elements, 0, 0, 0))
587 goto err;
588
589 /*
590 Check each column if we should use pre-space-compress, end-space-
591 compress, empty-field-compress or zero-field-compress.
592 */
593 check_counts(huff_counts,fields,mrg->records);
594
595 /*
596 Build a Huffman tree for each column.
597 */
598 huff_trees=make_huff_trees(huff_counts,trees);
599
600 /*
601 If the packed lengths of combined columns is less then the sum of
602 the non-combined columns, then create common Huffman trees for them.
603 We do this only for uchar compressed columns, not for distinct values
604 compressed columns.
605 */
606 if ((int) (used_trees=join_same_trees(huff_counts,trees)) < 0)
607 goto err;
608
609 /*
610 Assign codes to all uchar or column values.
611 */
612 if (make_huff_decode_table(huff_trees,fields))
613 goto err;
614
615 /* Prepare a file buffer. */
616 init_file_buffer(new_file,0);
617
618 /*
619 Reserve space in the target file for the fixed compressed file header.
620 */
621 file_buffer.pos_in_file=HEAD_LENGTH;
622 if (! test_only)
623 my_seek(new_file,file_buffer.pos_in_file,MY_SEEK_SET,MYF(0));
624
625 /*
626 Write field infos: field type, pack type, length bits, tree number.
627 */
628 write_field_info(huff_counts,fields,used_trees);
629
630 /*
631 Write decode trees.
632 */
633 if (!(tot_elements=write_huff_tree(huff_trees,trees)))
634 goto err;
635
636 /*
637 Calculate the total length of the compression info header.
638 This includes the fixed compressed file header, the column compression
639 type descriptions, and the decode trees.
640 */
641 header_length=(uint) file_buffer.pos_in_file+
642 (uint) (file_buffer.pos-file_buffer.buffer);
643
644 /*
645 Compress the source file into the target file.
646 */
647 DBUG_PRINT("info", ("- Compressing file"));
648 if (write_loop || verbose)
649 printf("- Compressing file\n");
650 error=compress_maria_file(mrg,huff_counts);
651 new_length=file_buffer.pos_in_file;
652 if (!error && !test_only)
653 {
654 uchar buff[MEMMAP_EXTRA_MARGIN]; /* End marginal for memmap */
655 bzero(buff,sizeof(buff));
656 error=my_write(file_buffer.file,buff,sizeof(buff),
657 MYF(MY_WME | MY_NABP | MY_WAIT_IF_FULL)) != 0;
658 }
659
660 /*
661 Write the fixed compressed file header.
662 */
663 if (!error)
664 error=write_header(mrg,header_length,used_trees,tot_elements,
665 new_length);
666
667 /* Flush the file buffer. */
668 end_file_buffer();
669
670 /* Display statistics. */
671 DBUG_PRINT("info", ("Min record length: %6d Max length: %6d "
672 "Mean total length: %6ld",
673 mrg->min_pack_length, mrg->max_pack_length,
674 (ulong) (mrg->records ? (new_length/mrg->records) : 0)));
675 if (verbose && mrg->records)
676 printf("Min record length: %6d Max length: %6d "
677 "Mean total length: %6ld\n", mrg->min_pack_length,
678 mrg->max_pack_length, (ulong) (new_length/mrg->records));
679
680 /* Close source and target file. */
681 if (!test_only)
682 {
683 error|=my_close(new_file,MYF(MY_WME));
684 if (!result_table)
685 {
686 (void) flush_pagecache_blocks(isam_file->s->pagecache, &isam_file->dfile,
687 FLUSH_RELEASE);
688 error|=my_close(isam_file->dfile.file, MYF(MY_WME));
689 isam_file->dfile.file= -1; /* Tell maria_close file is closed */
690 isam_file->s->bitmap.file.file= -1;
691 }
692 }
693
694 /* Cleanup. */
695 free_counts_and_tree_and_queue(huff_trees,trees,huff_counts,fields);
696 if (! test_only && ! error)
697 {
698 if (result_table)
699 {
700 error=save_state_mrg(join_maria_file,mrg,new_length,glob_crc);
701 }
702 else
703 {
704 if (backup)
705 {
706 if (my_rename(org_name,make_old_name(temp_name,
707 isam_file->s->open_file_name.str),
708 MYF(MY_WME)))
709 error=1;
710 else
711 {
712 if (tmp_dir[0])
713 error=my_copy(new_name,org_name,MYF(MY_WME));
714 else
715 error=my_rename(new_name,org_name,MYF(MY_WME));
716 if (!error)
717 {
718 my_copystat(temp_name,org_name,MYF(MY_COPYTIME));
719 if (tmp_dir[0])
720 my_delete(new_name,MYF(MY_WME));
721 }
722 }
723 }
724 else
725 {
726 if (tmp_dir[0])
727 {
728 error=my_copy(new_name,org_name,
729 MYF(MY_WME | MY_HOLD_ORIGINAL_MODES | MY_COPYTIME));
730 if (!error)
731 my_delete(new_name,MYF(MY_WME));
732 }
733 else
734 error=my_redel(org_name, new_name, 0, MYF(MY_WME | MY_COPYTIME));
735 }
736 if (! error)
737 error=save_state(isam_file,mrg,new_length,glob_crc);
738 }
739 }
740 error|=mrg_close(mrg);
741 if (join_maria_file >= 0)
742 error|=my_close(join_maria_file,MYF(MY_WME));
743 if (error)
744 {
745 fprintf(stderr, "Aborting: %s is not compressed\n", org_name);
746 my_delete(new_name,MYF(MY_WME));
747 DBUG_RETURN(-1);
748 }
749 if (write_loop || verbose)
750 {
751 if (old_length)
752 printf("%.4g%% \n",
753 (((longlong) (old_length - new_length)) * 100.0 /
754 (longlong) old_length));
755 else
756 puts("Empty file saved in compressed format");
757 }
758 DBUG_RETURN(0);
759
760 err:
761 free_counts_and_tree_and_queue(huff_trees,trees,huff_counts,fields);
762 if (new_file >= 0)
763 my_close(new_file,MYF(0));
764 if (join_maria_file >= 0)
765 my_close(join_maria_file,MYF(0));
766 mrg_close(mrg);
767 end_pagecache(maria_pagecache, 1);
768 fprintf(stderr, "Aborted: %s is not compressed\n", org_name);
769 DBUG_RETURN(-1);
770 }
771
772 /* Init a huff_count-struct for each field and init it */
773
init_huff_count(MARIA_HA * info,my_off_t records)774 static HUFF_COUNTS *init_huff_count(MARIA_HA *info,my_off_t records)
775 {
776 reg2 uint i;
777 reg1 HUFF_COUNTS *count;
778 if ((count = (HUFF_COUNTS*) my_malloc(info->s->base.fields*
779 sizeof(HUFF_COUNTS),
780 MYF(MY_ZEROFILL | MY_WME))))
781 {
782 for (i=0 ; i < info->s->base.fields ; i++)
783 {
784 enum en_fieldtype type;
785 uint col_nr = info->s->columndef[i].column_nr;
786 count[col_nr].field_length=info->s->columndef[i].length;
787 type= count[col_nr].field_type=
788 (enum en_fieldtype) info->s->columndef[i].type;
789 if (type == FIELD_INTERVALL ||
790 type == FIELD_CONSTANT ||
791 type == FIELD_ZERO)
792 type = FIELD_NORMAL;
793 if (count[col_nr].field_length <= 8 &&
794 (type == FIELD_NORMAL ||
795 type == FIELD_SKIP_ZERO))
796 count[col_nr].max_zero_fill= count[col_nr].field_length;
797 /*
798 For every column initialize a tree, which is used to detect distinct
799 column values. 'int_tree' works together with 'tree_buff' and
800 'tree_pos'. It's keys are implemented by pointers into 'tree_buff'.
801 This is accomplished by '-1' as the element size.
802 */
803 init_tree(&count[col_nr].int_tree,0,0,-1,(qsort_cmp2) compare_tree, NULL,
804 NULL, MYF(0));
805 if (records && type != FIELD_BLOB && type != FIELD_VARCHAR)
806 count[col_nr].tree_pos=count[col_nr].tree_buff =
807 my_malloc(count[col_nr].field_length > 1 ? tree_buff_length : 2,
808 MYF(MY_WME));
809 }
810 }
811 return count;
812 }
813
814
815 /* Free memory used by counts and trees */
816
free_counts_and_tree_and_queue(HUFF_TREE * huff_trees,uint trees,HUFF_COUNTS * huff_counts,uint fields)817 static void free_counts_and_tree_and_queue(HUFF_TREE *huff_trees, uint trees,
818 HUFF_COUNTS *huff_counts,
819 uint fields)
820 {
821 register uint i;
822
823 if (huff_trees)
824 {
825 for (i=0 ; i < trees ; i++)
826 {
827 if (huff_trees[i].element_buffer)
828 my_free(huff_trees[i].element_buffer);
829 if (huff_trees[i].code)
830 my_free(huff_trees[i].code);
831 }
832 my_free(huff_trees);
833 }
834 if (huff_counts)
835 {
836 for (i=0 ; i < fields ; i++)
837 {
838 if (huff_counts[i].tree_buff)
839 {
840 my_free(huff_counts[i].tree_buff);
841 delete_tree(&huff_counts[i].int_tree, 0);
842 }
843 }
844 my_free(huff_counts);
845 }
846 delete_queue(&queue); /* This is safe to free */
847 return;
848 }
849
850 /* Read through old file and gather some statistics */
851
get_statistic(PACK_MRG_INFO * mrg,HUFF_COUNTS * huff_counts)852 static int get_statistic(PACK_MRG_INFO *mrg,HUFF_COUNTS *huff_counts)
853 {
854 int error;
855 uint length, null_bytes;
856 ulong reclength,max_blob_length;
857 uchar *record,*pos,*next_pos,*end_pos,*start_pos;
858 ha_rows record_count;
859 HUFF_COUNTS *count,*end_count;
860 TREE_ELEMENT *element;
861 ha_checksum(*calc_checksum)(MARIA_HA *, const uchar *);
862 DBUG_ENTER("get_statistic");
863
864 reclength= mrg->file[0]->s->base.reclength;
865 null_bytes= mrg->file[0]->s->base.null_bytes;
866 record=(uchar*) my_safe_alloca(reclength);
867 end_count=huff_counts+mrg->file[0]->s->base.fields;
868 record_count=0; glob_crc=0;
869 max_blob_length=0;
870
871 /* Check how to calculate checksum */
872 if (mrg->file[0]->s->data_file_type == STATIC_RECORD)
873 calc_checksum= _ma_static_checksum;
874 else
875 calc_checksum= _ma_checksum;
876
877 mrg_reset(mrg);
878 while ((error=mrg_rrnd(mrg,record)) != HA_ERR_END_OF_FILE)
879 {
880 ulong tot_blob_length=0;
881 if (! error)
882 {
883 /* glob_crc is a checksum over all bytes of all records. */
884 glob_crc+= (*calc_checksum)(mrg->file[0],record);
885
886 /* Count the incidence of values separately for every column. */
887 for (pos=record + null_bytes, count=huff_counts ;
888 count < end_count ;
889 count++,
890 pos=next_pos)
891 {
892 next_pos=end_pos=(start_pos=pos)+count->field_length;
893
894 /*
895 Put the whole column value in a tree if there is room for it.
896 'int_tree' is used to quickly check for duplicate values.
897 'tree_buff' collects as many distinct column values as
898 possible. If the field length is > 1, it is tree_buff_length,
899 else 2 bytes. Each value is 'field_length' bytes big. If there
900 are more distinct column values than fit into the buffer, we
901 give up with this tree. BLOBs and VARCHARs do not have a
902 tree_buff as it can only be used with fixed length columns.
903 For the special case of field length == 1, we handle only the
904 case that there is only one distinct value in the table(s).
905 Otherwise, we can have a maximum of 256 distinct values. This
906 is then handled by the normal Huffman tree build.
907
908 Another limit for collecting distinct column values is the
909 number of values itself. Since we would need to build a
910 Huffman tree for the values, we are limited by the 'IS_OFFSET'
911 constant. This constant expresses a bit which is used to
912 determine if a tree element holds a final value or an offset
913 to a child element. Hence, all values and offsets need to be
914 smaller than 'IS_OFFSET'. A tree element is implemented with
915 two integer values, one for the left branch and one for the
916 right branch. For the extreme case that the first element
917 points to the last element, the number of integers in the tree
918 must be less or equal to IS_OFFSET. So the number of elements
919 must be less or equal to IS_OFFSET / 2.
920
921 WARNING: At first, we insert a pointer into the record buffer
922 as the key for the tree. If we got a new distinct value, which
923 is really inserted into the tree, instead of being counted
924 only, we will copy the column value from the record buffer to
925 'tree_buff' and adjust the key pointer of the tree accordingly.
926 */
927 if (count->tree_buff)
928 {
929 global_count=count;
930 if (!(element=tree_insert(&count->int_tree,pos, 0,
931 count->int_tree.custom_arg)) ||
932 (element->count == 1 &&
933 (count->tree_buff + tree_buff_length <
934 count->tree_pos + count->field_length)) ||
935 (count->int_tree.elements_in_tree > IS_OFFSET / 2) ||
936 (count->field_length == 1 &&
937 count->int_tree.elements_in_tree > 1))
938 {
939 delete_tree(&count->int_tree, 0);
940 my_free(count->tree_buff);
941 count->tree_buff=0;
942 }
943 else
944 {
945 /*
946 If tree_insert() succeeds, it either creates a new element
947 or increments the counter of an existing element.
948 */
949 if (element->count == 1)
950 {
951 /* Copy the new column value into 'tree_buff'. */
952 memcpy(count->tree_pos,pos,(size_t) count->field_length);
953 /* Adjust the key pointer in the tree. */
954 tree_set_pointer(element,count->tree_pos);
955 /* Point behind the last column value so far. */
956 count->tree_pos+=count->field_length;
957 }
958 }
959 }
960
961 /* Save character counters and space-counts and zero-field-counts */
962 if (count->field_type == FIELD_NORMAL ||
963 count->field_type == FIELD_SKIP_ENDSPACE)
964 {
965 /* Ignore trailing space. */
966 for ( ; end_pos > pos ; end_pos--)
967 if (end_pos[-1] != ' ')
968 break;
969 /* Empty fields are just counted. Go to the next record. */
970 if (end_pos == pos)
971 {
972 count->empty_fields++;
973 count->max_zero_fill=0;
974 continue;
975 }
976 /*
977 Count the total of all trailing spaces and the number of
978 short trailing spaces. Remember the longest trailing space.
979 */
980 length= (uint) (next_pos-end_pos);
981 count->tot_end_space+=length;
982 if (length < 8)
983 count->end_space[length]++;
984 if (count->max_end_space < length)
985 count->max_end_space = length;
986 }
987
988 if (count->field_type == FIELD_NORMAL ||
989 count->field_type == FIELD_SKIP_PRESPACE)
990 {
991 /* Ignore leading space. */
992 for (pos=start_pos; pos < end_pos ; pos++)
993 if (pos[0] != ' ')
994 break;
995 /* Empty fields are just counted. Go to the next record. */
996 if (end_pos == pos)
997 {
998 count->empty_fields++;
999 count->max_zero_fill=0;
1000 continue;
1001 }
1002 /*
1003 Count the total of all leading spaces and the number of
1004 short leading spaces. Remember the longest leading space.
1005 */
1006 length= (uint) (pos-start_pos);
1007 count->tot_pre_space+=length;
1008 if (length < 8)
1009 count->pre_space[length]++;
1010 if (count->max_pre_space < length)
1011 count->max_pre_space = length;
1012 }
1013
1014 /* Calculate pos, end_pos, and max_length for variable length fields. */
1015 if (count->field_type == FIELD_BLOB)
1016 {
1017 uint field_length=count->field_length -portable_sizeof_char_ptr;
1018 ulong blob_length= _ma_calc_blob_length(field_length, start_pos);
1019 memcpy(&pos, start_pos+field_length,sizeof(char*));
1020 end_pos=pos+blob_length;
1021 tot_blob_length+=blob_length;
1022 set_if_bigger(count->max_length,blob_length);
1023 }
1024 else if (count->field_type == FIELD_VARCHAR)
1025 {
1026 uint pack_length= HA_VARCHAR_PACKLENGTH(count->field_length-1);
1027 length= (pack_length == 1 ? (uint) *(uchar*) start_pos :
1028 uint2korr(start_pos));
1029 pos= start_pos+pack_length;
1030 end_pos= pos+length;
1031 set_if_bigger(count->max_length,length);
1032 }
1033
1034 /* Evaluate 'max_zero_fill' for short fields. */
1035 if (count->field_length <= 8 &&
1036 (count->field_type == FIELD_NORMAL ||
1037 count->field_type == FIELD_SKIP_ZERO))
1038 {
1039 uint i;
1040 /* Zero fields are just counted. Go to the next record. */
1041 if (!memcmp(start_pos, zero_string, count->field_length))
1042 {
1043 count->zero_fields++;
1044 continue;
1045 }
1046 /*
1047 max_zero_fill starts with field_length. It is decreased every
1048 time a shorter "zero trailer" is found. It is set to zero when
1049 an empty field is found (see above). This suggests that the
1050 variable should be called 'min_zero_fill'.
1051 */
1052 for (i =0 ; i < count->max_zero_fill && ! end_pos[-1 - (int) i] ;
1053 i++) ;
1054 if (i < count->max_zero_fill)
1055 count->max_zero_fill=i;
1056 }
1057
1058 /* Ignore zero fields and check fields. */
1059 if (count->field_type == FIELD_ZERO ||
1060 count->field_type == FIELD_CHECK)
1061 continue;
1062
1063 /*
1064 Count the incidence of every uchar value in the
1065 significant field value.
1066 */
1067 for ( ; pos < end_pos ; pos++)
1068 count->counts[(uchar) *pos]++;
1069
1070 /* Step to next field. */
1071 }
1072
1073 if (tot_blob_length > max_blob_length)
1074 max_blob_length=tot_blob_length;
1075 record_count++;
1076 if (write_loop && record_count % WRITE_COUNT == 0)
1077 {
1078 printf("%lu\r", (ulong) record_count);
1079 fflush(stdout);
1080 }
1081 }
1082 else if (error != HA_ERR_RECORD_DELETED)
1083 {
1084 fprintf(stderr, "Got error %d while reading rows\n", error);
1085 break;
1086 }
1087
1088 /* Step to next record. */
1089 }
1090 if (write_loop)
1091 {
1092 printf(" \r");
1093 fflush(stdout);
1094 }
1095
1096 /*
1097 If --debug=d,fakebigcodes is set, fake the counts to get big Huffman
1098 codes.
1099 */
1100 DBUG_EXECUTE_IF("fakebigcodes", fakebigcodes(huff_counts, end_count););
1101
1102 DBUG_PRINT("info", ("Found the following number of incidents "
1103 "of the uchar codes:"));
1104 if (verbose >= 2)
1105 printf("Found the following number of incidents "
1106 "of the uchar codes:\n");
1107 for (count= huff_counts ; count < end_count; count++)
1108 {
1109 uint idx;
1110 my_off_t total_count;
1111 char llbuf[32];
1112
1113 DBUG_PRINT("info", ("column: %3u", (uint) (count - huff_counts + 1)));
1114 if (verbose >= 2)
1115 printf("column: %3u\n", (uint) (count - huff_counts + 1));
1116 if (count->tree_buff)
1117 {
1118 DBUG_PRINT("info", ("number of distinct values: %u",
1119 (uint) ((count->tree_pos - count->tree_buff) /
1120 count->field_length)));
1121 if (verbose >= 2)
1122 printf("number of distinct values: %u\n",
1123 (uint) ((count->tree_pos - count->tree_buff) /
1124 count->field_length));
1125 }
1126 total_count= 0;
1127 for (idx= 0; idx < 256; idx++)
1128 {
1129 if (count->counts[idx])
1130 {
1131 total_count+= count->counts[idx];
1132 DBUG_PRINT("info", ("counts[0x%02x]: %12s", idx,
1133 llstr((longlong) count->counts[idx], llbuf)));
1134 if (verbose >= 2)
1135 printf("counts[0x%02x]: %12s\n", idx,
1136 llstr((longlong) count->counts[idx], llbuf));
1137 }
1138 }
1139 DBUG_PRINT("info", ("total: %12s", llstr((longlong) total_count,
1140 llbuf)));
1141 if ((verbose >= 2) && total_count)
1142 {
1143 printf("total: %12s\n",
1144 llstr((longlong) total_count, llbuf));
1145 }
1146 }
1147
1148 mrg->records=record_count;
1149 mrg->max_blob_length=max_blob_length;
1150 my_safe_afree(record, reclength);
1151 DBUG_RETURN(error != HA_ERR_END_OF_FILE);
1152 }
1153
compare_huff_elements(void * not_used,uchar * a,uchar * b)1154 static int compare_huff_elements(void *not_used __attribute__((unused)),
1155 uchar *a, uchar *b)
1156 {
1157 return *((my_off_t*) a) < *((my_off_t*) b) ? -1 :
1158 (*((my_off_t*) a) == *((my_off_t*) b) ? 0 : 1);
1159 }
1160
1161 /* Check each tree if we should use pre-space-compress, end-space-
1162 compress, empty-field-compress or zero-field-compress */
1163
check_counts(HUFF_COUNTS * huff_counts,uint trees,my_off_t records)1164 static void check_counts(HUFF_COUNTS *huff_counts, uint trees,
1165 my_off_t records)
1166 {
1167 uint space_fields,fill_zero_fields,field_count[(int) FIELD_enum_val_count];
1168 my_off_t old_length,new_length,length;
1169 DBUG_ENTER("check_counts");
1170
1171 bzero((uchar*) field_count,sizeof(field_count));
1172 space_fields=fill_zero_fields=0;
1173
1174 for (; trees-- ; huff_counts++)
1175 {
1176 if (huff_counts->field_type == FIELD_BLOB)
1177 {
1178 huff_counts->length_bits=max_bit(huff_counts->max_length);
1179 goto found_pack;
1180 }
1181 else if (huff_counts->field_type == FIELD_VARCHAR)
1182 {
1183 huff_counts->length_bits=max_bit(huff_counts->max_length);
1184 goto found_pack;
1185 }
1186 else if (huff_counts->field_type == FIELD_CHECK)
1187 {
1188 huff_counts->bytes_packed=0;
1189 huff_counts->counts[0]=0;
1190 goto found_pack;
1191 }
1192
1193 huff_counts->field_type=FIELD_NORMAL;
1194 huff_counts->pack_type=0;
1195
1196 /* Check for zero-filled records (in this column), or zero records. */
1197 if (huff_counts->zero_fields || ! records)
1198 {
1199 my_off_t old_space_count;
1200 /*
1201 If there are only zero filled records (in this column),
1202 or no records at all, we are done.
1203 */
1204 if (huff_counts->zero_fields == records)
1205 {
1206 huff_counts->field_type= FIELD_ZERO;
1207 huff_counts->bytes_packed=0;
1208 huff_counts->counts[0]=0;
1209 goto found_pack;
1210 }
1211 /* Remeber the number of significant spaces. */
1212 old_space_count=huff_counts->counts[' '];
1213 /* Add all leading and trailing spaces. */
1214 huff_counts->counts[' ']+= (huff_counts->tot_end_space +
1215 huff_counts->tot_pre_space +
1216 huff_counts->empty_fields *
1217 huff_counts->field_length);
1218 /* Check, what the compressed length of this would be. */
1219 old_length=calc_packed_length(huff_counts,0)+records/8;
1220 /* Get the number of zero bytes. */
1221 length=huff_counts->zero_fields*huff_counts->field_length;
1222 /* Add it to the counts. */
1223 huff_counts->counts[0]+=length;
1224 /* Check, what the compressed length of this would be. */
1225 new_length=calc_packed_length(huff_counts,0);
1226 /* If the compression without the zeroes would be shorter, we are done. */
1227 if (old_length < new_length && huff_counts->field_length > 1)
1228 {
1229 huff_counts->field_type=FIELD_SKIP_ZERO;
1230 huff_counts->counts[0]-=length;
1231 huff_counts->bytes_packed=old_length- records/8;
1232 goto found_pack;
1233 }
1234 /* Remove the insignificant spaces, but keep the zeroes. */
1235 huff_counts->counts[' ']=old_space_count;
1236 }
1237 /* Check, what the compressed length of this column would be. */
1238 huff_counts->bytes_packed=calc_packed_length(huff_counts,0);
1239
1240 /*
1241 If there are enough empty records (in this column),
1242 treating them specially may pay off.
1243 */
1244 if (huff_counts->empty_fields)
1245 {
1246 if (huff_counts->field_length > 2 &&
1247 huff_counts->empty_fields + (records - huff_counts->empty_fields)*
1248 (1+max_bit(MY_MAX(huff_counts->max_pre_space,
1249 huff_counts->max_end_space))) <
1250 records * max_bit(huff_counts->field_length))
1251 {
1252 huff_counts->pack_type |= PACK_TYPE_SPACE_FIELDS;
1253 }
1254 else
1255 {
1256 length=huff_counts->empty_fields*huff_counts->field_length;
1257 if (huff_counts->tot_end_space || ! huff_counts->tot_pre_space)
1258 {
1259 huff_counts->tot_end_space+=length;
1260 huff_counts->max_end_space=huff_counts->field_length;
1261 if (huff_counts->field_length < 8)
1262 huff_counts->end_space[huff_counts->field_length]+=
1263 huff_counts->empty_fields;
1264 }
1265 if (huff_counts->tot_pre_space)
1266 {
1267 huff_counts->tot_pre_space+=length;
1268 huff_counts->max_pre_space=huff_counts->field_length;
1269 if (huff_counts->field_length < 8)
1270 huff_counts->pre_space[huff_counts->field_length]+=
1271 huff_counts->empty_fields;
1272 }
1273 }
1274 }
1275
1276 /*
1277 If there are enough trailing spaces (in this column),
1278 treating them specially may pay off.
1279 */
1280 if (huff_counts->tot_end_space)
1281 {
1282 huff_counts->counts[' ']+=huff_counts->tot_pre_space;
1283 if (test_space_compress(huff_counts,records,huff_counts->max_end_space,
1284 huff_counts->end_space,
1285 huff_counts->tot_end_space,FIELD_SKIP_ENDSPACE))
1286 goto found_pack;
1287 huff_counts->counts[' ']-=huff_counts->tot_pre_space;
1288 }
1289
1290 /*
1291 If there are enough leading spaces (in this column),
1292 treating them specially may pay off.
1293 */
1294 if (huff_counts->tot_pre_space)
1295 {
1296 if (test_space_compress(huff_counts,records,huff_counts->max_pre_space,
1297 huff_counts->pre_space,
1298 huff_counts->tot_pre_space,FIELD_SKIP_PRESPACE))
1299 goto found_pack;
1300 }
1301
1302 found_pack: /* Found field-packing */
1303
1304 /* Test if we can use zero-fill */
1305
1306 if (huff_counts->max_zero_fill &&
1307 (huff_counts->field_type == FIELD_NORMAL ||
1308 huff_counts->field_type == FIELD_SKIP_ZERO))
1309 {
1310 huff_counts->counts[0]-=huff_counts->max_zero_fill*
1311 (huff_counts->field_type == FIELD_SKIP_ZERO ?
1312 records - huff_counts->zero_fields : records);
1313 huff_counts->pack_type|=PACK_TYPE_ZERO_FILL;
1314 huff_counts->bytes_packed=calc_packed_length(huff_counts,0);
1315 }
1316
1317 /* Test if intervall-field is better */
1318
1319 if (huff_counts->tree_buff)
1320 {
1321 HUFF_TREE tree;
1322
1323 DBUG_EXECUTE_IF("forceintervall",
1324 huff_counts->bytes_packed= ~ (my_off_t) 0;);
1325 tree.element_buffer=0;
1326 if (!make_huff_tree(&tree,huff_counts) &&
1327 tree.bytes_packed+tree.tree_pack_length < huff_counts->bytes_packed)
1328 {
1329 if (tree.elements == 1)
1330 huff_counts->field_type=FIELD_CONSTANT;
1331 else
1332 huff_counts->field_type=FIELD_INTERVALL;
1333 huff_counts->pack_type=0;
1334 }
1335 else
1336 {
1337 my_free(huff_counts->tree_buff);
1338 delete_tree(&huff_counts->int_tree, 0);
1339 huff_counts->tree_buff=0;
1340 }
1341 if (tree.element_buffer)
1342 my_free(tree.element_buffer);
1343 }
1344 if (huff_counts->pack_type & PACK_TYPE_SPACE_FIELDS)
1345 space_fields++;
1346 if (huff_counts->pack_type & PACK_TYPE_ZERO_FILL)
1347 fill_zero_fields++;
1348 field_count[huff_counts->field_type]++;
1349 }
1350 DBUG_PRINT("info", ("normal: %3d empty-space: %3d "
1351 "empty-zero: %3d empty-fill: %3d",
1352 field_count[FIELD_NORMAL],space_fields,
1353 field_count[FIELD_SKIP_ZERO],fill_zero_fields));
1354 DBUG_PRINT("info", ("pre-space: %3d end-space: %3d "
1355 "intervall-fields: %3d zero: %3d",
1356 field_count[FIELD_SKIP_PRESPACE],
1357 field_count[FIELD_SKIP_ENDSPACE],
1358 field_count[FIELD_INTERVALL],
1359 field_count[FIELD_ZERO]));
1360 if (verbose)
1361 printf("\nnormal: %3d empty-space: %3d "
1362 "empty-zero: %3d empty-fill: %3d\n"
1363 "pre-space: %3d end-space: %3d "
1364 "intervall-fields: %3d zero: %3d\n",
1365 field_count[FIELD_NORMAL],space_fields,
1366 field_count[FIELD_SKIP_ZERO],fill_zero_fields,
1367 field_count[FIELD_SKIP_PRESPACE],
1368 field_count[FIELD_SKIP_ENDSPACE],
1369 field_count[FIELD_INTERVALL],
1370 field_count[FIELD_ZERO]);
1371 DBUG_VOID_RETURN;
1372 }
1373
1374
1375 /* Test if we can use space-compression and empty-field-compression */
1376
1377 static int
test_space_compress(HUFF_COUNTS * huff_counts,my_off_t records,uint max_space_length,my_off_t * space_counts,my_off_t tot_space_count,enum en_fieldtype field_type)1378 test_space_compress(HUFF_COUNTS *huff_counts, my_off_t records,
1379 uint max_space_length, my_off_t *space_counts,
1380 my_off_t tot_space_count, enum en_fieldtype field_type)
1381 {
1382 int min_pos;
1383 uint length_bits,i;
1384 my_off_t space_count,min_space_count,min_pack,new_length,skip;
1385
1386 length_bits=max_bit(max_space_length);
1387
1388 /* Default no end_space-packing */
1389 space_count=huff_counts->counts[(uint) ' '];
1390 min_space_count= (huff_counts->counts[(uint) ' ']+= tot_space_count);
1391 min_pack=calc_packed_length(huff_counts,0);
1392 min_pos= -2;
1393 huff_counts->counts[(uint) ' ']=space_count;
1394
1395 /* Test with allways space-count */
1396 new_length=huff_counts->bytes_packed+length_bits*records/8;
1397 if (new_length+1 < min_pack)
1398 {
1399 min_pos= -1;
1400 min_pack=new_length;
1401 min_space_count=space_count;
1402 }
1403 /* Test with length-flag */
1404 for (skip=0L, i=0 ; i < 8 ; i++)
1405 {
1406 if (space_counts[i])
1407 {
1408 if (i)
1409 huff_counts->counts[(uint) ' ']+=space_counts[i];
1410 skip+=huff_counts->pre_space[i];
1411 new_length=calc_packed_length(huff_counts,0)+
1412 (records+(records-skip)*(1+length_bits))/8;
1413 if (new_length < min_pack)
1414 {
1415 min_pos=(int) i;
1416 min_pack=new_length;
1417 min_space_count=huff_counts->counts[(uint) ' '];
1418 }
1419 }
1420 }
1421
1422 huff_counts->counts[(uint) ' ']=min_space_count;
1423 huff_counts->bytes_packed=min_pack;
1424 switch (min_pos) {
1425 case -2:
1426 return(0); /* No space-compress */
1427 case -1: /* Always space-count */
1428 huff_counts->field_type=field_type;
1429 huff_counts->min_space=0;
1430 huff_counts->length_bits=max_bit(max_space_length);
1431 break;
1432 default:
1433 huff_counts->field_type=field_type;
1434 huff_counts->min_space=(uint) min_pos;
1435 huff_counts->pack_type|=PACK_TYPE_SELECTED;
1436 huff_counts->length_bits=max_bit(max_space_length);
1437 break;
1438 }
1439 return(1); /* Using space-compress */
1440 }
1441
1442
1443 /* Make a huff_tree of each huff_count */
1444
make_huff_trees(HUFF_COUNTS * huff_counts,uint trees)1445 static HUFF_TREE* make_huff_trees(HUFF_COUNTS *huff_counts, uint trees)
1446 {
1447 uint tree;
1448 HUFF_TREE *huff_tree;
1449 DBUG_ENTER("make_huff_trees");
1450
1451 if (!(huff_tree=(HUFF_TREE*) my_malloc(trees*sizeof(HUFF_TREE),
1452 MYF(MY_WME | MY_ZEROFILL))))
1453 DBUG_RETURN(0);
1454
1455 for (tree=0 ; tree < trees ; tree++)
1456 {
1457 if (make_huff_tree(huff_tree+tree,huff_counts+tree))
1458 {
1459 while (tree--)
1460 my_free(huff_tree[tree].element_buffer);
1461 my_free(huff_tree);
1462 DBUG_RETURN(0);
1463 }
1464 }
1465 DBUG_RETURN(huff_tree);
1466 }
1467
1468 /*
1469 Build a Huffman tree.
1470
1471 SYNOPSIS
1472 make_huff_tree()
1473 huff_tree The Huffman tree.
1474 huff_counts The counts.
1475
1476 DESCRIPTION
1477 Build a Huffman tree according to huff_counts->counts or
1478 huff_counts->tree_buff. tree_buff, if non-NULL contains up to
1479 tree_buff_length of distinct column values. In that case, whole
1480 values can be Huffman encoded instead of single bytes.
1481
1482 RETURN
1483 0 OK
1484 != 0 Error
1485 */
1486
make_huff_tree(HUFF_TREE * huff_tree,HUFF_COUNTS * huff_counts)1487 static int make_huff_tree(HUFF_TREE *huff_tree, HUFF_COUNTS *huff_counts)
1488 {
1489 uint i,found,bits_packed,first,last;
1490 my_off_t bytes_packed;
1491 HUFF_ELEMENT *a,*b,*new_huff_el;
1492
1493 first=last=0;
1494 if (huff_counts->tree_buff)
1495 {
1496 /* Calculate the number of distinct values in tree_buff. */
1497 found= (uint) (huff_counts->tree_pos - huff_counts->tree_buff) /
1498 huff_counts->field_length;
1499 first=0; last=found-1;
1500 }
1501 else
1502 {
1503 /* Count the number of uchar codes found in the column. */
1504 for (i=found=0 ; i < 256 ; i++)
1505 {
1506 if (huff_counts->counts[i])
1507 {
1508 if (! found++)
1509 first=i;
1510 last=i;
1511 }
1512 }
1513 if (found < 2)
1514 found=2;
1515 }
1516
1517 /* When using 'tree_buff' we can have more that 256 values. */
1518 if (queue.max_elements < found)
1519 {
1520 delete_queue(&queue);
1521 if (init_queue(&queue,found, 0, 0, compare_huff_elements, 0, 0, 0))
1522 return -1;
1523 }
1524
1525 /* Allocate or reallocate an element buffer for the Huffman tree. */
1526 if (!huff_tree->element_buffer)
1527 {
1528 if (!(huff_tree->element_buffer=
1529 (HUFF_ELEMENT*) my_malloc(found*2*sizeof(HUFF_ELEMENT),MYF(MY_WME))))
1530 return 1;
1531 }
1532 else
1533 {
1534 HUFF_ELEMENT *temp;
1535 if (!(temp=
1536 (HUFF_ELEMENT*) my_realloc((uchar*) huff_tree->element_buffer,
1537 found*2*sizeof(HUFF_ELEMENT),
1538 MYF(MY_WME))))
1539 return 1;
1540 huff_tree->element_buffer=temp;
1541 }
1542
1543 huff_counts->tree=huff_tree;
1544 huff_tree->counts=huff_counts;
1545 huff_tree->min_chr=first;
1546 huff_tree->max_chr=last;
1547 huff_tree->char_bits=max_bit(last-first);
1548 huff_tree->offset_bits=max_bit(found-1)+1;
1549
1550 if (huff_counts->tree_buff)
1551 {
1552 huff_tree->elements=0;
1553 huff_tree->tree_pack_length=(1+15+16+5+5+
1554 (huff_tree->char_bits+1)*found+
1555 (huff_tree->offset_bits+1)*
1556 (found-2)+7)/8 +
1557 (uint) (huff_tree->counts->tree_pos-
1558 huff_tree->counts->tree_buff);
1559 /*
1560 Put a HUFF_ELEMENT into the queue for every distinct column value.
1561
1562 tree_walk() calls save_counts_in_queue() for every element in
1563 'int_tree'. This takes elements from the target trees element
1564 buffer and places references to them into the buffer of the
1565 priority queue. We insert in column value order, but the order is
1566 in fact irrelevant here. We will establish the correct order
1567 later.
1568 */
1569 tree_walk(&huff_counts->int_tree,
1570 (int (*)(void*, element_count,void*)) save_counts_in_queue,
1571 (uchar*) huff_tree, left_root_right);
1572 }
1573 else
1574 {
1575 huff_tree->elements=found;
1576 huff_tree->tree_pack_length=(9+9+5+5+
1577 (huff_tree->char_bits+1)*found+
1578 (huff_tree->offset_bits+1)*
1579 (found-2)+7)/8;
1580 /*
1581 Put a HUFF_ELEMENT into the queue for every uchar code found in the column.
1582
1583 The elements are taken from the target trees element buffer.
1584 Instead of using queue_insert(), we just place references to the
1585 elements into the buffer of the priority queue. We insert in byte
1586 value order, but the order is in fact irrelevant here. We will
1587 establish the correct order later.
1588 */
1589 for (i=first, found=0 ; i <= last ; i++)
1590 {
1591 if (huff_counts->counts[i])
1592 {
1593 new_huff_el=huff_tree->element_buffer+(found++);
1594 new_huff_el->count=huff_counts->counts[i];
1595 new_huff_el->a.leaf.null=0;
1596 new_huff_el->a.leaf.element_nr=i;
1597 queue.root[found]=(uchar*) new_huff_el;
1598 }
1599 }
1600 /*
1601 If there is only a single uchar value in this field in all records,
1602 add a second element with zero incidence. This is required to enter
1603 the loop, which builds the Huffman tree.
1604 */
1605 while (found < 2)
1606 {
1607 new_huff_el=huff_tree->element_buffer+(found++);
1608 new_huff_el->count=0;
1609 new_huff_el->a.leaf.null=0;
1610 if (last)
1611 new_huff_el->a.leaf.element_nr=huff_tree->min_chr=last-1;
1612 else
1613 new_huff_el->a.leaf.element_nr=huff_tree->max_chr=last+1;
1614 queue.root[found]=(uchar*) new_huff_el;
1615 }
1616 }
1617
1618 /* Make a queue from the queue buffer. */
1619 queue.elements=found;
1620
1621 /*
1622 Make a priority queue from the queue. Construct its index so that we
1623 have a partially ordered tree.
1624 */
1625 queue_fix(&queue);
1626
1627 /* The Huffman algorithm. */
1628 bytes_packed=0; bits_packed=0;
1629 for (i=1 ; i < found ; i++)
1630 {
1631 /*
1632 Pop the top element from the queue (the one with the least incidence).
1633 Popping from a priority queue includes a re-ordering of the queue,
1634 to get the next least incidence element to the top.
1635 */
1636 a=(HUFF_ELEMENT*) queue_remove_top(&queue);
1637 /* Copy the next least incidence element */
1638 b=(HUFF_ELEMENT*) queue_top(&queue);
1639 /* Get a new element from the element buffer. */
1640 new_huff_el=huff_tree->element_buffer+found+i;
1641 /* The new element gets the sum of the two least incidence elements. */
1642 new_huff_el->count=a->count+b->count;
1643 /*
1644 The Huffman algorithm assigns another bit to the code for a byte
1645 every time that bytes incidence is combined (directly or indirectly)
1646 to a new element as one of the two least incidence elements.
1647 This means that one more bit per incidence of that uchar is required
1648 in the resulting file. So we add the new combined incidence as the
1649 number of bits by which the result grows.
1650 */
1651 bits_packed+=(uint) (new_huff_el->count & 7);
1652 bytes_packed+=new_huff_el->count/8;
1653 /* The new element points to its children, lesser in left. */
1654 new_huff_el->a.nod.left=a;
1655 new_huff_el->a.nod.right=b;
1656 /*
1657 Replace the copied top element by the new element and re-order the
1658 queue.
1659 */
1660 queue_top(&queue)= (uchar*) new_huff_el;
1661 queue_replace_top(&queue);
1662 }
1663 huff_tree->root=(HUFF_ELEMENT*) queue.root[1];
1664 huff_tree->bytes_packed=bytes_packed+(bits_packed+7)/8;
1665 return 0;
1666 }
1667
compare_tree(void * cmp_arg,register const uchar * s,register const uchar * t)1668 static int compare_tree(void* cmp_arg __attribute__((unused)),
1669 register const uchar *s, register const uchar *t)
1670 {
1671 uint length;
1672 for (length=global_count->field_length; length-- ;)
1673 if (*s++ != *t++)
1674 return (int) s[-1] - (int) t[-1];
1675 return 0;
1676 }
1677
1678 /*
1679 Organize distinct column values and their incidences into a priority queue.
1680
1681 SYNOPSIS
1682 save_counts_in_queue()
1683 key The column value.
1684 count The incidence of this value.
1685 tree The Huffman tree to be built later.
1686
1687 DESCRIPTION
1688 We use the element buffer of the targeted tree. The distinct column
1689 values are organized in a priority queue first. The Huffman
1690 algorithm will later organize the elements into a Huffman tree. For
1691 the time being, we just place references to the elements into the
1692 queue buffer. The buffer will later be organized into a priority
1693 queue.
1694
1695 RETURN
1696 0
1697 */
1698
save_counts_in_queue(uchar * key,element_count count,HUFF_TREE * tree)1699 static int save_counts_in_queue(uchar *key, element_count count,
1700 HUFF_TREE *tree)
1701 {
1702 HUFF_ELEMENT *new_huff_el;
1703
1704 new_huff_el=tree->element_buffer+(tree->elements++);
1705 new_huff_el->count=count;
1706 new_huff_el->a.leaf.null=0;
1707 new_huff_el->a.leaf.element_nr= (uint) (key- tree->counts->tree_buff) /
1708 tree->counts->field_length;
1709 queue.root[tree->elements]=(uchar*) new_huff_el;
1710 return 0;
1711 }
1712
1713
1714 /*
1715 Calculate length of file if given counts should be used.
1716
1717 SYNOPSIS
1718 calc_packed_length()
1719 huff_counts The counts for a column of the table(s).
1720 add_tree_lenght If the decode tree length should be added.
1721
1722 DESCRIPTION
1723 We need to follow the Huffman algorithm until we know, how many bits
1724 are required for each uchar code. But we do not need the resulting
1725 Huffman tree. Hence, we can leave out some steps which are essential
1726 in make_huff_tree().
1727
1728 RETURN
1729 Number of bytes required to compress this table column.
1730 */
1731
calc_packed_length(HUFF_COUNTS * huff_counts,uint add_tree_lenght)1732 static my_off_t calc_packed_length(HUFF_COUNTS *huff_counts,
1733 uint add_tree_lenght)
1734 {
1735 uint i,found,bits_packed,first,last;
1736 my_off_t bytes_packed;
1737 HUFF_ELEMENT element_buffer[256];
1738 DBUG_ENTER("calc_packed_length");
1739
1740 /*
1741 WARNING: We use a small hack for efficiency: Instead of placing
1742 references to HUFF_ELEMENTs into the queue, we just insert
1743 references to the counts of the uchar codes which appeared in this
1744 table column. During the Huffman algorithm they are successively
1745 replaced by references to HUFF_ELEMENTs. This works, because
1746 HUFF_ELEMENTs have the incidence count at their beginning.
1747 Regardless, wether the queue array contains references to counts of
1748 type my_off_t or references to HUFF_ELEMENTs which have the count of
1749 type my_off_t at their beginning, it always points to a count of the
1750 same type.
1751
1752 Instead of using queue_insert(), we just copy the references into
1753 the buffer of the priority queue. We insert in uchar value order, but
1754 the order is in fact irrelevant here. We will establish the correct
1755 order later.
1756 */
1757 first=last=0;
1758 for (i=found=0 ; i < 256 ; i++)
1759 {
1760 if (huff_counts->counts[i])
1761 {
1762 if (! found++)
1763 first=i;
1764 last=i;
1765 /* We start with root[1], which is the queues top element. */
1766 queue.root[found]=(uchar*) &huff_counts->counts[i];
1767 }
1768 }
1769 if (!found)
1770 DBUG_RETURN(0); /* Empty tree */
1771 /*
1772 If there is only a single uchar value in this field in all records,
1773 add a second element with zero incidence. This is required to enter
1774 the loop, which follows the Huffman algorithm.
1775 */
1776 if (found < 2)
1777 queue.root[++found]=(uchar*) &huff_counts->counts[last ? 0 : 1];
1778
1779 /* Make a queue from the queue buffer. */
1780 queue.elements=found;
1781
1782 bytes_packed=0; bits_packed=0;
1783 /* Add the length of the coding table, which would become part of the file. */
1784 if (add_tree_lenght)
1785 bytes_packed=(8+9+5+5+(max_bit(last-first)+1)*found+
1786 (max_bit(found-1)+1+1)*(found-2) +7)/8;
1787
1788 /*
1789 Make a priority queue from the queue. Construct its index so that we
1790 have a partially ordered tree.
1791 */
1792 queue_fix(&queue);
1793
1794 /* The Huffman algorithm. */
1795 for (i=0 ; i < found-1 ; i++)
1796 {
1797 my_off_t *a;
1798 my_off_t *b;
1799 HUFF_ELEMENT *new_huff_el;
1800
1801 /*
1802 Pop the top element from the queue (the one with the least
1803 incidence). Popping from a priority queue includes a re-ordering
1804 of the queue, to get the next least incidence element to the top.
1805 */
1806 a= (my_off_t*) queue_remove_top(&queue);
1807 /* Copy the next least incidence element. */
1808 b= (my_off_t*) queue_top(&queue);
1809 /* Create a new element in a local (automatic) buffer. */
1810 new_huff_el= element_buffer + i;
1811 /* The new element gets the sum of the two least incidence elements. */
1812 new_huff_el->count= *a + *b;
1813 /*
1814 The Huffman algorithm assigns another bit to the code for a byte
1815 every time that bytes incidence is combined (directly or indirectly)
1816 to a new element as one of the two least incidence elements.
1817 This means that one more bit per incidence of that uchar is required
1818 in the resulting file. So we add the new combined incidence as the
1819 number of bits by which the result grows.
1820 */
1821 bits_packed+=(uint) (new_huff_el->count & 7);
1822 bytes_packed+=new_huff_el->count/8;
1823 /*
1824 Replace the copied top element by the new element and re-order the
1825 queue. This successively replaces the references to counts by
1826 references to HUFF_ELEMENTs.
1827 */
1828 queue_top(&queue)= (uchar*) new_huff_el;
1829 queue_replace_top(&queue);
1830 }
1831 DBUG_RETURN(bytes_packed+(bits_packed+7)/8);
1832 }
1833
1834
1835 /* Remove trees that don't give any compression */
1836
join_same_trees(HUFF_COUNTS * huff_counts,uint trees)1837 static uint join_same_trees(HUFF_COUNTS *huff_counts, uint trees)
1838 {
1839 uint k,tree_number;
1840 HUFF_COUNTS count,*i,*j,*last_count;
1841
1842 last_count=huff_counts+trees;
1843 for (tree_number=0, i=huff_counts ; i < last_count ; i++)
1844 {
1845 if (!i->tree->tree_number)
1846 {
1847 i->tree->tree_number= ++tree_number;
1848 if (i->tree_buff)
1849 continue; /* Don't join intervall */
1850 for (j=i+1 ; j < last_count ; j++)
1851 {
1852 if (! j->tree->tree_number && ! j->tree_buff)
1853 {
1854 for (k=0 ; k < 256 ; k++)
1855 count.counts[k]=i->counts[k]+j->counts[k];
1856 if (calc_packed_length(&count,1) <=
1857 i->tree->bytes_packed + j->tree->bytes_packed+
1858 i->tree->tree_pack_length+j->tree->tree_pack_length+
1859 ALLOWED_JOIN_DIFF)
1860 {
1861 memcpy(i->counts,(uchar*) count.counts, sizeof(count.counts[0])*256);
1862 my_free(j->tree->element_buffer);
1863 j->tree->element_buffer=0;
1864 j->tree=i->tree;
1865 bmove((uchar*) i->counts,(uchar*) count.counts,
1866 sizeof(count.counts[0])*256);
1867 if (make_huff_tree(i->tree,i))
1868 return (uint) -1;
1869 }
1870 }
1871 }
1872 }
1873 }
1874 DBUG_PRINT("info", ("Original trees: %d After join: %d",
1875 trees, tree_number));
1876 if (verbose)
1877 printf("Original trees: %d After join: %d\n", trees, tree_number);
1878 return tree_number; /* Return trees left */
1879 }
1880
1881
1882 /*
1883 Fill in huff_tree encode tables.
1884
1885 SYNOPSIS
1886 make_huff_decode_table()
1887 huff_tree An array of HUFF_TREE which are to be encoded.
1888 trees The number of HUFF_TREE in the array.
1889
1890 RETURN
1891 0 success
1892 != 0 error
1893 */
1894
make_huff_decode_table(HUFF_TREE * huff_tree,uint trees)1895 static int make_huff_decode_table(HUFF_TREE *huff_tree, uint trees)
1896 {
1897 uint elements;
1898 for ( ; trees-- ; huff_tree++)
1899 {
1900 if (huff_tree->tree_number > 0)
1901 {
1902 elements=huff_tree->counts->tree_buff ? huff_tree->elements : 256;
1903 if (!(huff_tree->code =
1904 (ulonglong*) my_malloc(elements*
1905 (sizeof(ulonglong) + sizeof(uchar)),
1906 MYF(MY_WME | MY_ZEROFILL))))
1907 return 1;
1908 huff_tree->code_len=(uchar*) (huff_tree->code+elements);
1909 make_traverse_code_tree(huff_tree, huff_tree->root,
1910 8 * sizeof(ulonglong), 0);
1911 }
1912 }
1913 return 0;
1914 }
1915
1916
make_traverse_code_tree(HUFF_TREE * huff_tree,HUFF_ELEMENT * element,uint size,ulonglong code)1917 static void make_traverse_code_tree(HUFF_TREE *huff_tree,
1918 HUFF_ELEMENT *element,
1919 uint size, ulonglong code)
1920 {
1921 uint chr;
1922 if (!element->a.leaf.null)
1923 {
1924 chr=element->a.leaf.element_nr;
1925 huff_tree->code_len[chr]= (uchar) (8 * sizeof(ulonglong) - size);
1926 huff_tree->code[chr]= (code >> size);
1927 if (huff_tree->height < 8 * sizeof(ulonglong) - size)
1928 huff_tree->height= 8 * sizeof(ulonglong) - size;
1929 }
1930 else
1931 {
1932 size--;
1933 make_traverse_code_tree(huff_tree,element->a.nod.left,size,code);
1934 make_traverse_code_tree(huff_tree, element->a.nod.right, size,
1935 code + (((ulonglong) 1) << size));
1936 }
1937 return;
1938 }
1939
1940
1941 /*
1942 Convert a value into binary digits.
1943
1944 SYNOPSIS
1945 bindigits()
1946 value The value.
1947 length The number of low order bits to convert.
1948
1949 NOTE
1950 The result string is in static storage. It is reused on every call.
1951 So you cannot use it twice in one expression.
1952
1953 RETURN
1954 A pointer to a static NUL-terminated string.
1955 */
1956
bindigits(ulonglong value,uint bits)1957 static char *bindigits(ulonglong value, uint bits)
1958 {
1959 static char digits[72];
1960 char *ptr= digits;
1961 uint idx= bits;
1962
1963 DBUG_ASSERT(idx < sizeof(digits));
1964 while (idx)
1965 *(ptr++)= '0' + ((char) (value >> (--idx)) & (char) 1);
1966 *ptr= '\0';
1967 return digits;
1968 }
1969
1970
1971 /*
1972 Convert a value into hexadecimal digits.
1973
1974 SYNOPSIS
1975 hexdigits()
1976 value The value.
1977
1978 NOTE
1979 The result string is in static storage. It is reused on every call.
1980 So you cannot use it twice in one expression.
1981
1982 RETURN
1983 A pointer to a static NUL-terminated string.
1984 */
1985
hexdigits(ulonglong value)1986 static char *hexdigits(ulonglong value)
1987 {
1988 static char digits[20];
1989 char *ptr= digits;
1990 uint idx= 2 * sizeof(value); /* Two hex digits per byte. */
1991
1992 DBUG_ASSERT(idx < sizeof(digits));
1993 while (idx)
1994 {
1995 if ((*(ptr++)= '0' + ((char) (value >> (4 * (--idx))) & (char) 0xf)) > '9')
1996 *(ptr - 1)+= 'a' - '9' - 1;
1997 }
1998 *ptr= '\0';
1999 return digits;
2000 }
2001
2002
2003 /* Write header to new packed data file */
2004
write_header(PACK_MRG_INFO * mrg,uint head_length,uint trees,my_off_t tot_elements,my_off_t filelength)2005 static int write_header(PACK_MRG_INFO *mrg,uint head_length,uint trees,
2006 my_off_t tot_elements,my_off_t filelength)
2007 {
2008 uchar *buff= (uchar*) file_buffer.pos;
2009
2010 bzero(buff,HEAD_LENGTH);
2011 memcpy(buff,maria_pack_file_magic,4);
2012 int4store(buff+4,head_length);
2013 int4store(buff+8, mrg->min_pack_length);
2014 int4store(buff+12,mrg->max_pack_length);
2015 int4store(buff+16,tot_elements);
2016 int4store(buff+20,intervall_length);
2017 int2store(buff+24,trees);
2018 buff[26]=(char) mrg->ref_length;
2019 /* Save record pointer length */
2020 buff[27]= (uchar) maria_get_pointer_length((ulonglong) filelength,2);
2021 if (test_only)
2022 return 0;
2023 my_seek(file_buffer.file,0L,MY_SEEK_SET,MYF(0));
2024 return my_write(file_buffer.file,(const uchar *) file_buffer.pos,HEAD_LENGTH,
2025 MYF(MY_WME | MY_NABP | MY_WAIT_IF_FULL)) != 0;
2026 }
2027
2028 /* Write fieldinfo to new packed file */
2029
write_field_info(HUFF_COUNTS * counts,uint fields,uint trees)2030 static void write_field_info(HUFF_COUNTS *counts, uint fields, uint trees)
2031 {
2032 reg1 uint i;
2033 uint huff_tree_bits;
2034 huff_tree_bits=max_bit(trees ? trees-1 : 0);
2035
2036 DBUG_PRINT("info", (" "));
2037 DBUG_PRINT("info", ("column types:"));
2038 DBUG_PRINT("info", ("FIELD_NORMAL 0"));
2039 DBUG_PRINT("info", ("FIELD_SKIP_ENDSPACE 1"));
2040 DBUG_PRINT("info", ("FIELD_SKIP_PRESPACE 2"));
2041 DBUG_PRINT("info", ("FIELD_SKIP_ZERO 3"));
2042 DBUG_PRINT("info", ("FIELD_BLOB 4"));
2043 DBUG_PRINT("info", ("FIELD_CONSTANT 5"));
2044 DBUG_PRINT("info", ("FIELD_INTERVALL 6"));
2045 DBUG_PRINT("info", ("FIELD_ZERO 7"));
2046 DBUG_PRINT("info", ("FIELD_VARCHAR 8"));
2047 DBUG_PRINT("info", ("FIELD_CHECK 9"));
2048 DBUG_PRINT("info", (" "));
2049 DBUG_PRINT("info", ("pack type as a set of flags:"));
2050 DBUG_PRINT("info", ("PACK_TYPE_SELECTED 1"));
2051 DBUG_PRINT("info", ("PACK_TYPE_SPACE_FIELDS 2"));
2052 DBUG_PRINT("info", ("PACK_TYPE_ZERO_FILL 4"));
2053 DBUG_PRINT("info", (" "));
2054 if (verbose >= 2)
2055 {
2056 printf("\n");
2057 printf("column types:\n");
2058 printf("FIELD_NORMAL 0\n");
2059 printf("FIELD_SKIP_ENDSPACE 1\n");
2060 printf("FIELD_SKIP_PRESPACE 2\n");
2061 printf("FIELD_SKIP_ZERO 3\n");
2062 printf("FIELD_BLOB 4\n");
2063 printf("FIELD_CONSTANT 5\n");
2064 printf("FIELD_INTERVALL 6\n");
2065 printf("FIELD_ZERO 7\n");
2066 printf("FIELD_VARCHAR 8\n");
2067 printf("FIELD_CHECK 9\n");
2068 printf("\n");
2069 printf("pack type as a set of flags:\n");
2070 printf("PACK_TYPE_SELECTED 1\n");
2071 printf("PACK_TYPE_SPACE_FIELDS 2\n");
2072 printf("PACK_TYPE_ZERO_FILL 4\n");
2073 printf("\n");
2074 }
2075 for (i=0 ; i++ < fields ; counts++)
2076 {
2077 write_bits((ulonglong) (int) counts->field_type, 5);
2078 write_bits(counts->pack_type,6);
2079 if (counts->pack_type & PACK_TYPE_ZERO_FILL)
2080 write_bits(counts->max_zero_fill,5);
2081 else
2082 write_bits(counts->length_bits,5);
2083 write_bits((ulonglong) counts->tree->tree_number - 1, huff_tree_bits);
2084 DBUG_PRINT("info", ("column: %3u type: %2u pack: %2u zero: %4u "
2085 "lbits: %2u tree: %2u length: %4u",
2086 i , counts->field_type, counts->pack_type,
2087 counts->max_zero_fill, counts->length_bits,
2088 counts->tree->tree_number, counts->field_length));
2089 if (verbose >= 2)
2090 printf("column: %3u type: %2u pack: %2u zero: %4u lbits: %2u "
2091 "tree: %2u length: %4u\n", i , counts->field_type,
2092 counts->pack_type, counts->max_zero_fill, counts->length_bits,
2093 counts->tree->tree_number, counts->field_length);
2094 }
2095 flush_bits();
2096 return;
2097 }
2098
2099 /* Write all huff_trees to new datafile. Return tot count of
2100 elements in all trees
2101 Returns 0 on error */
2102
write_huff_tree(HUFF_TREE * huff_tree,uint trees)2103 static my_off_t write_huff_tree(HUFF_TREE *huff_tree, uint trees)
2104 {
2105 uint i,int_length;
2106 uint tree_no;
2107 uint codes;
2108 uint errors= 0;
2109 uint *packed_tree,*offset,length;
2110 my_off_t elements;
2111
2112 /* Find the highest number of elements in the trees. */
2113 for (i=length=0 ; i < trees ; i++)
2114 if (huff_tree[i].tree_number > 0 && huff_tree[i].elements > length)
2115 length=huff_tree[i].elements;
2116 /*
2117 Allocate a buffer for packing a decode tree. Two numbers per element
2118 (left child and right child).
2119 */
2120 if (!(packed_tree=(uint*) my_alloca(sizeof(uint)*length*2)))
2121 {
2122 my_error(EE_OUTOFMEMORY,MYF(ME_BELL),sizeof(uint)*length*2);
2123 return 0;
2124 }
2125
2126 DBUG_PRINT("info", (" "));
2127 if (verbose >= 2)
2128 printf("\n");
2129 tree_no= 0;
2130 intervall_length=0;
2131 for (elements=0; trees-- ; huff_tree++)
2132 {
2133 /* Skip columns that have been joined with other columns. */
2134 if (huff_tree->tree_number == 0)
2135 continue; /* Deleted tree */
2136 tree_no++;
2137 DBUG_PRINT("info", (" "));
2138 if (verbose >= 3)
2139 printf("\n");
2140 /* Count the total number of elements (byte codes or column values). */
2141 elements+=huff_tree->elements;
2142 huff_tree->max_offset=2;
2143 /* Build a tree of offsets and codes for decoding in 'packed_tree'. */
2144 if (huff_tree->elements <= 1)
2145 offset=packed_tree;
2146 else
2147 offset=make_offset_code_tree(huff_tree,huff_tree->root,packed_tree);
2148
2149 /* This should be the same as 'length' above. */
2150 huff_tree->offset_bits=max_bit(huff_tree->max_offset);
2151
2152 /*
2153 Since we check this during collecting the distinct column values,
2154 this should never happen.
2155 */
2156 if (huff_tree->max_offset >= IS_OFFSET)
2157 { /* This should be impossible */
2158 fprintf(stderr, "Tree offset got too big: %d, aborted\n",
2159 huff_tree->max_offset);
2160 my_afree(packed_tree);
2161 return 0;
2162 }
2163
2164 DBUG_PRINT("info", ("pos: %lu elements: %u tree-elements: %lu "
2165 "char_bits: %u\n",
2166 (ulong) (file_buffer.pos - file_buffer.buffer),
2167 huff_tree->elements, (ulong) (offset - packed_tree),
2168 huff_tree->char_bits));
2169 if (!huff_tree->counts->tree_buff)
2170 {
2171 /* We do a uchar compression on this column. Mark with bit 0. */
2172 write_bits(0,1);
2173 write_bits(huff_tree->min_chr,8);
2174 write_bits(huff_tree->elements,9);
2175 write_bits(huff_tree->char_bits,5);
2176 write_bits(huff_tree->offset_bits,5);
2177 int_length=0;
2178 }
2179 else
2180 {
2181 int_length=(uint) (huff_tree->counts->tree_pos -
2182 huff_tree->counts->tree_buff);
2183 /* We have distinct column values for this column. Mark with bit 1. */
2184 write_bits(1,1);
2185 write_bits(huff_tree->elements,15);
2186 write_bits(int_length,16);
2187 write_bits(huff_tree->char_bits,5);
2188 write_bits(huff_tree->offset_bits,5);
2189 intervall_length+=int_length;
2190 }
2191 DBUG_PRINT("info", ("tree: %2u elements: %4u char_bits: %2u "
2192 "offset_bits: %2u %s: %5u codelen: %2u",
2193 tree_no, huff_tree->elements, huff_tree->char_bits,
2194 huff_tree->offset_bits, huff_tree->counts->tree_buff ?
2195 "bufflen" : "min_chr", huff_tree->counts->tree_buff ?
2196 int_length : huff_tree->min_chr, huff_tree->height));
2197 if (verbose >= 2)
2198 printf("tree: %2u elements: %4u char_bits: %2u offset_bits: %2u "
2199 "%s: %5u codelen: %2u\n", tree_no, huff_tree->elements,
2200 huff_tree->char_bits, huff_tree->offset_bits,
2201 huff_tree->counts->tree_buff ? "bufflen" : "min_chr",
2202 huff_tree->counts->tree_buff ? int_length :
2203 huff_tree->min_chr, huff_tree->height);
2204
2205 /* Check that the code tree length matches the element count. */
2206 length=(uint) (offset-packed_tree);
2207 if (length != huff_tree->elements*2-2)
2208 {
2209 fprintf(stderr, "error: Huff-tree-length: %d != calc_length: %d\n",
2210 length, huff_tree->elements * 2 - 2);
2211 errors++;
2212 break;
2213 }
2214
2215 for (i=0 ; i < length ; i++)
2216 {
2217 if (packed_tree[i] & IS_OFFSET)
2218 write_bits(packed_tree[i] - IS_OFFSET+ (1 << huff_tree->offset_bits),
2219 huff_tree->offset_bits+1);
2220 else
2221 write_bits(packed_tree[i]-huff_tree->min_chr,huff_tree->char_bits+1);
2222 DBUG_PRINT("info", ("tree[0x%04x]: %s0x%04x",
2223 i, (packed_tree[i] & IS_OFFSET) ?
2224 " -> " : "", (packed_tree[i] & IS_OFFSET) ?
2225 packed_tree[i] - IS_OFFSET + i : packed_tree[i]));
2226 if (verbose >= 3)
2227 printf("tree[0x%04x]: %s0x%04x\n",
2228 i, (packed_tree[i] & IS_OFFSET) ? " -> " : "",
2229 (packed_tree[i] & IS_OFFSET) ?
2230 packed_tree[i] - IS_OFFSET + i : packed_tree[i]);
2231 }
2232 flush_bits();
2233
2234 /*
2235 Display coding tables and check their correctness.
2236 */
2237 codes= huff_tree->counts->tree_buff ? huff_tree->elements : 256;
2238 for (i= 0; i < codes; i++)
2239 {
2240 ulonglong code;
2241 uint bits;
2242 uint len;
2243 uint idx;
2244
2245 if (! (len= huff_tree->code_len[i]))
2246 continue;
2247 DBUG_PRINT("info", ("code[0x%04x]: 0x%s bits: %2u bin: %s", i,
2248 hexdigits(huff_tree->code[i]), huff_tree->code_len[i],
2249 bindigits(huff_tree->code[i],
2250 huff_tree->code_len[i])));
2251 if (verbose >= 3)
2252 printf("code[0x%04x]: 0x%s bits: %2u bin: %s\n", i,
2253 hexdigits(huff_tree->code[i]), huff_tree->code_len[i],
2254 bindigits(huff_tree->code[i], huff_tree->code_len[i]));
2255
2256 /* Check that the encode table decodes correctly. */
2257 code= 0;
2258 bits= 0;
2259 idx= 0;
2260 DBUG_EXECUTE_IF("forcechkerr1", len--;);
2261 DBUG_EXECUTE_IF("forcechkerr2", bits= 8 * sizeof(code););
2262 DBUG_EXECUTE_IF("forcechkerr3", idx= length;);
2263 for (;;)
2264 {
2265 if (! len)
2266 {
2267 fflush(stdout);
2268 fprintf(stderr, "error: code 0x%s with %u bits not found\n",
2269 hexdigits(huff_tree->code[i]), huff_tree->code_len[i]);
2270 errors++;
2271 break;
2272 }
2273 code<<= 1;
2274 code|= (huff_tree->code[i] >> (--len)) & 1;
2275 bits++;
2276 if (bits > 8 * sizeof(code))
2277 {
2278 fflush(stdout);
2279 fprintf(stderr, "error: Huffman code too long: %u/%u\n",
2280 bits, (uint) (8 * sizeof(code)));
2281 errors++;
2282 break;
2283 }
2284 idx+= (uint) code & 1;
2285 if (idx >= length)
2286 {
2287 fflush(stdout);
2288 fprintf(stderr, "error: illegal tree offset: %u/%u\n", idx, length);
2289 errors++;
2290 break;
2291 }
2292 if (packed_tree[idx] & IS_OFFSET)
2293 idx+= packed_tree[idx] & ~IS_OFFSET;
2294 else
2295 break; /* Hit a leaf. This contains the result value. */
2296 }
2297 if (errors)
2298 break;
2299
2300 DBUG_EXECUTE_IF("forcechkerr4", packed_tree[idx]++;);
2301 if (packed_tree[idx] != i)
2302 {
2303 fflush(stdout);
2304 fprintf(stderr, "error: decoded value 0x%04x should be: 0x%04x\n",
2305 packed_tree[idx], i);
2306 errors++;
2307 break;
2308 }
2309 } /*end for (codes)*/
2310 if (errors)
2311 break;
2312
2313 /* Write column values in case of distinct column value compression. */
2314 if (huff_tree->counts->tree_buff)
2315 {
2316 for (i=0 ; i < int_length ; i++)
2317 {
2318 write_bits((ulonglong) (uchar) huff_tree->counts->tree_buff[i], 8);
2319 DBUG_PRINT("info", ("column_values[0x%04x]: 0x%02x",
2320 i, (uchar) huff_tree->counts->tree_buff[i]));
2321 if (verbose >= 3)
2322 printf("column_values[0x%04x]: 0x%02x\n",
2323 i, (uchar) huff_tree->counts->tree_buff[i]);
2324 }
2325 }
2326 flush_bits();
2327 }
2328 DBUG_PRINT("info", (" "));
2329 if (verbose >= 2)
2330 printf("\n");
2331 my_afree(packed_tree);
2332 if (errors)
2333 {
2334 fprintf(stderr, "Error: Generated decode trees are corrupt. Stop.\n");
2335 return 0;
2336 }
2337 return elements;
2338 }
2339
2340
make_offset_code_tree(HUFF_TREE * huff_tree,HUFF_ELEMENT * element,uint * offset)2341 static uint *make_offset_code_tree(HUFF_TREE *huff_tree, HUFF_ELEMENT *element,
2342 uint *offset)
2343 {
2344 uint *prev_offset;
2345
2346 prev_offset= offset;
2347 /*
2348 'a.leaf.null' takes the same place as 'a.nod.left'. If this is null,
2349 then there is no left child and, hence no right child either. This
2350 is a property of a binary tree. An element is either a node with two
2351 childs, or a leaf without childs.
2352
2353 The current element is always a node with two childs. Go left first.
2354 */
2355 if (!element->a.nod.left->a.leaf.null)
2356 {
2357 /* Store the uchar code or the index of the column value. */
2358 prev_offset[0] =(uint) element->a.nod.left->a.leaf.element_nr;
2359 offset+=2;
2360 }
2361 else
2362 {
2363 /*
2364 Recursively traverse the tree to the left. Mark it as an offset to
2365 another tree node (in contrast to a uchar code or column value index).
2366 */
2367 prev_offset[0]= IS_OFFSET+2;
2368 offset=make_offset_code_tree(huff_tree,element->a.nod.left,offset+2);
2369 }
2370
2371 /* Now, check the right child. */
2372 if (!element->a.nod.right->a.leaf.null)
2373 {
2374 /* Store the uchar code or the index of the column value. */
2375 prev_offset[1]=element->a.nod.right->a.leaf.element_nr;
2376 return offset;
2377 }
2378 else
2379 {
2380 /*
2381 Recursively traverse the tree to the right. Mark it as an offset to
2382 another tree node (in contrast to a uchar code or column value index).
2383 */
2384 uint temp=(uint) (offset-prev_offset-1);
2385 prev_offset[1]= IS_OFFSET+ temp;
2386 if (huff_tree->max_offset < temp)
2387 huff_tree->max_offset = temp;
2388 return make_offset_code_tree(huff_tree,element->a.nod.right,offset);
2389 }
2390 }
2391
2392 /* Get number of bits neaded to represent value */
2393
max_bit(register uint value)2394 static uint max_bit(register uint value)
2395 {
2396 reg2 uint power=1;
2397
2398 while ((value>>=1))
2399 power++;
2400 return (power);
2401 }
2402
2403
compress_maria_file(PACK_MRG_INFO * mrg,HUFF_COUNTS * huff_counts)2404 static int compress_maria_file(PACK_MRG_INFO *mrg, HUFF_COUNTS *huff_counts)
2405 {
2406 int error;
2407 uint i,max_calc_length,pack_ref_length,min_record_length,max_record_length;
2408 uint intervall,field_length,max_pack_length,pack_blob_length, null_bytes;
2409 my_off_t record_count;
2410 char llbuf[32];
2411 ulong length,pack_length;
2412 uchar *record,*pos,*end_pos,*record_pos,*start_pos;
2413 HUFF_COUNTS *count,*end_count;
2414 HUFF_TREE *tree;
2415 MARIA_HA *isam_file=mrg->file[0];
2416 uint pack_version= (uint) isam_file->s->pack.version;
2417 DBUG_ENTER("compress_maria_file");
2418
2419 /* Allocate a buffer for the records (excluding blobs). */
2420 if (!(record=(uchar*) my_safe_alloca(isam_file->s->base.reclength)))
2421 return -1;
2422
2423 end_count=huff_counts+isam_file->s->base.fields;
2424 min_record_length= (uint) ~0;
2425 max_record_length=0;
2426 null_bytes= isam_file->s->base.null_bytes;
2427
2428 /*
2429 Calculate the maximum number of bits required to pack the records.
2430 Remember to understand 'max_zero_fill' as 'min_zero_fill'.
2431 The tree height determines the maximum number of bits per value.
2432 Some fields skip leading or trailing spaces or zeroes. The skipped
2433 number of bytes is encoded by 'length_bits' bits.
2434 Empty blobs and varchar are encoded with a single 1 bit. Other blobs
2435 and varchar get a leading 0 bit.
2436 */
2437 max_calc_length= null_bytes;
2438 for (i= 0 ; i < isam_file->s->base.fields ; i++)
2439 {
2440 if (!(huff_counts[i].pack_type & PACK_TYPE_ZERO_FILL))
2441 huff_counts[i].max_zero_fill=0;
2442 if (huff_counts[i].field_type == FIELD_CONSTANT ||
2443 huff_counts[i].field_type == FIELD_ZERO ||
2444 huff_counts[i].field_type == FIELD_CHECK)
2445 continue;
2446 if (huff_counts[i].field_type == FIELD_INTERVALL)
2447 max_calc_length+=huff_counts[i].tree->height;
2448 else if (huff_counts[i].field_type == FIELD_BLOB ||
2449 huff_counts[i].field_type == FIELD_VARCHAR)
2450 max_calc_length+=huff_counts[i].tree->height*huff_counts[i].max_length + huff_counts[i].length_bits +1;
2451 else
2452 max_calc_length+=
2453 (huff_counts[i].field_length - huff_counts[i].max_zero_fill)*
2454 huff_counts[i].tree->height+huff_counts[i].length_bits;
2455 }
2456 max_calc_length= (max_calc_length + 7) / 8;
2457 pack_ref_length= _ma_calc_pack_length(pack_version, max_calc_length);
2458 record_count=0;
2459 /* 'max_blob_length' is the max length of all blobs of a record. */
2460 pack_blob_length= isam_file->s->base.blobs ?
2461 _ma_calc_pack_length(pack_version, mrg->max_blob_length) : 0;
2462 max_pack_length=pack_ref_length+pack_blob_length;
2463
2464 DBUG_PRINT("fields", ("==="));
2465 mrg_reset(mrg);
2466 while ((error=mrg_rrnd(mrg,record)) != HA_ERR_END_OF_FILE)
2467 {
2468 ulong tot_blob_length=0;
2469 if (! error)
2470 {
2471 if (flush_buffer((ulong) max_calc_length + (ulong) max_pack_length +
2472 null_bytes))
2473 break;
2474 record_pos= file_buffer.pos;
2475 file_buffer.pos+= max_pack_length;
2476 if (null_bytes)
2477 {
2478 /* Copy null bits 'as is' */
2479 memcpy(file_buffer.pos, record, null_bytes);
2480 file_buffer.pos+= null_bytes;
2481 }
2482 for (start_pos=record+null_bytes, count= huff_counts;
2483 count < end_count ;
2484 count++)
2485 {
2486 end_pos=start_pos+(field_length=count->field_length);
2487 tree=count->tree;
2488
2489 DBUG_PRINT("fields", ("column: %3lu type: %2u pack: %2u zero: %4u "
2490 "lbits: %2u tree: %2u length: %4u",
2491 (ulong) (count - huff_counts + 1),
2492 count->field_type,
2493 count->pack_type, count->max_zero_fill,
2494 count->length_bits, count->tree->tree_number,
2495 count->field_length));
2496
2497 /* Check if the column contains spaces only. */
2498 if (count->pack_type & PACK_TYPE_SPACE_FIELDS)
2499 {
2500 for (pos=start_pos ; *pos == ' ' && pos < end_pos; pos++) ;
2501 if (pos == end_pos)
2502 {
2503 DBUG_PRINT("fields",
2504 ("PACK_TYPE_SPACE_FIELDS spaces only, bits: 1"));
2505 DBUG_PRINT("fields", ("---"));
2506 write_bits(1,1);
2507 start_pos=end_pos;
2508 continue;
2509 }
2510 DBUG_PRINT("fields",
2511 ("PACK_TYPE_SPACE_FIELDS not only spaces, bits: 1"));
2512 write_bits(0,1);
2513 }
2514 end_pos-=count->max_zero_fill;
2515 field_length-=count->max_zero_fill;
2516
2517 switch (count->field_type) {
2518 case FIELD_SKIP_ZERO:
2519 if (!memcmp(start_pos, zero_string, field_length))
2520 {
2521 DBUG_PRINT("fields", ("FIELD_SKIP_ZERO zeroes only, bits: 1"));
2522 write_bits(1,1);
2523 start_pos=end_pos;
2524 break;
2525 }
2526 DBUG_PRINT("fields", ("FIELD_SKIP_ZERO not only zeroes, bits: 1"));
2527 write_bits(0,1);
2528 /* Fall through */
2529 case FIELD_NORMAL:
2530 DBUG_PRINT("fields", ("FIELD_NORMAL %lu bytes",
2531 (ulong) (end_pos - start_pos)));
2532 for ( ; start_pos < end_pos ; start_pos++)
2533 {
2534 DBUG_PRINT("fields",
2535 ("value: 0x%02x code: 0x%s bits: %2u bin: %s",
2536 (uchar) *start_pos,
2537 hexdigits(tree->code[(uchar) *start_pos]),
2538 (uint) tree->code_len[(uchar) *start_pos],
2539 bindigits(tree->code[(uchar) *start_pos],
2540 (uint) tree->code_len[(uchar) *start_pos])));
2541 write_bits(tree->code[(uchar) *start_pos],
2542 (uint) tree->code_len[(uchar) *start_pos]);
2543 }
2544 break;
2545 case FIELD_SKIP_ENDSPACE:
2546 for (pos=end_pos ; pos > start_pos && pos[-1] == ' ' ; pos--) ;
2547 length= (ulong) (end_pos - pos);
2548 if (count->pack_type & PACK_TYPE_SELECTED)
2549 {
2550 if (length > count->min_space)
2551 {
2552 DBUG_PRINT("fields",
2553 ("FIELD_SKIP_ENDSPACE more than min_space, bits: 1"));
2554 DBUG_PRINT("fields",
2555 ("FIELD_SKIP_ENDSPACE skip %lu/%u bytes, bits: %2u",
2556 length, field_length, count->length_bits));
2557 write_bits(1,1);
2558 write_bits(length,count->length_bits);
2559 }
2560 else
2561 {
2562 DBUG_PRINT("fields",
2563 ("FIELD_SKIP_ENDSPACE not more than min_space, "
2564 "bits: 1"));
2565 write_bits(0,1);
2566 pos=end_pos;
2567 }
2568 }
2569 else
2570 {
2571 DBUG_PRINT("fields",
2572 ("FIELD_SKIP_ENDSPACE skip %lu/%u bytes, bits: %2u",
2573 length, field_length, count->length_bits));
2574 write_bits(length,count->length_bits);
2575 }
2576 /* Encode all significant bytes. */
2577 DBUG_PRINT("fields", ("FIELD_SKIP_ENDSPACE %lu bytes",
2578 (ulong) (pos - start_pos)));
2579 for ( ; start_pos < pos ; start_pos++)
2580 {
2581 DBUG_PRINT("fields",
2582 ("value: 0x%02x code: 0x%s bits: %2u bin: %s",
2583 (uchar) *start_pos,
2584 hexdigits(tree->code[(uchar) *start_pos]),
2585 (uint) tree->code_len[(uchar) *start_pos],
2586 bindigits(tree->code[(uchar) *start_pos],
2587 (uint) tree->code_len[(uchar) *start_pos])));
2588 write_bits(tree->code[(uchar) *start_pos],
2589 (uint) tree->code_len[(uchar) *start_pos]);
2590 }
2591 start_pos=end_pos;
2592 break;
2593 case FIELD_SKIP_PRESPACE:
2594 for (pos=start_pos ; pos < end_pos && pos[0] == ' ' ; pos++) ;
2595 length= (ulong) (pos - start_pos);
2596 if (count->pack_type & PACK_TYPE_SELECTED)
2597 {
2598 if (length > count->min_space)
2599 {
2600 DBUG_PRINT("fields",
2601 ("FIELD_SKIP_PRESPACE more than min_space, bits: 1"));
2602 DBUG_PRINT("fields",
2603 ("FIELD_SKIP_PRESPACE skip %lu/%u bytes, bits: %2u",
2604 length, field_length, count->length_bits));
2605 write_bits(1,1);
2606 write_bits(length,count->length_bits);
2607 }
2608 else
2609 {
2610 DBUG_PRINT("fields",
2611 ("FIELD_SKIP_PRESPACE not more than min_space, "
2612 "bits: 1"));
2613 pos=start_pos;
2614 write_bits(0,1);
2615 }
2616 }
2617 else
2618 {
2619 DBUG_PRINT("fields",
2620 ("FIELD_SKIP_PRESPACE skip %lu/%u bytes, bits: %2u",
2621 length, field_length, count->length_bits));
2622 write_bits(length,count->length_bits);
2623 }
2624 /* Encode all significant bytes. */
2625 DBUG_PRINT("fields", ("FIELD_SKIP_PRESPACE %lu bytes",
2626 (ulong) (end_pos - start_pos)));
2627 for (start_pos=pos ; start_pos < end_pos ; start_pos++)
2628 {
2629 DBUG_PRINT("fields",
2630 ("value: 0x%02x code: 0x%s bits: %2u bin: %s",
2631 (uchar) *start_pos,
2632 hexdigits(tree->code[(uchar) *start_pos]),
2633 (uint) tree->code_len[(uchar) *start_pos],
2634 bindigits(tree->code[(uchar) *start_pos],
2635 (uint) tree->code_len[(uchar) *start_pos])));
2636 write_bits(tree->code[(uchar) *start_pos],
2637 (uint) tree->code_len[(uchar) *start_pos]);
2638 }
2639 break;
2640 case FIELD_CONSTANT:
2641 case FIELD_ZERO:
2642 case FIELD_CHECK:
2643 DBUG_PRINT("fields", ("FIELD_CONSTANT/ZERO/CHECK"));
2644 start_pos=end_pos;
2645 break;
2646 case FIELD_INTERVALL:
2647 global_count=count;
2648 pos=(uchar*) tree_search(&count->int_tree, start_pos,
2649 count->int_tree.custom_arg);
2650 intervall=(uint) (pos - count->tree_buff)/field_length;
2651 DBUG_PRINT("fields", ("FIELD_INTERVALL"));
2652 DBUG_PRINT("fields", ("index: %4u code: 0x%s bits: %2u",
2653 intervall, hexdigits(tree->code[intervall]),
2654 (uint) tree->code_len[intervall]));
2655 write_bits(tree->code[intervall],(uint) tree->code_len[intervall]);
2656 start_pos=end_pos;
2657 break;
2658 case FIELD_BLOB:
2659 {
2660 ulong blob_length= _ma_calc_blob_length(field_length-
2661 portable_sizeof_char_ptr,
2662 start_pos);
2663 /* Empty blobs are encoded with a single 1 bit. */
2664 if (!blob_length)
2665 {
2666 DBUG_PRINT("fields", ("FIELD_BLOB empty, bits: 1"));
2667 write_bits(1,1);
2668 }
2669 else
2670 {
2671 uchar *blob,*blob_end;
2672 DBUG_PRINT("fields", ("FIELD_BLOB not empty, bits: 1"));
2673 write_bits(0,1);
2674 /* Write the blob length. */
2675 DBUG_PRINT("fields", ("FIELD_BLOB %lu bytes, bits: %2u",
2676 blob_length, count->length_bits));
2677 write_bits(blob_length,count->length_bits);
2678 memcpy(&blob,end_pos-portable_sizeof_char_ptr, sizeof(char*));
2679 blob_end=blob+blob_length;
2680 /* Encode the blob bytes. */
2681 for ( ; blob < blob_end ; blob++)
2682 {
2683 DBUG_PRINT("fields",
2684 ("value: 0x%02x code: 0x%s bits: %2u bin: %s",
2685 (uchar) *blob, hexdigits(tree->code[(uchar) *blob]),
2686 (uint) tree->code_len[(uchar) *blob],
2687 bindigits(tree->code[(uchar) *start_pos],
2688 (uint)tree->code_len[(uchar) *start_pos])));
2689 write_bits(tree->code[(uchar) *blob],
2690 (uint) tree->code_len[(uchar) *blob]);
2691 }
2692 tot_blob_length+=blob_length;
2693 }
2694 start_pos= end_pos;
2695 break;
2696 }
2697 case FIELD_VARCHAR:
2698 {
2699 uint var_pack_length= HA_VARCHAR_PACKLENGTH(count->field_length-1);
2700 ulong col_length= (var_pack_length == 1 ?
2701 (uint) *(uchar*) start_pos :
2702 uint2korr(start_pos));
2703 /* Empty varchar are encoded with a single 1 bit. */
2704 if (!col_length)
2705 {
2706 DBUG_PRINT("fields", ("FIELD_VARCHAR empty, bits: 1"));
2707 write_bits(1,1); /* Empty varchar */
2708 }
2709 else
2710 {
2711 uchar *end= start_pos + var_pack_length + col_length;
2712 DBUG_PRINT("fields", ("FIELD_VARCHAR not empty, bits: 1"));
2713 write_bits(0,1);
2714 /* Write the varchar length. */
2715 DBUG_PRINT("fields", ("FIELD_VARCHAR %lu bytes, bits: %2u",
2716 col_length, count->length_bits));
2717 write_bits(col_length,count->length_bits);
2718 /* Encode the varchar bytes. */
2719 for (start_pos+= var_pack_length ; start_pos < end ; start_pos++)
2720 {
2721 DBUG_PRINT("fields",
2722 ("value: 0x%02x code: 0x%s bits: %2u bin: %s",
2723 (uchar) *start_pos,
2724 hexdigits(tree->code[(uchar) *start_pos]),
2725 (uint) tree->code_len[(uchar) *start_pos],
2726 bindigits(tree->code[(uchar) *start_pos],
2727 (uint)tree->code_len[(uchar) *start_pos])));
2728 write_bits(tree->code[(uchar) *start_pos],
2729 (uint) tree->code_len[(uchar) *start_pos]);
2730 }
2731 }
2732 start_pos= end_pos;
2733 break;
2734 }
2735 case FIELD_LAST:
2736 case FIELD_enum_val_count:
2737 abort(); /* Impossible */
2738 }
2739 start_pos+=count->max_zero_fill;
2740 DBUG_PRINT("fields", ("---"));
2741 }
2742 flush_bits();
2743 length=(ulong) (file_buffer.pos - record_pos) - max_pack_length;
2744 pack_length= _ma_save_pack_length(pack_version, record_pos, length);
2745 if (pack_blob_length)
2746 pack_length+= _ma_save_pack_length(pack_version,
2747 record_pos + pack_length,
2748 tot_blob_length);
2749 DBUG_PRINT("fields", ("record: %lu length: %lu blob-length: %lu "
2750 "length-bytes: %lu", (ulong) record_count, length,
2751 tot_blob_length, pack_length));
2752 DBUG_PRINT("fields", ("==="));
2753
2754 /* Correct file buffer if the header was smaller */
2755 if (pack_length != max_pack_length)
2756 {
2757 bmove(record_pos+pack_length,record_pos+max_pack_length,length);
2758 file_buffer.pos-= (max_pack_length-pack_length);
2759 }
2760 if (length < (ulong) min_record_length)
2761 min_record_length=(uint) length;
2762 if (length > (ulong) max_record_length)
2763 max_record_length=(uint) length;
2764 record_count++;
2765 if (write_loop && record_count % WRITE_COUNT == 0)
2766 {
2767 printf("%lu\r", (ulong) record_count);
2768 fflush(stdout);
2769 }
2770 }
2771 else if (error != HA_ERR_RECORD_DELETED)
2772 break;
2773 }
2774 if (error == HA_ERR_END_OF_FILE)
2775 error=0;
2776 else
2777 {
2778 fprintf(stderr, "%s: Got error %d reading records\n", my_progname, error);
2779 }
2780 if (verbose >= 2)
2781 printf("wrote %s records.\n", llstr((longlong) record_count, llbuf));
2782
2783 my_safe_afree(record, isam_file->s->base.reclength);
2784 mrg->ref_length=max_pack_length;
2785 mrg->min_pack_length=max_record_length ? min_record_length : 0;
2786 mrg->max_pack_length=max_record_length;
2787 DBUG_RETURN(error || error_on_write || flush_buffer(~(ulong) 0));
2788 }
2789
2790
make_new_name(char * new_name,char * old_name)2791 static char *make_new_name(char *new_name, char *old_name)
2792 {
2793 return fn_format(new_name,old_name,"",DATA_TMP_EXT,2+4);
2794 }
2795
make_old_name(char * new_name,char * old_name)2796 static char *make_old_name(char *new_name, char *old_name)
2797 {
2798 return fn_format(new_name,old_name,"",OLD_EXT,2+4);
2799 }
2800
2801 /* rutines for bit writing buffer */
2802
init_file_buffer(File file,pbool read_buffer)2803 static void init_file_buffer(File file, pbool read_buffer)
2804 {
2805 file_buffer.file=file;
2806 file_buffer.buffer= (uchar*) my_malloc(ALIGN_SIZE(RECORD_CACHE_SIZE),
2807 MYF(MY_WME));
2808 file_buffer.end=file_buffer.buffer+ALIGN_SIZE(RECORD_CACHE_SIZE)-8;
2809 file_buffer.pos_in_file=0;
2810 error_on_write=0;
2811 if (read_buffer)
2812 {
2813
2814 file_buffer.pos=file_buffer.end;
2815 file_buffer.bits=0;
2816 }
2817 else
2818 {
2819 file_buffer.pos=file_buffer.buffer;
2820 file_buffer.bits=BITS_SAVED;
2821 }
2822 file_buffer.bitbucket= 0;
2823 }
2824
2825
flush_buffer(ulong neaded_length)2826 static int flush_buffer(ulong neaded_length)
2827 {
2828 ulong length;
2829
2830 /*
2831 file_buffer.end is 8 bytes lower than the real end of the buffer.
2832 This is done so that the end-of-buffer condition does not need to be
2833 checked for every uchar (see write_bits()). Consequently,
2834 file_buffer.pos can become greater than file_buffer.end. The
2835 algorithms in the other functions ensure that there will never be
2836 more than 8 bytes written to the buffer without an end-of-buffer
2837 check. So the buffer cannot be overrun. But we need to check for the
2838 near-to-buffer-end condition to avoid a negative result, which is
2839 casted to unsigned and thus becomes giant.
2840 */
2841 if ((file_buffer.pos < file_buffer.end) &&
2842 ((ulong) (file_buffer.end - file_buffer.pos) > neaded_length))
2843 return 0;
2844 length=(ulong) (file_buffer.pos-file_buffer.buffer);
2845 file_buffer.pos=file_buffer.buffer;
2846 file_buffer.pos_in_file+=length;
2847 if (test_only)
2848 return 0;
2849 if (error_on_write|| my_write(file_buffer.file,
2850 (const uchar*) file_buffer.buffer,
2851 length,
2852 MYF(MY_WME | MY_NABP | MY_WAIT_IF_FULL)))
2853 {
2854 error_on_write=1;
2855 return 1;
2856 }
2857
2858 if (neaded_length != ~(ulong) 0 &&
2859 (ulong) (file_buffer.end-file_buffer.buffer) < neaded_length)
2860 {
2861 uchar *tmp;
2862 neaded_length+=256; /* some margin */
2863 tmp= (uchar*) my_realloc(file_buffer.buffer, neaded_length,MYF(MY_WME));
2864 if (!tmp)
2865 return 1;
2866 file_buffer.pos= (tmp + (ulong) (file_buffer.pos - file_buffer.buffer));
2867 file_buffer.buffer= tmp;
2868 file_buffer.end= (tmp+neaded_length-8);
2869 }
2870 return 0;
2871 }
2872
2873
end_file_buffer(void)2874 static void end_file_buffer(void)
2875 {
2876 my_free(file_buffer.buffer);
2877 }
2878
2879 /* output `bits` low bits of `value' */
2880
write_bits(register ulonglong value,register uint bits)2881 static void write_bits(register ulonglong value, register uint bits)
2882 {
2883 DBUG_ASSERT(((bits < 8 * sizeof(value)) && ! (value >> bits)) ||
2884 (bits == 8 * sizeof(value)));
2885
2886 if ((file_buffer.bits-= (int) bits) >= 0)
2887 {
2888 file_buffer.bitbucket|= value << file_buffer.bits;
2889 }
2890 else
2891 {
2892 reg3 ulonglong bit_buffer;
2893 bits= (uint) -file_buffer.bits;
2894 bit_buffer= (file_buffer.bitbucket |
2895 ((bits != 8 * sizeof(value)) ? (value >> bits) : 0));
2896 #if BITS_SAVED == 64
2897 *file_buffer.pos++= (uchar) (bit_buffer >> 56);
2898 *file_buffer.pos++= (uchar) (bit_buffer >> 48);
2899 *file_buffer.pos++= (uchar) (bit_buffer >> 40);
2900 *file_buffer.pos++= (uchar) (bit_buffer >> 32);
2901 #endif
2902 *file_buffer.pos++= (uchar) (bit_buffer >> 24);
2903 *file_buffer.pos++= (uchar) (bit_buffer >> 16);
2904 *file_buffer.pos++= (uchar) (bit_buffer >> 8);
2905 *file_buffer.pos++= (uchar) (bit_buffer);
2906
2907 if (bits != 8 * sizeof(value))
2908 value&= (((ulonglong) 1) << bits) - 1;
2909 if (file_buffer.pos >= file_buffer.end)
2910 flush_buffer(~ (ulong) 0);
2911 file_buffer.bits=(int) (BITS_SAVED - bits);
2912 file_buffer.bitbucket= value << (BITS_SAVED - bits);
2913 }
2914 return;
2915 }
2916
2917 /* Flush bits in bit_buffer to buffer */
2918
flush_bits(void)2919 static void flush_bits(void)
2920 {
2921 int bits;
2922 ulonglong bit_buffer;
2923
2924 bits= file_buffer.bits & ~7;
2925 bit_buffer= file_buffer.bitbucket >> bits;
2926 bits= BITS_SAVED - bits;
2927 while (bits > 0)
2928 {
2929 bits-= 8;
2930 *file_buffer.pos++= (uchar) (bit_buffer >> bits);
2931 }
2932 if (file_buffer.pos >= file_buffer.end)
2933 flush_buffer(~ (ulong) 0);
2934 file_buffer.bits= BITS_SAVED;
2935 file_buffer.bitbucket= 0;
2936 }
2937
2938
2939 /****************************************************************************
2940 ** functions to handle the joined files
2941 ****************************************************************************/
2942
save_state(MARIA_HA * isam_file,PACK_MRG_INFO * mrg,my_off_t new_length,ha_checksum crc)2943 static int save_state(MARIA_HA *isam_file,PACK_MRG_INFO *mrg,
2944 my_off_t new_length,
2945 ha_checksum crc)
2946 {
2947 MARIA_SHARE *share=isam_file->s;
2948 uint options=mi_uint2korr(share->state.header.options);
2949 uint key;
2950 DBUG_ENTER("save_state");
2951
2952 options|= HA_OPTION_COMPRESS_RECORD | HA_OPTION_READ_ONLY_DATA;
2953 mi_int2store(share->state.header.options,options);
2954 /* Save the original file type of we have to undo the packing later */
2955 share->state.header.org_data_file_type= share->state.header.data_file_type;
2956 share->state.header.data_file_type= COMPRESSED_RECORD;
2957
2958 share->state.state.data_file_length=new_length;
2959 share->state.state.del=0;
2960 share->state.state.empty=0;
2961 share->state.dellink= HA_OFFSET_ERROR;
2962 share->state.split=(ha_rows) mrg->records;
2963 share->state.version=(ulong) time((time_t*) 0);
2964 if (share->base.born_transactional)
2965 share->state.create_rename_lsn= share->state.is_of_horizon=
2966 share->state.skip_redo_lsn= LSN_NEEDS_NEW_STATE_LSNS;
2967 if (! maria_is_all_keys_active(share->state.key_map, share->base.keys))
2968 {
2969 /*
2970 Some indexes are disabled, cannot use current key_file_length value
2971 as an estimate of upper bound of index file size. Use packed data file
2972 size instead.
2973 */
2974 share->state.state.key_file_length= new_length;
2975 }
2976 /*
2977 If there are no disabled indexes, keep key_file_length value from
2978 original file so "aria_chk -rq" can use this value (this is necessary
2979 because index size cannot be easily calculated for fulltext keys)
2980 */
2981 maria_clear_all_keys_active(share->state.key_map);
2982 for (key=0 ; key < share->base.keys ; key++)
2983 share->state.key_root[key]= HA_OFFSET_ERROR;
2984 share->state.key_del= HA_OFFSET_ERROR;
2985 share->state.state.checksum= crc; /* Save crc in file */
2986 share->changed=1; /* Force write of header */
2987 share->state.open_count=0;
2988 share->global_changed=0;
2989 my_chsize(share->kfile.file, share->base.keystart, 0, MYF(0));
2990 if (share->base.keys)
2991 isamchk_neaded=1;
2992 DBUG_RETURN(_ma_state_info_write_sub(share->kfile.file,
2993 &share->state,
2994 MA_STATE_INFO_WRITE_DONT_MOVE_OFFSET |
2995 MA_STATE_INFO_WRITE_FULL_INFO));
2996 }
2997
2998
save_state_mrg(File file,PACK_MRG_INFO * mrg,my_off_t new_length,ha_checksum crc)2999 static int save_state_mrg(File file,PACK_MRG_INFO *mrg,my_off_t new_length,
3000 ha_checksum crc)
3001 {
3002 MARIA_STATE_INFO state;
3003 MARIA_HA *isam_file=mrg->file[0];
3004 uint options;
3005 DBUG_ENTER("save_state_mrg");
3006
3007 state= isam_file->s->state;
3008 options= (mi_uint2korr(state.header.options) | HA_OPTION_COMPRESS_RECORD |
3009 HA_OPTION_READ_ONLY_DATA);
3010 mi_int2store(state.header.options,options);
3011 /* Save the original file type of we have to undo the packing later */
3012 state.header.org_data_file_type= state.header.data_file_type;
3013 state.header.data_file_type= COMPRESSED_RECORD;
3014
3015 state.state.data_file_length=new_length;
3016 state.state.del=0;
3017 state.state.empty=0;
3018 state.state.records=state.split=(ha_rows) mrg->records;
3019 state.create_rename_lsn= state.is_of_horizon= state.skip_redo_lsn=
3020 LSN_NEEDS_NEW_STATE_LSNS;
3021
3022 /* See comment above in save_state about key_file_length handling. */
3023 if (mrg->src_file_has_indexes_disabled)
3024 {
3025 isam_file->s->state.state.key_file_length=
3026 MY_MAX(isam_file->s->state.state.key_file_length, new_length);
3027 }
3028 state.dellink= HA_OFFSET_ERROR;
3029 state.version=(ulong) time((time_t*) 0);
3030 maria_clear_all_keys_active(state.key_map);
3031 state.state.checksum=crc;
3032 if (isam_file->s->base.keys)
3033 isamchk_neaded=1;
3034 state.changed=STATE_CHANGED | STATE_NOT_ANALYZED; /* Force check of table */
3035 DBUG_RETURN (_ma_state_info_write_sub(file, &state,
3036 MA_STATE_INFO_WRITE_DONT_MOVE_OFFSET |
3037 MA_STATE_INFO_WRITE_FULL_INFO));
3038 }
3039
3040
3041 /* reset for mrg_rrnd */
3042
mrg_reset(PACK_MRG_INFO * mrg)3043 static void mrg_reset(PACK_MRG_INFO *mrg)
3044 {
3045 if (mrg->current)
3046 {
3047 maria_extra(*mrg->current, HA_EXTRA_NO_CACHE, 0);
3048 mrg->current=0;
3049 }
3050 }
3051
mrg_rrnd(PACK_MRG_INFO * info,uchar * buf)3052 static int mrg_rrnd(PACK_MRG_INFO *info,uchar *buf)
3053 {
3054 int error;
3055 MARIA_HA *isam_info;
3056
3057 if (!info->current)
3058 {
3059 isam_info= *(info->current=info->file);
3060 info->end=info->current+info->count;
3061 maria_reset(isam_info);
3062 maria_extra(isam_info, HA_EXTRA_CACHE, 0);
3063 if ((error= maria_scan_init(isam_info)))
3064 return(error);
3065 }
3066 else
3067 isam_info= *info->current;
3068
3069 for (;;)
3070 {
3071 if (!(error= maria_scan(isam_info, buf)) ||
3072 error != HA_ERR_END_OF_FILE)
3073 return (error);
3074 maria_scan_end(isam_info);
3075 maria_extra(isam_info,HA_EXTRA_NO_CACHE, 0);
3076 if (info->current+1 == info->end)
3077 return(HA_ERR_END_OF_FILE);
3078 info->current++;
3079 isam_info= *info->current;
3080 maria_reset(isam_info);
3081 maria_extra(isam_info,HA_EXTRA_CACHE, 0);
3082 if ((error= maria_scan_init(isam_info)))
3083 return(error);
3084 }
3085 }
3086
3087
mrg_close(PACK_MRG_INFO * mrg)3088 static int mrg_close(PACK_MRG_INFO *mrg)
3089 {
3090 uint i;
3091 int error=0;
3092 DBUG_ENTER("mrg_close");
3093
3094 for (i=0 ; i < mrg->count ; i++)
3095 error|=maria_close(mrg->file[i]);
3096 if (mrg->free_file)
3097 my_free(mrg->file);
3098 DBUG_RETURN(error);
3099 }
3100
3101
3102 #if !defined(DBUG_OFF)
3103 /*
3104 Fake the counts to get big Huffman codes.
3105
3106 SYNOPSIS
3107 fakebigcodes()
3108 huff_counts A pointer to the counts array.
3109 end_count A pointer past the counts array.
3110
3111 DESCRIPTION
3112
3113 Huffman coding works by removing the two least frequent values from
3114 the list of values and add a new value with the sum of their
3115 incidences in a loop until only one value is left. Every time a
3116 value is reused for a new value, it gets one more bit for its
3117 encoding. Hence, the least frequent values get the longest codes.
3118
3119 To get a maximum code length for a value, two of the values must
3120 have an incidence of 1. As their sum is 2, the next infrequent value
3121 must have at least an incidence of 2, then 4, 8, 16 and so on. This
3122 means that one needs 2**n bytes (values) for a code length of n
3123 bits. However, using more distinct values forces the use of longer
3124 codes, or reaching the code length with less total bytes (values).
3125
3126 To get 64(32)-bit codes, I sort the counts by decreasing incidence.
3127 I assign counts of 1 to the two most frequent values, a count of 2
3128 for the next one, then 4, 8, and so on until 2**64-1(2**30-1). All
3129 the remaining values get 1. That way every possible uchar has an
3130 assigned code, though not all codes are used if not all uchar values
3131 are present in the column.
3132
3133 This strategy would work with distinct column values too, but
3134 requires that at least 64(32) values are present. To make things
3135 easier here, I cancel all distinct column values and force byte
3136 compression for all columns.
3137
3138 RETURN
3139 void
3140 */
3141
fakebigcodes(HUFF_COUNTS * huff_counts,HUFF_COUNTS * end_count)3142 static void fakebigcodes(HUFF_COUNTS *huff_counts, HUFF_COUNTS *end_count)
3143 {
3144 HUFF_COUNTS *count;
3145 my_off_t *cur_count_p;
3146 my_off_t *end_count_p;
3147 my_off_t **cur_sort_p;
3148 my_off_t **end_sort_p;
3149 my_off_t *sort_counts[256];
3150 my_off_t total;
3151 DBUG_ENTER("fakebigcodes");
3152
3153 for (count= huff_counts; count < end_count; count++)
3154 {
3155 /*
3156 Remove distinct column values.
3157 */
3158 if (huff_counts->tree_buff)
3159 {
3160 my_free(huff_counts->tree_buff);
3161 delete_tree(&huff_counts->int_tree, 0);
3162 huff_counts->tree_buff= NULL;
3163 DBUG_PRINT("fakebigcodes", ("freed distinct column values"));
3164 }
3165
3166 /*
3167 Sort counts by decreasing incidence.
3168 */
3169 cur_count_p= count->counts;
3170 end_count_p= cur_count_p + 256;
3171 cur_sort_p= sort_counts;
3172 while (cur_count_p < end_count_p)
3173 *(cur_sort_p++)= cur_count_p++;
3174 (void) my_qsort(sort_counts, 256, sizeof(my_off_t*), (qsort_cmp) fakecmp);
3175
3176 /*
3177 Assign faked counts.
3178 */
3179 cur_sort_p= sort_counts;
3180 #if SIZEOF_LONG_LONG > 4
3181 end_sort_p= sort_counts + 8 * sizeof(ulonglong) - 1;
3182 #else
3183 end_sort_p= sort_counts + 8 * sizeof(ulonglong) - 2;
3184 #endif
3185 /* Most frequent value gets a faked count of 1. */
3186 **(cur_sort_p++)= 1;
3187 total= 1;
3188 while (cur_sort_p < end_sort_p)
3189 {
3190 **(cur_sort_p++)= total;
3191 total<<= 1;
3192 }
3193 /* Set the last value. */
3194 **(cur_sort_p++)= --total;
3195 /*
3196 Set the remaining counts.
3197 */
3198 end_sort_p= sort_counts + 256;
3199 while (cur_sort_p < end_sort_p)
3200 **(cur_sort_p++)= 1;
3201 }
3202 DBUG_VOID_RETURN;
3203 }
3204
3205
3206 /*
3207 Compare two counts for reverse sorting.
3208
3209 SYNOPSIS
3210 fakecmp()
3211 count1 One count.
3212 count2 Another count.
3213
3214 RETURN
3215 1 count1 < count2
3216 0 count1 == count2
3217 -1 count1 > count2
3218 */
3219
fakecmp(my_off_t ** count1,my_off_t ** count2)3220 static int fakecmp(my_off_t **count1, my_off_t **count2)
3221 {
3222 return ((**count1 < **count2) ? 1 :
3223 (**count1 > **count2) ? -1 : 0);
3224 }
3225 #endif
3226
3227 #include "ma_check_standalone.h"
3228
3229