1 /* Copyright (c) 2000, 2016, Oracle and/or its affiliates. All rights reserved.
2
3 This program is free software; you can redistribute it and/or modify
4 it under the terms of the GNU General Public License, version 2.0,
5 as published by the Free Software Foundation.
6
7 This program is also distributed with certain software (including
8 but not limited to OpenSSL) that is licensed under separate terms,
9 as designated in a particular file or component or in included license
10 documentation. The authors of MySQL hereby grant you an additional
11 permission to link the program and your derivative works with the
12 separately licensed software that they have included with MySQL.
13
14 This program is distributed in the hope that it will be useful,
15 but WITHOUT ANY WARRANTY; without even the implied warranty of
16 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
17 GNU General Public License, version 2.0, for more details.
18
19 You should have received a copy of the GNU General Public License
20 along with this program; if not, write to the Free Software
21 Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA */
22
23 /* Pack MyISAM file */
24
25 #ifndef USE_MY_FUNC
26 #define USE_MY_FUNC /* We need at least my_malloc */
27 #endif
28
29 #include "myisamdef.h"
30 #include "my_default.h"
31 #include <queues.h>
32 #include <my_tree.h>
33 #include "mysys_err.h"
34 #ifndef __GNU_LIBRARY__
35 #define __GNU_LIBRARY__ /* Skip warnings in getopt.h */
36 #endif
37 #include <my_getopt.h>
38 #include <assert.h>
39 #include <welcome_copyright_notice.h> // ORACLE_WELCOME_COPYRIGHT_NOTICE
40
41 #if SIZEOF_LONG_LONG > 4
42 #define BITS_SAVED 64
43 #else
44 #define BITS_SAVED 32
45 #endif
46
47 #define IS_OFFSET ((uint) 32768) /* Bit if offset or char in tree */
48 #define HEAD_LENGTH 32
49 #define ALLOWED_JOIN_DIFF 256 /* Diff allowed to join trees */
50
51 #define DATA_TMP_EXT ".TMD"
52 #define OLD_EXT ".OLD"
53 #define FRM_EXT ".frm"
54 #define WRITE_COUNT MY_HOW_OFTEN_TO_WRITE
55
56 struct st_file_buffer {
57 File file;
58 uchar *buffer,*pos,*end;
59 my_off_t pos_in_file;
60 int bits;
61 ulonglong bitbucket;
62 };
63
64 struct st_huff_tree;
65 struct st_huff_element;
66
67 typedef struct st_huff_counts {
68 uint field_length,max_zero_fill;
69 uint pack_type;
70 uint max_end_space,max_pre_space,length_bits,min_space;
71 ulong max_length;
72 enum en_fieldtype field_type;
73 struct st_huff_tree *tree; /* Tree for field */
74 my_off_t counts[256];
75 my_off_t end_space[8];
76 my_off_t pre_space[8];
77 my_off_t tot_end_space,tot_pre_space,zero_fields,empty_fields,bytes_packed;
78 TREE int_tree; /* Tree for detecting distinct column values. */
79 uchar *tree_buff; /* Column values, 'field_length' each. */
80 uchar *tree_pos; /* Points to end of column values in 'tree_buff'. */
81 } HUFF_COUNTS;
82
83 typedef struct st_huff_element HUFF_ELEMENT;
84
85 /*
86 WARNING: It is crucial for the optimizations in calc_packed_length()
87 that 'count' is the first element of 'HUFF_ELEMENT'.
88 */
89 struct st_huff_element {
90 my_off_t count;
91 union un_element {
92 struct st_nod {
93 HUFF_ELEMENT *left,*right;
94 } nod;
95 struct st_leaf {
96 HUFF_ELEMENT *null;
97 uint element_nr; /* Number of element */
98 } leaf;
99 } a;
100 };
101
102
103 typedef struct st_huff_tree {
104 HUFF_ELEMENT *root,*element_buffer;
105 HUFF_COUNTS *counts;
106 uint tree_number;
107 uint elements;
108 my_off_t bytes_packed;
109 uint tree_pack_length;
110 uint min_chr,max_chr,char_bits,offset_bits,max_offset,height;
111 ulonglong *code;
112 uchar *code_len;
113 } HUFF_TREE;
114
115
116 typedef struct st_isam_mrg {
117 MI_INFO **file,**current,**end;
118 uint free_file;
119 uint count;
120 uint min_pack_length; /* Theese is used by packed data */
121 uint max_pack_length;
122 uint ref_length;
123 uint max_blob_length;
124 my_off_t records;
125 /* true if at least one source file has at least one disabled index */
126 my_bool src_file_has_indexes_disabled;
127 } PACK_MRG_INFO;
128
129
130 extern int main(int argc,char * *argv);
131 static void get_options(int *argc,char ***argv);
132 static MI_INFO *open_isam_file(char *name,int mode);
133 static my_bool open_isam_files(PACK_MRG_INFO *mrg,char **names,uint count);
134 static int compress(PACK_MRG_INFO *file,char *join_name);
135 static int create_dest_frm(char *source_table, char *dest_table);
136 static HUFF_COUNTS *init_huff_count(MI_INFO *info,my_off_t records);
137 static void free_counts_and_tree_and_queue(HUFF_TREE *huff_trees,
138 uint trees,
139 HUFF_COUNTS *huff_counts,
140 uint fields);
141 static int compare_tree(void* cmp_arg MY_ATTRIBUTE((unused)),
142 const uchar *s,const uchar *t);
143 static int get_statistic(PACK_MRG_INFO *mrg,HUFF_COUNTS *huff_counts);
144 static void check_counts(HUFF_COUNTS *huff_counts,uint trees,
145 my_off_t records);
146 static int test_space_compress(HUFF_COUNTS *huff_counts,my_off_t records,
147 uint max_space_length,my_off_t *space_counts,
148 my_off_t tot_space_count,
149 enum en_fieldtype field_type);
150 static HUFF_TREE* make_huff_trees(HUFF_COUNTS *huff_counts,uint trees);
151 static int make_huff_tree(HUFF_TREE *tree,HUFF_COUNTS *huff_counts);
152 static int compare_huff_elements(void *not_used, uchar *a,uchar *b);
153 static int save_counts_in_queue(uchar *key,element_count count,
154 HUFF_TREE *tree);
155 static my_off_t calc_packed_length(HUFF_COUNTS *huff_counts,uint flag);
156 static uint join_same_trees(HUFF_COUNTS *huff_counts,uint trees);
157 static int make_huff_decode_table(HUFF_TREE *huff_tree,uint trees);
158 static void make_traverse_code_tree(HUFF_TREE *huff_tree,
159 HUFF_ELEMENT *element,uint size,
160 ulonglong code);
161 static int write_header(PACK_MRG_INFO *isam_file, uint header_length,uint trees,
162 my_off_t tot_elements,my_off_t filelength);
163 static void write_field_info(HUFF_COUNTS *counts, uint fields,uint trees);
164 static my_off_t write_huff_tree(HUFF_TREE *huff_tree,uint trees);
165 static uint *make_offset_code_tree(HUFF_TREE *huff_tree,
166 HUFF_ELEMENT *element,
167 uint *offset);
168 static uint max_bit(uint value);
169 static int compress_isam_file(PACK_MRG_INFO *file,HUFF_COUNTS *huff_counts);
170 static char *make_new_name(char *new_name,char *old_name);
171 static char *make_old_name(char *new_name,char *old_name);
172 static void init_file_buffer(File file,pbool read_buffer);
173 static int flush_buffer(ulong neaded_length);
174 static void end_file_buffer(void);
175 static void write_bits(ulonglong value, uint bits);
176 static void flush_bits(void);
177 static int save_state(MI_INFO *isam_file,PACK_MRG_INFO *mrg,my_off_t new_length,
178 ha_checksum crc);
179 static int save_state_mrg(File file,PACK_MRG_INFO *isam_file,my_off_t new_length,
180 ha_checksum crc);
181 static int mrg_close(PACK_MRG_INFO *mrg);
182 static int mrg_rrnd(PACK_MRG_INFO *info,uchar *buf);
183 static void mrg_reset(PACK_MRG_INFO *mrg);
184 #if !defined(DBUG_OFF)
185 static void fakebigcodes(HUFF_COUNTS *huff_counts, HUFF_COUNTS *end_count);
186 static int fakecmp(my_off_t **count1, my_off_t **count2);
187 #endif
188
189
190 static int error_on_write=0,test_only=0,verbose=0,silent=0,
191 write_loop=0,force_pack=0, isamchk_neaded=0;
192 static int tmpfile_createflag=O_RDWR | O_TRUNC | O_EXCL;
193 static my_bool backup, opt_wait;
194 /*
195 tree_buff_length is somewhat arbitrary. The bigger it is the better
196 the chance to win in terms of compression factor. On the other hand,
197 this table becomes part of the compressed file header. And its length
198 is coded with 16 bits in the header. Hence the limit is 2**16 - 1.
199 */
200 static uint tree_buff_length= 65536 - MALLOC_OVERHEAD;
201 static char tmp_dir[FN_REFLEN]={0},*join_table;
202 static my_off_t intervall_length;
203 static ha_checksum glob_crc;
204 static struct st_file_buffer file_buffer;
205 static QUEUE queue;
206 static HUFF_COUNTS *global_count;
207 static char zero_string[]={0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0};
208 static const char *load_default_groups[]= { "myisampack",0 };
209
210 /* The main program */
211
main(int argc,char ** argv)212 int main(int argc, char **argv)
213 {
214 int error,ok;
215 PACK_MRG_INFO merge;
216 char **default_argv;
217 MY_INIT(argv[0]);
218
219 if (load_defaults("my",load_default_groups,&argc,&argv))
220 exit(1);
221
222 default_argv= argv;
223 get_options(&argc,&argv);
224
225 error=ok=isamchk_neaded=0;
226 if (join_table)
227 {
228 /*
229 Join files into one and create FRM file for the compressed table only if
230 the compression succeeds
231 */
232 if (open_isam_files(&merge,argv,(uint) argc) ||
233 compress(&merge, join_table) || create_dest_frm(argv[0], join_table))
234 error=1;
235 }
236 else while (argc--)
237 {
238 MI_INFO *isam_file;
239 if (!(isam_file=open_isam_file(*argv++,O_RDWR)))
240 error=1;
241 else
242 {
243 merge.file= &isam_file;
244 merge.current=0;
245 merge.free_file=0;
246 merge.count=1;
247 if (compress(&merge,0))
248 error=1;
249 else
250 ok=1;
251 }
252 }
253 if (ok && isamchk_neaded && !silent)
254 puts("Remember to run myisamchk -rq on compressed tables");
255 (void) fflush(stdout);
256 (void) fflush(stderr);
257 free_defaults(default_argv);
258 my_end(verbose ? MY_CHECK_ERROR | MY_GIVE_INFO : MY_CHECK_ERROR);
259 exit(error ? 2 : 0);
260 #ifndef _lint
261 return 0; /* No compiler warning */
262 #endif
263 }
264
265 enum options_mp {OPT_CHARSETS_DIR_MP=256};
266
267 static struct my_option my_long_options[] =
268 {
269 {"backup", 'b', "Make a backup of the table as table_name.OLD.",
270 &backup, &backup, 0, GET_BOOL, NO_ARG, 0, 0, 0, 0, 0, 0},
271 {"character-sets-dir", OPT_CHARSETS_DIR_MP,
272 "Directory where character sets are.", &charsets_dir,
273 &charsets_dir, 0, GET_STR, REQUIRED_ARG, 0, 0, 0, 0, 0, 0},
274 {"debug", '#', "Output debug log. Often this is 'd:t:o,filename'.",
275 0, 0, 0, GET_STR, OPT_ARG, 0, 0, 0, 0, 0, 0},
276 {"force", 'f',
277 "Force packing of table even if it gets bigger or if tempfile exists.",
278 0, 0, 0, GET_NO_ARG, NO_ARG, 0, 0, 0, 0, 0, 0},
279 {"join", 'j',
280 "Join all given tables into 'new_table_name'. All tables MUST have identical layouts.",
281 &join_table, &join_table, 0, GET_STR, REQUIRED_ARG, 0, 0, 0,
282 0, 0, 0},
283 {"help", '?', "Display this help and exit.",
284 0, 0, 0, GET_NO_ARG, NO_ARG, 0, 0, 0, 0, 0, 0},
285 {"silent", 's', "Be more silent.",
286 0, 0, 0, GET_NO_ARG, NO_ARG, 0, 0, 0, 0, 0, 0},
287 {"tmpdir", 'T', "Use temporary directory to store temporary table.",
288 0, 0, 0, GET_STR, REQUIRED_ARG, 0, 0, 0, 0, 0, 0},
289 {"test", 't', "Don't pack table, only test packing it.",
290 0, 0, 0, GET_NO_ARG, NO_ARG, 0, 0, 0, 0, 0, 0},
291 {"verbose", 'v', "Write info about progress and packing result. Use many -v for more verbosity!",
292 0, 0, 0, GET_NO_ARG, NO_ARG, 0, 0, 0, 0, 0, 0},
293 {"version", 'V', "Output version information and exit.",
294 0, 0, 0, GET_NO_ARG, NO_ARG, 0, 0, 0, 0, 0, 0},
295 {"wait", 'w', "Wait and retry if table is in use.", &opt_wait,
296 &opt_wait, 0, GET_BOOL, NO_ARG, 0, 0, 0, 0, 0, 0},
297 { 0, 0, 0, 0, 0, 0, GET_NO_ARG, NO_ARG, 0, 0, 0, 0, 0, 0}
298 };
299
300
print_version(void)301 static void print_version(void)
302 {
303 printf("%s Ver 1.23 for %s on %s\n",
304 my_progname, SYSTEM_TYPE, MACHINE_TYPE);
305 }
306
307
usage(void)308 static void usage(void)
309 {
310 print_version();
311 puts(ORACLE_WELCOME_COPYRIGHT_NOTICE("2002"));
312
313 puts("Pack a MyISAM-table to take much less space.");
314 puts("Keys are not updated, you must run myisamchk -rq on the datafile");
315 puts("afterwards to update the keys.");
316 puts("You should give the .MYI file as the filename argument.");
317
318 printf("\nUsage: %s [OPTIONS] filename...\n", my_progname);
319 my_print_help(my_long_options);
320 print_defaults("my", load_default_groups);
321 my_print_variables(my_long_options);
322 }
323
324
325 static my_bool
get_one_option(int optid,const struct my_option * opt MY_ATTRIBUTE ((unused)),char * argument)326 get_one_option(int optid, const struct my_option *opt MY_ATTRIBUTE((unused)),
327 char *argument)
328 {
329 uint length;
330
331 switch(optid) {
332 case 'f':
333 force_pack= 1;
334 tmpfile_createflag= O_RDWR | O_TRUNC;
335 break;
336 case 's':
337 write_loop= verbose= 0;
338 silent= 1;
339 break;
340 case 't':
341 test_only= 1;
342 /* Avoid to reset 'verbose' if it was already set > 1. */
343 if (! verbose)
344 verbose= 1;
345 break;
346 case 'T':
347 length= (uint) (strmov(tmp_dir, argument) - tmp_dir);
348 if (length != dirname_length(tmp_dir))
349 {
350 tmp_dir[length]=FN_LIBCHAR;
351 tmp_dir[length+1]=0;
352 }
353 break;
354 case 'v':
355 verbose++; /* Allow for selecting the level of verbosity. */
356 silent= 0;
357 break;
358 case '#':
359 DBUG_PUSH(argument ? argument : "d:t:o");
360 break;
361 case 'V':
362 print_version();
363 exit(0);
364 case 'I':
365 case '?':
366 usage();
367 exit(0);
368 }
369 return 0;
370 }
371
372 /* reads options */
373 /* Initiates DEBUG - but no debugging here ! */
374
get_options(int * argc,char *** argv)375 static void get_options(int *argc,char ***argv)
376 {
377 int ho_error;
378
379 my_progname= argv[0][0];
380 if (isatty(fileno(stdout)))
381 write_loop=1;
382
383 if ((ho_error=handle_options(argc, argv, my_long_options, get_one_option)))
384 exit(ho_error);
385
386 if (!*argc)
387 {
388 usage();
389 exit(1);
390 }
391 if (join_table)
392 {
393 backup=0; /* Not needed */
394 tmp_dir[0]=0;
395 }
396 return;
397 }
398
399
open_isam_file(char * name,int mode)400 static MI_INFO *open_isam_file(char *name,int mode)
401 {
402 MI_INFO *isam_file;
403 MYISAM_SHARE *share;
404 DBUG_ENTER("open_isam_file");
405
406 if (!(isam_file=mi_open(name,mode,
407 (opt_wait ? HA_OPEN_WAIT_IF_LOCKED :
408 HA_OPEN_ABORT_IF_LOCKED))))
409 {
410 (void) fprintf(stderr, "%s gave error %d on open\n", name, my_errno);
411 DBUG_RETURN(0);
412 }
413 share=isam_file->s;
414 if (share->options & HA_OPTION_COMPRESS_RECORD && !join_table)
415 {
416 if (!force_pack)
417 {
418 (void) fprintf(stderr, "%s is already compressed\n", name);
419 (void) mi_close(isam_file);
420 DBUG_RETURN(0);
421 }
422 if (verbose)
423 puts("Recompressing already compressed table");
424 share->options&= ~HA_OPTION_READ_ONLY_DATA; /* We are modifing it */
425 }
426 if (! force_pack && share->state.state.records != 0 &&
427 (share->state.state.records <= 1 ||
428 share->state.state.data_file_length < 1024))
429 {
430 (void) fprintf(stderr, "%s is too small to compress\n", name);
431 (void) mi_close(isam_file);
432 DBUG_RETURN(0);
433 }
434 (void) mi_lock_database(isam_file,F_WRLCK);
435 DBUG_RETURN(isam_file);
436 }
437
438
open_isam_files(PACK_MRG_INFO * mrg,char ** names,uint count)439 static my_bool open_isam_files(PACK_MRG_INFO *mrg, char **names, uint count)
440 {
441 uint i,j;
442 mrg->count=0;
443 mrg->current=0;
444 mrg->file=(MI_INFO**) my_malloc(sizeof(MI_INFO*)*count,MYF(MY_FAE));
445 mrg->free_file=1;
446 mrg->src_file_has_indexes_disabled= 0;
447 for (i=0; i < count ; i++)
448 {
449 if (!(mrg->file[i]=open_isam_file(names[i],O_RDONLY)))
450 goto error;
451
452 mrg->src_file_has_indexes_disabled|=
453 ! mi_is_all_keys_active(mrg->file[i]->s->state.key_map,
454 mrg->file[i]->s->base.keys);
455 }
456 /* Check that files are identical */
457 for (j=0 ; j < count-1 ; j++)
458 {
459 MI_COLUMNDEF *m1,*m2,*end;
460 if (mrg->file[j]->s->base.reclength != mrg->file[j+1]->s->base.reclength ||
461 mrg->file[j]->s->base.fields != mrg->file[j+1]->s->base.fields)
462 goto diff_file;
463 m1=mrg->file[j]->s->rec;
464 end=m1+mrg->file[j]->s->base.fields;
465 m2=mrg->file[j+1]->s->rec;
466 for ( ; m1 != end ; m1++,m2++)
467 {
468 if (m1->type != m2->type || m1->length != m2->length)
469 goto diff_file;
470 }
471 }
472 mrg->count=count;
473 return 0;
474
475 diff_file:
476 (void) fprintf(stderr, "%s: Tables '%s' and '%s' are not identical\n",
477 my_progname, names[j], names[j+1]);
478 error:
479 while (i--)
480 mi_close(mrg->file[i]);
481 my_free(mrg->file);
482 return 1;
483 }
484
485
compress(PACK_MRG_INFO * mrg,char * result_table)486 static int compress(PACK_MRG_INFO *mrg,char *result_table)
487 {
488 int error;
489 File new_file,join_isam_file;
490 MI_INFO *isam_file;
491 MYISAM_SHARE *share;
492 char org_name[FN_REFLEN],new_name[FN_REFLEN],temp_name[FN_REFLEN];
493 uint i,header_length,fields,trees,used_trees;
494 my_off_t old_length,new_length,tot_elements;
495 HUFF_COUNTS *huff_counts;
496 HUFF_TREE *huff_trees;
497 DBUG_ENTER("compress");
498
499 isam_file=mrg->file[0]; /* Take this as an example */
500 share=isam_file->s;
501 new_file=join_isam_file= -1;
502 trees=fields=0;
503 huff_trees=0;
504 huff_counts=0;
505
506 /* Create temporary or join file */
507
508 if (backup)
509 (void) fn_format(org_name,isam_file->filename,"",MI_NAME_DEXT,2);
510 else
511 (void) fn_format(org_name,isam_file->filename,"",MI_NAME_DEXT,2+4+16);
512 if (!test_only && result_table)
513 {
514 /* Make a new indexfile based on first file in list */
515 uint length;
516 uchar *buff;
517 strmov(org_name,result_table); /* Fix error messages */
518 (void) fn_format(new_name,result_table,"",MI_NAME_IEXT,2);
519 if ((join_isam_file=my_create(new_name,0,tmpfile_createflag,MYF(MY_WME)))
520 < 0)
521 goto err;
522 length=(uint) share->base.keystart;
523 if (!(buff= (uchar*) my_malloc(length,MYF(MY_WME))))
524 goto err;
525 if (my_pread(share->kfile,buff,length,0L,MYF(MY_WME | MY_NABP)) ||
526 my_write(join_isam_file,buff,length,
527 MYF(MY_WME | MY_NABP | MY_WAIT_IF_FULL)))
528 {
529 my_free(buff);
530 goto err;
531 }
532 my_free(buff);
533 (void) fn_format(new_name,result_table,"",MI_NAME_DEXT,2);
534 }
535 else if (!tmp_dir[0])
536 (void) make_new_name(new_name,org_name);
537 else
538 (void) fn_format(new_name,org_name,tmp_dir,DATA_TMP_EXT,1+2+4);
539 if (!test_only &&
540 (new_file=my_create(new_name,0,tmpfile_createflag,MYF(MY_WME))) < 0)
541 goto err;
542
543 /* Start calculating statistics */
544
545 mrg->records=0;
546 for (i=0 ; i < mrg->count ; i++)
547 mrg->records+=mrg->file[i]->s->state.state.records;
548
549 DBUG_PRINT("info", ("Compressing %s: (%lu records)",
550 result_table ? new_name : org_name,
551 (ulong) mrg->records));
552 if (write_loop || verbose)
553 {
554 printf("Compressing %s: (%lu records)\n",
555 result_table ? new_name : org_name, (ulong) mrg->records);
556 }
557 trees=fields=share->base.fields;
558 huff_counts=init_huff_count(isam_file,mrg->records);
559
560 /*
561 Read the whole data file(s) for statistics.
562 */
563 DBUG_PRINT("info", ("- Calculating statistics"));
564 if (write_loop || verbose)
565 printf("- Calculating statistics\n");
566 if (get_statistic(mrg,huff_counts))
567 goto err;
568
569 old_length=0;
570 for (i=0; i < mrg->count ; i++)
571 old_length+= (mrg->file[i]->s->state.state.data_file_length -
572 mrg->file[i]->s->state.state.empty);
573
574 /*
575 Create a global priority queue in preparation for making
576 temporary Huffman trees.
577 */
578 if (init_queue(&queue,256,0,0,compare_huff_elements,0))
579 goto err;
580
581 /*
582 Check each column if we should use pre-space-compress, end-space-
583 compress, empty-field-compress or zero-field-compress.
584 */
585 check_counts(huff_counts,fields,mrg->records);
586
587 /*
588 Build a Huffman tree for each column.
589 */
590 huff_trees=make_huff_trees(huff_counts,trees);
591
592 /*
593 If the packed lengths of combined columns is less then the sum of
594 the non-combined columns, then create common Huffman trees for them.
595 We do this only for byte compressed columns, not for distinct values
596 compressed columns.
597 */
598 if ((int) (used_trees=join_same_trees(huff_counts,trees)) < 0)
599 goto err;
600
601 /*
602 Assign codes to all byte or column values.
603 */
604 if (make_huff_decode_table(huff_trees,fields))
605 goto err;
606
607 /* Prepare a file buffer. */
608 init_file_buffer(new_file,0);
609
610 /*
611 Reserve space in the target file for the fixed compressed file header.
612 */
613 file_buffer.pos_in_file=HEAD_LENGTH;
614 if (! test_only)
615 my_seek(new_file,file_buffer.pos_in_file,MY_SEEK_SET,MYF(0));
616
617 /*
618 Write field infos: field type, pack type, length bits, tree number.
619 */
620 write_field_info(huff_counts,fields,used_trees);
621
622 /*
623 Write decode trees.
624 */
625 if (!(tot_elements=write_huff_tree(huff_trees,trees)))
626 goto err;
627
628 /*
629 Calculate the total length of the compression info header.
630 This includes the fixed compressed file header, the column compression
631 type descriptions, and the decode trees.
632 */
633 header_length=(uint) file_buffer.pos_in_file+
634 (uint) (file_buffer.pos-file_buffer.buffer);
635
636 /*
637 Compress the source file into the target file.
638 */
639 DBUG_PRINT("info", ("- Compressing file"));
640 if (write_loop || verbose)
641 printf("- Compressing file\n");
642 error=compress_isam_file(mrg,huff_counts);
643 new_length=file_buffer.pos_in_file;
644 if (!error && !test_only)
645 {
646 uchar buff[MEMMAP_EXTRA_MARGIN]; /* End marginal for memmap */
647 memset(buff, 0, sizeof(buff));
648 error=my_write(file_buffer.file,buff,sizeof(buff),
649 MYF(MY_WME | MY_NABP | MY_WAIT_IF_FULL)) != 0;
650 }
651
652 /*
653 Write the fixed compressed file header.
654 */
655 if (!error)
656 error=write_header(mrg,header_length,used_trees,tot_elements,
657 new_length);
658
659 /* Flush the file buffer. */
660 end_file_buffer();
661
662 /* Display statistics. */
663 DBUG_PRINT("info", ("Min record length: %6d Max length: %6d "
664 "Mean total length: %6ld\n",
665 mrg->min_pack_length, mrg->max_pack_length,
666 (ulong) (mrg->records ? (new_length/mrg->records) : 0)));
667 if (verbose && mrg->records)
668 printf("Min record length: %6d Max length: %6d "
669 "Mean total length: %6ld\n", mrg->min_pack_length,
670 mrg->max_pack_length, (ulong) (new_length/mrg->records));
671
672 /* Close source and target file. */
673 if (!test_only)
674 {
675 error|=my_close(new_file,MYF(MY_WME));
676 if (!result_table)
677 {
678 error|=my_close(isam_file->dfile,MYF(MY_WME));
679 isam_file->dfile= -1; /* Tell mi_close file is closed */
680 }
681 }
682
683 /* Cleanup. */
684 free_counts_and_tree_and_queue(huff_trees,trees,huff_counts,fields);
685 if (! test_only && ! error)
686 {
687 if (result_table)
688 {
689 error=save_state_mrg(join_isam_file,mrg,new_length,glob_crc);
690 }
691 else
692 {
693 if (backup)
694 {
695 if (my_rename(org_name,make_old_name(temp_name,isam_file->filename),
696 MYF(MY_WME)))
697 error=1;
698 else
699 {
700 if (tmp_dir[0])
701 error=my_copy(new_name,org_name,MYF(MY_WME));
702 else
703 error=my_rename(new_name,org_name,MYF(MY_WME));
704 if (!error)
705 {
706 (void) my_copystat(temp_name,org_name,MYF(MY_COPYTIME));
707 if (tmp_dir[0])
708 (void) my_delete(new_name,MYF(MY_WME));
709 }
710 }
711 }
712 else
713 {
714 if (tmp_dir[0])
715 {
716 error=my_copy(new_name,org_name,
717 MYF(MY_WME | MY_HOLD_ORIGINAL_MODES | MY_COPYTIME));
718 if (!error)
719 (void) my_delete(new_name,MYF(MY_WME));
720 }
721 else
722 error=my_redel(org_name,new_name,MYF(MY_WME | MY_COPYTIME));
723 }
724 if (! error)
725 error=save_state(isam_file,mrg,new_length,glob_crc);
726 }
727 }
728 error|=mrg_close(mrg);
729 if (join_isam_file >= 0)
730 error|=my_close(join_isam_file,MYF(MY_WME));
731 if (error)
732 {
733 (void) fprintf(stderr, "Aborting: %s is not compressed\n", org_name);
734 (void) my_delete(new_name,MYF(MY_WME));
735 DBUG_RETURN(-1);
736 }
737 if (write_loop || verbose)
738 {
739 if (old_length)
740 printf("%.4g%% \n",
741 (((longlong) (old_length - new_length)) * 100.0 /
742 (longlong) old_length));
743 else
744 puts("Empty file saved in compressed format");
745 }
746 DBUG_RETURN(0);
747
748 err:
749 free_counts_and_tree_and_queue(huff_trees,trees,huff_counts,fields);
750 if (new_file >= 0)
751 (void) my_close(new_file,MYF(0));
752 if (join_isam_file >= 0)
753 (void) my_close(join_isam_file,MYF(0));
754 mrg_close(mrg);
755 (void) fprintf(stderr, "Aborted: %s is not compressed\n", org_name);
756 DBUG_RETURN(-1);
757 }
758
759
760 /**
761 Create FRM for the destination table for --join operation
762 Copy the first table FRM as the destination table FRM file. Doing so
763 will help the mysql server to recognize the newly created table.
764 See Bug#36573.
765
766 @param source_table Name of the source table
767 @param dest_table Name of the destination table
768 @retval 0 Successful copy operation
769
770 @note We always return 0 because we don't want myisampack to report error
771 even if the copy operation fails.
772 */
773
create_dest_frm(char * source_table,char * dest_table)774 static int create_dest_frm(char *source_table, char *dest_table)
775 {
776 char source_name[FN_REFLEN], dest_name[FN_REFLEN];
777
778 DBUG_ENTER("create_dest_frm");
779
780 (void) fn_format(source_name, source_table,
781 "", FRM_EXT, MY_UNPACK_FILENAME | MY_RESOLVE_SYMLINKS);
782 (void) fn_format(dest_name, dest_table,
783 "", FRM_EXT, MY_UNPACK_FILENAME | MY_RESOLVE_SYMLINKS);
784 /*
785 Error messages produced by my_copy() are suppressed as this
786 is not vital for --join operation. User shouldn't see any error messages
787 like "source file frm not found" and "unable to create destination frm
788 file. So we don't pass the flag MY_WME -Write Message on Error to
789 my_copy()
790 */
791 (void) my_copy(source_name, dest_name, MYF(MY_DONT_OVERWRITE_FILE));
792
793 DBUG_RETURN(0);
794 }
795
796
797 /* Init a huff_count-struct for each field and init it */
798
init_huff_count(MI_INFO * info,my_off_t records)799 static HUFF_COUNTS *init_huff_count(MI_INFO *info,my_off_t records)
800 {
801 reg2 uint i;
802 reg1 HUFF_COUNTS *count;
803 if ((count = (HUFF_COUNTS*) my_malloc(info->s->base.fields*
804 sizeof(HUFF_COUNTS),
805 MYF(MY_ZEROFILL | MY_WME))))
806 {
807 for (i=0 ; i < info->s->base.fields ; i++)
808 {
809 enum en_fieldtype type;
810 count[i].field_length=info->s->rec[i].length;
811 type= count[i].field_type= (enum en_fieldtype) info->s->rec[i].type;
812 if (type == FIELD_INTERVALL ||
813 type == FIELD_CONSTANT ||
814 type == FIELD_ZERO)
815 type = FIELD_NORMAL;
816 if (count[i].field_length <= 8 &&
817 (type == FIELD_NORMAL ||
818 type == FIELD_SKIP_ZERO))
819 count[i].max_zero_fill= count[i].field_length;
820 /*
821 For every column initialize a tree, which is used to detect distinct
822 column values. 'int_tree' works together with 'tree_buff' and
823 'tree_pos'. It's keys are implemented by pointers into 'tree_buff'.
824 This is accomplished by '-1' as the element size.
825 */
826 init_tree(&count[i].int_tree,0,0,-1,(qsort_cmp2) compare_tree,0, NULL,
827 NULL);
828 if (records && type != FIELD_BLOB && type != FIELD_VARCHAR)
829 count[i].tree_pos=count[i].tree_buff =
830 my_malloc(count[i].field_length > 1 ? tree_buff_length : 2,
831 MYF(MY_WME));
832 }
833 }
834 return count;
835 }
836
837
838 /* Free memory used by counts and trees */
839
free_counts_and_tree_and_queue(HUFF_TREE * huff_trees,uint trees,HUFF_COUNTS * huff_counts,uint fields)840 static void free_counts_and_tree_and_queue(HUFF_TREE *huff_trees, uint trees,
841 HUFF_COUNTS *huff_counts,
842 uint fields)
843 {
844 register uint i;
845
846 if (huff_trees)
847 {
848 for (i=0 ; i < trees ; i++)
849 {
850 if (huff_trees[i].element_buffer)
851 my_free(huff_trees[i].element_buffer);
852 if (huff_trees[i].code)
853 my_free(huff_trees[i].code);
854 }
855 my_free(huff_trees);
856 }
857 if (huff_counts)
858 {
859 for (i=0 ; i < fields ; i++)
860 {
861 if (huff_counts[i].tree_buff)
862 {
863 my_free(huff_counts[i].tree_buff);
864 delete_tree(&huff_counts[i].int_tree);
865 }
866 }
867 my_free(huff_counts);
868 }
869 delete_queue(&queue); /* This is safe to free */
870 return;
871 }
872
873 /* Read through old file and gather some statistics */
874
get_statistic(PACK_MRG_INFO * mrg,HUFF_COUNTS * huff_counts)875 static int get_statistic(PACK_MRG_INFO *mrg,HUFF_COUNTS *huff_counts)
876 {
877 int error;
878 uint length;
879 ulong reclength,max_blob_length;
880 uchar *record,*pos,*next_pos,*end_pos,*start_pos;
881 ha_rows record_count;
882 my_bool static_row_size;
883 HUFF_COUNTS *count,*end_count;
884 TREE_ELEMENT *element;
885 DBUG_ENTER("get_statistic");
886
887 reclength=mrg->file[0]->s->base.reclength;
888 record=(uchar*) my_alloca(reclength);
889 end_count=huff_counts+mrg->file[0]->s->base.fields;
890 record_count=0; glob_crc=0;
891 max_blob_length=0;
892
893 /* Check how to calculate checksum */
894 static_row_size=1;
895 for (count=huff_counts ; count < end_count ; count++)
896 {
897 if (count->field_type == FIELD_BLOB ||
898 count->field_type == FIELD_VARCHAR)
899 {
900 static_row_size=0;
901 break;
902 }
903 }
904
905 mrg_reset(mrg);
906 while ((error=mrg_rrnd(mrg,record)) != HA_ERR_END_OF_FILE)
907 {
908 ulong tot_blob_length=0;
909 if (! error)
910 {
911 /* glob_crc is a checksum over all bytes of all records. */
912 if (static_row_size)
913 glob_crc+=mi_static_checksum(mrg->file[0],record);
914 else
915 glob_crc+=mi_checksum(mrg->file[0],record);
916
917 /* Count the incidence of values separately for every column. */
918 for (pos=record,count=huff_counts ;
919 count < end_count ;
920 count++,
921 pos=next_pos)
922 {
923 next_pos=end_pos=(start_pos=pos)+count->field_length;
924
925 /*
926 Put the whole column value in a tree if there is room for it.
927 'int_tree' is used to quickly check for duplicate values.
928 'tree_buff' collects as many distinct column values as
929 possible. If the field length is > 1, it is tree_buff_length,
930 else 2 bytes. Each value is 'field_length' bytes big. If there
931 are more distinct column values than fit into the buffer, we
932 give up with this tree. BLOBs and VARCHARs do not have a
933 tree_buff as it can only be used with fixed length columns.
934 For the special case of field length == 1, we handle only the
935 case that there is only one distinct value in the table(s).
936 Otherwise, we can have a maximum of 256 distinct values. This
937 is then handled by the normal Huffman tree build.
938
939 Another limit for collecting distinct column values is the
940 number of values itself. Since we would need to build a
941 Huffman tree for the values, we are limited by the 'IS_OFFSET'
942 constant. This constant expresses a bit which is used to
943 determine if a tree element holds a final value or an offset
944 to a child element. Hence, all values and offsets need to be
945 smaller than 'IS_OFFSET'. A tree element is implemented with
946 two integer values, one for the left branch and one for the
947 right branch. For the extreme case that the first element
948 points to the last element, the number of integers in the tree
949 must be less or equal to IS_OFFSET. So the number of elements
950 must be less or equal to IS_OFFSET / 2.
951
952 WARNING: At first, we insert a pointer into the record buffer
953 as the key for the tree. If we got a new distinct value, which
954 is really inserted into the tree, instead of being counted
955 only, we will copy the column value from the record buffer to
956 'tree_buff' and adjust the key pointer of the tree accordingly.
957 */
958 if (count->tree_buff)
959 {
960 global_count=count;
961 if (!(element=tree_insert(&count->int_tree,pos, 0,
962 count->int_tree.custom_arg)) ||
963 (element->count == 1 &&
964 (count->tree_buff + tree_buff_length <
965 count->tree_pos + count->field_length)) ||
966 (count->int_tree.elements_in_tree > IS_OFFSET / 2) ||
967 (count->field_length == 1 &&
968 count->int_tree.elements_in_tree > 1))
969 {
970 delete_tree(&count->int_tree);
971 my_free(count->tree_buff);
972 count->tree_buff=0;
973 }
974 else
975 {
976 /*
977 If tree_insert() succeeds, it either creates a new element
978 or increments the counter of an existing element.
979 */
980 if (element->count == 1)
981 {
982 /* Copy the new column value into 'tree_buff'. */
983 memcpy(count->tree_pos,pos,(size_t) count->field_length);
984 /* Adjust the key pointer in the tree. */
985 tree_set_pointer(element,count->tree_pos);
986 /* Point behind the last column value so far. */
987 count->tree_pos+=count->field_length;
988 }
989 }
990 }
991
992 /* Save character counters and space-counts and zero-field-counts */
993 if (count->field_type == FIELD_NORMAL ||
994 count->field_type == FIELD_SKIP_ENDSPACE)
995 {
996 /* Ignore trailing space. */
997 for ( ; end_pos > pos ; end_pos--)
998 if (end_pos[-1] != ' ')
999 break;
1000 /* Empty fields are just counted. Go to the next record. */
1001 if (end_pos == pos)
1002 {
1003 count->empty_fields++;
1004 count->max_zero_fill=0;
1005 continue;
1006 }
1007 /*
1008 Count the total of all trailing spaces and the number of
1009 short trailing spaces. Remember the longest trailing space.
1010 */
1011 length= (uint) (next_pos-end_pos);
1012 count->tot_end_space+=length;
1013 if (length < 8)
1014 count->end_space[length]++;
1015 if (count->max_end_space < length)
1016 count->max_end_space = length;
1017 }
1018
1019 if (count->field_type == FIELD_NORMAL ||
1020 count->field_type == FIELD_SKIP_PRESPACE)
1021 {
1022 /* Ignore leading space. */
1023 for (pos=start_pos; pos < end_pos ; pos++)
1024 if (pos[0] != ' ')
1025 break;
1026 /* Empty fields are just counted. Go to the next record. */
1027 if (end_pos == pos)
1028 {
1029 count->empty_fields++;
1030 count->max_zero_fill=0;
1031 continue;
1032 }
1033 /*
1034 Count the total of all leading spaces and the number of
1035 short leading spaces. Remember the longest leading space.
1036 */
1037 length= (uint) (pos-start_pos);
1038 count->tot_pre_space+=length;
1039 if (length < 8)
1040 count->pre_space[length]++;
1041 if (count->max_pre_space < length)
1042 count->max_pre_space = length;
1043 }
1044
1045 /* Calculate pos, end_pos, and max_length for variable length fields. */
1046 if (count->field_type == FIELD_BLOB)
1047 {
1048 uint field_length=count->field_length -portable_sizeof_char_ptr;
1049 ulong blob_length= _mi_calc_blob_length(field_length, start_pos);
1050 memcpy(&pos, start_pos+field_length, sizeof(char*));
1051 end_pos=pos+blob_length;
1052 tot_blob_length+=blob_length;
1053 set_if_bigger(count->max_length,blob_length);
1054 }
1055 else if (count->field_type == FIELD_VARCHAR)
1056 {
1057 uint pack_length= HA_VARCHAR_PACKLENGTH(count->field_length-1);
1058 length= (pack_length == 1 ? (uint) *(uchar*) start_pos :
1059 uint2korr(start_pos));
1060 pos= start_pos+pack_length;
1061 end_pos= pos+length;
1062 set_if_bigger(count->max_length,length);
1063 }
1064
1065 /* Evaluate 'max_zero_fill' for short fields. */
1066 if (count->field_length <= 8 &&
1067 (count->field_type == FIELD_NORMAL ||
1068 count->field_type == FIELD_SKIP_ZERO))
1069 {
1070 uint i;
1071 /* Zero fields are just counted. Go to the next record. */
1072 if (!memcmp((uchar*) start_pos,zero_string,count->field_length))
1073 {
1074 count->zero_fields++;
1075 continue;
1076 }
1077 /*
1078 max_zero_fill starts with field_length. It is decreased every
1079 time a shorter "zero trailer" is found. It is set to zero when
1080 an empty field is found (see above). This suggests that the
1081 variable should be called 'min_zero_fill'.
1082 */
1083 for (i =0 ; i < count->max_zero_fill && ! end_pos[-1 - (int) i] ;
1084 i++) ;
1085 if (i < count->max_zero_fill)
1086 count->max_zero_fill=i;
1087 }
1088
1089 /* Ignore zero fields and check fields. */
1090 if (count->field_type == FIELD_ZERO ||
1091 count->field_type == FIELD_CHECK)
1092 continue;
1093
1094 /*
1095 Count the incidence of every byte value in the
1096 significant field value.
1097 */
1098 for ( ; pos < end_pos ; pos++)
1099 count->counts[(uchar) *pos]++;
1100
1101 /* Step to next field. */
1102 }
1103
1104 if (tot_blob_length > max_blob_length)
1105 max_blob_length=tot_blob_length;
1106 record_count++;
1107 if (write_loop && record_count % WRITE_COUNT == 0)
1108 {
1109 printf("%lu\r", (ulong) record_count);
1110 (void) fflush(stdout);
1111 }
1112 }
1113 else if (error != HA_ERR_RECORD_DELETED)
1114 {
1115 (void) fprintf(stderr, "Got error %d while reading rows", error);
1116 break;
1117 }
1118
1119 /* Step to next record. */
1120 }
1121 if (write_loop)
1122 {
1123 printf(" \r");
1124 (void) fflush(stdout);
1125 }
1126
1127 /*
1128 If --debug=d,fakebigcodes is set, fake the counts to get big Huffman
1129 codes.
1130 */
1131 DBUG_EXECUTE_IF("fakebigcodes", fakebigcodes(huff_counts, end_count););
1132
1133 DBUG_PRINT("info", ("Found the following number of incidents "
1134 "of the byte codes:"));
1135 if (verbose >= 2)
1136 printf("Found the following number of incidents "
1137 "of the byte codes:\n");
1138 for (count= huff_counts ; count < end_count; count++)
1139 {
1140 uint idx;
1141 my_off_t total_count;
1142 char llbuf[32];
1143
1144 DBUG_PRINT("info", ("column: %3u", (uint) (count - huff_counts + 1)));
1145 if (verbose >= 2)
1146 printf("column: %3u\n", (uint) (count - huff_counts + 1));
1147 if (count->tree_buff)
1148 {
1149 DBUG_PRINT("info", ("number of distinct values: %u",
1150 (uint) ((count->tree_pos - count->tree_buff) /
1151 count->field_length)));
1152 if (verbose >= 2)
1153 printf("number of distinct values: %u\n",
1154 (uint) ((count->tree_pos - count->tree_buff) /
1155 count->field_length));
1156 }
1157 total_count= 0;
1158 for (idx= 0; idx < 256; idx++)
1159 {
1160 if (count->counts[idx])
1161 {
1162 total_count+= count->counts[idx];
1163 DBUG_PRINT("info", ("counts[0x%02x]: %12s", idx,
1164 llstr((longlong) count->counts[idx], llbuf)));
1165 if (verbose >= 2)
1166 printf("counts[0x%02x]: %12s\n", idx,
1167 llstr((longlong) count->counts[idx], llbuf));
1168 }
1169 }
1170 DBUG_PRINT("info", ("total: %12s", llstr((longlong) total_count,
1171 llbuf)));
1172 if ((verbose >= 2) && total_count)
1173 {
1174 printf("total: %12s\n",
1175 llstr((longlong) total_count, llbuf));
1176 }
1177 }
1178
1179 mrg->records=record_count;
1180 mrg->max_blob_length=max_blob_length;
1181 my_afree((uchar*) record);
1182 DBUG_RETURN(error != HA_ERR_END_OF_FILE);
1183 }
1184
compare_huff_elements(void * not_used MY_ATTRIBUTE ((unused)),uchar * a,uchar * b)1185 static int compare_huff_elements(void *not_used MY_ATTRIBUTE((unused)),
1186 uchar *a, uchar *b)
1187 {
1188 return *((my_off_t*) a) < *((my_off_t*) b) ? -1 :
1189 (*((my_off_t*) a) == *((my_off_t*) b) ? 0 : 1);
1190 }
1191
1192 /* Check each tree if we should use pre-space-compress, end-space-
1193 compress, empty-field-compress or zero-field-compress */
1194
check_counts(HUFF_COUNTS * huff_counts,uint trees,my_off_t records)1195 static void check_counts(HUFF_COUNTS *huff_counts, uint trees,
1196 my_off_t records)
1197 {
1198 uint space_fields,fill_zero_fields,field_count[(int) FIELD_enum_val_count];
1199 my_off_t old_length,new_length,length;
1200 DBUG_ENTER("check_counts");
1201
1202 memset(field_count, 0, sizeof(field_count));
1203 space_fields=fill_zero_fields=0;
1204
1205 for (; trees-- ; huff_counts++)
1206 {
1207 if (huff_counts->field_type == FIELD_BLOB)
1208 {
1209 huff_counts->length_bits=max_bit(huff_counts->max_length);
1210 goto found_pack;
1211 }
1212 else if (huff_counts->field_type == FIELD_VARCHAR)
1213 {
1214 huff_counts->length_bits=max_bit(huff_counts->max_length);
1215 goto found_pack;
1216 }
1217 else if (huff_counts->field_type == FIELD_CHECK)
1218 {
1219 huff_counts->bytes_packed=0;
1220 huff_counts->counts[0]=0;
1221 goto found_pack;
1222 }
1223
1224 huff_counts->field_type=FIELD_NORMAL;
1225 huff_counts->pack_type=0;
1226
1227 /* Check for zero-filled records (in this column), or zero records. */
1228 if (huff_counts->zero_fields || ! records)
1229 {
1230 my_off_t old_space_count;
1231 /*
1232 If there are only zero filled records (in this column),
1233 or no records at all, we are done.
1234 */
1235 if (huff_counts->zero_fields == records)
1236 {
1237 huff_counts->field_type= FIELD_ZERO;
1238 huff_counts->bytes_packed=0;
1239 huff_counts->counts[0]=0;
1240 goto found_pack;
1241 }
1242 /* Remeber the number of significant spaces. */
1243 old_space_count=huff_counts->counts[' '];
1244 /* Add all leading and trailing spaces. */
1245 huff_counts->counts[' ']+= (huff_counts->tot_end_space +
1246 huff_counts->tot_pre_space +
1247 huff_counts->empty_fields *
1248 huff_counts->field_length);
1249 /* Check, what the compressed length of this would be. */
1250 old_length=calc_packed_length(huff_counts,0)+records/8;
1251 /* Get the number of zero bytes. */
1252 length=huff_counts->zero_fields*huff_counts->field_length;
1253 /* Add it to the counts. */
1254 huff_counts->counts[0]+=length;
1255 /* Check, what the compressed length of this would be. */
1256 new_length=calc_packed_length(huff_counts,0);
1257 /* If the compression without the zeroes would be shorter, we are done. */
1258 if (old_length < new_length && huff_counts->field_length > 1)
1259 {
1260 huff_counts->field_type=FIELD_SKIP_ZERO;
1261 huff_counts->counts[0]-=length;
1262 huff_counts->bytes_packed=old_length- records/8;
1263 goto found_pack;
1264 }
1265 /* Remove the insignificant spaces, but keep the zeroes. */
1266 huff_counts->counts[' ']=old_space_count;
1267 }
1268 /* Check, what the compressed length of this column would be. */
1269 huff_counts->bytes_packed=calc_packed_length(huff_counts,0);
1270
1271 /*
1272 If there are enough empty records (in this column),
1273 treating them specially may pay off.
1274 */
1275 if (huff_counts->empty_fields)
1276 {
1277 if (huff_counts->field_length > 2 &&
1278 huff_counts->empty_fields + (records - huff_counts->empty_fields)*
1279 (1+max_bit(MY_MAX(huff_counts->max_pre_space,
1280 huff_counts->max_end_space))) <
1281 records * max_bit(huff_counts->field_length))
1282 {
1283 huff_counts->pack_type |= PACK_TYPE_SPACE_FIELDS;
1284 }
1285 else
1286 {
1287 length=huff_counts->empty_fields*huff_counts->field_length;
1288 if (huff_counts->tot_end_space || ! huff_counts->tot_pre_space)
1289 {
1290 huff_counts->tot_end_space+=length;
1291 huff_counts->max_end_space=huff_counts->field_length;
1292 if (huff_counts->field_length < 8)
1293 huff_counts->end_space[huff_counts->field_length]+=
1294 huff_counts->empty_fields;
1295 }
1296 if (huff_counts->tot_pre_space)
1297 {
1298 huff_counts->tot_pre_space+=length;
1299 huff_counts->max_pre_space=huff_counts->field_length;
1300 if (huff_counts->field_length < 8)
1301 huff_counts->pre_space[huff_counts->field_length]+=
1302 huff_counts->empty_fields;
1303 }
1304 }
1305 }
1306
1307 /*
1308 If there are enough trailing spaces (in this column),
1309 treating them specially may pay off.
1310 */
1311 if (huff_counts->tot_end_space)
1312 {
1313 huff_counts->counts[' ']+=huff_counts->tot_pre_space;
1314 if (test_space_compress(huff_counts,records,huff_counts->max_end_space,
1315 huff_counts->end_space,
1316 huff_counts->tot_end_space,FIELD_SKIP_ENDSPACE))
1317 goto found_pack;
1318 huff_counts->counts[' ']-=huff_counts->tot_pre_space;
1319 }
1320
1321 /*
1322 If there are enough leading spaces (in this column),
1323 treating them specially may pay off.
1324 */
1325 if (huff_counts->tot_pre_space)
1326 {
1327 if (test_space_compress(huff_counts,records,huff_counts->max_pre_space,
1328 huff_counts->pre_space,
1329 huff_counts->tot_pre_space,FIELD_SKIP_PRESPACE))
1330 goto found_pack;
1331 }
1332
1333 found_pack: /* Found field-packing */
1334
1335 /* Test if we can use zero-fill */
1336
1337 if (huff_counts->max_zero_fill &&
1338 (huff_counts->field_type == FIELD_NORMAL ||
1339 huff_counts->field_type == FIELD_SKIP_ZERO))
1340 {
1341 huff_counts->counts[0]-=huff_counts->max_zero_fill*
1342 (huff_counts->field_type == FIELD_SKIP_ZERO ?
1343 records - huff_counts->zero_fields : records);
1344 huff_counts->pack_type|=PACK_TYPE_ZERO_FILL;
1345 huff_counts->bytes_packed=calc_packed_length(huff_counts,0);
1346 }
1347
1348 /* Test if intervall-field is better */
1349
1350 if (huff_counts->tree_buff)
1351 {
1352 HUFF_TREE tree;
1353
1354 DBUG_EXECUTE_IF("forceintervall",
1355 huff_counts->bytes_packed= ~ (my_off_t) 0;);
1356 tree.element_buffer=0;
1357 if (!make_huff_tree(&tree,huff_counts) &&
1358 tree.bytes_packed+tree.tree_pack_length < huff_counts->bytes_packed)
1359 {
1360 if (tree.elements == 1)
1361 huff_counts->field_type=FIELD_CONSTANT;
1362 else
1363 huff_counts->field_type=FIELD_INTERVALL;
1364 huff_counts->pack_type=0;
1365 }
1366 else
1367 {
1368 my_free(huff_counts->tree_buff);
1369 delete_tree(&huff_counts->int_tree);
1370 huff_counts->tree_buff=0;
1371 }
1372 if (tree.element_buffer)
1373 my_free(tree.element_buffer);
1374 }
1375 if (huff_counts->pack_type & PACK_TYPE_SPACE_FIELDS)
1376 space_fields++;
1377 if (huff_counts->pack_type & PACK_TYPE_ZERO_FILL)
1378 fill_zero_fields++;
1379 field_count[huff_counts->field_type]++;
1380 }
1381 DBUG_PRINT("info", ("normal: %3d empty-space: %3d "
1382 "empty-zero: %3d empty-fill: %3d",
1383 field_count[FIELD_NORMAL],space_fields,
1384 field_count[FIELD_SKIP_ZERO],fill_zero_fields));
1385 DBUG_PRINT("info", ("pre-space: %3d end-space: %3d "
1386 "intervall-fields: %3d zero: %3d",
1387 field_count[FIELD_SKIP_PRESPACE],
1388 field_count[FIELD_SKIP_ENDSPACE],
1389 field_count[FIELD_INTERVALL],
1390 field_count[FIELD_ZERO]));
1391 if (verbose)
1392 printf("\nnormal: %3d empty-space: %3d "
1393 "empty-zero: %3d empty-fill: %3d\n"
1394 "pre-space: %3d end-space: %3d "
1395 "intervall-fields: %3d zero: %3d\n",
1396 field_count[FIELD_NORMAL],space_fields,
1397 field_count[FIELD_SKIP_ZERO],fill_zero_fields,
1398 field_count[FIELD_SKIP_PRESPACE],
1399 field_count[FIELD_SKIP_ENDSPACE],
1400 field_count[FIELD_INTERVALL],
1401 field_count[FIELD_ZERO]);
1402 DBUG_VOID_RETURN;
1403 }
1404
1405 /* Test if we can use space-compression and empty-field-compression */
1406
1407 static int
test_space_compress(HUFF_COUNTS * huff_counts,my_off_t records,uint max_space_length,my_off_t * space_counts,my_off_t tot_space_count,enum en_fieldtype field_type)1408 test_space_compress(HUFF_COUNTS *huff_counts, my_off_t records,
1409 uint max_space_length, my_off_t *space_counts,
1410 my_off_t tot_space_count, enum en_fieldtype field_type)
1411 {
1412 int min_pos;
1413 uint length_bits,i;
1414 my_off_t space_count,min_space_count,min_pack,new_length,skip;
1415
1416 length_bits=max_bit(max_space_length);
1417
1418 /* Default no end_space-packing */
1419 space_count=huff_counts->counts[(uint) ' '];
1420 min_space_count= (huff_counts->counts[(uint) ' ']+= tot_space_count);
1421 min_pack=calc_packed_length(huff_counts,0);
1422 min_pos= -2;
1423 huff_counts->counts[(uint) ' ']=space_count;
1424
1425 /* Test with allways space-count */
1426 new_length=huff_counts->bytes_packed+length_bits*records/8;
1427 if (new_length+1 < min_pack)
1428 {
1429 min_pos= -1;
1430 min_pack=new_length;
1431 min_space_count=space_count;
1432 }
1433 /* Test with length-flag */
1434 for (skip=0L, i=0 ; i < 8 ; i++)
1435 {
1436 if (space_counts[i])
1437 {
1438 if (i)
1439 huff_counts->counts[(uint) ' ']+=space_counts[i];
1440 skip+=huff_counts->pre_space[i];
1441 new_length=calc_packed_length(huff_counts,0)+
1442 (records+(records-skip)*(1+length_bits))/8;
1443 if (new_length < min_pack)
1444 {
1445 min_pos=(int) i;
1446 min_pack=new_length;
1447 min_space_count=huff_counts->counts[(uint) ' '];
1448 }
1449 }
1450 }
1451
1452 huff_counts->counts[(uint) ' ']=min_space_count;
1453 huff_counts->bytes_packed=min_pack;
1454 switch (min_pos) {
1455 case -2:
1456 return(0); /* No space-compress */
1457 case -1: /* Always space-count */
1458 huff_counts->field_type=field_type;
1459 huff_counts->min_space=0;
1460 huff_counts->length_bits=max_bit(max_space_length);
1461 break;
1462 default:
1463 huff_counts->field_type=field_type;
1464 huff_counts->min_space=(uint) min_pos;
1465 huff_counts->pack_type|=PACK_TYPE_SELECTED;
1466 huff_counts->length_bits=max_bit(max_space_length);
1467 break;
1468 }
1469 return(1); /* Using space-compress */
1470 }
1471
1472
1473 /* Make a huff_tree of each huff_count */
1474
make_huff_trees(HUFF_COUNTS * huff_counts,uint trees)1475 static HUFF_TREE* make_huff_trees(HUFF_COUNTS *huff_counts, uint trees)
1476 {
1477 uint tree;
1478 HUFF_TREE *huff_tree;
1479 DBUG_ENTER("make_huff_trees");
1480
1481 if (!(huff_tree=(HUFF_TREE*) my_malloc(trees*sizeof(HUFF_TREE),
1482 MYF(MY_WME | MY_ZEROFILL))))
1483 DBUG_RETURN(0);
1484
1485 for (tree=0 ; tree < trees ; tree++)
1486 {
1487 if (make_huff_tree(huff_tree+tree,huff_counts+tree))
1488 {
1489 while (tree--)
1490 my_free(huff_tree[tree].element_buffer);
1491 my_free(huff_tree);
1492 DBUG_RETURN(0);
1493 }
1494 }
1495 DBUG_RETURN(huff_tree);
1496 }
1497
1498 /*
1499 Build a Huffman tree.
1500
1501 SYNOPSIS
1502 make_huff_tree()
1503 huff_tree The Huffman tree.
1504 huff_counts The counts.
1505
1506 DESCRIPTION
1507 Build a Huffman tree according to huff_counts->counts or
1508 huff_counts->tree_buff. tree_buff, if non-NULL contains up to
1509 tree_buff_length of distinct column values. In that case, whole
1510 values can be Huffman encoded instead of single bytes.
1511
1512 RETURN
1513 0 OK
1514 != 0 Error
1515 */
1516
make_huff_tree(HUFF_TREE * huff_tree,HUFF_COUNTS * huff_counts)1517 static int make_huff_tree(HUFF_TREE *huff_tree, HUFF_COUNTS *huff_counts)
1518 {
1519 uint i,found,bits_packed,first,last;
1520 my_off_t bytes_packed;
1521 HUFF_ELEMENT *a,*b,*new_huff_el;
1522
1523 first=last=0;
1524 if (huff_counts->tree_buff)
1525 {
1526 /* Calculate the number of distinct values in tree_buff. */
1527 found= (uint) (huff_counts->tree_pos - huff_counts->tree_buff) /
1528 huff_counts->field_length;
1529 first=0; last=found-1;
1530 }
1531 else
1532 {
1533 /* Count the number of byte codes found in the column. */
1534 for (i=found=0 ; i < 256 ; i++)
1535 {
1536 if (huff_counts->counts[i])
1537 {
1538 if (! found++)
1539 first=i;
1540 last=i;
1541 }
1542 }
1543 if (found < 2)
1544 found=2;
1545 }
1546
1547 /* When using 'tree_buff' we can have more that 256 values. */
1548 if (queue.max_elements < found)
1549 {
1550 delete_queue(&queue);
1551 if (init_queue(&queue,found,0,0,compare_huff_elements,0))
1552 return -1;
1553 }
1554
1555 /* Allocate or reallocate an element buffer for the Huffman tree. */
1556 if (!huff_tree->element_buffer)
1557 {
1558 if (!(huff_tree->element_buffer=
1559 (HUFF_ELEMENT*) my_malloc(found*2*sizeof(HUFF_ELEMENT),MYF(MY_WME))))
1560 return 1;
1561 }
1562 else
1563 {
1564 HUFF_ELEMENT *temp;
1565 if (!(temp=
1566 (HUFF_ELEMENT*) my_realloc((uchar*) huff_tree->element_buffer,
1567 found*2*sizeof(HUFF_ELEMENT),
1568 MYF(MY_WME))))
1569 return 1;
1570 huff_tree->element_buffer=temp;
1571 }
1572
1573 huff_counts->tree=huff_tree;
1574 huff_tree->counts=huff_counts;
1575 huff_tree->min_chr=first;
1576 huff_tree->max_chr=last;
1577 huff_tree->char_bits=max_bit(last-first);
1578 huff_tree->offset_bits=max_bit(found-1)+1;
1579
1580 if (huff_counts->tree_buff)
1581 {
1582 huff_tree->elements=0;
1583 huff_tree->tree_pack_length=(1+15+16+5+5+
1584 (huff_tree->char_bits+1)*found+
1585 (huff_tree->offset_bits+1)*
1586 (found-2)+7)/8 +
1587 (uint) (huff_tree->counts->tree_pos-
1588 huff_tree->counts->tree_buff);
1589 /*
1590 Put a HUFF_ELEMENT into the queue for every distinct column value.
1591
1592 tree_walk() calls save_counts_in_queue() for every element in
1593 'int_tree'. This takes elements from the target trees element
1594 buffer and places references to them into the buffer of the
1595 priority queue. We insert in column value order, but the order is
1596 in fact irrelevant here. We will establish the correct order
1597 later.
1598 */
1599 tree_walk(&huff_counts->int_tree,
1600 (int (*)(void*, element_count,void*)) save_counts_in_queue,
1601 (uchar*) huff_tree, left_root_right);
1602 }
1603 else
1604 {
1605 huff_tree->elements=found;
1606 huff_tree->tree_pack_length=(9+9+5+5+
1607 (huff_tree->char_bits+1)*found+
1608 (huff_tree->offset_bits+1)*
1609 (found-2)+7)/8;
1610 /*
1611 Put a HUFF_ELEMENT into the queue for every byte code found in the column.
1612
1613 The elements are taken from the target trees element buffer.
1614 Instead of using queue_insert(), we just place references to the
1615 elements into the buffer of the priority queue. We insert in byte
1616 value order, but the order is in fact irrelevant here. We will
1617 establish the correct order later.
1618 */
1619 for (i=first, found=0 ; i <= last ; i++)
1620 {
1621 if (huff_counts->counts[i])
1622 {
1623 new_huff_el=huff_tree->element_buffer+(found++);
1624 new_huff_el->count=huff_counts->counts[i];
1625 new_huff_el->a.leaf.null=0;
1626 new_huff_el->a.leaf.element_nr=i;
1627 queue.root[found]=(uchar*) new_huff_el;
1628 }
1629 }
1630 /*
1631 If there is only a single byte value in this field in all records,
1632 add a second element with zero incidence. This is required to enter
1633 the loop, which builds the Huffman tree.
1634 */
1635 while (found < 2)
1636 {
1637 new_huff_el=huff_tree->element_buffer+(found++);
1638 new_huff_el->count=0;
1639 new_huff_el->a.leaf.null=0;
1640 if (last)
1641 new_huff_el->a.leaf.element_nr=huff_tree->min_chr=last-1;
1642 else
1643 new_huff_el->a.leaf.element_nr=huff_tree->max_chr=last+1;
1644 queue.root[found]=(uchar*) new_huff_el;
1645 }
1646 }
1647
1648 /* Make a queue from the queue buffer. */
1649 queue.elements=found;
1650
1651 /*
1652 Make a priority queue from the queue. Construct its index so that we
1653 have a partially ordered tree.
1654 */
1655 for (i=found/2 ; i > 0 ; i--)
1656 _downheap(&queue,i);
1657
1658 /* The Huffman algorithm. */
1659 bytes_packed=0; bits_packed=0;
1660 for (i=1 ; i < found ; i++)
1661 {
1662 /*
1663 Pop the top element from the queue (the one with the least incidence).
1664 Popping from a priority queue includes a re-ordering of the queue,
1665 to get the next least incidence element to the top.
1666 */
1667 a=(HUFF_ELEMENT*) queue_remove(&queue,0);
1668 /*
1669 Copy the next least incidence element. The queue implementation
1670 reserves root[0] for temporary purposes. root[1] is the top.
1671 */
1672 b=(HUFF_ELEMENT*) queue.root[1];
1673 /* Get a new element from the element buffer. */
1674 new_huff_el=huff_tree->element_buffer+found+i;
1675 /* The new element gets the sum of the two least incidence elements. */
1676 new_huff_el->count=a->count+b->count;
1677 /*
1678 The Huffman algorithm assigns another bit to the code for a byte
1679 every time that bytes incidence is combined (directly or indirectly)
1680 to a new element as one of the two least incidence elements.
1681 This means that one more bit per incidence of that byte is required
1682 in the resulting file. So we add the new combined incidence as the
1683 number of bits by which the result grows.
1684 */
1685 bits_packed+=(uint) (new_huff_el->count & 7);
1686 bytes_packed+=new_huff_el->count/8;
1687 /* The new element points to its children, lesser in left. */
1688 new_huff_el->a.nod.left=a;
1689 new_huff_el->a.nod.right=b;
1690 /*
1691 Replace the copied top element by the new element and re-order the
1692 queue.
1693 */
1694 queue.root[1]=(uchar*) new_huff_el;
1695 queue_replaced(&queue);
1696 }
1697 huff_tree->root=(HUFF_ELEMENT*) queue.root[1];
1698 huff_tree->bytes_packed=bytes_packed+(bits_packed+7)/8;
1699 return 0;
1700 }
1701
compare_tree(void * cmp_arg MY_ATTRIBUTE ((unused)),register const uchar * s,register const uchar * t)1702 static int compare_tree(void* cmp_arg MY_ATTRIBUTE((unused)),
1703 register const uchar *s, register const uchar *t)
1704 {
1705 uint length;
1706 for (length=global_count->field_length; length-- ;)
1707 if (*s++ != *t++)
1708 return (int) s[-1] - (int) t[-1];
1709 return 0;
1710 }
1711
1712 /*
1713 Organize distinct column values and their incidences into a priority queue.
1714
1715 SYNOPSIS
1716 save_counts_in_queue()
1717 key The column value.
1718 count The incidence of this value.
1719 tree The Huffman tree to be built later.
1720
1721 DESCRIPTION
1722 We use the element buffer of the targeted tree. The distinct column
1723 values are organized in a priority queue first. The Huffman
1724 algorithm will later organize the elements into a Huffman tree. For
1725 the time being, we just place references to the elements into the
1726 queue buffer. The buffer will later be organized into a priority
1727 queue.
1728
1729 RETURN
1730 0
1731 */
1732
save_counts_in_queue(uchar * key,element_count count,HUFF_TREE * tree)1733 static int save_counts_in_queue(uchar *key, element_count count,
1734 HUFF_TREE *tree)
1735 {
1736 HUFF_ELEMENT *new_huff_el;
1737
1738 new_huff_el=tree->element_buffer+(tree->elements++);
1739 new_huff_el->count=count;
1740 new_huff_el->a.leaf.null=0;
1741 new_huff_el->a.leaf.element_nr= (uint) (key- tree->counts->tree_buff) /
1742 tree->counts->field_length;
1743 queue.root[tree->elements]=(uchar*) new_huff_el;
1744 return 0;
1745 }
1746
1747
1748 /*
1749 Calculate length of file if given counts should be used.
1750
1751 SYNOPSIS
1752 calc_packed_length()
1753 huff_counts The counts for a column of the table(s).
1754 add_tree_lenght If the decode tree length should be added.
1755
1756 DESCRIPTION
1757 We need to follow the Huffman algorithm until we know, how many bits
1758 are required for each byte code. But we do not need the resulting
1759 Huffman tree. Hence, we can leave out some steps which are essential
1760 in make_huff_tree().
1761
1762 RETURN
1763 Number of bytes required to compress this table column.
1764 */
1765
calc_packed_length(HUFF_COUNTS * huff_counts,uint add_tree_lenght)1766 static my_off_t calc_packed_length(HUFF_COUNTS *huff_counts,
1767 uint add_tree_lenght)
1768 {
1769 uint i,found,bits_packed,first,last;
1770 my_off_t bytes_packed;
1771 HUFF_ELEMENT element_buffer[256];
1772 DBUG_ENTER("calc_packed_length");
1773
1774 /*
1775 WARNING: We use a small hack for efficiency: Instead of placing
1776 references to HUFF_ELEMENTs into the queue, we just insert
1777 references to the counts of the byte codes which appeared in this
1778 table column. During the Huffman algorithm they are successively
1779 replaced by references to HUFF_ELEMENTs. This works, because
1780 HUFF_ELEMENTs have the incidence count at their beginning.
1781 Regardless, wether the queue array contains references to counts of
1782 type my_off_t or references to HUFF_ELEMENTs which have the count of
1783 type my_off_t at their beginning, it always points to a count of the
1784 same type.
1785
1786 Instead of using queue_insert(), we just copy the references into
1787 the buffer of the priority queue. We insert in byte value order, but
1788 the order is in fact irrelevant here. We will establish the correct
1789 order later.
1790 */
1791 first=last=0;
1792 for (i=found=0 ; i < 256 ; i++)
1793 {
1794 if (huff_counts->counts[i])
1795 {
1796 if (! found++)
1797 first=i;
1798 last=i;
1799 /* We start with root[1], which is the queues top element. */
1800 queue.root[found]=(uchar*) &huff_counts->counts[i];
1801 }
1802 }
1803 if (!found)
1804 DBUG_RETURN(0); /* Empty tree */
1805 /*
1806 If there is only a single byte value in this field in all records,
1807 add a second element with zero incidence. This is required to enter
1808 the loop, which follows the Huffman algorithm.
1809 */
1810 if (found < 2)
1811 queue.root[++found]=(uchar*) &huff_counts->counts[last ? 0 : 1];
1812
1813 /* Make a queue from the queue buffer. */
1814 queue.elements=found;
1815
1816 bytes_packed=0; bits_packed=0;
1817 /* Add the length of the coding table, which would become part of the file. */
1818 if (add_tree_lenght)
1819 bytes_packed=(8+9+5+5+(max_bit(last-first)+1)*found+
1820 (max_bit(found-1)+1+1)*(found-2) +7)/8;
1821
1822 /*
1823 Make a priority queue from the queue. Construct its index so that we
1824 have a partially ordered tree.
1825 */
1826 for (i=(found+1)/2 ; i > 0 ; i--)
1827 _downheap(&queue,i);
1828
1829 /* The Huffman algorithm. */
1830 for (i=0 ; i < found-1 ; i++)
1831 {
1832 my_off_t *a;
1833 my_off_t *b;
1834 HUFF_ELEMENT *new_huff_el;
1835
1836 /*
1837 Pop the top element from the queue (the one with the least
1838 incidence). Popping from a priority queue includes a re-ordering
1839 of the queue, to get the next least incidence element to the top.
1840 */
1841 a= (my_off_t*) queue_remove(&queue, 0);
1842 /*
1843 Copy the next least incidence element. The queue implementation
1844 reserves root[0] for temporary purposes. root[1] is the top.
1845 */
1846 b= (my_off_t*) queue.root[1];
1847 /* Create a new element in a local (automatic) buffer. */
1848 new_huff_el= element_buffer + i;
1849 /* The new element gets the sum of the two least incidence elements. */
1850 new_huff_el->count= *a + *b;
1851 /*
1852 The Huffman algorithm assigns another bit to the code for a byte
1853 every time that bytes incidence is combined (directly or indirectly)
1854 to a new element as one of the two least incidence elements.
1855 This means that one more bit per incidence of that byte is required
1856 in the resulting file. So we add the new combined incidence as the
1857 number of bits by which the result grows.
1858 */
1859 bits_packed+=(uint) (new_huff_el->count & 7);
1860 bytes_packed+=new_huff_el->count/8;
1861 /*
1862 Replace the copied top element by the new element and re-order the
1863 queue. This successively replaces the references to counts by
1864 references to HUFF_ELEMENTs.
1865 */
1866 queue.root[1]=(uchar*) new_huff_el;
1867 queue_replaced(&queue);
1868 }
1869 DBUG_RETURN(bytes_packed+(bits_packed+7)/8);
1870 }
1871
1872
1873 /* Remove trees that don't give any compression */
1874
join_same_trees(HUFF_COUNTS * huff_counts,uint trees)1875 static uint join_same_trees(HUFF_COUNTS *huff_counts, uint trees)
1876 {
1877 uint k,tree_number;
1878 HUFF_COUNTS count,*i,*j,*last_count;
1879
1880 last_count=huff_counts+trees;
1881 for (tree_number=0, i=huff_counts ; i < last_count ; i++)
1882 {
1883 if (!i->tree->tree_number)
1884 {
1885 i->tree->tree_number= ++tree_number;
1886 if (i->tree_buff)
1887 continue; /* Don't join intervall */
1888 for (j=i+1 ; j < last_count ; j++)
1889 {
1890 if (! j->tree->tree_number && ! j->tree_buff)
1891 {
1892 for (k=0 ; k < 256 ; k++)
1893 count.counts[k]=i->counts[k]+j->counts[k];
1894 if (calc_packed_length(&count,1) <=
1895 i->tree->bytes_packed + j->tree->bytes_packed+
1896 i->tree->tree_pack_length+j->tree->tree_pack_length+
1897 ALLOWED_JOIN_DIFF)
1898 {
1899 memcpy(i->counts, count.counts,
1900 sizeof(count.counts[0])*256);
1901 my_free(j->tree->element_buffer);
1902 j->tree->element_buffer=0;
1903 j->tree=i->tree;
1904 bmove((uchar*) i->counts,(uchar*) count.counts,
1905 sizeof(count.counts[0])*256);
1906 if (make_huff_tree(i->tree,i))
1907 return (uint) -1;
1908 }
1909 }
1910 }
1911 }
1912 }
1913 DBUG_PRINT("info", ("Original trees: %d After join: %d",
1914 trees, tree_number));
1915 if (verbose)
1916 printf("Original trees: %d After join: %d\n", trees, tree_number);
1917 return tree_number; /* Return trees left */
1918 }
1919
1920
1921 /*
1922 Fill in huff_tree encode tables.
1923
1924 SYNOPSIS
1925 make_huff_decode_table()
1926 huff_tree An array of HUFF_TREE which are to be encoded.
1927 trees The number of HUFF_TREE in the array.
1928
1929 RETURN
1930 0 success
1931 != 0 error
1932 */
1933
make_huff_decode_table(HUFF_TREE * huff_tree,uint trees)1934 static int make_huff_decode_table(HUFF_TREE *huff_tree, uint trees)
1935 {
1936 uint elements;
1937 for ( ; trees-- ; huff_tree++)
1938 {
1939 if (huff_tree->tree_number > 0)
1940 {
1941 elements=huff_tree->counts->tree_buff ? huff_tree->elements : 256;
1942 if (!(huff_tree->code =
1943 (ulonglong*) my_malloc(elements*
1944 (sizeof(ulonglong) + sizeof(uchar)),
1945 MYF(MY_WME | MY_ZEROFILL))))
1946 return 1;
1947 huff_tree->code_len=(uchar*) (huff_tree->code+elements);
1948 make_traverse_code_tree(huff_tree, huff_tree->root,
1949 8 * sizeof(ulonglong), LL(0));
1950 }
1951 }
1952 return 0;
1953 }
1954
1955
make_traverse_code_tree(HUFF_TREE * huff_tree,HUFF_ELEMENT * element,uint size,ulonglong code)1956 static void make_traverse_code_tree(HUFF_TREE *huff_tree,
1957 HUFF_ELEMENT *element,
1958 uint size, ulonglong code)
1959 {
1960 uint chr;
1961 if (!element->a.leaf.null)
1962 {
1963 chr=element->a.leaf.element_nr;
1964 huff_tree->code_len[chr]= (uchar) (8 * sizeof(ulonglong) - size);
1965 huff_tree->code[chr]= (code >> size);
1966 if (huff_tree->height < 8 * sizeof(ulonglong) - size)
1967 huff_tree->height= 8 * sizeof(ulonglong) - size;
1968 }
1969 else
1970 {
1971 size--;
1972 make_traverse_code_tree(huff_tree,element->a.nod.left,size,code);
1973 make_traverse_code_tree(huff_tree, element->a.nod.right, size,
1974 code + (((ulonglong) 1) << size));
1975 }
1976 return;
1977 }
1978
1979
1980 /*
1981 Convert a value into binary digits.
1982
1983 SYNOPSIS
1984 bindigits()
1985 value The value.
1986 length The number of low order bits to convert.
1987
1988 NOTE
1989 The result string is in static storage. It is reused on every call.
1990 So you cannot use it twice in one expression.
1991
1992 RETURN
1993 A pointer to a static NUL-terminated string.
1994 */
1995
bindigits(ulonglong value,uint bits)1996 static char *bindigits(ulonglong value, uint bits)
1997 {
1998 static char digits[72];
1999 char *ptr= digits;
2000 uint idx= bits;
2001
2002 DBUG_ASSERT(idx < sizeof(digits));
2003 while (idx)
2004 *(ptr++)= '0' + ((char) (value >> (--idx)) & (char) 1);
2005 *ptr= '\0';
2006 return digits;
2007 }
2008
2009
2010 /*
2011 Convert a value into hexadecimal digits.
2012
2013 SYNOPSIS
2014 hexdigits()
2015 value The value.
2016
2017 NOTE
2018 The result string is in static storage. It is reused on every call.
2019 So you cannot use it twice in one expression.
2020
2021 RETURN
2022 A pointer to a static NUL-terminated string.
2023 */
2024
hexdigits(ulonglong value)2025 static char *hexdigits(ulonglong value)
2026 {
2027 static char digits[20];
2028 char *ptr= digits;
2029 uint idx= 2 * sizeof(value); /* Two hex digits per byte. */
2030
2031 DBUG_ASSERT(idx < sizeof(digits));
2032 while (idx)
2033 {
2034 if ((*(ptr++)= '0' + ((char) (value >> (4 * (--idx))) & (char) 0xf)) > '9')
2035 *(ptr - 1)+= 'a' - '9' - 1;
2036 }
2037 *ptr= '\0';
2038 return digits;
2039 }
2040
2041
2042 /* Write header to new packed data file */
2043
write_header(PACK_MRG_INFO * mrg,uint head_length,uint trees,my_off_t tot_elements,my_off_t filelength)2044 static int write_header(PACK_MRG_INFO *mrg,uint head_length,uint trees,
2045 my_off_t tot_elements,my_off_t filelength)
2046 {
2047 uchar *buff= (uchar*) file_buffer.pos;
2048
2049 memset(buff, 0, HEAD_LENGTH);
2050 memcpy(buff,myisam_pack_file_magic,4);
2051 int4store(buff+4,head_length);
2052 int4store(buff+8, mrg->min_pack_length);
2053 int4store(buff+12,mrg->max_pack_length);
2054 int4store(buff+16,tot_elements);
2055 int4store(buff+20,intervall_length);
2056 int2store(buff+24,trees);
2057 buff[26]=(char) mrg->ref_length;
2058 /* Save record pointer length */
2059 buff[27]= (uchar) mi_get_pointer_length((ulonglong) filelength,2);
2060 if (test_only)
2061 return 0;
2062 my_seek(file_buffer.file,0L,MY_SEEK_SET,MYF(0));
2063 return my_write(file_buffer.file,(const uchar *) file_buffer.pos,HEAD_LENGTH,
2064 MYF(MY_WME | MY_NABP | MY_WAIT_IF_FULL)) != 0;
2065 }
2066
2067 /* Write fieldinfo to new packed file */
2068
write_field_info(HUFF_COUNTS * counts,uint fields,uint trees)2069 static void write_field_info(HUFF_COUNTS *counts, uint fields, uint trees)
2070 {
2071 reg1 uint i;
2072 uint huff_tree_bits;
2073 huff_tree_bits=max_bit(trees ? trees-1 : 0);
2074
2075 DBUG_PRINT("info", (" "));
2076 DBUG_PRINT("info", ("column types:"));
2077 DBUG_PRINT("info", ("FIELD_NORMAL 0"));
2078 DBUG_PRINT("info", ("FIELD_SKIP_ENDSPACE 1"));
2079 DBUG_PRINT("info", ("FIELD_SKIP_PRESPACE 2"));
2080 DBUG_PRINT("info", ("FIELD_SKIP_ZERO 3"));
2081 DBUG_PRINT("info", ("FIELD_BLOB 4"));
2082 DBUG_PRINT("info", ("FIELD_CONSTANT 5"));
2083 DBUG_PRINT("info", ("FIELD_INTERVALL 6"));
2084 DBUG_PRINT("info", ("FIELD_ZERO 7"));
2085 DBUG_PRINT("info", ("FIELD_VARCHAR 8"));
2086 DBUG_PRINT("info", ("FIELD_CHECK 9"));
2087 DBUG_PRINT("info", (" "));
2088 DBUG_PRINT("info", ("pack type as a set of flags:"));
2089 DBUG_PRINT("info", ("PACK_TYPE_SELECTED 1"));
2090 DBUG_PRINT("info", ("PACK_TYPE_SPACE_FIELDS 2"));
2091 DBUG_PRINT("info", ("PACK_TYPE_ZERO_FILL 4"));
2092 DBUG_PRINT("info", (" "));
2093 if (verbose >= 2)
2094 {
2095 printf("\n");
2096 printf("column types:\n");
2097 printf("FIELD_NORMAL 0\n");
2098 printf("FIELD_SKIP_ENDSPACE 1\n");
2099 printf("FIELD_SKIP_PRESPACE 2\n");
2100 printf("FIELD_SKIP_ZERO 3\n");
2101 printf("FIELD_BLOB 4\n");
2102 printf("FIELD_CONSTANT 5\n");
2103 printf("FIELD_INTERVALL 6\n");
2104 printf("FIELD_ZERO 7\n");
2105 printf("FIELD_VARCHAR 8\n");
2106 printf("FIELD_CHECK 9\n");
2107 printf("\n");
2108 printf("pack type as a set of flags:\n");
2109 printf("PACK_TYPE_SELECTED 1\n");
2110 printf("PACK_TYPE_SPACE_FIELDS 2\n");
2111 printf("PACK_TYPE_ZERO_FILL 4\n");
2112 printf("\n");
2113 }
2114 for (i=0 ; i++ < fields ; counts++)
2115 {
2116 write_bits((ulonglong) (int) counts->field_type, 5);
2117 write_bits(counts->pack_type,6);
2118 if (counts->pack_type & PACK_TYPE_ZERO_FILL)
2119 write_bits(counts->max_zero_fill,5);
2120 else
2121 write_bits(counts->length_bits,5);
2122 write_bits((ulonglong) counts->tree->tree_number - 1, huff_tree_bits);
2123 DBUG_PRINT("info", ("column: %3u type: %2u pack: %2u zero: %4u "
2124 "lbits: %2u tree: %2u length: %4u",
2125 i , counts->field_type, counts->pack_type,
2126 counts->max_zero_fill, counts->length_bits,
2127 counts->tree->tree_number, counts->field_length));
2128 if (verbose >= 2)
2129 printf("column: %3u type: %2u pack: %2u zero: %4u lbits: %2u "
2130 "tree: %2u length: %4u\n", i , counts->field_type,
2131 counts->pack_type, counts->max_zero_fill, counts->length_bits,
2132 counts->tree->tree_number, counts->field_length);
2133 }
2134 flush_bits();
2135 return;
2136 }
2137
2138 /* Write all huff_trees to new datafile. Return tot count of
2139 elements in all trees
2140 Returns 0 on error */
2141
write_huff_tree(HUFF_TREE * huff_tree,uint trees)2142 static my_off_t write_huff_tree(HUFF_TREE *huff_tree, uint trees)
2143 {
2144 uint i,int_length;
2145 uint tree_no;
2146 uint codes;
2147 uint errors= 0;
2148 uint *packed_tree,*offset,length;
2149 my_off_t elements;
2150
2151 /* Find the highest number of elements in the trees. */
2152 for (i=length=0 ; i < trees ; i++)
2153 if (huff_tree[i].tree_number > 0 && huff_tree[i].elements > length)
2154 length=huff_tree[i].elements;
2155 /*
2156 Allocate a buffer for packing a decode tree. Two numbers per element
2157 (left child and right child).
2158 */
2159 if (!(packed_tree=(uint*) my_alloca(sizeof(uint)*length*2)))
2160 {
2161 my_error(EE_OUTOFMEMORY, MYF(ME_BELL+ME_FATALERROR),
2162 sizeof(uint)*length*2);
2163 return 0;
2164 }
2165
2166 DBUG_PRINT("info", (" "));
2167 if (verbose >= 2)
2168 printf("\n");
2169 tree_no= 0;
2170 intervall_length=0;
2171 for (elements=0; trees-- ; huff_tree++)
2172 {
2173 /* Skip columns that have been joined with other columns. */
2174 if (huff_tree->tree_number == 0)
2175 continue; /* Deleted tree */
2176 tree_no++;
2177 DBUG_PRINT("info", (" "));
2178 if (verbose >= 3)
2179 printf("\n");
2180 /* Count the total number of elements (byte codes or column values). */
2181 elements+=huff_tree->elements;
2182 huff_tree->max_offset=2;
2183 /* Build a tree of offsets and codes for decoding in 'packed_tree'. */
2184 if (huff_tree->elements <= 1)
2185 offset=packed_tree;
2186 else
2187 offset=make_offset_code_tree(huff_tree,huff_tree->root,packed_tree);
2188
2189 /* This should be the same as 'length' above. */
2190 huff_tree->offset_bits=max_bit(huff_tree->max_offset);
2191
2192 /*
2193 Since we check this during collecting the distinct column values,
2194 this should never happen.
2195 */
2196 if (huff_tree->max_offset >= IS_OFFSET)
2197 { /* This should be impossible */
2198 (void) fprintf(stderr, "Tree offset got too big: %d, aborted\n",
2199 huff_tree->max_offset);
2200 my_afree((uchar*) packed_tree);
2201 return 0;
2202 }
2203
2204 DBUG_PRINT("info", ("pos: %lu elements: %u tree-elements: %lu "
2205 "char_bits: %u\n",
2206 (ulong) (file_buffer.pos - file_buffer.buffer),
2207 huff_tree->elements, (ulong) (offset - packed_tree),
2208 huff_tree->char_bits));
2209 if (!huff_tree->counts->tree_buff)
2210 {
2211 /* We do a byte compression on this column. Mark with bit 0. */
2212 write_bits(0,1);
2213 write_bits(huff_tree->min_chr,8);
2214 write_bits(huff_tree->elements,9);
2215 write_bits(huff_tree->char_bits,5);
2216 write_bits(huff_tree->offset_bits,5);
2217 int_length=0;
2218 }
2219 else
2220 {
2221 int_length=(uint) (huff_tree->counts->tree_pos -
2222 huff_tree->counts->tree_buff);
2223 /* We have distinct column values for this column. Mark with bit 1. */
2224 write_bits(1,1);
2225 write_bits(huff_tree->elements,15);
2226 write_bits(int_length,16);
2227 write_bits(huff_tree->char_bits,5);
2228 write_bits(huff_tree->offset_bits,5);
2229 intervall_length+=int_length;
2230 }
2231 DBUG_PRINT("info", ("tree: %2u elements: %4u char_bits: %2u "
2232 "offset_bits: %2u %s: %5u codelen: %2u",
2233 tree_no, huff_tree->elements, huff_tree->char_bits,
2234 huff_tree->offset_bits, huff_tree->counts->tree_buff ?
2235 "bufflen" : "min_chr", huff_tree->counts->tree_buff ?
2236 int_length : huff_tree->min_chr, huff_tree->height));
2237 if (verbose >= 2)
2238 printf("tree: %2u elements: %4u char_bits: %2u offset_bits: %2u "
2239 "%s: %5u codelen: %2u\n", tree_no, huff_tree->elements,
2240 huff_tree->char_bits, huff_tree->offset_bits,
2241 huff_tree->counts->tree_buff ? "bufflen" : "min_chr",
2242 huff_tree->counts->tree_buff ? int_length :
2243 huff_tree->min_chr, huff_tree->height);
2244
2245 /* Check that the code tree length matches the element count. */
2246 length=(uint) (offset-packed_tree);
2247 if (length != huff_tree->elements*2-2)
2248 {
2249 (void) fprintf(stderr, "error: Huff-tree-length: %d != calc_length: %d\n",
2250 length, huff_tree->elements * 2 - 2);
2251 errors++;
2252 break;
2253 }
2254
2255 for (i=0 ; i < length ; i++)
2256 {
2257 if (packed_tree[i] & IS_OFFSET)
2258 write_bits(packed_tree[i] - IS_OFFSET+ (1 << huff_tree->offset_bits),
2259 huff_tree->offset_bits+1);
2260 else
2261 write_bits(packed_tree[i]-huff_tree->min_chr,huff_tree->char_bits+1);
2262 DBUG_PRINT("info", ("tree[0x%04x]: %s0x%04x",
2263 i, (packed_tree[i] & IS_OFFSET) ?
2264 " -> " : "", (packed_tree[i] & IS_OFFSET) ?
2265 packed_tree[i] - IS_OFFSET + i : packed_tree[i]));
2266 if (verbose >= 3)
2267 printf("tree[0x%04x]: %s0x%04x\n",
2268 i, (packed_tree[i] & IS_OFFSET) ? " -> " : "",
2269 (packed_tree[i] & IS_OFFSET) ?
2270 packed_tree[i] - IS_OFFSET + i : packed_tree[i]);
2271 }
2272 flush_bits();
2273
2274 /*
2275 Display coding tables and check their correctness.
2276 */
2277 codes= huff_tree->counts->tree_buff ? huff_tree->elements : 256;
2278 for (i= 0; i < codes; i++)
2279 {
2280 ulonglong code;
2281 uint bits;
2282 uint len;
2283 uint idx;
2284
2285 if (! (len= huff_tree->code_len[i]))
2286 continue;
2287 DBUG_PRINT("info", ("code[0x%04x]: 0x%s bits: %2u bin: %s", i,
2288 hexdigits(huff_tree->code[i]), huff_tree->code_len[i],
2289 bindigits(huff_tree->code[i],
2290 huff_tree->code_len[i])));
2291 if (verbose >= 3)
2292 printf("code[0x%04x]: 0x%s bits: %2u bin: %s\n", i,
2293 hexdigits(huff_tree->code[i]), huff_tree->code_len[i],
2294 bindigits(huff_tree->code[i], huff_tree->code_len[i]));
2295
2296 /* Check that the encode table decodes correctly. */
2297 code= 0;
2298 bits= 0;
2299 idx= 0;
2300 DBUG_EXECUTE_IF("forcechkerr1", len--;);
2301 DBUG_EXECUTE_IF("forcechkerr2", bits= 8 * sizeof(code););
2302 DBUG_EXECUTE_IF("forcechkerr3", idx= length;);
2303 for (;;)
2304 {
2305 if (! len)
2306 {
2307 (void) fflush(stdout);
2308 (void) fprintf(stderr, "error: code 0x%s with %u bits not found\n",
2309 hexdigits(huff_tree->code[i]), huff_tree->code_len[i]);
2310 errors++;
2311 break;
2312 }
2313 code<<= 1;
2314 code|= (huff_tree->code[i] >> (--len)) & 1;
2315 bits++;
2316 if (bits > 8 * sizeof(code))
2317 {
2318 (void) fflush(stdout);
2319 (void) fprintf(stderr, "error: Huffman code too long: %u/%u\n",
2320 bits, (uint) (8 * sizeof(code)));
2321 errors++;
2322 break;
2323 }
2324 idx+= (uint) code & 1;
2325 if (idx >= length)
2326 {
2327 (void) fflush(stdout);
2328 (void) fprintf(stderr, "error: illegal tree offset: %u/%u\n",
2329 idx, length);
2330 errors++;
2331 break;
2332 }
2333 if (packed_tree[idx] & IS_OFFSET)
2334 idx+= packed_tree[idx] & ~IS_OFFSET;
2335 else
2336 break; /* Hit a leaf. This contains the result value. */
2337 }
2338 if (errors)
2339 break;
2340
2341 DBUG_EXECUTE_IF("forcechkerr4", packed_tree[idx]++;);
2342 if (packed_tree[idx] != i)
2343 {
2344 (void) fflush(stdout);
2345 (void) fprintf(stderr, "error: decoded value 0x%04x should be: 0x%04x\n",
2346 packed_tree[idx], i);
2347 errors++;
2348 break;
2349 }
2350 } /*end for (codes)*/
2351 if (errors)
2352 break;
2353
2354 /* Write column values in case of distinct column value compression. */
2355 if (huff_tree->counts->tree_buff)
2356 {
2357 for (i=0 ; i < int_length ; i++)
2358 {
2359 write_bits((ulonglong) (uchar) huff_tree->counts->tree_buff[i], 8);
2360 DBUG_PRINT("info", ("column_values[0x%04x]: 0x%02x",
2361 i, (uchar) huff_tree->counts->tree_buff[i]));
2362 if (verbose >= 3)
2363 printf("column_values[0x%04x]: 0x%02x\n",
2364 i, (uchar) huff_tree->counts->tree_buff[i]);
2365 }
2366 }
2367 flush_bits();
2368 }
2369 DBUG_PRINT("info", (" "));
2370 if (verbose >= 2)
2371 printf("\n");
2372 my_afree((uchar*) packed_tree);
2373 if (errors)
2374 {
2375 (void) fprintf(stderr, "Error: Generated decode trees are corrupt. Stop.\n");
2376 return 0;
2377 }
2378 return elements;
2379 }
2380
2381
make_offset_code_tree(HUFF_TREE * huff_tree,HUFF_ELEMENT * element,uint * offset)2382 static uint *make_offset_code_tree(HUFF_TREE *huff_tree, HUFF_ELEMENT *element,
2383 uint *offset)
2384 {
2385 uint *prev_offset;
2386
2387 prev_offset= offset;
2388 /*
2389 'a.leaf.null' takes the same place as 'a.nod.left'. If this is null,
2390 then there is no left child and, hence no right child either. This
2391 is a property of a binary tree. An element is either a node with two
2392 childs, or a leaf without childs.
2393
2394 The current element is always a node with two childs. Go left first.
2395 */
2396 if (!element->a.nod.left->a.leaf.null)
2397 {
2398 /* Store the byte code or the index of the column value. */
2399 prev_offset[0] =(uint) element->a.nod.left->a.leaf.element_nr;
2400 offset+=2;
2401 }
2402 else
2403 {
2404 /*
2405 Recursively traverse the tree to the left. Mark it as an offset to
2406 another tree node (in contrast to a byte code or column value index).
2407 */
2408 prev_offset[0]= IS_OFFSET+2;
2409 offset=make_offset_code_tree(huff_tree,element->a.nod.left,offset+2);
2410 }
2411
2412 /* Now, check the right child. */
2413 if (!element->a.nod.right->a.leaf.null)
2414 {
2415 /* Store the byte code or the index of the column value. */
2416 prev_offset[1]=element->a.nod.right->a.leaf.element_nr;
2417 return offset;
2418 }
2419 else
2420 {
2421 /*
2422 Recursively traverse the tree to the right. Mark it as an offset to
2423 another tree node (in contrast to a byte code or column value index).
2424 */
2425 uint temp=(uint) (offset-prev_offset-1);
2426 prev_offset[1]= IS_OFFSET+ temp;
2427 if (huff_tree->max_offset < temp)
2428 huff_tree->max_offset = temp;
2429 return make_offset_code_tree(huff_tree,element->a.nod.right,offset);
2430 }
2431 }
2432
2433 /* Get number of bits neaded to represent value */
2434
max_bit(register uint value)2435 static uint max_bit(register uint value)
2436 {
2437 reg2 uint power=1;
2438
2439 while ((value>>=1))
2440 power++;
2441 return (power);
2442 }
2443
2444
compress_isam_file(PACK_MRG_INFO * mrg,HUFF_COUNTS * huff_counts)2445 static int compress_isam_file(PACK_MRG_INFO *mrg, HUFF_COUNTS *huff_counts)
2446 {
2447 int error;
2448 uint i,max_calc_length,pack_ref_length,min_record_length,max_record_length,
2449 intervall,field_length,max_pack_length,pack_blob_length;
2450 my_off_t record_count;
2451 char llbuf[32];
2452 ulong length,pack_length;
2453 uchar *record,*pos,*end_pos,*record_pos,*start_pos;
2454 HUFF_COUNTS *count,*end_count;
2455 HUFF_TREE *tree;
2456 MI_INFO *isam_file=mrg->file[0];
2457 uint pack_version= (uint) isam_file->s->pack.version;
2458 DBUG_ENTER("compress_isam_file");
2459
2460 /* Allocate a buffer for the records (excluding blobs). */
2461 if (!(record=(uchar*) my_alloca(isam_file->s->base.reclength)))
2462 return -1;
2463
2464 end_count=huff_counts+isam_file->s->base.fields;
2465 min_record_length= (uint) ~0;
2466 max_record_length=0;
2467
2468 /*
2469 Calculate the maximum number of bits required to pack the records.
2470 Remember to understand 'max_zero_fill' as 'min_zero_fill'.
2471 The tree height determines the maximum number of bits per value.
2472 Some fields skip leading or trailing spaces or zeroes. The skipped
2473 number of bytes is encoded by 'length_bits' bits.
2474 Empty blobs and varchar are encoded with a single 1 bit. Other blobs
2475 and varchar get a leading 0 bit.
2476 */
2477 for (i=max_calc_length=0 ; i < isam_file->s->base.fields ; i++)
2478 {
2479 if (!(huff_counts[i].pack_type & PACK_TYPE_ZERO_FILL))
2480 huff_counts[i].max_zero_fill=0;
2481 if (huff_counts[i].field_type == FIELD_CONSTANT ||
2482 huff_counts[i].field_type == FIELD_ZERO ||
2483 huff_counts[i].field_type == FIELD_CHECK)
2484 continue;
2485 if (huff_counts[i].field_type == FIELD_INTERVALL)
2486 max_calc_length+=huff_counts[i].tree->height;
2487 else if (huff_counts[i].field_type == FIELD_BLOB ||
2488 huff_counts[i].field_type == FIELD_VARCHAR)
2489 max_calc_length+=huff_counts[i].tree->height*huff_counts[i].max_length + huff_counts[i].length_bits +1;
2490 else
2491 max_calc_length+=
2492 (huff_counts[i].field_length - huff_counts[i].max_zero_fill)*
2493 huff_counts[i].tree->height+huff_counts[i].length_bits;
2494 }
2495 max_calc_length= (max_calc_length + 7) / 8;
2496 pack_ref_length= calc_pack_length(pack_version, max_calc_length);
2497 record_count=0;
2498 /* 'max_blob_length' is the max length of all blobs of a record. */
2499 pack_blob_length= isam_file->s->base.blobs ?
2500 calc_pack_length(pack_version, mrg->max_blob_length) : 0;
2501 max_pack_length=pack_ref_length+pack_blob_length;
2502
2503 DBUG_PRINT("fields", ("==="));
2504 mrg_reset(mrg);
2505 while ((error=mrg_rrnd(mrg,record)) != HA_ERR_END_OF_FILE)
2506 {
2507 ulong tot_blob_length=0;
2508 if (! error)
2509 {
2510 if (flush_buffer((ulong) max_calc_length + (ulong) max_pack_length))
2511 break;
2512 record_pos= (uchar*) file_buffer.pos;
2513 file_buffer.pos+=max_pack_length;
2514 for (start_pos=record, count= huff_counts; count < end_count ; count++)
2515 {
2516 end_pos=start_pos+(field_length=count->field_length);
2517 tree=count->tree;
2518
2519 DBUG_PRINT("fields", ("column: %3lu type: %2u pack: %2u zero: %4u "
2520 "lbits: %2u tree: %2u length: %4u",
2521 (ulong) (count - huff_counts + 1),
2522 count->field_type,
2523 count->pack_type, count->max_zero_fill,
2524 count->length_bits, count->tree->tree_number,
2525 count->field_length));
2526
2527 /* Check if the column contains spaces only. */
2528 if (count->pack_type & PACK_TYPE_SPACE_FIELDS)
2529 {
2530 for (pos=start_pos ; *pos == ' ' && pos < end_pos; pos++) ;
2531 if (pos == end_pos)
2532 {
2533 DBUG_PRINT("fields",
2534 ("PACK_TYPE_SPACE_FIELDS spaces only, bits: 1"));
2535 DBUG_PRINT("fields", ("---"));
2536 write_bits(1,1);
2537 start_pos=end_pos;
2538 continue;
2539 }
2540 DBUG_PRINT("fields",
2541 ("PACK_TYPE_SPACE_FIELDS not only spaces, bits: 1"));
2542 write_bits(0,1);
2543 }
2544 end_pos-=count->max_zero_fill;
2545 field_length-=count->max_zero_fill;
2546
2547 switch (count->field_type) {
2548 case FIELD_SKIP_ZERO:
2549 if (!memcmp((uchar*) start_pos,zero_string,field_length))
2550 {
2551 DBUG_PRINT("fields", ("FIELD_SKIP_ZERO zeroes only, bits: 1"));
2552 write_bits(1,1);
2553 start_pos=end_pos;
2554 break;
2555 }
2556 DBUG_PRINT("fields", ("FIELD_SKIP_ZERO not only zeroes, bits: 1"));
2557 write_bits(0,1);
2558 /* Fall through */
2559 case FIELD_NORMAL:
2560 DBUG_PRINT("fields", ("FIELD_NORMAL %lu bytes",
2561 (ulong) (end_pos - start_pos)));
2562 for ( ; start_pos < end_pos ; start_pos++)
2563 {
2564 DBUG_PRINT("fields",
2565 ("value: 0x%02x code: 0x%s bits: %2u bin: %s",
2566 (uchar) *start_pos,
2567 hexdigits(tree->code[(uchar) *start_pos]),
2568 (uint) tree->code_len[(uchar) *start_pos],
2569 bindigits(tree->code[(uchar) *start_pos],
2570 (uint) tree->code_len[(uchar) *start_pos])));
2571 write_bits(tree->code[(uchar) *start_pos],
2572 (uint) tree->code_len[(uchar) *start_pos]);
2573 }
2574 break;
2575 case FIELD_SKIP_ENDSPACE:
2576 for (pos=end_pos ; pos > start_pos && pos[-1] == ' ' ; pos--) ;
2577 length= (ulong) (end_pos - pos);
2578 if (count->pack_type & PACK_TYPE_SELECTED)
2579 {
2580 if (length > count->min_space)
2581 {
2582 DBUG_PRINT("fields",
2583 ("FIELD_SKIP_ENDSPACE more than min_space, bits: 1"));
2584 DBUG_PRINT("fields",
2585 ("FIELD_SKIP_ENDSPACE skip %lu/%u bytes, bits: %2u",
2586 length, field_length, count->length_bits));
2587 write_bits(1,1);
2588 write_bits(length,count->length_bits);
2589 }
2590 else
2591 {
2592 DBUG_PRINT("fields",
2593 ("FIELD_SKIP_ENDSPACE not more than min_space, "
2594 "bits: 1"));
2595 write_bits(0,1);
2596 pos=end_pos;
2597 }
2598 }
2599 else
2600 {
2601 DBUG_PRINT("fields",
2602 ("FIELD_SKIP_ENDSPACE skip %lu/%u bytes, bits: %2u",
2603 length, field_length, count->length_bits));
2604 write_bits(length,count->length_bits);
2605 }
2606 /* Encode all significant bytes. */
2607 DBUG_PRINT("fields", ("FIELD_SKIP_ENDSPACE %lu bytes",
2608 (ulong) (pos - start_pos)));
2609 for ( ; start_pos < pos ; start_pos++)
2610 {
2611 DBUG_PRINT("fields",
2612 ("value: 0x%02x code: 0x%s bits: %2u bin: %s",
2613 (uchar) *start_pos,
2614 hexdigits(tree->code[(uchar) *start_pos]),
2615 (uint) tree->code_len[(uchar) *start_pos],
2616 bindigits(tree->code[(uchar) *start_pos],
2617 (uint) tree->code_len[(uchar) *start_pos])));
2618 write_bits(tree->code[(uchar) *start_pos],
2619 (uint) tree->code_len[(uchar) *start_pos]);
2620 }
2621 start_pos=end_pos;
2622 break;
2623 case FIELD_SKIP_PRESPACE:
2624 for (pos=start_pos ; pos < end_pos && pos[0] == ' ' ; pos++) ;
2625 length= (ulong) (pos - start_pos);
2626 if (count->pack_type & PACK_TYPE_SELECTED)
2627 {
2628 if (length > count->min_space)
2629 {
2630 DBUG_PRINT("fields",
2631 ("FIELD_SKIP_PRESPACE more than min_space, bits: 1"));
2632 DBUG_PRINT("fields",
2633 ("FIELD_SKIP_PRESPACE skip %lu/%u bytes, bits: %2u",
2634 length, field_length, count->length_bits));
2635 write_bits(1,1);
2636 write_bits(length,count->length_bits);
2637 }
2638 else
2639 {
2640 DBUG_PRINT("fields",
2641 ("FIELD_SKIP_PRESPACE not more than min_space, "
2642 "bits: 1"));
2643 pos=start_pos;
2644 write_bits(0,1);
2645 }
2646 }
2647 else
2648 {
2649 DBUG_PRINT("fields",
2650 ("FIELD_SKIP_PRESPACE skip %lu/%u bytes, bits: %2u",
2651 length, field_length, count->length_bits));
2652 write_bits(length,count->length_bits);
2653 }
2654 /* Encode all significant bytes. */
2655 DBUG_PRINT("fields", ("FIELD_SKIP_PRESPACE %lu bytes",
2656 (ulong) (end_pos - start_pos)));
2657 for (start_pos=pos ; start_pos < end_pos ; start_pos++)
2658 {
2659 DBUG_PRINT("fields",
2660 ("value: 0x%02x code: 0x%s bits: %2u bin: %s",
2661 (uchar) *start_pos,
2662 hexdigits(tree->code[(uchar) *start_pos]),
2663 (uint) tree->code_len[(uchar) *start_pos],
2664 bindigits(tree->code[(uchar) *start_pos],
2665 (uint) tree->code_len[(uchar) *start_pos])));
2666 write_bits(tree->code[(uchar) *start_pos],
2667 (uint) tree->code_len[(uchar) *start_pos]);
2668 }
2669 break;
2670 case FIELD_CONSTANT:
2671 case FIELD_ZERO:
2672 case FIELD_CHECK:
2673 DBUG_PRINT("fields", ("FIELD_CONSTANT/ZERO/CHECK"));
2674 start_pos=end_pos;
2675 break;
2676 case FIELD_INTERVALL:
2677 global_count=count;
2678 pos=(uchar*) tree_search(&count->int_tree, start_pos,
2679 count->int_tree.custom_arg);
2680 intervall=(uint) (pos - count->tree_buff)/field_length;
2681 DBUG_PRINT("fields", ("FIELD_INTERVALL"));
2682 DBUG_PRINT("fields", ("index: %4u code: 0x%s bits: %2u",
2683 intervall, hexdigits(tree->code[intervall]),
2684 (uint) tree->code_len[intervall]));
2685 write_bits(tree->code[intervall],(uint) tree->code_len[intervall]);
2686 start_pos=end_pos;
2687 break;
2688 case FIELD_BLOB:
2689 {
2690 ulong blob_length=_mi_calc_blob_length(field_length-
2691 portable_sizeof_char_ptr,
2692 start_pos);
2693 /* Empty blobs are encoded with a single 1 bit. */
2694 if (!blob_length)
2695 {
2696 DBUG_PRINT("fields", ("FIELD_BLOB empty, bits: 1"));
2697 write_bits(1,1);
2698 }
2699 else
2700 {
2701 uchar *blob,*blob_end;
2702 DBUG_PRINT("fields", ("FIELD_BLOB not empty, bits: 1"));
2703 write_bits(0,1);
2704 /* Write the blob length. */
2705 DBUG_PRINT("fields", ("FIELD_BLOB %lu bytes, bits: %2u",
2706 blob_length, count->length_bits));
2707 write_bits(blob_length,count->length_bits);
2708 memcpy(&blob, end_pos-portable_sizeof_char_ptr, sizeof(char*));
2709 blob_end=blob+blob_length;
2710 /* Encode the blob bytes. */
2711 for ( ; blob < blob_end ; blob++)
2712 {
2713 DBUG_PRINT("fields",
2714 ("value: 0x%02x code: 0x%s bits: %2u bin: %s",
2715 (uchar) *blob, hexdigits(tree->code[(uchar) *blob]),
2716 (uint) tree->code_len[(uchar) *blob],
2717 bindigits(tree->code[(uchar) *start_pos],
2718 (uint)tree->code_len[(uchar) *start_pos])));
2719 write_bits(tree->code[(uchar) *blob],
2720 (uint) tree->code_len[(uchar) *blob]);
2721 }
2722 tot_blob_length+=blob_length;
2723 }
2724 start_pos= end_pos;
2725 break;
2726 }
2727 case FIELD_VARCHAR:
2728 {
2729 uint var_pack_length= HA_VARCHAR_PACKLENGTH(count->field_length-1);
2730 ulong col_length= (var_pack_length == 1 ?
2731 (uint) *(uchar*) start_pos :
2732 uint2korr(start_pos));
2733 /* Empty varchar are encoded with a single 1 bit. */
2734 if (!col_length)
2735 {
2736 DBUG_PRINT("fields", ("FIELD_VARCHAR empty, bits: 1"));
2737 write_bits(1,1); /* Empty varchar */
2738 }
2739 else
2740 {
2741 uchar *end= start_pos + var_pack_length + col_length;
2742 DBUG_PRINT("fields", ("FIELD_VARCHAR not empty, bits: 1"));
2743 write_bits(0,1);
2744 /* Write the varchar length. */
2745 DBUG_PRINT("fields", ("FIELD_VARCHAR %lu bytes, bits: %2u",
2746 col_length, count->length_bits));
2747 write_bits(col_length,count->length_bits);
2748 /* Encode the varchar bytes. */
2749 for (start_pos+= var_pack_length ; start_pos < end ; start_pos++)
2750 {
2751 DBUG_PRINT("fields",
2752 ("value: 0x%02x code: 0x%s bits: %2u bin: %s",
2753 (uchar) *start_pos,
2754 hexdigits(tree->code[(uchar) *start_pos]),
2755 (uint) tree->code_len[(uchar) *start_pos],
2756 bindigits(tree->code[(uchar) *start_pos],
2757 (uint)tree->code_len[(uchar) *start_pos])));
2758 write_bits(tree->code[(uchar) *start_pos],
2759 (uint) tree->code_len[(uchar) *start_pos]);
2760 }
2761 }
2762 start_pos= end_pos;
2763 break;
2764 }
2765 case FIELD_LAST:
2766 case FIELD_enum_val_count:
2767 abort(); /* Impossible */
2768 }
2769 start_pos+=count->max_zero_fill;
2770 DBUG_PRINT("fields", ("---"));
2771 }
2772 flush_bits();
2773 length=(ulong) ((uchar*) file_buffer.pos - record_pos) - max_pack_length;
2774 pack_length= save_pack_length(pack_version, record_pos, length);
2775 if (pack_blob_length)
2776 pack_length+= save_pack_length(pack_version, record_pos + pack_length,
2777 tot_blob_length);
2778 DBUG_PRINT("fields", ("record: %lu length: %lu blob-length: %lu "
2779 "length-bytes: %lu", (ulong) record_count, length,
2780 tot_blob_length, pack_length));
2781 DBUG_PRINT("fields", ("==="));
2782
2783 /* Correct file buffer if the header was smaller */
2784 if (pack_length != max_pack_length)
2785 {
2786 bmove(record_pos+pack_length,record_pos+max_pack_length,length);
2787 file_buffer.pos-= (max_pack_length-pack_length);
2788 }
2789 if (length < (ulong) min_record_length)
2790 min_record_length=(uint) length;
2791 if (length > (ulong) max_record_length)
2792 max_record_length=(uint) length;
2793 record_count++;
2794 if (write_loop && record_count % WRITE_COUNT == 0)
2795 {
2796 printf("%lu\r", (ulong) record_count);
2797 (void) fflush(stdout);
2798 }
2799 }
2800 else if (error != HA_ERR_RECORD_DELETED)
2801 break;
2802 }
2803 if (error == HA_ERR_END_OF_FILE)
2804 error=0;
2805 else
2806 {
2807 (void) fprintf(stderr, "%s: Got error %d reading records\n",
2808 my_progname, error);
2809 }
2810 if (verbose >= 2)
2811 printf("wrote %s records.\n", llstr((longlong) record_count, llbuf));
2812
2813 my_afree((uchar*) record);
2814 mrg->ref_length=max_pack_length;
2815 mrg->min_pack_length=max_record_length ? min_record_length : 0;
2816 mrg->max_pack_length=max_record_length;
2817 DBUG_RETURN(error || error_on_write || flush_buffer(~(ulong) 0));
2818 }
2819
2820
make_new_name(char * new_name,char * old_name)2821 static char *make_new_name(char *new_name, char *old_name)
2822 {
2823 return fn_format(new_name,old_name,"",DATA_TMP_EXT,2+4);
2824 }
2825
make_old_name(char * new_name,char * old_name)2826 static char *make_old_name(char *new_name, char *old_name)
2827 {
2828 return fn_format(new_name,old_name,"",OLD_EXT,2+4);
2829 }
2830
2831 /* rutines for bit writing buffer */
2832
init_file_buffer(File file,pbool read_buffer)2833 static void init_file_buffer(File file, pbool read_buffer)
2834 {
2835 file_buffer.file=file;
2836 file_buffer.buffer= (uchar*) my_malloc(ALIGN_SIZE(RECORD_CACHE_SIZE),
2837 MYF(MY_WME));
2838 file_buffer.end=file_buffer.buffer+ALIGN_SIZE(RECORD_CACHE_SIZE)-8;
2839 file_buffer.pos_in_file=0;
2840 error_on_write=0;
2841 if (read_buffer)
2842 {
2843
2844 file_buffer.pos=file_buffer.end;
2845 file_buffer.bits=0;
2846 }
2847 else
2848 {
2849 file_buffer.pos=file_buffer.buffer;
2850 file_buffer.bits=BITS_SAVED;
2851 }
2852 file_buffer.bitbucket= 0;
2853 }
2854
2855
flush_buffer(ulong neaded_length)2856 static int flush_buffer(ulong neaded_length)
2857 {
2858 ulong length;
2859
2860 /*
2861 file_buffer.end is 8 bytes lower than the real end of the buffer.
2862 This is done so that the end-of-buffer condition does not need to be
2863 checked for every byte (see write_bits()). Consequently,
2864 file_buffer.pos can become greater than file_buffer.end. The
2865 algorithms in the other functions ensure that there will never be
2866 more than 8 bytes written to the buffer without an end-of-buffer
2867 check. So the buffer cannot be overrun. But we need to check for the
2868 near-to-buffer-end condition to avoid a negative result, which is
2869 casted to unsigned and thus becomes giant.
2870 */
2871 if ((file_buffer.pos < file_buffer.end) &&
2872 ((ulong) (file_buffer.end - file_buffer.pos) > neaded_length))
2873 return 0;
2874 length=(ulong) (file_buffer.pos-file_buffer.buffer);
2875 file_buffer.pos=file_buffer.buffer;
2876 file_buffer.pos_in_file+=length;
2877 if (test_only)
2878 return 0;
2879 if (error_on_write|| my_write(file_buffer.file,
2880 (const uchar*) file_buffer.buffer,
2881 length,
2882 MYF(MY_WME | MY_NABP | MY_WAIT_IF_FULL)))
2883 {
2884 error_on_write=1;
2885 return 1;
2886 }
2887
2888 if (neaded_length != ~(ulong) 0 &&
2889 (ulong) (file_buffer.end-file_buffer.buffer) < neaded_length)
2890 {
2891 char *tmp;
2892 neaded_length+=256; /* some margin */
2893 tmp= my_realloc((char*) file_buffer.buffer, neaded_length,MYF(MY_WME));
2894 if (!tmp)
2895 return 1;
2896 file_buffer.pos= ((uchar*) tmp +
2897 (ulong) (file_buffer.pos - file_buffer.buffer));
2898 file_buffer.buffer= (uchar*) tmp;
2899 file_buffer.end= (uchar*) (tmp+neaded_length-8);
2900 }
2901 return 0;
2902 }
2903
2904
end_file_buffer(void)2905 static void end_file_buffer(void)
2906 {
2907 my_free(file_buffer.buffer);
2908 }
2909
2910 /* output `bits` low bits of `value' */
2911
write_bits(register ulonglong value,register uint bits)2912 static void write_bits(register ulonglong value, register uint bits)
2913 {
2914 DBUG_ASSERT(((bits < 8 * sizeof(value)) && ! (value >> bits)) ||
2915 (bits == 8 * sizeof(value)));
2916
2917 if ((file_buffer.bits-= (int) bits) >= 0)
2918 {
2919 file_buffer.bitbucket|= value << file_buffer.bits;
2920 }
2921 else
2922 {
2923 reg3 ulonglong bit_buffer;
2924 bits= (uint) -file_buffer.bits;
2925 bit_buffer= (file_buffer.bitbucket |
2926 ((bits != 8 * sizeof(value)) ? (value >> bits) : 0));
2927 #if BITS_SAVED == 64
2928 *file_buffer.pos++= (uchar) (bit_buffer >> 56);
2929 *file_buffer.pos++= (uchar) (bit_buffer >> 48);
2930 *file_buffer.pos++= (uchar) (bit_buffer >> 40);
2931 *file_buffer.pos++= (uchar) (bit_buffer >> 32);
2932 #endif
2933 *file_buffer.pos++= (uchar) (bit_buffer >> 24);
2934 *file_buffer.pos++= (uchar) (bit_buffer >> 16);
2935 *file_buffer.pos++= (uchar) (bit_buffer >> 8);
2936 *file_buffer.pos++= (uchar) (bit_buffer);
2937
2938 if (bits != 8 * sizeof(value))
2939 value&= (((ulonglong) 1) << bits) - 1;
2940 if (file_buffer.pos >= file_buffer.end)
2941 (void) flush_buffer(~ (ulong) 0);
2942 file_buffer.bits=(int) (BITS_SAVED - bits);
2943 file_buffer.bitbucket= value << (BITS_SAVED - bits);
2944 }
2945 return;
2946 }
2947
2948 /* Flush bits in bit_buffer to buffer */
2949
flush_bits(void)2950 static void flush_bits(void)
2951 {
2952 int bits;
2953 ulonglong bit_buffer;
2954
2955 bits= file_buffer.bits & ~7;
2956 bit_buffer= file_buffer.bitbucket >> bits;
2957 bits= BITS_SAVED - bits;
2958 while (bits > 0)
2959 {
2960 bits-= 8;
2961 *file_buffer.pos++= (uchar) (bit_buffer >> bits);
2962 }
2963 if (file_buffer.pos >= file_buffer.end)
2964 (void) flush_buffer(~ (ulong) 0);
2965 file_buffer.bits= BITS_SAVED;
2966 file_buffer.bitbucket= 0;
2967 }
2968
2969
2970 /****************************************************************************
2971 ** functions to handle the joined files
2972 ****************************************************************************/
2973
save_state(MI_INFO * isam_file,PACK_MRG_INFO * mrg,my_off_t new_length,ha_checksum crc)2974 static int save_state(MI_INFO *isam_file,PACK_MRG_INFO *mrg,my_off_t new_length,
2975 ha_checksum crc)
2976 {
2977 MYISAM_SHARE *share=isam_file->s;
2978 uint options=mi_uint2korr(share->state.header.options);
2979 uint key;
2980 DBUG_ENTER("save_state");
2981
2982 options|= HA_OPTION_COMPRESS_RECORD | HA_OPTION_READ_ONLY_DATA;
2983 mi_int2store(share->state.header.options,options);
2984
2985 share->state.state.data_file_length=new_length;
2986 share->state.state.del=0;
2987 share->state.state.empty=0;
2988 share->state.dellink= HA_OFFSET_ERROR;
2989 share->state.split=(ha_rows) mrg->records;
2990 share->state.version=(ulong) time((time_t*) 0);
2991 if (! mi_is_all_keys_active(share->state.key_map, share->base.keys))
2992 {
2993 /*
2994 Some indexes are disabled, cannot use current key_file_length value
2995 as an estimate of upper bound of index file size. Use packed data file
2996 size instead.
2997 */
2998 share->state.state.key_file_length= new_length;
2999 }
3000 /*
3001 If there are no disabled indexes, keep key_file_length value from
3002 original file so "myisamchk -rq" can use this value (this is necessary
3003 because index size cannot be easily calculated for fulltext keys)
3004 */
3005 mi_clear_all_keys_active(share->state.key_map);
3006 for (key=0 ; key < share->base.keys ; key++)
3007 share->state.key_root[key]= HA_OFFSET_ERROR;
3008 for (key=0 ; key < share->state.header.max_block_size_index ; key++)
3009 share->state.key_del[key]= HA_OFFSET_ERROR;
3010 isam_file->state->checksum=crc; /* Save crc here */
3011 share->changed=1; /* Force write of header */
3012 share->state.open_count=0;
3013 share->global_changed=0;
3014 (void) my_chsize(share->kfile, share->base.keystart, 0, MYF(0));
3015 if (share->base.keys)
3016 isamchk_neaded=1;
3017 DBUG_RETURN(mi_state_info_write(share->kfile,&share->state,1+2));
3018 }
3019
3020
save_state_mrg(File file,PACK_MRG_INFO * mrg,my_off_t new_length,ha_checksum crc)3021 static int save_state_mrg(File file,PACK_MRG_INFO *mrg,my_off_t new_length,
3022 ha_checksum crc)
3023 {
3024 MI_STATE_INFO state;
3025 MI_INFO *isam_file=mrg->file[0];
3026 uint options;
3027 DBUG_ENTER("save_state_mrg");
3028
3029 state= isam_file->s->state;
3030 options= (mi_uint2korr(state.header.options) | HA_OPTION_COMPRESS_RECORD |
3031 HA_OPTION_READ_ONLY_DATA);
3032 mi_int2store(state.header.options,options);
3033 state.state.data_file_length=new_length;
3034 state.state.del=0;
3035 state.state.empty=0;
3036 state.state.records=state.split=(ha_rows) mrg->records;
3037 /* See comment above in save_state about key_file_length handling. */
3038 if (mrg->src_file_has_indexes_disabled)
3039 {
3040 isam_file->s->state.state.key_file_length=
3041 MY_MAX(isam_file->s->state.state.key_file_length, new_length);
3042 }
3043 state.dellink= HA_OFFSET_ERROR;
3044 state.version=(ulong) time((time_t*) 0);
3045 mi_clear_all_keys_active(state.key_map);
3046 state.state.checksum=crc;
3047 if (isam_file->s->base.keys)
3048 isamchk_neaded=1;
3049 state.changed=STATE_CHANGED | STATE_NOT_ANALYZED; /* Force check of table */
3050 DBUG_RETURN (mi_state_info_write(file,&state,1+2));
3051 }
3052
3053
3054 /* reset for mrg_rrnd */
3055
mrg_reset(PACK_MRG_INFO * mrg)3056 static void mrg_reset(PACK_MRG_INFO *mrg)
3057 {
3058 if (mrg->current)
3059 {
3060 mi_extra(*mrg->current, HA_EXTRA_NO_CACHE, 0);
3061 mrg->current=0;
3062 }
3063 }
3064
mrg_rrnd(PACK_MRG_INFO * info,uchar * buf)3065 static int mrg_rrnd(PACK_MRG_INFO *info,uchar *buf)
3066 {
3067 int error;
3068 MI_INFO *isam_info;
3069 my_off_t filepos;
3070
3071 if (!info->current)
3072 {
3073 isam_info= *(info->current=info->file);
3074 info->end=info->current+info->count;
3075 mi_reset(isam_info);
3076 mi_extra(isam_info, HA_EXTRA_CACHE, 0);
3077 filepos=isam_info->s->pack.header_length;
3078 }
3079 else
3080 {
3081 isam_info= *info->current;
3082 filepos= isam_info->nextpos;
3083 }
3084
3085 for (;;)
3086 {
3087 isam_info->update&= HA_STATE_CHANGED;
3088 if (!(error=(*isam_info->s->read_rnd)(isam_info,(uchar*) buf,
3089 filepos, 1)) ||
3090 error != HA_ERR_END_OF_FILE)
3091 return (error);
3092 mi_extra(isam_info,HA_EXTRA_NO_CACHE, 0);
3093 if (info->current+1 == info->end)
3094 return(HA_ERR_END_OF_FILE);
3095 info->current++;
3096 isam_info= *info->current;
3097 filepos=isam_info->s->pack.header_length;
3098 mi_reset(isam_info);
3099 mi_extra(isam_info,HA_EXTRA_CACHE, 0);
3100 }
3101 }
3102
3103
mrg_close(PACK_MRG_INFO * mrg)3104 static int mrg_close(PACK_MRG_INFO *mrg)
3105 {
3106 uint i;
3107 int error=0;
3108 for (i=0 ; i < mrg->count ; i++)
3109 error|=mi_close(mrg->file[i]);
3110 if (mrg->free_file)
3111 my_free(mrg->file);
3112 return error;
3113 }
3114
3115
3116 #if !defined(DBUG_OFF)
3117 /*
3118 Fake the counts to get big Huffman codes.
3119
3120 SYNOPSIS
3121 fakebigcodes()
3122 huff_counts A pointer to the counts array.
3123 end_count A pointer past the counts array.
3124
3125 DESCRIPTION
3126
3127 Huffman coding works by removing the two least frequent values from
3128 the list of values and add a new value with the sum of their
3129 incidences in a loop until only one value is left. Every time a
3130 value is reused for a new value, it gets one more bit for its
3131 encoding. Hence, the least frequent values get the longest codes.
3132
3133 To get a maximum code length for a value, two of the values must
3134 have an incidence of 1. As their sum is 2, the next infrequent value
3135 must have at least an incidence of 2, then 4, 8, 16 and so on. This
3136 means that one needs 2**n bytes (values) for a code length of n
3137 bits. However, using more distinct values forces the use of longer
3138 codes, or reaching the code length with less total bytes (values).
3139
3140 To get 64(32)-bit codes, I sort the counts by decreasing incidence.
3141 I assign counts of 1 to the two most frequent values, a count of 2
3142 for the next one, then 4, 8, and so on until 2**64-1(2**30-1). All
3143 the remaining values get 1. That way every possible byte has an
3144 assigned code, though not all codes are used if not all byte values
3145 are present in the column.
3146
3147 This strategy would work with distinct column values too, but
3148 requires that at least 64(32) values are present. To make things
3149 easier here, I cancel all distinct column values and force byte
3150 compression for all columns.
3151
3152 RETURN
3153 void
3154 */
3155
fakebigcodes(HUFF_COUNTS * huff_counts,HUFF_COUNTS * end_count)3156 static void fakebigcodes(HUFF_COUNTS *huff_counts, HUFF_COUNTS *end_count)
3157 {
3158 HUFF_COUNTS *count;
3159 my_off_t *cur_count_p;
3160 my_off_t *end_count_p;
3161 my_off_t **cur_sort_p;
3162 my_off_t **end_sort_p;
3163 my_off_t *sort_counts[256];
3164 my_off_t total;
3165 DBUG_ENTER("fakebigcodes");
3166
3167 for (count= huff_counts; count < end_count; count++)
3168 {
3169 /*
3170 Remove distinct column values.
3171 */
3172 if (huff_counts->tree_buff)
3173 {
3174 my_free(huff_counts->tree_buff);
3175 delete_tree(&huff_counts->int_tree);
3176 huff_counts->tree_buff= NULL;
3177 DBUG_PRINT("fakebigcodes", ("freed distinct column values"));
3178 }
3179
3180 /*
3181 Sort counts by decreasing incidence.
3182 */
3183 cur_count_p= count->counts;
3184 end_count_p= cur_count_p + 256;
3185 cur_sort_p= sort_counts;
3186 while (cur_count_p < end_count_p)
3187 *(cur_sort_p++)= cur_count_p++;
3188 (void) my_qsort(sort_counts, 256, sizeof(my_off_t*), (qsort_cmp) fakecmp);
3189
3190 /*
3191 Assign faked counts.
3192 */
3193 cur_sort_p= sort_counts;
3194 #if SIZEOF_LONG_LONG > 4
3195 end_sort_p= sort_counts + 8 * sizeof(ulonglong) - 1;
3196 #else
3197 end_sort_p= sort_counts + 8 * sizeof(ulonglong) - 2;
3198 #endif
3199 /* Most frequent value gets a faked count of 1. */
3200 **(cur_sort_p++)= 1;
3201 total= 1;
3202 while (cur_sort_p < end_sort_p)
3203 {
3204 **(cur_sort_p++)= total;
3205 total<<= 1;
3206 }
3207 /* Set the last value. */
3208 **(cur_sort_p++)= --total;
3209 /*
3210 Set the remaining counts.
3211 */
3212 end_sort_p= sort_counts + 256;
3213 while (cur_sort_p < end_sort_p)
3214 **(cur_sort_p++)= 1;
3215 }
3216 DBUG_VOID_RETURN;
3217 }
3218
3219
3220 /*
3221 Compare two counts for reverse sorting.
3222
3223 SYNOPSIS
3224 fakecmp()
3225 count1 One count.
3226 count2 Another count.
3227
3228 RETURN
3229 1 count1 < count2
3230 0 count1 == count2
3231 -1 count1 > count2
3232 */
3233
fakecmp(my_off_t ** count1,my_off_t ** count2)3234 static int fakecmp(my_off_t **count1, my_off_t **count2)
3235 {
3236 return ((**count1 < **count2) ? 1 :
3237 (**count1 > **count2) ? -1 : 0);
3238 }
3239 #endif
3240
3241 #include "mi_extrafunc.h"
3242