1 /* Copyright (C) 2006 MySQL AB & MySQL Finland AB & TCX DataKonsult AB
2 
3    This program is free software; you can redistribute it and/or modify
4    it under the terms of the GNU General Public License as published by
5    the Free Software Foundation; version 2 of the License.
6 
7    This program is distributed in the hope that it will be useful,
8    but WITHOUT ANY WARRANTY; without even the implied warranty of
9    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
10    GNU General Public License for more details.
11 
12    You should have received a copy of the GNU General Public License
13    along with this program; if not, write to the Free Software
14    Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1335 USA */
15 
16 /* Pack MARIA file */
17 
18 #ifndef USE_MY_FUNC
19 #define USE_MY_FUNC			/* We need at least my_malloc */
20 #endif
21 
22 #include "maria_def.h"
23 #include "trnman_public.h"
24 #include "trnman.h"
25 #include <queues.h>
26 #include <my_tree.h>
27 #include "mysys_err.h"
28 #ifdef MSDOS
29 #include <io.h>
30 #endif
31 #ifndef __GNU_LIBRARY__
32 #define __GNU_LIBRARY__			/* Skip warnings in getopt.h */
33 #endif
34 #include <my_getopt.h>
35 #include <my_handler_errors.h>
36 
37 #if SIZEOF_LONG_LONG > 4
38 #define BITS_SAVED 64
39 #else
40 #define BITS_SAVED 32
41 #endif
42 #ifndef MAX_INTERNAL_TRID
43 #define MAX_INTERNAL_TRID  0xffffffffffffLL
44 #endif
45 
46 #define IS_OFFSET ((uint) 32768)	/* Bit if offset or char in tree */
47 #define HEAD_LENGTH	32
48 #define ALLOWED_JOIN_DIFF	256	/* Diff allowed to join trees */
49 
50 #define DATA_TMP_EXT		".TMD"
51 #define OLD_EXT			".OLD"
52 #define WRITE_COUNT		MY_HOW_OFTEN_TO_WRITE
53 
54 struct st_file_buffer {
55   File file;
56   uchar *buffer,*pos,*end;
57   my_off_t pos_in_file;
58   int bits;
59   ulonglong bitbucket;
60 };
61 
62 struct st_huff_tree;
63 struct st_huff_element;
64 
65 typedef struct st_huff_counts {
66   uint	field_length,max_zero_fill;
67   uint	pack_type;
68   uint	max_end_space,max_pre_space,length_bits,min_space;
69   ulong max_length;
70   enum en_fieldtype field_type;
71   struct st_huff_tree *tree;		/* Tree for field */
72   my_off_t counts[256];
73   my_off_t end_space[8];
74   my_off_t pre_space[8];
75   my_off_t tot_end_space,tot_pre_space,zero_fields,empty_fields,bytes_packed;
76   TREE int_tree;        /* Tree for detecting distinct column values. */
77   uchar *tree_buff;      /* Column values, 'field_length' each. */
78   uchar *tree_pos;       /* Points to end of column values in 'tree_buff'. */
79 } HUFF_COUNTS;
80 
81 typedef struct st_huff_element HUFF_ELEMENT;
82 
83 /*
84   WARNING: It is crucial for the optimizations in calc_packed_length()
85   that 'count' is the first element of 'HUFF_ELEMENT'.
86 */
87 struct st_huff_element {
88   my_off_t count;
89   union un_element {
90     struct st_nod {
91       HUFF_ELEMENT *left,*right;
92     } nod;
93     struct st_leaf {
94       HUFF_ELEMENT *null;
95       uint	element_nr;		/* Number of element */
96     } leaf;
97   } a;
98 };
99 
100 
101 typedef struct st_huff_tree {
102   HUFF_ELEMENT *root,*element_buffer;
103   HUFF_COUNTS *counts;
104   uint tree_number;
105   uint elements;
106   my_off_t bytes_packed;
107   uint tree_pack_length;
108   uint min_chr,max_chr,char_bits,offset_bits,max_offset,height;
109   ulonglong *code;
110   uchar *code_len;
111 } HUFF_TREE;
112 
113 
114 typedef struct st_isam_mrg {
115   MARIA_HA **file,**current,**end;
116   uint free_file;
117   uint count;
118   uint	min_pack_length;		/* Theese is used by packed data */
119   uint	max_pack_length;
120   uint	ref_length;
121   uint	max_blob_length;
122   my_off_t records;
123   /* true if at least one source file has at least one disabled index */
124   my_bool src_file_has_indexes_disabled;
125 } PACK_MRG_INFO;
126 
127 
128 extern int main(int argc,char * *argv);
129 static void get_options(int *argc,char ***argv);
130 static MARIA_HA *open_maria_file(char *name,int mode);
131 static my_bool open_maria_files(PACK_MRG_INFO *mrg,char **names,uint count);
132 static int compress(PACK_MRG_INFO *file,char *join_name);
133 static HUFF_COUNTS *init_huff_count(MARIA_HA *info,my_off_t records);
134 static void free_counts_and_tree_and_queue(HUFF_TREE *huff_trees,
135 					   uint trees,
136 					   HUFF_COUNTS *huff_counts,
137 					   uint fields);
138 static int compare_tree(void* cmp_arg __attribute__((unused)),
139 			const uchar *s,const uchar *t);
140 static int get_statistic(PACK_MRG_INFO *mrg,HUFF_COUNTS *huff_counts);
141 static void check_counts(HUFF_COUNTS *huff_counts,uint trees,
142 			 my_off_t records);
143 static int test_space_compress(HUFF_COUNTS *huff_counts,my_off_t records,
144 			       uint max_space_length,my_off_t *space_counts,
145 			       my_off_t tot_space_count,
146 			       enum en_fieldtype field_type);
147 static HUFF_TREE* make_huff_trees(HUFF_COUNTS *huff_counts,uint trees);
148 static int make_huff_tree(HUFF_TREE *tree,HUFF_COUNTS *huff_counts);
149 static int compare_huff_elements(void *not_used, uchar *a,uchar *b);
150 static int save_counts_in_queue(uchar *key,element_count count,
151 				    HUFF_TREE *tree);
152 static my_off_t calc_packed_length(HUFF_COUNTS *huff_counts,uint flag);
153 static uint join_same_trees(HUFF_COUNTS *huff_counts,uint trees);
154 static int make_huff_decode_table(HUFF_TREE *huff_tree,uint trees);
155 static void make_traverse_code_tree(HUFF_TREE *huff_tree,
156 				    HUFF_ELEMENT *element,uint size,
157 				    ulonglong code);
158 static int write_header(PACK_MRG_INFO *isam_file, uint header_length,uint trees,
159 			my_off_t tot_elements,my_off_t filelength);
160 static void write_field_info(HUFF_COUNTS *counts, uint fields,uint trees);
161 static my_off_t write_huff_tree(HUFF_TREE *huff_tree,uint trees);
162 static uint *make_offset_code_tree(HUFF_TREE *huff_tree,
163 				       HUFF_ELEMENT *element,
164 				       uint *offset);
165 static uint max_bit(uint value);
166 static int compress_maria_file(PACK_MRG_INFO *file,HUFF_COUNTS *huff_counts);
167 static char *make_new_name(char *new_name,char *old_name);
168 static char *make_old_name(char *new_name,char *old_name);
169 static void init_file_buffer(File file,pbool read_buffer);
170 static int flush_buffer(ulong neaded_length);
171 static void end_file_buffer(void);
172 static void write_bits(ulonglong value, uint bits);
173 static void flush_bits(void);
174 static int save_state(MARIA_HA *isam_file,PACK_MRG_INFO *mrg,
175                       my_off_t new_length, ha_checksum crc);
176 static int save_state_mrg(File file,PACK_MRG_INFO *isam_file,
177                           my_off_t new_length, ha_checksum crc);
178 static int mrg_close(PACK_MRG_INFO *mrg);
179 static int mrg_rrnd(PACK_MRG_INFO *info,uchar *buf);
180 static void mrg_reset(PACK_MRG_INFO *mrg);
181 #if !defined(DBUG_OFF)
182 static void fakebigcodes(HUFF_COUNTS *huff_counts, HUFF_COUNTS *end_count);
183 static int fakecmp(my_off_t **count1, my_off_t **count2);
184 #endif
185 
186 /*
187   tree_buff_length is somewhat arbitrary. The bigger it is the better
188   the chance to win in terms of compression factor. On the other hand,
189   this table becomes part of the compressed file header. And its length
190   is coded with 16 bits in the header. Hence the limit is 2**16 - 1.
191 */
192 static uint tree_buff_length= 65536 - MALLOC_OVERHEAD;
193 
194 static int error_on_write=0,test_only=0,verbose=0,silent=0,
195 	   write_loop=0,force_pack=0, isamchk_neaded=0;
196 static int tmpfile_createflag=O_RDWR | O_TRUNC | O_EXCL;
197 static my_bool backup, opt_wait;
198 static my_bool opt_ignore_control_file, opt_require_control_file;
199 static char tmp_dir[FN_REFLEN]={0},*join_table;
200 static my_off_t intervall_length;
201 static ha_checksum glob_crc;
202 static struct st_file_buffer file_buffer;
203 static QUEUE queue;
204 static HUFF_COUNTS *global_count;
205 static char zero_string[]={0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0};
206 static const char *load_default_groups[]= { "ariapack",0 };
207 
208 /*
209   Register handler error messages for usage with my_error()
210 
211   NOTES
212     This is safe to call multiple times as my_error_register()
213     will ignore calls to register already registered error numbers.
214 */
215 
get_handler_error_messages(int e)216 static const char **get_handler_error_messages(int e __attribute__((unused)))
217 {
218   return handler_error_messages;
219 }
220 
221 
222 /* The main program */
223 
main(int argc,char ** argv)224 int main(int argc, char **argv)
225 {
226   int error,ok;
227   PACK_MRG_INFO merge;
228   char **default_argv;
229   my_bool no_control_file= 0;
230   MY_INIT(argv[0]);
231 
232   maria_data_root= (char *)".";
233   load_defaults_or_exit("my", load_default_groups, &argc, &argv);
234   default_argv= argv;
235   get_options(&argc,&argv);
236   my_error_register(get_handler_error_messages, HA_ERR_FIRST,
237                     HA_ERR_FIRST+ array_elements(handler_error_messages)-1);
238 
239   if (!opt_ignore_control_file &&
240       (no_control_file= ma_control_file_open(FALSE,
241                                              (opt_require_control_file ||
242                                               !silent))) &&
243        opt_require_control_file)
244   {
245     error= 1;
246     goto end;
247   }
248   maria_init();
249   if (no_control_file || force_pack)
250   {
251     /* Assume that all rows exists */
252     trnman_init(MAX_INTERNAL_TRID-16);
253   }
254 
255   error=ok=isamchk_neaded=0;
256   if (join_table)
257   {						/* Join files into one */
258     if (open_maria_files(&merge,argv,(uint) argc) ||
259 	compress(&merge,join_table))
260       error=1;
261   }
262   else while (argc--)
263   {
264     MARIA_HA *isam_file;
265     if (!(isam_file=open_maria_file(*argv++,O_RDWR)))
266       error=1;
267     else
268     {
269       merge.file= &isam_file;
270       merge.current=0;
271       merge.free_file=0;
272       merge.count=1;
273       if (compress(&merge,0))
274 	error=1;
275       else
276 	ok=1;
277     }
278   }
279   if (ok && isamchk_neaded && !silent)
280     puts("Remember to run aria_chk -rq on compressed tables");
281 
282 end:
283   fflush(stdout);
284   fflush(stderr);
285   free_defaults(default_argv);
286   my_error_unregister(HA_ERR_FIRST,
287                       HA_ERR_FIRST+ array_elements(handler_error_messages)-1);
288   maria_end();
289   my_end(verbose ? MY_CHECK_ERROR | MY_GIVE_INFO : MY_CHECK_ERROR);
290   exit(error ? 2 : 0);
291 #ifndef _lint
292   return 0;					/* No compiler warning */
293 #endif
294 }
295 
296 enum options_mp {OPT_CHARSETS_DIR_MP=256, OPT_AUTO_CLOSE};
297 
298 static struct my_option my_long_options[] =
299 {
300 #ifdef __NETWARE__
301   {"autoclose", OPT_AUTO_CLOSE, "Auto close the screen on exit for Netware.",
302    0, 0, 0, GET_NO_ARG, NO_ARG, 0, 0, 0, 0, 0, 0},
303 #endif
304   {"backup", 'b', "Make a backup of the table as table_name.OLD.",
305    &backup, &backup, 0, GET_BOOL, NO_ARG, 0, 0, 0, 0, 0, 0},
306   {"character-sets-dir", OPT_CHARSETS_DIR_MP,
307    "Directory where character sets are.", (char**) &charsets_dir,
308    (char**) &charsets_dir, 0, GET_STR, REQUIRED_ARG, 0, 0, 0, 0, 0, 0},
309   {"datadir", 'h',
310    "Path for control file (and logs if --logdir not used).",
311    &maria_data_root, 0, 0, GET_STR, REQUIRED_ARG,
312    0, 0, 0, 0, 0, 0},
313   {"debug", '#', "Output debug log. Often this is 'd:t:o,filename'.",
314    0, 0, 0, GET_STR, OPT_ARG, 0, 0, 0, 0, 0, 0},
315   {"force", 'f',
316    "Force packing of table even if it gets bigger or if tempfile exists.",
317    0, 0, 0, GET_NO_ARG, NO_ARG, 0, 0, 0, 0, 0, 0},
318   { "ignore-control-file", 0,
319     "Ignore the control file",
320     (uchar**)&opt_ignore_control_file, 0, 0, GET_BOOL, NO_ARG,
321     0, 0, 0, 0, 0, 0},
322   {"join", 'j',
323    "Join all given tables into 'new_table_name'. All tables MUST have identical layouts.",
324    &join_table, &join_table, 0, GET_STR, REQUIRED_ARG, 0, 0, 0,
325    0, 0, 0},
326   {"help", '?', "Display this help and exit.",
327    0, 0, 0, GET_NO_ARG, NO_ARG, 0, 0, 0, 0, 0, 0},
328   { "require-control-file", 0,
329     "Abort if cannot find control file",
330     (uchar**)&opt_require_control_file, 0, 0, GET_BOOL, NO_ARG,
331     0, 0, 0, 0, 0, 0},
332   {"silent", 's', "Be more silent.",
333    0, 0, 0, GET_NO_ARG, NO_ARG, 0, 0, 0, 0, 0, 0},
334   {"tmpdir", 'T', "Use temporary directory to store temporary table.",
335    0, 0, 0, GET_STR, REQUIRED_ARG, 0, 0, 0, 0, 0, 0},
336   {"test", 't', "Don't pack table, only test packing it.",
337    0, 0, 0, GET_NO_ARG, NO_ARG, 0, 0, 0, 0, 0, 0},
338   {"verbose", 'v', "Write info about progress and packing result. Use many -v for more verbosity!",
339    0, 0, 0, GET_NO_ARG, NO_ARG, 0, 0, 0, 0, 0, 0},
340   {"version", 'V', "Output version information and exit.",
341    0, 0, 0, GET_NO_ARG, NO_ARG, 0, 0, 0, 0, 0, 0},
342   {"wait", 'w', "Wait and retry if table is in use.", &opt_wait,
343    &opt_wait, 0, GET_BOOL, NO_ARG, 0, 0, 0, 0, 0, 0},
344   { 0, 0, 0, 0, 0, 0, GET_NO_ARG, NO_ARG, 0, 0, 0, 0, 0, 0}
345 };
346 
347 
print_version(void)348 static void print_version(void)
349 {
350   printf("%s Ver 1.0 for %s on %s\n", my_progname, SYSTEM_TYPE, MACHINE_TYPE);
351 }
352 
353 
usage(void)354 static void usage(void)
355 {
356   print_version();
357   puts("Copyright 2002-2008 MySQL AB, 2008-2009 Sun Microsystems, Inc.");
358   puts("This software comes with ABSOLUTELY NO WARRANTY. This is free software,");
359   puts("and you are welcome to modify and redistribute it under the GPL license\n");
360 
361   puts("Pack a Aria-table to take much less space.");
362   puts("Keys are not updated, you must run aria_chk -rq on the index (.MAI) file");
363   puts("afterwards to update the keys.");
364   puts("You should give the .MAI file as the filename argument.");
365   puts("To unpack a packed table, run aria_chk -u on the table");
366 
367   printf("\nUsage: %s [OPTIONS] filename...\n", my_progname);
368   my_print_help(my_long_options);
369   print_defaults("my", load_default_groups);
370   my_print_variables(my_long_options);
371 }
372 
373 
374 static my_bool
get_one_option(int optid,const struct my_option * opt,char * argument)375 get_one_option(int optid, const struct my_option *opt __attribute__((unused)),
376 	       char *argument)
377 {
378   uint length;
379 
380   switch(optid) {
381 #ifdef __NETWARE__
382   case OPT_AUTO_CLOSE:
383     setscreenmode(SCR_AUTOCLOSE_ON_EXIT);
384     break;
385 #endif
386   case 'f':
387     force_pack= 1;
388     tmpfile_createflag= O_RDWR | O_TRUNC;
389     break;
390   case 's':
391     write_loop= verbose= 0;
392     silent= 1;
393     break;
394   case 't':
395     test_only= 1;
396     /* Avoid to reset 'verbose' if it was already set > 1. */
397     if (! verbose)
398       verbose= 1;
399     break;
400   case 'T':
401     length= (uint) (strmov(tmp_dir, argument) - tmp_dir);
402     if (length != dirname_length(tmp_dir))
403     {
404       tmp_dir[length]=FN_LIBCHAR;
405       tmp_dir[length+1]=0;
406     }
407     break;
408   case 'v':
409     verbose++; /* Allow for selecting the level of verbosity. */
410     silent= 0;
411     break;
412   case '#':
413     DBUG_PUSH(argument ? argument : "d:t:o,/tmp/aria_pack.trace");
414     break;
415   case 'V':
416     print_version();
417     exit(0);
418   case 'I':
419   case '?':
420     usage();
421     exit(0);
422   }
423   return 0;
424 }
425 
426 	/* reads options */
427 	/* Initiates DEBUG - but no debugging here ! */
428 
get_options(int * argc,char *** argv)429 static void get_options(int *argc,char ***argv)
430 {
431   int ho_error;
432 
433   my_progname= argv[0][0];
434   if (isatty(fileno(stdout)))
435     write_loop=1;
436 
437   if ((ho_error=handle_options(argc, argv, my_long_options, get_one_option)))
438     exit(ho_error);
439 
440   if (!*argc)
441   {
442     usage();
443     exit(1);
444   }
445   if (join_table)
446   {
447     backup=0;					/* Not needed */
448     tmp_dir[0]=0;
449   }
450   return;
451 }
452 
453 
print_error(int error,const char * filename)454 static void print_error(int error, const char *filename)
455 {
456   switch (error) {
457   case HA_ERR_CRASHED:
458     fprintf(stderr, "'%s' doesn't have a correct index definition. You need to recreate it before you can do a repair",filename);
459     break;
460   case HA_ERR_NOT_A_TABLE:
461     fprintf(stderr, "'%s' is not a Aria table",filename);
462     break;
463   case HA_ERR_CRASHED_ON_USAGE:
464     fprintf(stderr, "'%s' is marked as crashed",filename);
465     break;
466   case HA_ERR_CRASHED_ON_REPAIR:
467     fprintf(stderr, "'%s' is marked as crashed after last repair",filename);
468     break;
469   case HA_ERR_OLD_FILE:
470     fprintf(stderr, "'%s' has transactions newer than registered in control file. If this is ok, please re-run with --ignore-control-file", filename);
471     break;
472   case HA_ERR_NEW_FILE:
473     fprintf(stderr, "'%s' uses new features not supported by this version of the Aria library", filename);
474     break;
475   case HA_ERR_END_OF_FILE:
476     fprintf(stderr, "Couldn't read complete header from '%s'", filename);
477     break;
478   case EAGAIN:
479     fprintf(stderr, "'%s' is locked. Use -w to wait until unlocked",filename);
480     break;
481   case ENOENT:
482     fprintf(stderr, "File '%s' doesn't exist",filename);
483     break;
484   case EACCES:
485     fprintf(stderr, "You don't have permission to use '%s'", filename);
486     break;
487   default:
488     fprintf(stderr, "%d when opening Aria table '%s'", error, filename);
489     break;
490   }
491   fputc('\n',stderr);
492 }
493 
494 
open_maria_file(char * name,int mode)495 static MARIA_HA *open_maria_file(char *name,int mode)
496 {
497   MARIA_HA *isam_file;
498   MARIA_SHARE *share;
499   DBUG_ENTER("open_maria_file");
500 
501   if (!(isam_file=maria_open(name, mode, HA_OPEN_IGNORE_MOVED_STATE |
502                              (opt_wait ? HA_OPEN_WAIT_IF_LOCKED :
503                               HA_OPEN_ABORT_IF_LOCKED))))
504   {
505     print_error(my_errno, name);
506     DBUG_RETURN(0);
507   }
508   share=isam_file->s;
509   if (share->options & HA_OPTION_COMPRESS_RECORD && !join_table)
510   {
511     if (!force_pack)
512     {
513       fprintf(stderr, "%s is already compressed\n", name);
514       maria_close(isam_file);
515       DBUG_RETURN(0);
516     }
517     if (verbose)
518       puts("Recompressing already compressed table");
519     share->options&= ~HA_OPTION_READ_ONLY_DATA; /* We are modifing it */
520   }
521   if (! force_pack && share->state.state.records != 0 &&
522       (share->state.state.records <= 1 ||
523        share->state.state.data_file_length < 1024))
524   {
525     fprintf(stderr, "%s is too small to compress\n", name);
526     maria_close(isam_file);
527     DBUG_RETURN(0);
528   }
529   maria_lock_database(isam_file,F_WRLCK);
530   maria_ignore_trids(isam_file);
531   DBUG_RETURN(isam_file);
532 }
533 
534 
open_maria_files(PACK_MRG_INFO * mrg,char ** names,uint count)535 static my_bool open_maria_files(PACK_MRG_INFO *mrg,char **names,uint count)
536 {
537   uint i,j;
538   mrg->count=0;
539   mrg->current=0;
540   mrg->file=(MARIA_HA**) my_malloc(sizeof(MARIA_HA*)*count,MYF(MY_FAE));
541   mrg->free_file=1;
542   mrg->src_file_has_indexes_disabled= 0;
543   for (i=0; i < count ; i++)
544   {
545     if (!(mrg->file[i]=open_maria_file(names[i],O_RDONLY)))
546       goto error;
547 
548     mrg->src_file_has_indexes_disabled|=
549       ! maria_is_all_keys_active(mrg->file[i]->s->state.key_map,
550                               mrg->file[i]->s->base.keys);
551   }
552   /* Check that files are identical */
553   for (j=0 ; j < count-1 ; j++)
554   {
555     MARIA_COLUMNDEF *m1,*m2,*end;
556     if (mrg->file[j]->s->base.reclength != mrg->file[j+1]->s->base.reclength ||
557 	mrg->file[j]->s->base.fields != mrg->file[j+1]->s->base.fields)
558       goto diff_file;
559     m1=mrg->file[j]->s->columndef;
560     end=m1+mrg->file[j]->s->base.fields;
561     m2=mrg->file[j+1]->s->columndef;
562     for ( ; m1 != end ; m1++,m2++)
563     {
564       if (m1->type != m2->type || m1->length != m2->length)
565 	goto diff_file;
566     }
567   }
568   mrg->count=count;
569   return 0;
570 
571  diff_file:
572   fprintf(stderr, "%s: Tables '%s' and '%s' are not identical\n",
573                my_progname, names[j], names[j+1]);
574  error:
575   while (i--)
576     maria_close(mrg->file[i]);
577   my_free(mrg->file);
578   return 1;
579 }
580 
581 
compress(PACK_MRG_INFO * mrg,char * result_table)582 static int compress(PACK_MRG_INFO *mrg,char *result_table)
583 {
584   int error;
585   File new_file,join_maria_file;
586   MARIA_HA *isam_file;
587   MARIA_SHARE *share;
588   char org_name[FN_REFLEN],new_name[FN_REFLEN],temp_name[FN_REFLEN];
589   uint i,header_length,fields,trees,used_trees;
590   my_off_t old_length,new_length,tot_elements;
591   HUFF_COUNTS *huff_counts;
592   HUFF_TREE *huff_trees;
593   DBUG_ENTER("compress");
594 
595   isam_file=mrg->file[0];			/* Take this as an example */
596   share=isam_file->s;
597   new_file=join_maria_file= -1;
598   trees=fields=0;
599   huff_trees=0;
600   huff_counts=0;
601   maria_block_size= isam_file->s->block_size;
602 
603   /* Create temporary or join file */
604   if (backup)
605     fn_format(org_name,isam_file->s->open_file_name.str, "",MARIA_NAME_DEXT, 2);
606   else
607     fn_format(org_name,isam_file->s->open_file_name.str, "",MARIA_NAME_DEXT, 2+4+16);
608 
609   if (init_pagecache(maria_pagecache, MARIA_MIN_PAGE_CACHE_SIZE, 0, 0,
610                      maria_block_size, 0, MY_WME) == 0)
611   {
612     fprintf(stderr, "Can't initialize page cache\n");
613     goto err;
614   }
615 
616   if (!test_only && result_table)
617   {
618     /* Make a new indexfile based on first file in list */
619     uint length;
620     uchar *buff;
621     strmov(org_name,result_table);		/* Fix error messages */
622     fn_format(new_name,result_table,"",MARIA_NAME_IEXT,2);
623     if ((join_maria_file=my_create(new_name,0,tmpfile_createflag,MYF(MY_WME)))
624 	< 0)
625       goto err;
626     length=(uint) share->base.keystart;
627     if (!(buff= (uchar*) my_malloc(length,MYF(MY_WME))))
628       goto err;
629     if (my_pread(share->kfile.file, buff, length, 0L, MYF(MY_WME | MY_NABP)) ||
630 	my_write(join_maria_file,buff,length,
631 		 MYF(MY_WME | MY_NABP | MY_WAIT_IF_FULL)))
632     {
633       my_free(buff);
634       goto err;
635     }
636     my_free(buff);
637     fn_format(new_name,result_table,"",MARIA_NAME_DEXT,2);
638   }
639   else if (!tmp_dir[0])
640     make_new_name(new_name,org_name);
641   else
642     fn_format(new_name,org_name,tmp_dir,DATA_TMP_EXT,1+2+4);
643   if (!test_only &&
644       (new_file=my_create(new_name,0,tmpfile_createflag,MYF(MY_WME))) < 0)
645     goto err;
646 
647   /* Start calculating statistics */
648 
649   mrg->records=0;
650   for (i=0 ; i < mrg->count ; i++)
651     mrg->records+=mrg->file[i]->s->state.state.records;
652 
653   DBUG_PRINT("info", ("Compressing %s: (%lu records)",
654                       result_table ? new_name : org_name,
655                       (ulong) mrg->records));
656   if (write_loop || verbose)
657   {
658     printf("Compressing %s: (%lu records)\n",
659                 result_table ? new_name : org_name, (ulong) mrg->records);
660   }
661   trees=fields=share->base.fields;
662   huff_counts=init_huff_count(isam_file,mrg->records);
663 
664   /*
665     Read the whole data file(s) for statistics.
666   */
667   DBUG_PRINT("info", ("- Calculating statistics"));
668   if (write_loop || verbose)
669     printf("- Calculating statistics\n");
670   if (get_statistic(mrg,huff_counts))
671     goto err;
672 
673   old_length=0;
674   for (i=0; i < mrg->count ; i++)
675     old_length+= (mrg->file[i]->s->state.state.data_file_length -
676 		  mrg->file[i]->s->state.state.empty);
677 
678   /*
679     Create a global priority queue in preparation for making
680     temporary Huffman trees.
681   */
682   if (init_queue(&queue, 256, 0, 0, compare_huff_elements, 0, 0, 0))
683     goto err;
684 
685   /*
686     Check each column if we should use pre-space-compress, end-space-
687     compress, empty-field-compress or zero-field-compress.
688   */
689   check_counts(huff_counts,fields,mrg->records);
690 
691   /*
692     Build a Huffman tree for each column.
693   */
694   huff_trees=make_huff_trees(huff_counts,trees);
695 
696   /*
697     If the packed lengths of combined columns is less then the sum of
698     the non-combined columns, then create common Huffman trees for them.
699     We do this only for uchar compressed columns, not for distinct values
700     compressed columns.
701   */
702   if ((int) (used_trees=join_same_trees(huff_counts,trees)) < 0)
703     goto err;
704 
705   /*
706     Assign codes to all uchar or column values.
707   */
708   if (make_huff_decode_table(huff_trees,fields))
709     goto err;
710 
711   /* Prepare a file buffer. */
712   init_file_buffer(new_file,0);
713 
714   /*
715     Reserve space in the target file for the fixed compressed file header.
716   */
717   file_buffer.pos_in_file=HEAD_LENGTH;
718   if (! test_only)
719     my_seek(new_file,file_buffer.pos_in_file,MY_SEEK_SET,MYF(0));
720 
721   /*
722     Write field infos: field type, pack type, length bits, tree number.
723   */
724   write_field_info(huff_counts,fields,used_trees);
725 
726   /*
727     Write decode trees.
728   */
729   if (!(tot_elements=write_huff_tree(huff_trees,trees)))
730     goto err;
731 
732   /*
733     Calculate the total length of the compression info header.
734     This includes the fixed compressed file header, the column compression
735     type descriptions, and the decode trees.
736   */
737   header_length=(uint) file_buffer.pos_in_file+
738     (uint) (file_buffer.pos-file_buffer.buffer);
739 
740   /*
741     Compress the source file into the target file.
742   */
743   DBUG_PRINT("info", ("- Compressing file"));
744   if (write_loop || verbose)
745     printf("- Compressing file\n");
746   error=compress_maria_file(mrg,huff_counts);
747   new_length=file_buffer.pos_in_file;
748   if (!error && !test_only)
749   {
750     uchar buff[MEMMAP_EXTRA_MARGIN];		/* End marginal for memmap */
751     bzero(buff,sizeof(buff));
752     error=my_write(file_buffer.file,buff,sizeof(buff),
753 		   MYF(MY_WME | MY_NABP | MY_WAIT_IF_FULL)) != 0;
754   }
755 
756   /*
757     Write the fixed compressed file header.
758   */
759   if (!error)
760     error=write_header(mrg,header_length,used_trees,tot_elements,
761 		       new_length);
762 
763   /* Flush the file buffer. */
764   end_file_buffer();
765 
766   /* Display statistics. */
767   DBUG_PRINT("info", ("Min record length: %6d  Max length: %6d  "
768                       "Mean total length: %6ld",
769                       mrg->min_pack_length, mrg->max_pack_length,
770                       (ulong) (mrg->records ? (new_length/mrg->records) : 0)));
771   if (verbose && mrg->records)
772     printf("Min record length: %6d   Max length: %6d   "
773                 "Mean total length: %6ld\n", mrg->min_pack_length,
774                 mrg->max_pack_length, (ulong) (new_length/mrg->records));
775 
776   /* Close source and target file. */
777   if (!test_only)
778   {
779     error|=my_close(new_file,MYF(MY_WME));
780     if (!result_table)
781     {
782       (void) flush_pagecache_blocks(isam_file->s->pagecache, &isam_file->dfile,
783                                     FLUSH_RELEASE);
784       error|=my_close(isam_file->dfile.file, MYF(MY_WME));
785       isam_file->dfile.file= -1;	/* Tell maria_close file is closed */
786       isam_file->s->bitmap.file.file= -1;
787     }
788   }
789 
790   /* Cleanup. */
791   free_counts_and_tree_and_queue(huff_trees,trees,huff_counts,fields);
792   if (! test_only && ! error)
793   {
794     if (result_table)
795     {
796       error=save_state_mrg(join_maria_file,mrg,new_length,glob_crc);
797     }
798     else
799     {
800       if (backup)
801       {
802 	if (my_rename(org_name,make_old_name(temp_name,
803                                              isam_file->s->open_file_name.str),
804 		      MYF(MY_WME)))
805 	  error=1;
806 	else
807 	{
808 	  if (tmp_dir[0])
809 	    error=my_copy(new_name,org_name,MYF(MY_WME));
810 	  else
811 	    error=my_rename(new_name,org_name,MYF(MY_WME));
812 	  if (!error)
813           {
814 	    my_copystat(temp_name,org_name,MYF(MY_COPYTIME));
815             if (tmp_dir[0])
816               my_delete(new_name,MYF(MY_WME));
817           }
818 	}
819       }
820       else
821       {
822 	if (tmp_dir[0])
823         {
824 	  error=my_copy(new_name,org_name,
825 			MYF(MY_WME | MY_HOLD_ORIGINAL_MODES | MY_COPYTIME));
826           if (!error)
827             my_delete(new_name,MYF(MY_WME));
828         }
829 	else
830 	  error=my_redel(org_name, new_name, 0, MYF(MY_WME | MY_COPYTIME));
831       }
832       if (! error)
833 	error=save_state(isam_file,mrg,new_length,glob_crc);
834     }
835   }
836   error|=mrg_close(mrg);
837   if (join_maria_file >= 0)
838     error|=my_close(join_maria_file,MYF(MY_WME));
839   if (error)
840   {
841     fprintf(stderr, "Aborting: %s is not compressed\n", org_name);
842     my_delete(new_name,MYF(MY_WME));
843     DBUG_RETURN(-1);
844   }
845   if (write_loop || verbose)
846   {
847     if (old_length)
848       printf("%.4g%%     \n",
849                   (((longlong) (old_length - new_length)) * 100.0 /
850                    (longlong) old_length));
851     else
852       puts("Empty file saved in compressed format");
853   }
854   DBUG_RETURN(0);
855 
856  err:
857   free_counts_and_tree_and_queue(huff_trees,trees,huff_counts,fields);
858   if (new_file >= 0)
859     my_close(new_file,MYF(0));
860   if (join_maria_file >= 0)
861     my_close(join_maria_file,MYF(0));
862   mrg_close(mrg);
863   end_pagecache(maria_pagecache, 1);
864   fprintf(stderr, "Aborted: %s is not compressed\n", org_name);
865   DBUG_RETURN(-1);
866 }
867 
868 	/* Init a huff_count-struct for each field and init it */
869 
init_huff_count(MARIA_HA * info,my_off_t records)870 static HUFF_COUNTS *init_huff_count(MARIA_HA *info,my_off_t records)
871 {
872   reg2 uint i;
873   reg1 HUFF_COUNTS *count;
874   if ((count = (HUFF_COUNTS*) my_malloc(info->s->base.fields*
875 					sizeof(HUFF_COUNTS),
876 					MYF(MY_ZEROFILL | MY_WME))))
877   {
878     for (i=0 ; i < info->s->base.fields ; i++)
879     {
880       enum en_fieldtype type;
881       uint col_nr = info->s->columndef[i].column_nr;
882       count[col_nr].field_length=info->s->columndef[i].length;
883       type= count[col_nr].field_type=
884         (enum en_fieldtype) info->s->columndef[i].type;
885       if (type == FIELD_INTERVALL ||
886 	  type == FIELD_CONSTANT ||
887 	  type == FIELD_ZERO)
888 	type = FIELD_NORMAL;
889       if (count[col_nr].field_length <= 8 &&
890 	  (type == FIELD_NORMAL ||
891 	   type == FIELD_SKIP_ZERO))
892 	count[col_nr].max_zero_fill= count[col_nr].field_length;
893       /*
894         For every column initialize a tree, which is used to detect distinct
895         column values. 'int_tree' works together with 'tree_buff' and
896         'tree_pos'. It's keys are implemented by pointers into 'tree_buff'.
897         This is accomplished by '-1' as the element size.
898       */
899       init_tree(&count[col_nr].int_tree,0,0,-1,(qsort_cmp2) compare_tree, NULL,
900 		NULL, MYF(0));
901       if (records && type != FIELD_BLOB && type != FIELD_VARCHAR)
902 	count[col_nr].tree_pos=count[col_nr].tree_buff =
903 	  my_malloc(count[col_nr].field_length > 1 ? tree_buff_length : 2,
904 		    MYF(MY_WME));
905     }
906   }
907   return count;
908 }
909 
910 
911 	/* Free memory used by counts and trees */
912 
free_counts_and_tree_and_queue(HUFF_TREE * huff_trees,uint trees,HUFF_COUNTS * huff_counts,uint fields)913 static void free_counts_and_tree_and_queue(HUFF_TREE *huff_trees, uint trees,
914 					   HUFF_COUNTS *huff_counts,
915 					   uint fields)
916 {
917   register uint i;
918 
919   if (huff_trees)
920   {
921     for (i=0 ; i < trees ; i++)
922     {
923       if (huff_trees[i].element_buffer)
924 	my_free(huff_trees[i].element_buffer);
925       if (huff_trees[i].code)
926 	my_free(huff_trees[i].code);
927     }
928     my_free(huff_trees);
929   }
930   if (huff_counts)
931   {
932     for (i=0 ; i < fields ; i++)
933     {
934       if (huff_counts[i].tree_buff)
935       {
936 	my_free(huff_counts[i].tree_buff);
937 	delete_tree(&huff_counts[i].int_tree, 0);
938       }
939     }
940     my_free(huff_counts);
941   }
942   delete_queue(&queue);		/* This is safe to free */
943   return;
944 }
945 
946 	/* Read through old file and gather some statistics */
947 
get_statistic(PACK_MRG_INFO * mrg,HUFF_COUNTS * huff_counts)948 static int get_statistic(PACK_MRG_INFO *mrg,HUFF_COUNTS *huff_counts)
949 {
950   int error;
951   uint length, null_bytes;
952   ulong reclength,max_blob_length;
953   uchar *record,*pos,*next_pos,*end_pos,*start_pos;
954   ha_rows record_count;
955   HUFF_COUNTS *count,*end_count;
956   TREE_ELEMENT *element;
957   ha_checksum(*calc_checksum)(MARIA_HA *, const uchar *);
958   DBUG_ENTER("get_statistic");
959 
960   reclength=  mrg->file[0]->s->base.reclength;
961   null_bytes= mrg->file[0]->s->base.null_bytes;
962   record=(uchar*) my_safe_alloca(reclength);
963   end_count=huff_counts+mrg->file[0]->s->base.fields;
964   record_count=0; glob_crc=0;
965   max_blob_length=0;
966 
967   /* Check how to calculate checksum */
968   if (mrg->file[0]->s->data_file_type == STATIC_RECORD)
969     calc_checksum= _ma_static_checksum;
970   else
971     calc_checksum= _ma_checksum;
972 
973   mrg_reset(mrg);
974   while ((error=mrg_rrnd(mrg,record)) != HA_ERR_END_OF_FILE)
975   {
976     ulong tot_blob_length=0;
977     if (! error)
978     {
979       /* glob_crc is a checksum over all bytes of all records. */
980       glob_crc+= (*calc_checksum)(mrg->file[0],record);
981 
982       /* Count the incidence of values separately for every column. */
983       for (pos=record + null_bytes, count=huff_counts ;
984 	   count < end_count ;
985 	   count++,
986 	   pos=next_pos)
987       {
988 	next_pos=end_pos=(start_pos=pos)+count->field_length;
989 
990 	/*
991           Put the whole column value in a tree if there is room for it.
992           'int_tree' is used to quickly check for duplicate values.
993           'tree_buff' collects as many distinct column values as
994           possible. If the field length is > 1, it is tree_buff_length,
995           else 2 bytes. Each value is 'field_length' bytes big. If there
996           are more distinct column values than fit into the buffer, we
997           give up with this tree. BLOBs and VARCHARs do not have a
998           tree_buff as it can only be used with fixed length columns.
999           For the special case of field length == 1, we handle only the
1000           case that there is only one distinct value in the table(s).
1001           Otherwise, we can have a maximum of 256 distinct values. This
1002           is then handled by the normal Huffman tree build.
1003 
1004           Another limit for collecting distinct column values is the
1005           number of values itself. Since we would need to build a
1006           Huffman tree for the values, we are limited by the 'IS_OFFSET'
1007           constant. This constant expresses a bit which is used to
1008           determine if a tree element holds a final value or an offset
1009           to a child element. Hence, all values and offsets need to be
1010           smaller than 'IS_OFFSET'. A tree element is implemented with
1011           two integer values, one for the left branch and one for the
1012           right branch. For the extreme case that the first element
1013           points to the last element, the number of integers in the tree
1014           must be less or equal to IS_OFFSET. So the number of elements
1015           must be less or equal to IS_OFFSET / 2.
1016 
1017           WARNING: At first, we insert a pointer into the record buffer
1018           as the key for the tree. If we got a new distinct value, which
1019           is really inserted into the tree, instead of being counted
1020           only, we will copy the column value from the record buffer to
1021           'tree_buff' and adjust the key pointer of the tree accordingly.
1022         */
1023 	if (count->tree_buff)
1024 	{
1025 	  global_count=count;
1026 	  if (!(element=tree_insert(&count->int_tree,pos, 0,
1027 				    count->int_tree.custom_arg)) ||
1028 	      (element->count == 1 &&
1029 	       (count->tree_buff + tree_buff_length <
1030                 count->tree_pos + count->field_length)) ||
1031               (count->int_tree.elements_in_tree > IS_OFFSET / 2) ||
1032 	      (count->field_length == 1 &&
1033 	       count->int_tree.elements_in_tree > 1))
1034 	  {
1035 	    delete_tree(&count->int_tree, 0);
1036 	    my_free(count->tree_buff);
1037 	    count->tree_buff=0;
1038 	  }
1039 	  else
1040 	  {
1041             /*
1042               If tree_insert() succeeds, it either creates a new element
1043               or increments the counter of an existing element.
1044             */
1045 	    if (element->count == 1)
1046 	    {
1047               /* Copy the new column value into 'tree_buff'. */
1048 	      memcpy(count->tree_pos,pos,(size_t) count->field_length);
1049               /* Adjust the key pointer in the tree. */
1050 	      tree_set_pointer(element,count->tree_pos);
1051               /* Point behind the last column value so far. */
1052 	      count->tree_pos+=count->field_length;
1053 	    }
1054 	  }
1055 	}
1056 
1057 	/* Save character counters and space-counts and zero-field-counts */
1058 	if (count->field_type == FIELD_NORMAL ||
1059 	    count->field_type == FIELD_SKIP_ENDSPACE)
1060 	{
1061           /* Ignore trailing space. */
1062 	  for ( ; end_pos > pos ; end_pos--)
1063 	    if (end_pos[-1] != ' ')
1064 	      break;
1065           /* Empty fields are just counted. Go to the next record. */
1066 	  if (end_pos == pos)
1067 	  {
1068 	    count->empty_fields++;
1069 	    count->max_zero_fill=0;
1070 	    continue;
1071 	  }
1072           /*
1073             Count the total of all trailing spaces and the number of
1074             short trailing spaces. Remember the longest trailing space.
1075           */
1076 	  length= (uint) (next_pos-end_pos);
1077 	  count->tot_end_space+=length;
1078 	  if (length < 8)
1079 	    count->end_space[length]++;
1080 	  if (count->max_end_space < length)
1081 	    count->max_end_space = length;
1082 	}
1083 
1084 	if (count->field_type == FIELD_NORMAL ||
1085 	    count->field_type == FIELD_SKIP_PRESPACE)
1086 	{
1087           /* Ignore leading space. */
1088 	  for (pos=start_pos; pos < end_pos ; pos++)
1089 	    if (pos[0] != ' ')
1090 	      break;
1091           /* Empty fields are just counted. Go to the next record. */
1092 	  if (end_pos == pos)
1093 	  {
1094 	    count->empty_fields++;
1095 	    count->max_zero_fill=0;
1096 	    continue;
1097 	  }
1098           /*
1099             Count the total of all leading spaces and the number of
1100             short leading spaces. Remember the longest leading space.
1101           */
1102 	  length= (uint) (pos-start_pos);
1103 	  count->tot_pre_space+=length;
1104 	  if (length < 8)
1105 	    count->pre_space[length]++;
1106 	  if (count->max_pre_space < length)
1107 	    count->max_pre_space = length;
1108 	}
1109 
1110         /* Calculate pos, end_pos, and max_length for variable length fields. */
1111 	if (count->field_type == FIELD_BLOB)
1112 	{
1113 	  uint field_length=count->field_length -portable_sizeof_char_ptr;
1114 	  ulong blob_length= _ma_calc_blob_length(field_length, start_pos);
1115 	  memcpy(&pos,  start_pos+field_length,sizeof(char*));
1116 	  end_pos=pos+blob_length;
1117 	  tot_blob_length+=blob_length;
1118 	  set_if_bigger(count->max_length,blob_length);
1119 	}
1120 	else if (count->field_type == FIELD_VARCHAR)
1121 	{
1122           uint pack_length= HA_VARCHAR_PACKLENGTH(count->field_length-1);
1123 	  length= (pack_length == 1 ? (uint) *(uchar*) start_pos :
1124                    uint2korr(start_pos));
1125 	  pos= start_pos+pack_length;
1126 	  end_pos= pos+length;
1127 	  set_if_bigger(count->max_length,length);
1128 	}
1129 
1130         /* Evaluate 'max_zero_fill' for short fields. */
1131 	if (count->field_length <= 8 &&
1132 	    (count->field_type == FIELD_NORMAL ||
1133 	     count->field_type == FIELD_SKIP_ZERO))
1134 	{
1135 	  uint i;
1136           /* Zero fields are just counted. Go to the next record. */
1137 	  if (!memcmp(start_pos, zero_string, count->field_length))
1138 	  {
1139 	    count->zero_fields++;
1140 	    continue;
1141 	  }
1142           /*
1143             max_zero_fill starts with field_length. It is decreased every
1144             time a shorter "zero trailer" is found. It is set to zero when
1145             an empty field is found (see above). This suggests that the
1146             variable should be called 'min_zero_fill'.
1147           */
1148 	  for (i =0 ; i < count->max_zero_fill && ! end_pos[-1 - (int) i] ;
1149 	       i++) ;
1150 	  if (i < count->max_zero_fill)
1151 	    count->max_zero_fill=i;
1152 	}
1153 
1154         /* Ignore zero fields and check fields. */
1155 	if (count->field_type == FIELD_ZERO ||
1156 	    count->field_type == FIELD_CHECK)
1157 	  continue;
1158 
1159         /*
1160           Count the incidence of every uchar value in the
1161           significant field value.
1162         */
1163 	for ( ; pos < end_pos ; pos++)
1164 	  count->counts[(uchar) *pos]++;
1165 
1166         /* Step to next field. */
1167       }
1168 
1169       if (tot_blob_length > max_blob_length)
1170 	max_blob_length=tot_blob_length;
1171       record_count++;
1172       if (write_loop && record_count % WRITE_COUNT == 0)
1173       {
1174 	printf("%lu\r", (ulong) record_count);
1175         fflush(stdout);
1176       }
1177     }
1178     else if (error != HA_ERR_RECORD_DELETED)
1179     {
1180       fprintf(stderr, "Got error %d while reading rows\n", error);
1181       break;
1182     }
1183 
1184     /* Step to next record. */
1185   }
1186   if (write_loop)
1187   {
1188     printf("            \r");
1189     fflush(stdout);
1190   }
1191 
1192   /*
1193     If --debug=d,fakebigcodes is set, fake the counts to get big Huffman
1194     codes.
1195   */
1196   DBUG_EXECUTE_IF("fakebigcodes", fakebigcodes(huff_counts, end_count););
1197 
1198   DBUG_PRINT("info", ("Found the following number of incidents "
1199                       "of the uchar codes:"));
1200   if (verbose >= 2)
1201     printf("Found the following number of incidents "
1202                 "of the uchar codes:\n");
1203   for (count= huff_counts ; count < end_count; count++)
1204   {
1205     uint      idx;
1206     my_off_t  total_count;
1207     char      llbuf[32];
1208 
1209     DBUG_PRINT("info", ("column: %3u", (uint) (count - huff_counts + 1)));
1210     if (verbose >= 2)
1211       printf("column: %3u\n", (uint) (count - huff_counts + 1));
1212     if (count->tree_buff)
1213     {
1214       DBUG_PRINT("info", ("number of distinct values: %u",
1215                           (uint) ((count->tree_pos - count->tree_buff) /
1216                                   count->field_length)));
1217       if (verbose >= 2)
1218         printf("number of distinct values: %u\n",
1219                     (uint) ((count->tree_pos - count->tree_buff) /
1220                             count->field_length));
1221     }
1222     total_count= 0;
1223     for (idx= 0; idx < 256; idx++)
1224     {
1225       if (count->counts[idx])
1226       {
1227         total_count+= count->counts[idx];
1228         DBUG_PRINT("info", ("counts[0x%02x]: %12s", idx,
1229                             llstr((longlong) count->counts[idx], llbuf)));
1230         if (verbose >= 2)
1231           printf("counts[0x%02x]: %12s\n", idx,
1232                       llstr((longlong) count->counts[idx], llbuf));
1233       }
1234     }
1235     DBUG_PRINT("info", ("total:        %12s", llstr((longlong) total_count,
1236                                                     llbuf)));
1237     if ((verbose >= 2) && total_count)
1238     {
1239       printf("total:        %12s\n",
1240                   llstr((longlong) total_count, llbuf));
1241     }
1242   }
1243 
1244   mrg->records=record_count;
1245   mrg->max_blob_length=max_blob_length;
1246   my_safe_afree(record, reclength);
1247   DBUG_RETURN(error != HA_ERR_END_OF_FILE);
1248 }
1249 
compare_huff_elements(void * not_used,uchar * a,uchar * b)1250 static int compare_huff_elements(void *not_used __attribute__((unused)),
1251 				 uchar *a, uchar *b)
1252 {
1253   return *((my_off_t*) a) < *((my_off_t*) b) ? -1 :
1254     (*((my_off_t*) a) == *((my_off_t*) b)  ? 0 : 1);
1255 }
1256 
1257 	/* Check each tree if we should use pre-space-compress, end-space-
1258 	   compress, empty-field-compress or zero-field-compress */
1259 
check_counts(HUFF_COUNTS * huff_counts,uint trees,my_off_t records)1260 static void check_counts(HUFF_COUNTS *huff_counts, uint trees,
1261 			 my_off_t records)
1262 {
1263   uint space_fields,fill_zero_fields,field_count[(int) FIELD_enum_val_count];
1264   my_off_t old_length,new_length,length;
1265   DBUG_ENTER("check_counts");
1266 
1267   bzero((uchar*) field_count,sizeof(field_count));
1268   space_fields=fill_zero_fields=0;
1269 
1270   for (; trees-- ; huff_counts++)
1271   {
1272     if (huff_counts->field_type == FIELD_BLOB)
1273     {
1274       huff_counts->length_bits=max_bit(huff_counts->max_length);
1275       goto found_pack;
1276     }
1277     else if (huff_counts->field_type == FIELD_VARCHAR)
1278     {
1279       huff_counts->length_bits=max_bit(huff_counts->max_length);
1280       goto found_pack;
1281     }
1282     else if (huff_counts->field_type == FIELD_CHECK)
1283     {
1284       huff_counts->bytes_packed=0;
1285       huff_counts->counts[0]=0;
1286       goto found_pack;
1287     }
1288 
1289     huff_counts->field_type=FIELD_NORMAL;
1290     huff_counts->pack_type=0;
1291 
1292     /* Check for zero-filled records (in this column), or zero records. */
1293     if (huff_counts->zero_fields || ! records)
1294     {
1295       my_off_t old_space_count;
1296       /*
1297         If there are only zero filled records (in this column),
1298         or no records at all, we are done.
1299       */
1300       if (huff_counts->zero_fields == records)
1301       {
1302 	huff_counts->field_type= FIELD_ZERO;
1303 	huff_counts->bytes_packed=0;
1304 	huff_counts->counts[0]=0;
1305 	goto found_pack;
1306       }
1307       /* Remeber the number of significant spaces. */
1308       old_space_count=huff_counts->counts[' '];
1309       /* Add all leading and trailing spaces. */
1310       huff_counts->counts[' ']+= (huff_counts->tot_end_space +
1311                                   huff_counts->tot_pre_space +
1312                                   huff_counts->empty_fields *
1313                                   huff_counts->field_length);
1314       /* Check, what the compressed length of this would be. */
1315       old_length=calc_packed_length(huff_counts,0)+records/8;
1316       /* Get the number of zero bytes. */
1317       length=huff_counts->zero_fields*huff_counts->field_length;
1318       /* Add it to the counts. */
1319       huff_counts->counts[0]+=length;
1320       /* Check, what the compressed length of this would be. */
1321       new_length=calc_packed_length(huff_counts,0);
1322       /* If the compression without the zeroes would be shorter, we are done. */
1323       if (old_length < new_length && huff_counts->field_length > 1)
1324       {
1325 	huff_counts->field_type=FIELD_SKIP_ZERO;
1326 	huff_counts->counts[0]-=length;
1327 	huff_counts->bytes_packed=old_length- records/8;
1328 	goto found_pack;
1329       }
1330       /* Remove the insignificant spaces, but keep the zeroes. */
1331       huff_counts->counts[' ']=old_space_count;
1332     }
1333     /* Check, what the compressed length of this column would be. */
1334     huff_counts->bytes_packed=calc_packed_length(huff_counts,0);
1335 
1336     /*
1337       If there are enough empty records (in this column),
1338       treating them specially may pay off.
1339     */
1340     if (huff_counts->empty_fields)
1341     {
1342       if (huff_counts->field_length > 2 &&
1343 	  huff_counts->empty_fields + (records - huff_counts->empty_fields)*
1344 	  (1+max_bit(MY_MAX(huff_counts->max_pre_space,
1345 			 huff_counts->max_end_space))) <
1346 	  records * max_bit(huff_counts->field_length))
1347       {
1348 	huff_counts->pack_type |= PACK_TYPE_SPACE_FIELDS;
1349       }
1350       else
1351       {
1352 	length=huff_counts->empty_fields*huff_counts->field_length;
1353 	if (huff_counts->tot_end_space || ! huff_counts->tot_pre_space)
1354 	{
1355 	  huff_counts->tot_end_space+=length;
1356 	  huff_counts->max_end_space=huff_counts->field_length;
1357 	  if (huff_counts->field_length < 8)
1358 	    huff_counts->end_space[huff_counts->field_length]+=
1359 	      huff_counts->empty_fields;
1360 	}
1361 	if (huff_counts->tot_pre_space)
1362 	{
1363 	  huff_counts->tot_pre_space+=length;
1364 	  huff_counts->max_pre_space=huff_counts->field_length;
1365 	  if (huff_counts->field_length < 8)
1366 	    huff_counts->pre_space[huff_counts->field_length]+=
1367 	      huff_counts->empty_fields;
1368 	}
1369       }
1370     }
1371 
1372     /*
1373       If there are enough trailing spaces (in this column),
1374       treating them specially may pay off.
1375     */
1376     if (huff_counts->tot_end_space)
1377     {
1378       huff_counts->counts[' ']+=huff_counts->tot_pre_space;
1379       if (test_space_compress(huff_counts,records,huff_counts->max_end_space,
1380 			      huff_counts->end_space,
1381 			      huff_counts->tot_end_space,FIELD_SKIP_ENDSPACE))
1382 	goto found_pack;
1383       huff_counts->counts[' ']-=huff_counts->tot_pre_space;
1384     }
1385 
1386     /*
1387       If there are enough leading spaces (in this column),
1388       treating them specially may pay off.
1389     */
1390     if (huff_counts->tot_pre_space)
1391     {
1392       if (test_space_compress(huff_counts,records,huff_counts->max_pre_space,
1393 			      huff_counts->pre_space,
1394 			      huff_counts->tot_pre_space,FIELD_SKIP_PRESPACE))
1395 	goto found_pack;
1396     }
1397 
1398   found_pack:			/* Found field-packing */
1399 
1400     /* Test if we can use zero-fill */
1401 
1402     if (huff_counts->max_zero_fill &&
1403 	(huff_counts->field_type == FIELD_NORMAL ||
1404 	 huff_counts->field_type == FIELD_SKIP_ZERO))
1405     {
1406       huff_counts->counts[0]-=huff_counts->max_zero_fill*
1407 	(huff_counts->field_type == FIELD_SKIP_ZERO ?
1408 	 records - huff_counts->zero_fields : records);
1409       huff_counts->pack_type|=PACK_TYPE_ZERO_FILL;
1410       huff_counts->bytes_packed=calc_packed_length(huff_counts,0);
1411     }
1412 
1413     /* Test if intervall-field is better */
1414 
1415     if (huff_counts->tree_buff)
1416     {
1417       HUFF_TREE tree;
1418 
1419       DBUG_EXECUTE_IF("forceintervall",
1420                       huff_counts->bytes_packed= ~ (my_off_t) 0;);
1421       tree.element_buffer=0;
1422       if (!make_huff_tree(&tree,huff_counts) &&
1423 	  tree.bytes_packed+tree.tree_pack_length < huff_counts->bytes_packed)
1424       {
1425 	if (tree.elements == 1)
1426 	  huff_counts->field_type=FIELD_CONSTANT;
1427 	else
1428 	  huff_counts->field_type=FIELD_INTERVALL;
1429 	huff_counts->pack_type=0;
1430       }
1431       else
1432       {
1433 	my_free(huff_counts->tree_buff);
1434 	delete_tree(&huff_counts->int_tree, 0);
1435 	huff_counts->tree_buff=0;
1436       }
1437       if (tree.element_buffer)
1438 	my_free(tree.element_buffer);
1439     }
1440     if (huff_counts->pack_type & PACK_TYPE_SPACE_FIELDS)
1441       space_fields++;
1442     if (huff_counts->pack_type & PACK_TYPE_ZERO_FILL)
1443       fill_zero_fields++;
1444     field_count[huff_counts->field_type]++;
1445   }
1446   DBUG_PRINT("info", ("normal:    %3d  empty-space:     %3d  "
1447                       "empty-zero:       %3d  empty-fill: %3d",
1448                       field_count[FIELD_NORMAL],space_fields,
1449                       field_count[FIELD_SKIP_ZERO],fill_zero_fields));
1450   DBUG_PRINT("info", ("pre-space: %3d  end-space:       %3d  "
1451                       "intervall-fields: %3d  zero:       %3d",
1452                       field_count[FIELD_SKIP_PRESPACE],
1453                       field_count[FIELD_SKIP_ENDSPACE],
1454                       field_count[FIELD_INTERVALL],
1455                       field_count[FIELD_ZERO]));
1456   if (verbose)
1457     printf("\nnormal:    %3d  empty-space:     %3d  "
1458                 "empty-zero:       %3d  empty-fill: %3d\n"
1459                 "pre-space: %3d  end-space:       %3d  "
1460                 "intervall-fields: %3d  zero:       %3d\n",
1461                 field_count[FIELD_NORMAL],space_fields,
1462                 field_count[FIELD_SKIP_ZERO],fill_zero_fields,
1463                 field_count[FIELD_SKIP_PRESPACE],
1464                 field_count[FIELD_SKIP_ENDSPACE],
1465                 field_count[FIELD_INTERVALL],
1466                 field_count[FIELD_ZERO]);
1467   DBUG_VOID_RETURN;
1468 }
1469 
1470 
1471 /* Test if we can use space-compression and empty-field-compression */
1472 
1473 static int
test_space_compress(HUFF_COUNTS * huff_counts,my_off_t records,uint max_space_length,my_off_t * space_counts,my_off_t tot_space_count,enum en_fieldtype field_type)1474 test_space_compress(HUFF_COUNTS *huff_counts, my_off_t records,
1475 		    uint max_space_length, my_off_t *space_counts,
1476 		    my_off_t tot_space_count, enum en_fieldtype field_type)
1477 {
1478   int min_pos;
1479   uint length_bits,i;
1480   my_off_t space_count,min_space_count,min_pack,new_length,skip;
1481 
1482   length_bits=max_bit(max_space_length);
1483 
1484 		/* Default no end_space-packing */
1485   space_count=huff_counts->counts[(uint) ' '];
1486   min_space_count= (huff_counts->counts[(uint) ' ']+= tot_space_count);
1487   min_pack=calc_packed_length(huff_counts,0);
1488   min_pos= -2;
1489   huff_counts->counts[(uint) ' ']=space_count;
1490 
1491 	/* Test with allways space-count */
1492   new_length=huff_counts->bytes_packed+length_bits*records/8;
1493   if (new_length+1 < min_pack)
1494   {
1495     min_pos= -1;
1496     min_pack=new_length;
1497     min_space_count=space_count;
1498   }
1499 	/* Test with length-flag */
1500   for (skip=0L, i=0 ; i < 8 ; i++)
1501   {
1502     if (space_counts[i])
1503     {
1504       if (i)
1505 	huff_counts->counts[(uint) ' ']+=space_counts[i];
1506       skip+=huff_counts->pre_space[i];
1507       new_length=calc_packed_length(huff_counts,0)+
1508 	(records+(records-skip)*(1+length_bits))/8;
1509       if (new_length < min_pack)
1510       {
1511 	min_pos=(int) i;
1512 	min_pack=new_length;
1513 	min_space_count=huff_counts->counts[(uint) ' '];
1514       }
1515     }
1516   }
1517 
1518   huff_counts->counts[(uint) ' ']=min_space_count;
1519   huff_counts->bytes_packed=min_pack;
1520   switch (min_pos) {
1521   case -2:
1522     return(0);				/* No space-compress */
1523   case -1:				/* Always space-count */
1524     huff_counts->field_type=field_type;
1525     huff_counts->min_space=0;
1526     huff_counts->length_bits=max_bit(max_space_length);
1527     break;
1528   default:
1529     huff_counts->field_type=field_type;
1530     huff_counts->min_space=(uint) min_pos;
1531     huff_counts->pack_type|=PACK_TYPE_SELECTED;
1532     huff_counts->length_bits=max_bit(max_space_length);
1533     break;
1534   }
1535   return(1);				/* Using space-compress */
1536 }
1537 
1538 
1539 	/* Make a huff_tree of each huff_count */
1540 
make_huff_trees(HUFF_COUNTS * huff_counts,uint trees)1541 static HUFF_TREE* make_huff_trees(HUFF_COUNTS *huff_counts, uint trees)
1542 {
1543   uint tree;
1544   HUFF_TREE *huff_tree;
1545   DBUG_ENTER("make_huff_trees");
1546 
1547   if (!(huff_tree=(HUFF_TREE*) my_malloc(trees*sizeof(HUFF_TREE),
1548 					 MYF(MY_WME | MY_ZEROFILL))))
1549     DBUG_RETURN(0);
1550 
1551   for (tree=0 ; tree < trees ; tree++)
1552   {
1553     if (make_huff_tree(huff_tree+tree,huff_counts+tree))
1554     {
1555       while (tree--)
1556 	my_free(huff_tree[tree].element_buffer);
1557       my_free(huff_tree);
1558       DBUG_RETURN(0);
1559     }
1560   }
1561   DBUG_RETURN(huff_tree);
1562 }
1563 
1564 /*
1565   Build a Huffman tree.
1566 
1567   SYNOPSIS
1568     make_huff_tree()
1569     huff_tree                   The Huffman tree.
1570     huff_counts                 The counts.
1571 
1572   DESCRIPTION
1573     Build a Huffman tree according to huff_counts->counts or
1574     huff_counts->tree_buff. tree_buff, if non-NULL contains up to
1575     tree_buff_length of distinct column values. In that case, whole
1576     values can be Huffman encoded instead of single bytes.
1577 
1578   RETURN
1579     0           OK
1580     != 0        Error
1581 */
1582 
make_huff_tree(HUFF_TREE * huff_tree,HUFF_COUNTS * huff_counts)1583 static int make_huff_tree(HUFF_TREE *huff_tree, HUFF_COUNTS *huff_counts)
1584 {
1585   uint i,found,bits_packed,first,last;
1586   my_off_t bytes_packed;
1587   HUFF_ELEMENT *a,*b,*new_huff_el;
1588 
1589   first=last=0;
1590   if (huff_counts->tree_buff)
1591   {
1592     /* Calculate the number of distinct values in tree_buff. */
1593     found= (uint) (huff_counts->tree_pos - huff_counts->tree_buff) /
1594       huff_counts->field_length;
1595     first=0; last=found-1;
1596   }
1597   else
1598   {
1599     /* Count the number of uchar codes found in the column. */
1600     for (i=found=0 ; i < 256 ; i++)
1601     {
1602       if (huff_counts->counts[i])
1603       {
1604 	if (! found++)
1605 	  first=i;
1606 	last=i;
1607       }
1608     }
1609     if (found < 2)
1610       found=2;
1611   }
1612 
1613   /* When using 'tree_buff' we can have more that 256 values. */
1614   if (queue.max_elements < found)
1615   {
1616     delete_queue(&queue);
1617     if (init_queue(&queue,found, 0, 0, compare_huff_elements, 0, 0, 0))
1618       return -1;
1619   }
1620 
1621   /* Allocate or reallocate an element buffer for the Huffman tree. */
1622   if (!huff_tree->element_buffer)
1623   {
1624     if (!(huff_tree->element_buffer=
1625 	 (HUFF_ELEMENT*) my_malloc(found*2*sizeof(HUFF_ELEMENT),MYF(MY_WME))))
1626       return 1;
1627   }
1628   else
1629   {
1630     HUFF_ELEMENT *temp;
1631     if (!(temp=
1632 	  (HUFF_ELEMENT*) my_realloc((uchar*) huff_tree->element_buffer,
1633 				     found*2*sizeof(HUFF_ELEMENT),
1634 				     MYF(MY_WME))))
1635       return 1;
1636     huff_tree->element_buffer=temp;
1637   }
1638 
1639   huff_counts->tree=huff_tree;
1640   huff_tree->counts=huff_counts;
1641   huff_tree->min_chr=first;
1642   huff_tree->max_chr=last;
1643   huff_tree->char_bits=max_bit(last-first);
1644   huff_tree->offset_bits=max_bit(found-1)+1;
1645 
1646   if (huff_counts->tree_buff)
1647   {
1648     huff_tree->elements=0;
1649     huff_tree->tree_pack_length=(1+15+16+5+5+
1650 				 (huff_tree->char_bits+1)*found+
1651 				 (huff_tree->offset_bits+1)*
1652 				 (found-2)+7)/8 +
1653 				   (uint) (huff_tree->counts->tree_pos-
1654 					   huff_tree->counts->tree_buff);
1655     /*
1656       Put a HUFF_ELEMENT into the queue for every distinct column value.
1657 
1658       tree_walk() calls save_counts_in_queue() for every element in
1659       'int_tree'. This takes elements from the target trees element
1660       buffer and places references to them into the buffer of the
1661       priority queue. We insert in column value order, but the order is
1662       in fact irrelevant here. We will establish the correct order
1663       later.
1664     */
1665     tree_walk(&huff_counts->int_tree,
1666 	      (int (*)(void*, element_count,void*)) save_counts_in_queue,
1667 	      (uchar*) huff_tree, left_root_right);
1668   }
1669   else
1670   {
1671     huff_tree->elements=found;
1672     huff_tree->tree_pack_length=(9+9+5+5+
1673 				 (huff_tree->char_bits+1)*found+
1674 				 (huff_tree->offset_bits+1)*
1675 				 (found-2)+7)/8;
1676     /*
1677       Put a HUFF_ELEMENT into the queue for every uchar code found in the column.
1678 
1679       The elements are taken from the target trees element buffer.
1680       Instead of using queue_insert(), we just place references to the
1681       elements into the buffer of the priority queue. We insert in byte
1682       value order, but the order is in fact irrelevant here. We will
1683       establish the correct order later.
1684     */
1685     for (i=first, found=0 ; i <= last ; i++)
1686     {
1687       if (huff_counts->counts[i])
1688       {
1689 	new_huff_el=huff_tree->element_buffer+(found++);
1690 	new_huff_el->count=huff_counts->counts[i];
1691 	new_huff_el->a.leaf.null=0;
1692 	new_huff_el->a.leaf.element_nr=i;
1693 	queue.root[found]=(uchar*) new_huff_el;
1694       }
1695     }
1696     /*
1697       If there is only a single uchar value in this field in all records,
1698       add a second element with zero incidence. This is required to enter
1699       the loop, which builds the Huffman tree.
1700     */
1701     while (found < 2)
1702     {
1703       new_huff_el=huff_tree->element_buffer+(found++);
1704       new_huff_el->count=0;
1705       new_huff_el->a.leaf.null=0;
1706       if (last)
1707 	new_huff_el->a.leaf.element_nr=huff_tree->min_chr=last-1;
1708       else
1709 	new_huff_el->a.leaf.element_nr=huff_tree->max_chr=last+1;
1710       queue.root[found]=(uchar*) new_huff_el;
1711     }
1712   }
1713 
1714   /* Make a queue from the queue buffer. */
1715   queue.elements=found;
1716 
1717   /*
1718     Make a priority queue from the queue. Construct its index so that we
1719     have a partially ordered tree.
1720   */
1721   queue_fix(&queue);
1722 
1723   /* The Huffman algorithm. */
1724   bytes_packed=0; bits_packed=0;
1725   for (i=1 ; i < found ; i++)
1726   {
1727     /*
1728       Pop the top element from the queue (the one with the least incidence).
1729       Popping from a priority queue includes a re-ordering of the queue,
1730       to get the next least incidence element to the top.
1731     */
1732     a=(HUFF_ELEMENT*) queue_remove_top(&queue);
1733     /* Copy the next least incidence element */
1734     b=(HUFF_ELEMENT*) queue_top(&queue);
1735     /* Get a new element from the element buffer. */
1736     new_huff_el=huff_tree->element_buffer+found+i;
1737     /* The new element gets the sum of the two least incidence elements. */
1738     new_huff_el->count=a->count+b->count;
1739     /*
1740       The Huffman algorithm assigns another bit to the code for a byte
1741       every time that bytes incidence is combined (directly or indirectly)
1742       to a new element as one of the two least incidence elements.
1743       This means that one more bit per incidence of that uchar is required
1744       in the resulting file. So we add the new combined incidence as the
1745       number of bits by which the result grows.
1746     */
1747     bits_packed+=(uint) (new_huff_el->count & 7);
1748     bytes_packed+=new_huff_el->count/8;
1749     /* The new element points to its children, lesser in left.  */
1750     new_huff_el->a.nod.left=a;
1751     new_huff_el->a.nod.right=b;
1752     /*
1753       Replace the copied top element by the new element and re-order the
1754       queue.
1755     */
1756     queue_top(&queue)= (uchar*) new_huff_el;
1757     queue_replace_top(&queue);
1758   }
1759   huff_tree->root=(HUFF_ELEMENT*) queue.root[1];
1760   huff_tree->bytes_packed=bytes_packed+(bits_packed+7)/8;
1761   return 0;
1762 }
1763 
compare_tree(void * cmp_arg,register const uchar * s,register const uchar * t)1764 static int compare_tree(void* cmp_arg __attribute__((unused)),
1765 			register const uchar *s, register const uchar *t)
1766 {
1767   uint length;
1768   for (length=global_count->field_length; length-- ;)
1769     if (*s++ != *t++)
1770       return (int) s[-1] - (int) t[-1];
1771   return 0;
1772 }
1773 
1774 /*
1775   Organize distinct column values and their incidences into a priority queue.
1776 
1777   SYNOPSIS
1778     save_counts_in_queue()
1779     key                         The column value.
1780     count                       The incidence of this value.
1781     tree                        The Huffman tree to be built later.
1782 
1783   DESCRIPTION
1784     We use the element buffer of the targeted tree. The distinct column
1785     values are organized in a priority queue first. The Huffman
1786     algorithm will later organize the elements into a Huffman tree. For
1787     the time being, we just place references to the elements into the
1788     queue buffer. The buffer will later be organized into a priority
1789     queue.
1790 
1791   RETURN
1792     0
1793  */
1794 
save_counts_in_queue(uchar * key,element_count count,HUFF_TREE * tree)1795 static int save_counts_in_queue(uchar *key, element_count count,
1796 				HUFF_TREE *tree)
1797 {
1798   HUFF_ELEMENT *new_huff_el;
1799 
1800   new_huff_el=tree->element_buffer+(tree->elements++);
1801   new_huff_el->count=count;
1802   new_huff_el->a.leaf.null=0;
1803   new_huff_el->a.leaf.element_nr= (uint) (key- tree->counts->tree_buff) /
1804     tree->counts->field_length;
1805   queue.root[tree->elements]=(uchar*) new_huff_el;
1806   return 0;
1807 }
1808 
1809 
1810 /*
1811   Calculate length of file if given counts should be used.
1812 
1813   SYNOPSIS
1814     calc_packed_length()
1815     huff_counts                 The counts for a column of the table(s).
1816     add_tree_lenght             If the decode tree length should be added.
1817 
1818   DESCRIPTION
1819     We need to follow the Huffman algorithm until we know, how many bits
1820     are required for each uchar code. But we do not need the resulting
1821     Huffman tree. Hence, we can leave out some steps which are essential
1822     in make_huff_tree().
1823 
1824   RETURN
1825     Number of bytes required to compress this table column.
1826 */
1827 
calc_packed_length(HUFF_COUNTS * huff_counts,uint add_tree_lenght)1828 static my_off_t calc_packed_length(HUFF_COUNTS *huff_counts,
1829 				   uint add_tree_lenght)
1830 {
1831   uint i,found,bits_packed,first,last;
1832   my_off_t bytes_packed;
1833   HUFF_ELEMENT element_buffer[256];
1834   DBUG_ENTER("calc_packed_length");
1835 
1836   /*
1837     WARNING: We use a small hack for efficiency: Instead of placing
1838     references to HUFF_ELEMENTs into the queue, we just insert
1839     references to the counts of the uchar codes which appeared in this
1840     table column. During the Huffman algorithm they are successively
1841     replaced by references to HUFF_ELEMENTs. This works, because
1842     HUFF_ELEMENTs have the incidence count at their beginning.
1843     Regardless, wether the queue array contains references to counts of
1844     type my_off_t or references to HUFF_ELEMENTs which have the count of
1845     type my_off_t at their beginning, it always points to a count of the
1846     same type.
1847 
1848     Instead of using queue_insert(), we just copy the references into
1849     the buffer of the priority queue. We insert in uchar value order, but
1850     the order is in fact irrelevant here. We will establish the correct
1851     order later.
1852   */
1853   first=last=0;
1854   for (i=found=0 ; i < 256 ; i++)
1855   {
1856     if (huff_counts->counts[i])
1857     {
1858       if (! found++)
1859 	first=i;
1860       last=i;
1861       /* We start with root[1], which is the queues top element. */
1862       queue.root[found]=(uchar*) &huff_counts->counts[i];
1863     }
1864   }
1865   if (!found)
1866     DBUG_RETURN(0);			/* Empty tree */
1867   /*
1868     If there is only a single uchar value in this field in all records,
1869     add a second element with zero incidence. This is required to enter
1870     the loop, which follows the Huffman algorithm.
1871   */
1872   if (found < 2)
1873     queue.root[++found]=(uchar*) &huff_counts->counts[last ? 0 : 1];
1874 
1875   /* Make a queue from the queue buffer. */
1876   queue.elements=found;
1877 
1878   bytes_packed=0; bits_packed=0;
1879   /* Add the length of the coding table, which would become part of the file. */
1880   if (add_tree_lenght)
1881     bytes_packed=(8+9+5+5+(max_bit(last-first)+1)*found+
1882 		  (max_bit(found-1)+1+1)*(found-2) +7)/8;
1883 
1884   /*
1885     Make a priority queue from the queue. Construct its index so that we
1886     have a partially ordered tree.
1887   */
1888   queue_fix(&queue);
1889 
1890   /* The Huffman algorithm. */
1891   for (i=0 ; i < found-1 ; i++)
1892   {
1893     my_off_t        *a;
1894     my_off_t        *b;
1895     HUFF_ELEMENT    *new_huff_el;
1896 
1897     /*
1898       Pop the top element from the queue (the one with the least
1899       incidence). Popping from a priority queue includes a re-ordering
1900       of the queue, to get the next least incidence element to the top.
1901     */
1902     a= (my_off_t*) queue_remove_top(&queue);
1903     /* Copy the next least incidence element. */
1904     b= (my_off_t*) queue_top(&queue);
1905     /* Create a new element in a local (automatic) buffer. */
1906     new_huff_el= element_buffer + i;
1907     /* The new element gets the sum of the two least incidence elements. */
1908     new_huff_el->count= *a + *b;
1909     /*
1910       The Huffman algorithm assigns another bit to the code for a byte
1911       every time that bytes incidence is combined (directly or indirectly)
1912       to a new element as one of the two least incidence elements.
1913       This means that one more bit per incidence of that uchar is required
1914       in the resulting file. So we add the new combined incidence as the
1915       number of bits by which the result grows.
1916     */
1917     bits_packed+=(uint) (new_huff_el->count & 7);
1918     bytes_packed+=new_huff_el->count/8;
1919     /*
1920       Replace the copied top element by the new element and re-order the
1921       queue. This successively replaces the references to counts by
1922       references to HUFF_ELEMENTs.
1923     */
1924     queue_top(&queue)= (uchar*) new_huff_el;
1925     queue_replace_top(&queue);
1926   }
1927   DBUG_RETURN(bytes_packed+(bits_packed+7)/8);
1928 }
1929 
1930 
1931 	/* Remove trees that don't give any compression */
1932 
join_same_trees(HUFF_COUNTS * huff_counts,uint trees)1933 static uint join_same_trees(HUFF_COUNTS *huff_counts, uint trees)
1934 {
1935   uint k,tree_number;
1936   HUFF_COUNTS count,*i,*j,*last_count;
1937 
1938   last_count=huff_counts+trees;
1939   for (tree_number=0, i=huff_counts ; i < last_count ; i++)
1940   {
1941     if (!i->tree->tree_number)
1942     {
1943       i->tree->tree_number= ++tree_number;
1944       if (i->tree_buff)
1945 	continue;			/* Don't join intervall */
1946       for (j=i+1 ; j < last_count ; j++)
1947       {
1948 	if (! j->tree->tree_number && ! j->tree_buff)
1949 	{
1950 	  for (k=0 ; k < 256 ; k++)
1951 	    count.counts[k]=i->counts[k]+j->counts[k];
1952 	  if (calc_packed_length(&count,1) <=
1953 	      i->tree->bytes_packed + j->tree->bytes_packed+
1954 	      i->tree->tree_pack_length+j->tree->tree_pack_length+
1955 	      ALLOWED_JOIN_DIFF)
1956 	  {
1957 	    memcpy(i->counts,(uchar*) count.counts, sizeof(count.counts[0])*256);
1958 	    my_free(j->tree->element_buffer);
1959 	    j->tree->element_buffer=0;
1960 	    j->tree=i->tree;
1961 	    bmove((uchar*) i->counts,(uchar*) count.counts,
1962 		  sizeof(count.counts[0])*256);
1963 	    if (make_huff_tree(i->tree,i))
1964 	      return (uint) -1;
1965 	  }
1966 	}
1967       }
1968     }
1969   }
1970   DBUG_PRINT("info", ("Original trees:  %d  After join: %d",
1971                       trees, tree_number));
1972   if (verbose)
1973     printf("Original trees:  %d  After join: %d\n", trees, tree_number);
1974   return tree_number;			/* Return trees left */
1975 }
1976 
1977 
1978 /*
1979   Fill in huff_tree encode tables.
1980 
1981   SYNOPSIS
1982     make_huff_decode_table()
1983     huff_tree               An array of HUFF_TREE which are to be encoded.
1984     trees                   The number of HUFF_TREE in the array.
1985 
1986   RETURN
1987     0           success
1988     != 0        error
1989 */
1990 
make_huff_decode_table(HUFF_TREE * huff_tree,uint trees)1991 static int make_huff_decode_table(HUFF_TREE *huff_tree, uint trees)
1992 {
1993   uint elements;
1994   for ( ; trees-- ; huff_tree++)
1995   {
1996     if (huff_tree->tree_number > 0)
1997     {
1998       elements=huff_tree->counts->tree_buff ? huff_tree->elements : 256;
1999       if (!(huff_tree->code =
2000             (ulonglong*) my_malloc(elements*
2001                                    (sizeof(ulonglong) + sizeof(uchar)),
2002                                    MYF(MY_WME | MY_ZEROFILL))))
2003 	return 1;
2004       huff_tree->code_len=(uchar*) (huff_tree->code+elements);
2005       make_traverse_code_tree(huff_tree, huff_tree->root,
2006                               8 * sizeof(ulonglong), 0);
2007     }
2008   }
2009   return 0;
2010 }
2011 
2012 
make_traverse_code_tree(HUFF_TREE * huff_tree,HUFF_ELEMENT * element,uint size,ulonglong code)2013 static void make_traverse_code_tree(HUFF_TREE *huff_tree,
2014 				    HUFF_ELEMENT *element,
2015 				    uint size, ulonglong code)
2016 {
2017   uint chr;
2018   if (!element->a.leaf.null)
2019   {
2020     chr=element->a.leaf.element_nr;
2021     huff_tree->code_len[chr]= (uchar) (8 * sizeof(ulonglong) - size);
2022     huff_tree->code[chr]= (code >> size);
2023     if (huff_tree->height < 8 * sizeof(ulonglong) - size)
2024         huff_tree->height= 8 * sizeof(ulonglong) - size;
2025   }
2026   else
2027   {
2028     size--;
2029     make_traverse_code_tree(huff_tree,element->a.nod.left,size,code);
2030     make_traverse_code_tree(huff_tree, element->a.nod.right, size,
2031 			    code + (((ulonglong) 1) << size));
2032   }
2033   return;
2034 }
2035 
2036 
2037 /*
2038   Convert a value into binary digits.
2039 
2040   SYNOPSIS
2041     bindigits()
2042     value                       The value.
2043     length                      The number of low order bits to convert.
2044 
2045   NOTE
2046     The result string is in static storage. It is reused on every call.
2047     So you cannot use it twice in one expression.
2048 
2049   RETURN
2050     A pointer to a static NUL-terminated string.
2051  */
2052 
bindigits(ulonglong value,uint bits)2053 static char *bindigits(ulonglong value, uint bits)
2054 {
2055   static char digits[72];
2056   char *ptr= digits;
2057   uint idx= bits;
2058 
2059   DBUG_ASSERT(idx < sizeof(digits));
2060   while (idx)
2061     *(ptr++)= '0' + ((char) (value >> (--idx)) & (char) 1);
2062   *ptr= '\0';
2063   return digits;
2064 }
2065 
2066 
2067 /*
2068   Convert a value into hexadecimal digits.
2069 
2070   SYNOPSIS
2071     hexdigits()
2072     value                       The value.
2073 
2074   NOTE
2075     The result string is in static storage. It is reused on every call.
2076     So you cannot use it twice in one expression.
2077 
2078   RETURN
2079     A pointer to a static NUL-terminated string.
2080  */
2081 
hexdigits(ulonglong value)2082 static char *hexdigits(ulonglong value)
2083 {
2084   static char digits[20];
2085   char *ptr= digits;
2086   uint idx= 2 * sizeof(value); /* Two hex digits per byte. */
2087 
2088   DBUG_ASSERT(idx < sizeof(digits));
2089   while (idx)
2090   {
2091     if ((*(ptr++)= '0' + ((char) (value >> (4 * (--idx))) & (char) 0xf)) > '9')
2092       *(ptr - 1)+= 'a' - '9' - 1;
2093   }
2094   *ptr= '\0';
2095   return digits;
2096 }
2097 
2098 
2099 	/* Write header to new packed data file */
2100 
write_header(PACK_MRG_INFO * mrg,uint head_length,uint trees,my_off_t tot_elements,my_off_t filelength)2101 static int write_header(PACK_MRG_INFO *mrg,uint head_length,uint trees,
2102 			my_off_t tot_elements,my_off_t filelength)
2103 {
2104   uchar *buff= (uchar*) file_buffer.pos;
2105 
2106   bzero(buff,HEAD_LENGTH);
2107   memcpy(buff,maria_pack_file_magic,4);
2108   int4store(buff+4,head_length);
2109   int4store(buff+8, mrg->min_pack_length);
2110   int4store(buff+12,mrg->max_pack_length);
2111   int4store(buff+16,tot_elements);
2112   int4store(buff+20,intervall_length);
2113   int2store(buff+24,trees);
2114   buff[26]=(char) mrg->ref_length;
2115 	/* Save record pointer length */
2116   buff[27]= (uchar) maria_get_pointer_length((ulonglong) filelength,2);
2117   if (test_only)
2118     return 0;
2119   my_seek(file_buffer.file,0L,MY_SEEK_SET,MYF(0));
2120   return my_write(file_buffer.file,(const uchar *) file_buffer.pos,HEAD_LENGTH,
2121 		  MYF(MY_WME | MY_NABP | MY_WAIT_IF_FULL)) != 0;
2122 }
2123 
2124 	/* Write fieldinfo to new packed file */
2125 
write_field_info(HUFF_COUNTS * counts,uint fields,uint trees)2126 static void write_field_info(HUFF_COUNTS *counts, uint fields, uint trees)
2127 {
2128   reg1 uint i;
2129   uint huff_tree_bits;
2130   huff_tree_bits=max_bit(trees ? trees-1 : 0);
2131 
2132   DBUG_PRINT("info", (" "));
2133   DBUG_PRINT("info", ("column types:"));
2134   DBUG_PRINT("info", ("FIELD_NORMAL          0"));
2135   DBUG_PRINT("info", ("FIELD_SKIP_ENDSPACE   1"));
2136   DBUG_PRINT("info", ("FIELD_SKIP_PRESPACE   2"));
2137   DBUG_PRINT("info", ("FIELD_SKIP_ZERO       3"));
2138   DBUG_PRINT("info", ("FIELD_BLOB            4"));
2139   DBUG_PRINT("info", ("FIELD_CONSTANT        5"));
2140   DBUG_PRINT("info", ("FIELD_INTERVALL       6"));
2141   DBUG_PRINT("info", ("FIELD_ZERO            7"));
2142   DBUG_PRINT("info", ("FIELD_VARCHAR         8"));
2143   DBUG_PRINT("info", ("FIELD_CHECK           9"));
2144   DBUG_PRINT("info", (" "));
2145   DBUG_PRINT("info", ("pack type as a set of flags:"));
2146   DBUG_PRINT("info", ("PACK_TYPE_SELECTED      1"));
2147   DBUG_PRINT("info", ("PACK_TYPE_SPACE_FIELDS  2"));
2148   DBUG_PRINT("info", ("PACK_TYPE_ZERO_FILL     4"));
2149   DBUG_PRINT("info", (" "));
2150   if (verbose >= 2)
2151   {
2152     printf("\n");
2153     printf("column types:\n");
2154     printf("FIELD_NORMAL          0\n");
2155     printf("FIELD_SKIP_ENDSPACE   1\n");
2156     printf("FIELD_SKIP_PRESPACE   2\n");
2157     printf("FIELD_SKIP_ZERO       3\n");
2158     printf("FIELD_BLOB            4\n");
2159     printf("FIELD_CONSTANT        5\n");
2160     printf("FIELD_INTERVALL       6\n");
2161     printf("FIELD_ZERO            7\n");
2162     printf("FIELD_VARCHAR         8\n");
2163     printf("FIELD_CHECK           9\n");
2164     printf("\n");
2165     printf("pack type as a set of flags:\n");
2166     printf("PACK_TYPE_SELECTED      1\n");
2167     printf("PACK_TYPE_SPACE_FIELDS  2\n");
2168     printf("PACK_TYPE_ZERO_FILL     4\n");
2169     printf("\n");
2170   }
2171   for (i=0 ; i++ < fields ; counts++)
2172   {
2173     write_bits((ulonglong) (int) counts->field_type, 5);
2174     write_bits(counts->pack_type,6);
2175     if (counts->pack_type & PACK_TYPE_ZERO_FILL)
2176       write_bits(counts->max_zero_fill,5);
2177     else
2178       write_bits(counts->length_bits,5);
2179     write_bits((ulonglong) counts->tree->tree_number - 1, huff_tree_bits);
2180     DBUG_PRINT("info", ("column: %3u  type: %2u  pack: %2u  zero: %4u  "
2181                         "lbits: %2u  tree: %2u  length: %4u",
2182                         i , counts->field_type, counts->pack_type,
2183                         counts->max_zero_fill, counts->length_bits,
2184                         counts->tree->tree_number, counts->field_length));
2185     if (verbose >= 2)
2186       printf("column: %3u  type: %2u  pack: %2u  zero: %4u  lbits: %2u  "
2187                   "tree: %2u  length: %4u\n", i , counts->field_type,
2188                   counts->pack_type, counts->max_zero_fill, counts->length_bits,
2189                   counts->tree->tree_number, counts->field_length);
2190   }
2191   flush_bits();
2192   return;
2193 }
2194 
2195 	/* Write all huff_trees to new datafile. Return tot count of
2196 	   elements in all trees
2197 	   Returns 0 on error */
2198 
write_huff_tree(HUFF_TREE * huff_tree,uint trees)2199 static my_off_t write_huff_tree(HUFF_TREE *huff_tree, uint trees)
2200 {
2201   uint i,int_length;
2202   uint tree_no;
2203   uint codes;
2204   uint errors= 0;
2205   uint *packed_tree,*offset,length;
2206   my_off_t elements;
2207 
2208   /* Find the highest number of elements in the trees. */
2209   for (i=length=0 ; i < trees ; i++)
2210     if (huff_tree[i].tree_number > 0 && huff_tree[i].elements > length)
2211       length=huff_tree[i].elements;
2212   /*
2213     Allocate a buffer for packing a decode tree. Two numbers per element
2214     (left child and right child).
2215   */
2216   if (!(packed_tree=(uint*) my_alloca(sizeof(uint)*length*2)))
2217   {
2218     my_error(EE_OUTOFMEMORY,MYF(ME_BELL),sizeof(uint)*length*2);
2219     return 0;
2220   }
2221 
2222   DBUG_PRINT("info", (" "));
2223   if (verbose >= 2)
2224     printf("\n");
2225   tree_no= 0;
2226   intervall_length=0;
2227   for (elements=0; trees-- ; huff_tree++)
2228   {
2229     /* Skip columns that have been joined with other columns. */
2230     if (huff_tree->tree_number == 0)
2231       continue;				/* Deleted tree */
2232     tree_no++;
2233     DBUG_PRINT("info", (" "));
2234     if (verbose >= 3)
2235       printf("\n");
2236     /* Count the total number of elements (byte codes or column values). */
2237     elements+=huff_tree->elements;
2238     huff_tree->max_offset=2;
2239     /* Build a tree of offsets and codes for decoding in 'packed_tree'. */
2240     if (huff_tree->elements <= 1)
2241       offset=packed_tree;
2242     else
2243       offset=make_offset_code_tree(huff_tree,huff_tree->root,packed_tree);
2244 
2245     /* This should be the same as 'length' above. */
2246     huff_tree->offset_bits=max_bit(huff_tree->max_offset);
2247 
2248     /*
2249       Since we check this during collecting the distinct column values,
2250       this should never happen.
2251     */
2252     if (huff_tree->max_offset >= IS_OFFSET)
2253     {				/* This should be impossible */
2254       fprintf(stderr, "Tree offset got too big: %d, aborted\n",
2255                    huff_tree->max_offset);
2256       my_afree(packed_tree);
2257       return 0;
2258     }
2259 
2260     DBUG_PRINT("info", ("pos: %lu  elements: %u  tree-elements: %lu  "
2261                         "char_bits: %u\n",
2262                         (ulong) (file_buffer.pos - file_buffer.buffer),
2263                         huff_tree->elements, (ulong) (offset - packed_tree),
2264                         huff_tree->char_bits));
2265     if (!huff_tree->counts->tree_buff)
2266     {
2267       /* We do a uchar compression on this column. Mark with bit 0. */
2268       write_bits(0,1);
2269       write_bits(huff_tree->min_chr,8);
2270       write_bits(huff_tree->elements,9);
2271       write_bits(huff_tree->char_bits,5);
2272       write_bits(huff_tree->offset_bits,5);
2273       int_length=0;
2274     }
2275     else
2276     {
2277       int_length=(uint) (huff_tree->counts->tree_pos -
2278 			 huff_tree->counts->tree_buff);
2279       /* We have distinct column values for this column. Mark with bit 1. */
2280       write_bits(1,1);
2281       write_bits(huff_tree->elements,15);
2282       write_bits(int_length,16);
2283       write_bits(huff_tree->char_bits,5);
2284       write_bits(huff_tree->offset_bits,5);
2285       intervall_length+=int_length;
2286     }
2287     DBUG_PRINT("info", ("tree: %2u  elements: %4u  char_bits: %2u  "
2288                         "offset_bits: %2u  %s: %5u  codelen: %2u",
2289                         tree_no, huff_tree->elements, huff_tree->char_bits,
2290                         huff_tree->offset_bits, huff_tree->counts->tree_buff ?
2291                         "bufflen" : "min_chr", huff_tree->counts->tree_buff ?
2292                         int_length : huff_tree->min_chr, huff_tree->height));
2293     if (verbose >= 2)
2294       printf("tree: %2u  elements: %4u  char_bits: %2u  offset_bits: %2u  "
2295                   "%s: %5u  codelen: %2u\n", tree_no, huff_tree->elements,
2296                   huff_tree->char_bits, huff_tree->offset_bits,
2297                   huff_tree->counts->tree_buff ? "bufflen" : "min_chr",
2298                   huff_tree->counts->tree_buff ? int_length :
2299                   huff_tree->min_chr, huff_tree->height);
2300 
2301     /* Check that the code tree length matches the element count. */
2302     length=(uint) (offset-packed_tree);
2303     if (length != huff_tree->elements*2-2)
2304     {
2305       fprintf(stderr, "error: Huff-tree-length: %d != calc_length: %d\n",
2306                    length, huff_tree->elements * 2 - 2);
2307       errors++;
2308       break;
2309     }
2310 
2311     for (i=0 ; i < length ; i++)
2312     {
2313       if (packed_tree[i] & IS_OFFSET)
2314 	write_bits(packed_tree[i] - IS_OFFSET+ (1 << huff_tree->offset_bits),
2315 		   huff_tree->offset_bits+1);
2316       else
2317 	write_bits(packed_tree[i]-huff_tree->min_chr,huff_tree->char_bits+1);
2318       DBUG_PRINT("info", ("tree[0x%04x]: %s0x%04x",
2319                           i, (packed_tree[i] & IS_OFFSET) ?
2320                           " -> " : "", (packed_tree[i] & IS_OFFSET) ?
2321                           packed_tree[i] - IS_OFFSET + i : packed_tree[i]));
2322       if (verbose >= 3)
2323         printf("tree[0x%04x]: %s0x%04x\n",
2324                     i, (packed_tree[i] & IS_OFFSET) ? " -> " : "",
2325                     (packed_tree[i] & IS_OFFSET) ?
2326                     packed_tree[i] - IS_OFFSET + i : packed_tree[i]);
2327     }
2328     flush_bits();
2329 
2330     /*
2331       Display coding tables and check their correctness.
2332     */
2333     codes= huff_tree->counts->tree_buff ? huff_tree->elements : 256;
2334     for (i= 0; i < codes; i++)
2335     {
2336       ulonglong code;
2337       uint bits;
2338       uint len;
2339       uint idx;
2340 
2341       if (! (len= huff_tree->code_len[i]))
2342         continue;
2343       DBUG_PRINT("info", ("code[0x%04x]:      0x%s  bits: %2u  bin: %s", i,
2344                           hexdigits(huff_tree->code[i]), huff_tree->code_len[i],
2345                           bindigits(huff_tree->code[i],
2346                                     huff_tree->code_len[i])));
2347       if (verbose >= 3)
2348         printf("code[0x%04x]:      0x%s  bits: %2u  bin: %s\n", i,
2349                     hexdigits(huff_tree->code[i]), huff_tree->code_len[i],
2350                     bindigits(huff_tree->code[i], huff_tree->code_len[i]));
2351 
2352       /* Check that the encode table decodes correctly. */
2353       code= 0;
2354       bits= 0;
2355       idx= 0;
2356       DBUG_EXECUTE_IF("forcechkerr1", len--;);
2357       DBUG_EXECUTE_IF("forcechkerr2", bits= 8 * sizeof(code););
2358       DBUG_EXECUTE_IF("forcechkerr3", idx= length;);
2359       for (;;)
2360       {
2361         if (! len)
2362         {
2363           fflush(stdout);
2364           fprintf(stderr, "error: code 0x%s with %u bits not found\n",
2365                        hexdigits(huff_tree->code[i]), huff_tree->code_len[i]);
2366           errors++;
2367           break;
2368         }
2369         code<<= 1;
2370         code|= (huff_tree->code[i] >> (--len)) & 1;
2371         bits++;
2372         if (bits > 8 * sizeof(code))
2373         {
2374           fflush(stdout);
2375           fprintf(stderr, "error: Huffman code too long: %u/%u\n",
2376                        bits, (uint) (8 * sizeof(code)));
2377           errors++;
2378           break;
2379         }
2380         idx+= (uint) code & 1;
2381         if (idx >= length)
2382         {
2383           fflush(stdout);
2384           fprintf(stderr, "error: illegal tree offset: %u/%u\n", idx, length);
2385           errors++;
2386           break;
2387         }
2388         if (packed_tree[idx] & IS_OFFSET)
2389           idx+= packed_tree[idx] & ~IS_OFFSET;
2390         else
2391           break; /* Hit a leaf. This contains the result value. */
2392       }
2393       if (errors)
2394         break;
2395 
2396       DBUG_EXECUTE_IF("forcechkerr4", packed_tree[idx]++;);
2397       if (packed_tree[idx] != i)
2398       {
2399         fflush(stdout);
2400         fprintf(stderr, "error: decoded value 0x%04x  should be: 0x%04x\n",
2401                      packed_tree[idx], i);
2402         errors++;
2403         break;
2404       }
2405     } /*end for (codes)*/
2406     if (errors)
2407       break;
2408 
2409     /* Write column values in case of distinct column value compression. */
2410     if (huff_tree->counts->tree_buff)
2411     {
2412       for (i=0 ; i < int_length ; i++)
2413       {
2414  	write_bits((ulonglong) (uchar) huff_tree->counts->tree_buff[i], 8);
2415         DBUG_PRINT("info", ("column_values[0x%04x]: 0x%02x",
2416                             i, (uchar) huff_tree->counts->tree_buff[i]));
2417         if (verbose >= 3)
2418           printf("column_values[0x%04x]: 0x%02x\n",
2419                       i, (uchar) huff_tree->counts->tree_buff[i]);
2420       }
2421     }
2422     flush_bits();
2423   }
2424   DBUG_PRINT("info", (" "));
2425   if (verbose >= 2)
2426     printf("\n");
2427   my_afree(packed_tree);
2428   if (errors)
2429   {
2430     fprintf(stderr, "Error: Generated decode trees are corrupt. Stop.\n");
2431     return 0;
2432   }
2433   return elements;
2434 }
2435 
2436 
make_offset_code_tree(HUFF_TREE * huff_tree,HUFF_ELEMENT * element,uint * offset)2437 static uint *make_offset_code_tree(HUFF_TREE *huff_tree, HUFF_ELEMENT *element,
2438 				   uint *offset)
2439 {
2440   uint *prev_offset;
2441 
2442   prev_offset= offset;
2443   /*
2444     'a.leaf.null' takes the same place as 'a.nod.left'. If this is null,
2445     then there is no left child and, hence no right child either. This
2446     is a property of a binary tree. An element is either a node with two
2447     childs, or a leaf without childs.
2448 
2449     The current element is always a node with two childs. Go left first.
2450   */
2451   if (!element->a.nod.left->a.leaf.null)
2452   {
2453     /* Store the uchar code or the index of the column value. */
2454     prev_offset[0] =(uint) element->a.nod.left->a.leaf.element_nr;
2455     offset+=2;
2456   }
2457   else
2458   {
2459     /*
2460       Recursively traverse the tree to the left. Mark it as an offset to
2461       another tree node (in contrast to a uchar code or column value index).
2462     */
2463     prev_offset[0]= IS_OFFSET+2;
2464     offset=make_offset_code_tree(huff_tree,element->a.nod.left,offset+2);
2465   }
2466 
2467   /* Now, check the right child. */
2468   if (!element->a.nod.right->a.leaf.null)
2469   {
2470     /* Store the uchar code or the index of the column value. */
2471     prev_offset[1]=element->a.nod.right->a.leaf.element_nr;
2472     return offset;
2473   }
2474   else
2475   {
2476     /*
2477       Recursively traverse the tree to the right. Mark it as an offset to
2478       another tree node (in contrast to a uchar code or column value index).
2479     */
2480     uint temp=(uint) (offset-prev_offset-1);
2481     prev_offset[1]= IS_OFFSET+ temp;
2482     if (huff_tree->max_offset < temp)
2483       huff_tree->max_offset = temp;
2484     return make_offset_code_tree(huff_tree,element->a.nod.right,offset);
2485   }
2486 }
2487 
2488 	/* Get number of bits neaded to represent value */
2489 
max_bit(register uint value)2490 static uint max_bit(register uint value)
2491 {
2492   reg2 uint power=1;
2493 
2494   while ((value>>=1))
2495     power++;
2496   return (power);
2497 }
2498 
2499 
compress_maria_file(PACK_MRG_INFO * mrg,HUFF_COUNTS * huff_counts)2500 static int compress_maria_file(PACK_MRG_INFO *mrg, HUFF_COUNTS *huff_counts)
2501 {
2502   int error;
2503   uint i,max_calc_length,pack_ref_length,min_record_length,max_record_length;
2504   uint intervall,field_length,max_pack_length,pack_blob_length, null_bytes;
2505   my_off_t record_count;
2506   char llbuf[32];
2507   ulong length,pack_length;
2508   uchar *record,*pos,*end_pos,*record_pos,*start_pos;
2509   HUFF_COUNTS *count,*end_count;
2510   HUFF_TREE *tree;
2511   MARIA_HA *isam_file=mrg->file[0];
2512   uint pack_version= (uint) isam_file->s->pack.version;
2513   DBUG_ENTER("compress_maria_file");
2514 
2515   /* Allocate a buffer for the records (excluding blobs). */
2516   if (!(record=(uchar*) my_safe_alloca(isam_file->s->base.reclength)))
2517     return -1;
2518 
2519   end_count=huff_counts+isam_file->s->base.fields;
2520   min_record_length= (uint) ~0;
2521   max_record_length=0;
2522   null_bytes= isam_file->s->base.null_bytes;
2523 
2524   /*
2525     Calculate the maximum number of bits required to pack the records.
2526     Remember to understand 'max_zero_fill' as 'min_zero_fill'.
2527     The tree height determines the maximum number of bits per value.
2528     Some fields skip leading or trailing spaces or zeroes. The skipped
2529     number of bytes is encoded by 'length_bits' bits.
2530     Empty blobs and varchar are encoded with a single 1 bit. Other blobs
2531     and varchar get a leading 0 bit.
2532   */
2533   max_calc_length= null_bytes;
2534   for (i= 0 ; i < isam_file->s->base.fields ; i++)
2535   {
2536     if (!(huff_counts[i].pack_type & PACK_TYPE_ZERO_FILL))
2537       huff_counts[i].max_zero_fill=0;
2538     if (huff_counts[i].field_type == FIELD_CONSTANT ||
2539 	huff_counts[i].field_type == FIELD_ZERO ||
2540 	huff_counts[i].field_type == FIELD_CHECK)
2541       continue;
2542     if (huff_counts[i].field_type == FIELD_INTERVALL)
2543       max_calc_length+=huff_counts[i].tree->height;
2544     else if (huff_counts[i].field_type == FIELD_BLOB ||
2545 	     huff_counts[i].field_type == FIELD_VARCHAR)
2546       max_calc_length+=huff_counts[i].tree->height*huff_counts[i].max_length + huff_counts[i].length_bits +1;
2547     else
2548       max_calc_length+=
2549 	(huff_counts[i].field_length - huff_counts[i].max_zero_fill)*
2550 	  huff_counts[i].tree->height+huff_counts[i].length_bits;
2551   }
2552   max_calc_length= (max_calc_length + 7) / 8;
2553   pack_ref_length= _ma_calc_pack_length(pack_version, max_calc_length);
2554   record_count=0;
2555   /* 'max_blob_length' is the max length of all blobs of a record. */
2556   pack_blob_length= isam_file->s->base.blobs ?
2557                     _ma_calc_pack_length(pack_version, mrg->max_blob_length) : 0;
2558   max_pack_length=pack_ref_length+pack_blob_length;
2559 
2560   DBUG_PRINT("fields", ("==="));
2561   mrg_reset(mrg);
2562   while ((error=mrg_rrnd(mrg,record)) != HA_ERR_END_OF_FILE)
2563   {
2564     ulong tot_blob_length=0;
2565     if (! error)
2566     {
2567       if (flush_buffer((ulong) max_calc_length + (ulong) max_pack_length +
2568                        null_bytes))
2569 	break;
2570       record_pos= file_buffer.pos;
2571       file_buffer.pos+= max_pack_length;
2572       if (null_bytes)
2573       {
2574         /* Copy null bits 'as is' */
2575         memcpy(file_buffer.pos, record, null_bytes);
2576         file_buffer.pos+= null_bytes;
2577       }
2578       for (start_pos=record+null_bytes, count= huff_counts;
2579            count < end_count ;
2580            count++)
2581       {
2582 	end_pos=start_pos+(field_length=count->field_length);
2583 	tree=count->tree;
2584 
2585         DBUG_PRINT("fields", ("column: %3lu  type: %2u  pack: %2u  zero: %4u  "
2586                               "lbits: %2u  tree: %2u  length: %4u",
2587                               (ulong) (count - huff_counts + 1),
2588                               count->field_type,
2589                               count->pack_type, count->max_zero_fill,
2590                               count->length_bits, count->tree->tree_number,
2591                               count->field_length));
2592 
2593         /* Check if the column contains spaces only. */
2594 	if (count->pack_type & PACK_TYPE_SPACE_FIELDS)
2595 	{
2596 	  for (pos=start_pos ; *pos == ' ' && pos < end_pos; pos++) ;
2597 	  if (pos == end_pos)
2598 	  {
2599             DBUG_PRINT("fields",
2600                        ("PACK_TYPE_SPACE_FIELDS spaces only, bits:  1"));
2601             DBUG_PRINT("fields", ("---"));
2602 	    write_bits(1,1);
2603 	    start_pos=end_pos;
2604 	    continue;
2605 	  }
2606           DBUG_PRINT("fields",
2607                      ("PACK_TYPE_SPACE_FIELDS not only spaces, bits:  1"));
2608 	  write_bits(0,1);
2609 	}
2610 	end_pos-=count->max_zero_fill;
2611 	field_length-=count->max_zero_fill;
2612 
2613 	switch (count->field_type) {
2614 	case FIELD_SKIP_ZERO:
2615 	  if (!memcmp(start_pos, zero_string, field_length))
2616 	  {
2617             DBUG_PRINT("fields", ("FIELD_SKIP_ZERO zeroes only, bits:  1"));
2618 	    write_bits(1,1);
2619 	    start_pos=end_pos;
2620 	    break;
2621 	  }
2622           DBUG_PRINT("fields", ("FIELD_SKIP_ZERO not only zeroes, bits:  1"));
2623 	  write_bits(0,1);
2624 	  /* Fall through */
2625 	case FIELD_NORMAL:
2626           DBUG_PRINT("fields", ("FIELD_NORMAL %lu bytes",
2627                                 (ulong) (end_pos - start_pos)));
2628 	  for ( ; start_pos < end_pos ; start_pos++)
2629           {
2630             DBUG_PRINT("fields",
2631                        ("value: 0x%02x  code: 0x%s  bits: %2u  bin: %s",
2632                         (uchar) *start_pos,
2633                         hexdigits(tree->code[(uchar) *start_pos]),
2634                         (uint) tree->code_len[(uchar) *start_pos],
2635                         bindigits(tree->code[(uchar) *start_pos],
2636                                   (uint) tree->code_len[(uchar) *start_pos])));
2637 	    write_bits(tree->code[(uchar) *start_pos],
2638 		       (uint) tree->code_len[(uchar) *start_pos]);
2639           }
2640 	  break;
2641 	case FIELD_SKIP_ENDSPACE:
2642 	  for (pos=end_pos ; pos > start_pos && pos[-1] == ' ' ; pos--) ;
2643 	  length= (ulong) (end_pos - pos);
2644 	  if (count->pack_type & PACK_TYPE_SELECTED)
2645 	  {
2646 	    if (length > count->min_space)
2647 	    {
2648               DBUG_PRINT("fields",
2649                          ("FIELD_SKIP_ENDSPACE more than min_space, bits:  1"));
2650               DBUG_PRINT("fields",
2651                          ("FIELD_SKIP_ENDSPACE skip %lu/%u bytes, bits: %2u",
2652                           length, field_length, count->length_bits));
2653 	      write_bits(1,1);
2654 	      write_bits(length,count->length_bits);
2655 	    }
2656 	    else
2657 	    {
2658               DBUG_PRINT("fields",
2659                          ("FIELD_SKIP_ENDSPACE not more than min_space, "
2660                           "bits:  1"));
2661 	      write_bits(0,1);
2662 	      pos=end_pos;
2663 	    }
2664 	  }
2665 	  else
2666           {
2667             DBUG_PRINT("fields",
2668                        ("FIELD_SKIP_ENDSPACE skip %lu/%u bytes, bits: %2u",
2669                         length, field_length, count->length_bits));
2670 	    write_bits(length,count->length_bits);
2671           }
2672           /* Encode all significant bytes. */
2673           DBUG_PRINT("fields", ("FIELD_SKIP_ENDSPACE %lu bytes",
2674                                 (ulong) (pos - start_pos)));
2675 	  for ( ; start_pos < pos ; start_pos++)
2676           {
2677             DBUG_PRINT("fields",
2678                        ("value: 0x%02x  code: 0x%s  bits: %2u  bin: %s",
2679                         (uchar) *start_pos,
2680                         hexdigits(tree->code[(uchar) *start_pos]),
2681                         (uint) tree->code_len[(uchar) *start_pos],
2682                         bindigits(tree->code[(uchar) *start_pos],
2683                                   (uint) tree->code_len[(uchar) *start_pos])));
2684 	    write_bits(tree->code[(uchar) *start_pos],
2685 		       (uint) tree->code_len[(uchar) *start_pos]);
2686           }
2687 	  start_pos=end_pos;
2688 	  break;
2689 	case FIELD_SKIP_PRESPACE:
2690 	  for (pos=start_pos ; pos < end_pos && pos[0] == ' ' ; pos++) ;
2691           length= (ulong) (pos - start_pos);
2692 	  if (count->pack_type & PACK_TYPE_SELECTED)
2693 	  {
2694 	    if (length > count->min_space)
2695 	    {
2696               DBUG_PRINT("fields",
2697                          ("FIELD_SKIP_PRESPACE more than min_space, bits:  1"));
2698               DBUG_PRINT("fields",
2699                          ("FIELD_SKIP_PRESPACE skip %lu/%u bytes, bits: %2u",
2700                           length, field_length, count->length_bits));
2701 	      write_bits(1,1);
2702 	      write_bits(length,count->length_bits);
2703 	    }
2704 	    else
2705 	    {
2706               DBUG_PRINT("fields",
2707                          ("FIELD_SKIP_PRESPACE not more than min_space, "
2708                           "bits:  1"));
2709 	      pos=start_pos;
2710 	      write_bits(0,1);
2711 	    }
2712 	  }
2713 	  else
2714           {
2715             DBUG_PRINT("fields",
2716                        ("FIELD_SKIP_PRESPACE skip %lu/%u bytes, bits: %2u",
2717                         length, field_length, count->length_bits));
2718 	    write_bits(length,count->length_bits);
2719           }
2720           /* Encode all significant bytes. */
2721           DBUG_PRINT("fields", ("FIELD_SKIP_PRESPACE %lu bytes",
2722                                 (ulong) (end_pos - start_pos)));
2723 	  for (start_pos=pos ; start_pos < end_pos ; start_pos++)
2724           {
2725             DBUG_PRINT("fields",
2726                        ("value: 0x%02x  code: 0x%s  bits: %2u  bin: %s",
2727                         (uchar) *start_pos,
2728                         hexdigits(tree->code[(uchar) *start_pos]),
2729                         (uint) tree->code_len[(uchar) *start_pos],
2730                         bindigits(tree->code[(uchar) *start_pos],
2731                                   (uint) tree->code_len[(uchar) *start_pos])));
2732 	    write_bits(tree->code[(uchar) *start_pos],
2733 		       (uint) tree->code_len[(uchar) *start_pos]);
2734           }
2735 	  break;
2736 	case FIELD_CONSTANT:
2737 	case FIELD_ZERO:
2738 	case FIELD_CHECK:
2739           DBUG_PRINT("fields", ("FIELD_CONSTANT/ZERO/CHECK"));
2740 	  start_pos=end_pos;
2741 	  break;
2742 	case FIELD_INTERVALL:
2743 	  global_count=count;
2744 	  pos=(uchar*) tree_search(&count->int_tree, start_pos,
2745 				  count->int_tree.custom_arg);
2746 	  intervall=(uint) (pos - count->tree_buff)/field_length;
2747           DBUG_PRINT("fields", ("FIELD_INTERVALL"));
2748           DBUG_PRINT("fields", ("index: %4u code: 0x%s  bits: %2u",
2749                                 intervall, hexdigits(tree->code[intervall]),
2750                                 (uint) tree->code_len[intervall]));
2751 	  write_bits(tree->code[intervall],(uint) tree->code_len[intervall]);
2752 	  start_pos=end_pos;
2753 	  break;
2754 	case FIELD_BLOB:
2755 	{
2756 	  ulong blob_length= _ma_calc_blob_length(field_length-
2757 						 portable_sizeof_char_ptr,
2758 						 start_pos);
2759           /* Empty blobs are encoded with a single 1 bit. */
2760 	  if (!blob_length)
2761 	  {
2762             DBUG_PRINT("fields", ("FIELD_BLOB empty, bits:  1"));
2763             write_bits(1,1);
2764 	  }
2765 	  else
2766 	  {
2767 	    uchar *blob,*blob_end;
2768             DBUG_PRINT("fields", ("FIELD_BLOB not empty, bits:  1"));
2769 	    write_bits(0,1);
2770             /* Write the blob length. */
2771             DBUG_PRINT("fields", ("FIELD_BLOB %lu bytes, bits: %2u",
2772                                   blob_length, count->length_bits));
2773 	    write_bits(blob_length,count->length_bits);
2774 	    memcpy(&blob,end_pos-portable_sizeof_char_ptr, sizeof(char*));
2775 	    blob_end=blob+blob_length;
2776             /* Encode the blob bytes. */
2777 	    for ( ; blob < blob_end ; blob++)
2778             {
2779               DBUG_PRINT("fields",
2780                          ("value: 0x%02x  code: 0x%s  bits: %2u  bin: %s",
2781                           (uchar) *blob, hexdigits(tree->code[(uchar) *blob]),
2782                           (uint) tree->code_len[(uchar) *blob],
2783                           bindigits(tree->code[(uchar) *start_pos],
2784                                     (uint)tree->code_len[(uchar) *start_pos])));
2785 	      write_bits(tree->code[(uchar) *blob],
2786 			 (uint) tree->code_len[(uchar) *blob]);
2787             }
2788 	    tot_blob_length+=blob_length;
2789 	  }
2790 	  start_pos= end_pos;
2791 	  break;
2792 	}
2793 	case FIELD_VARCHAR:
2794 	{
2795           uint var_pack_length= HA_VARCHAR_PACKLENGTH(count->field_length-1);
2796 	  ulong col_length= (var_pack_length == 1 ?
2797                              (uint) *(uchar*) start_pos :
2798                              uint2korr(start_pos));
2799           /* Empty varchar are encoded with a single 1 bit. */
2800 	  if (!col_length)
2801 	  {
2802             DBUG_PRINT("fields", ("FIELD_VARCHAR empty, bits:  1"));
2803 	    write_bits(1,1);			/* Empty varchar */
2804 	  }
2805 	  else
2806 	  {
2807 	    uchar *end= start_pos + var_pack_length + col_length;
2808             DBUG_PRINT("fields", ("FIELD_VARCHAR not empty, bits:  1"));
2809 	    write_bits(0,1);
2810             /* Write the varchar length. */
2811             DBUG_PRINT("fields", ("FIELD_VARCHAR %lu bytes, bits: %2u",
2812                                   col_length, count->length_bits));
2813 	    write_bits(col_length,count->length_bits);
2814             /* Encode the varchar bytes. */
2815 	    for (start_pos+= var_pack_length ; start_pos < end ; start_pos++)
2816             {
2817               DBUG_PRINT("fields",
2818                          ("value: 0x%02x  code: 0x%s  bits: %2u  bin: %s",
2819                           (uchar) *start_pos,
2820                           hexdigits(tree->code[(uchar) *start_pos]),
2821                           (uint) tree->code_len[(uchar) *start_pos],
2822                           bindigits(tree->code[(uchar) *start_pos],
2823                                     (uint)tree->code_len[(uchar) *start_pos])));
2824 	      write_bits(tree->code[(uchar) *start_pos],
2825 			 (uint) tree->code_len[(uchar) *start_pos]);
2826             }
2827 	  }
2828 	  start_pos= end_pos;
2829 	  break;
2830 	}
2831 	case FIELD_LAST:
2832         case FIELD_enum_val_count:
2833 	  abort();				/* Impossible */
2834 	}
2835 	start_pos+=count->max_zero_fill;
2836         DBUG_PRINT("fields", ("---"));
2837       }
2838       flush_bits();
2839       length=(ulong) (file_buffer.pos - record_pos) - max_pack_length;
2840       pack_length= _ma_save_pack_length(pack_version, record_pos, length);
2841       if (pack_blob_length)
2842 	pack_length+= _ma_save_pack_length(pack_version,
2843                                            record_pos + pack_length,
2844                                            tot_blob_length);
2845       DBUG_PRINT("fields", ("record: %lu  length: %lu  blob-length: %lu  "
2846                             "length-bytes: %lu", (ulong) record_count, length,
2847                             tot_blob_length, pack_length));
2848       DBUG_PRINT("fields", ("==="));
2849 
2850       /* Correct file buffer if the header was smaller */
2851       if (pack_length != max_pack_length)
2852       {
2853 	bmove(record_pos+pack_length,record_pos+max_pack_length,length);
2854 	file_buffer.pos-= (max_pack_length-pack_length);
2855       }
2856       if (length < (ulong) min_record_length)
2857 	min_record_length=(uint) length;
2858       if (length > (ulong) max_record_length)
2859 	max_record_length=(uint) length;
2860       record_count++;
2861       if (write_loop && record_count % WRITE_COUNT == 0)
2862       {
2863 	printf("%lu\r", (ulong) record_count);
2864         fflush(stdout);
2865       }
2866     }
2867     else if (error != HA_ERR_RECORD_DELETED)
2868       break;
2869   }
2870   if (error == HA_ERR_END_OF_FILE)
2871     error=0;
2872   else
2873   {
2874     fprintf(stderr, "%s: Got error %d reading records\n", my_progname, error);
2875   }
2876   if (verbose >= 2)
2877     printf("wrote %s records.\n", llstr((longlong) record_count, llbuf));
2878 
2879   my_safe_afree(record, isam_file->s->base.reclength);
2880   mrg->ref_length=max_pack_length;
2881   mrg->min_pack_length=max_record_length ? min_record_length : 0;
2882   mrg->max_pack_length=max_record_length;
2883   DBUG_RETURN(error || error_on_write || flush_buffer(~(ulong) 0));
2884 }
2885 
2886 
make_new_name(char * new_name,char * old_name)2887 static char *make_new_name(char *new_name, char *old_name)
2888 {
2889   return fn_format(new_name,old_name,"",DATA_TMP_EXT,2+4);
2890 }
2891 
make_old_name(char * new_name,char * old_name)2892 static char *make_old_name(char *new_name, char *old_name)
2893 {
2894   return fn_format(new_name,old_name,"",OLD_EXT,2+4);
2895 }
2896 
2897 	/* rutines for bit writing buffer */
2898 
init_file_buffer(File file,pbool read_buffer)2899 static void init_file_buffer(File file, pbool read_buffer)
2900 {
2901   file_buffer.file=file;
2902   file_buffer.buffer= (uchar*) my_malloc(ALIGN_SIZE(RECORD_CACHE_SIZE),
2903 					 MYF(MY_WME));
2904   file_buffer.end=file_buffer.buffer+ALIGN_SIZE(RECORD_CACHE_SIZE)-8;
2905   file_buffer.pos_in_file=0;
2906   error_on_write=0;
2907   if (read_buffer)
2908   {
2909 
2910     file_buffer.pos=file_buffer.end;
2911     file_buffer.bits=0;
2912   }
2913   else
2914   {
2915     file_buffer.pos=file_buffer.buffer;
2916     file_buffer.bits=BITS_SAVED;
2917   }
2918   file_buffer.bitbucket= 0;
2919 }
2920 
2921 
flush_buffer(ulong neaded_length)2922 static int flush_buffer(ulong neaded_length)
2923 {
2924   ulong length;
2925 
2926   /*
2927     file_buffer.end is 8 bytes lower than the real end of the buffer.
2928     This is done so that the end-of-buffer condition does not need to be
2929     checked for every uchar (see write_bits()). Consequently,
2930     file_buffer.pos can become greater than file_buffer.end. The
2931     algorithms in the other functions ensure that there will never be
2932     more than 8 bytes written to the buffer without an end-of-buffer
2933     check. So the buffer cannot be overrun. But we need to check for the
2934     near-to-buffer-end condition to avoid a negative result, which is
2935     casted to unsigned and thus becomes giant.
2936   */
2937   if ((file_buffer.pos < file_buffer.end) &&
2938       ((ulong) (file_buffer.end - file_buffer.pos) > neaded_length))
2939     return 0;
2940   length=(ulong) (file_buffer.pos-file_buffer.buffer);
2941   file_buffer.pos=file_buffer.buffer;
2942   file_buffer.pos_in_file+=length;
2943   if (test_only)
2944     return 0;
2945   if (error_on_write|| my_write(file_buffer.file,
2946 				(const uchar*) file_buffer.buffer,
2947 				length,
2948 				MYF(MY_WME | MY_NABP | MY_WAIT_IF_FULL)))
2949   {
2950     error_on_write=1;
2951     return 1;
2952   }
2953 
2954   if (neaded_length != ~(ulong) 0 &&
2955       (ulong) (file_buffer.end-file_buffer.buffer) < neaded_length)
2956   {
2957     uchar *tmp;
2958     neaded_length+=256;				/* some margin */
2959     tmp= (uchar*) my_realloc(file_buffer.buffer, neaded_length,MYF(MY_WME));
2960     if (!tmp)
2961       return 1;
2962     file_buffer.pos=    (tmp + (ulong) (file_buffer.pos - file_buffer.buffer));
2963     file_buffer.buffer= tmp;
2964     file_buffer.end=    (tmp+neaded_length-8);
2965   }
2966   return 0;
2967 }
2968 
2969 
end_file_buffer(void)2970 static void end_file_buffer(void)
2971 {
2972   my_free(file_buffer.buffer);
2973 }
2974 
2975 	/* output `bits` low bits of `value' */
2976 
write_bits(register ulonglong value,register uint bits)2977 static void write_bits(register ulonglong value, register uint bits)
2978 {
2979   DBUG_ASSERT(((bits < 8 * sizeof(value)) && ! (value >> bits)) ||
2980               (bits == 8 * sizeof(value)));
2981 
2982   if ((file_buffer.bits-= (int) bits) >= 0)
2983   {
2984     file_buffer.bitbucket|= value << file_buffer.bits;
2985   }
2986   else
2987   {
2988     reg3 ulonglong bit_buffer;
2989     bits= (uint) -file_buffer.bits;
2990     bit_buffer= (file_buffer.bitbucket |
2991                  ((bits != 8 * sizeof(value)) ? (value >> bits) : 0));
2992 #if BITS_SAVED == 64
2993     *file_buffer.pos++= (uchar) (bit_buffer >> 56);
2994     *file_buffer.pos++= (uchar) (bit_buffer >> 48);
2995     *file_buffer.pos++= (uchar) (bit_buffer >> 40);
2996     *file_buffer.pos++= (uchar) (bit_buffer >> 32);
2997 #endif
2998     *file_buffer.pos++= (uchar) (bit_buffer >> 24);
2999     *file_buffer.pos++= (uchar) (bit_buffer >> 16);
3000     *file_buffer.pos++= (uchar) (bit_buffer >> 8);
3001     *file_buffer.pos++= (uchar) (bit_buffer);
3002 
3003     if (bits != 8 * sizeof(value))
3004       value&= (((ulonglong) 1) << bits) - 1;
3005     if (file_buffer.pos >= file_buffer.end)
3006       flush_buffer(~ (ulong) 0);
3007     file_buffer.bits=(int) (BITS_SAVED - bits);
3008     file_buffer.bitbucket= value << (BITS_SAVED - bits);
3009   }
3010   return;
3011 }
3012 
3013 	/* Flush bits in bit_buffer to buffer */
3014 
flush_bits(void)3015 static void flush_bits(void)
3016 {
3017   int bits;
3018   ulonglong bit_buffer;
3019 
3020   bits= file_buffer.bits & ~7;
3021   bit_buffer= file_buffer.bitbucket >> bits;
3022   bits= BITS_SAVED - bits;
3023   while (bits > 0)
3024   {
3025     bits-= 8;
3026     *file_buffer.pos++= (uchar) (bit_buffer >> bits);
3027   }
3028   if (file_buffer.pos >= file_buffer.end)
3029     flush_buffer(~ (ulong) 0);
3030   file_buffer.bits= BITS_SAVED;
3031   file_buffer.bitbucket= 0;
3032 }
3033 
3034 
3035 /****************************************************************************
3036 ** functions to handle the joined files
3037 ****************************************************************************/
3038 
save_state(MARIA_HA * isam_file,PACK_MRG_INFO * mrg,my_off_t new_length,ha_checksum crc)3039 static int save_state(MARIA_HA *isam_file,PACK_MRG_INFO *mrg,
3040                       my_off_t new_length,
3041 		      ha_checksum crc)
3042 {
3043   MARIA_SHARE *share=isam_file->s;
3044   uint options=mi_uint2korr(share->state.header.options);
3045   uint key;
3046   DBUG_ENTER("save_state");
3047 
3048   options|= HA_OPTION_COMPRESS_RECORD | HA_OPTION_READ_ONLY_DATA;
3049   mi_int2store(share->state.header.options,options);
3050   /* Save the original file type of we have to undo the packing later */
3051   share->state.header.org_data_file_type= share->state.header.data_file_type;
3052   share->state.header.data_file_type= COMPRESSED_RECORD;
3053 
3054   share->state.state.data_file_length=new_length;
3055   share->state.state.del=0;
3056   share->state.state.empty=0;
3057   share->state.dellink= HA_OFFSET_ERROR;
3058   share->state.split=(ha_rows) mrg->records;
3059   share->state.version=(ulong) time((time_t*) 0);
3060   if (share->base.born_transactional)
3061     share->state.create_rename_lsn= share->state.is_of_horizon=
3062       share->state.skip_redo_lsn= LSN_NEEDS_NEW_STATE_LSNS;
3063   if (! maria_is_all_keys_active(share->state.key_map, share->base.keys))
3064   {
3065     /*
3066       Some indexes are disabled, cannot use current key_file_length value
3067       as an estimate of upper bound of index file size. Use packed data file
3068       size instead.
3069     */
3070     share->state.state.key_file_length= new_length;
3071   }
3072   /*
3073     If there are no disabled indexes, keep key_file_length value from
3074     original file so "aria_chk -rq" can use this value (this is necessary
3075     because index size cannot be easily calculated for fulltext keys)
3076   */
3077   maria_clear_all_keys_active(share->state.key_map);
3078   for (key=0 ; key < share->base.keys ; key++)
3079     share->state.key_root[key]= HA_OFFSET_ERROR;
3080   share->state.key_del= HA_OFFSET_ERROR;
3081   share->state.state.checksum= crc;     /* Save crc in file */
3082   share->changed=1;			/* Force write of header */
3083   share->state.open_count=0;
3084   share->global_changed=0;
3085   my_chsize(share->kfile.file, share->base.keystart, 0, MYF(0));
3086   if (share->base.keys)
3087     isamchk_neaded=1;
3088   DBUG_RETURN(_ma_state_info_write_sub(share->kfile.file,
3089                                        &share->state,
3090                                        MA_STATE_INFO_WRITE_DONT_MOVE_OFFSET |
3091                                        MA_STATE_INFO_WRITE_FULL_INFO));
3092 }
3093 
3094 
save_state_mrg(File file,PACK_MRG_INFO * mrg,my_off_t new_length,ha_checksum crc)3095 static int save_state_mrg(File file,PACK_MRG_INFO *mrg,my_off_t new_length,
3096 			  ha_checksum crc)
3097 {
3098   MARIA_STATE_INFO state;
3099   MARIA_HA *isam_file=mrg->file[0];
3100   uint options;
3101   DBUG_ENTER("save_state_mrg");
3102 
3103   state= isam_file->s->state;
3104   options= (mi_uint2korr(state.header.options) | HA_OPTION_COMPRESS_RECORD |
3105 	    HA_OPTION_READ_ONLY_DATA);
3106   mi_int2store(state.header.options,options);
3107   /* Save the original file type of we have to undo the packing later */
3108   state.header.org_data_file_type= state.header.data_file_type;
3109   state.header.data_file_type= COMPRESSED_RECORD;
3110 
3111   state.state.data_file_length=new_length;
3112   state.state.del=0;
3113   state.state.empty=0;
3114   state.state.records=state.split=(ha_rows) mrg->records;
3115   state.create_rename_lsn= state.is_of_horizon= state.skip_redo_lsn=
3116     LSN_NEEDS_NEW_STATE_LSNS;
3117 
3118   /* See comment above in save_state about key_file_length handling. */
3119   if (mrg->src_file_has_indexes_disabled)
3120   {
3121     isam_file->s->state.state.key_file_length=
3122       MY_MAX(isam_file->s->state.state.key_file_length, new_length);
3123   }
3124   state.dellink= HA_OFFSET_ERROR;
3125   state.version=(ulong) time((time_t*) 0);
3126   maria_clear_all_keys_active(state.key_map);
3127   state.state.checksum=crc;
3128   if (isam_file->s->base.keys)
3129     isamchk_neaded=1;
3130   state.changed=STATE_CHANGED | STATE_NOT_ANALYZED; /* Force check of table */
3131   DBUG_RETURN (_ma_state_info_write_sub(file, &state,
3132                                         MA_STATE_INFO_WRITE_DONT_MOVE_OFFSET |
3133                                         MA_STATE_INFO_WRITE_FULL_INFO));
3134 }
3135 
3136 
3137 /* reset for mrg_rrnd */
3138 
mrg_reset(PACK_MRG_INFO * mrg)3139 static void mrg_reset(PACK_MRG_INFO *mrg)
3140 {
3141   if (mrg->current)
3142   {
3143     maria_extra(*mrg->current, HA_EXTRA_NO_CACHE, 0);
3144     mrg->current=0;
3145   }
3146 }
3147 
mrg_rrnd(PACK_MRG_INFO * info,uchar * buf)3148 static int mrg_rrnd(PACK_MRG_INFO *info,uchar *buf)
3149 {
3150   int error;
3151   MARIA_HA *isam_info;
3152 
3153   if (!info->current)
3154   {
3155     isam_info= *(info->current=info->file);
3156     info->end=info->current+info->count;
3157     maria_reset(isam_info);
3158     maria_extra(isam_info, HA_EXTRA_CACHE, 0);
3159     if ((error= maria_scan_init(isam_info)))
3160       return(error);
3161   }
3162   else
3163     isam_info= *info->current;
3164 
3165   for (;;)
3166   {
3167     if (!(error= maria_scan(isam_info, buf)) ||
3168 	error != HA_ERR_END_OF_FILE)
3169       return (error);
3170     maria_scan_end(isam_info);
3171     maria_extra(isam_info,HA_EXTRA_NO_CACHE, 0);
3172     if (info->current+1 == info->end)
3173       return(HA_ERR_END_OF_FILE);
3174     info->current++;
3175     isam_info= *info->current;
3176     maria_reset(isam_info);
3177     maria_extra(isam_info,HA_EXTRA_CACHE, 0);
3178     if ((error= maria_scan_init(isam_info)))
3179       return(error);
3180   }
3181 }
3182 
3183 
mrg_close(PACK_MRG_INFO * mrg)3184 static int mrg_close(PACK_MRG_INFO *mrg)
3185 {
3186   uint i;
3187   int error=0;
3188   DBUG_ENTER("mrg_close");
3189 
3190   for (i=0 ; i < mrg->count ; i++)
3191     error|=maria_close(mrg->file[i]);
3192   if (mrg->free_file)
3193     my_free(mrg->file);
3194   DBUG_RETURN(error);
3195 }
3196 
3197 
3198 #if !defined(DBUG_OFF)
3199 /*
3200   Fake the counts to get big Huffman codes.
3201 
3202   SYNOPSIS
3203     fakebigcodes()
3204     huff_counts                 A pointer to the counts array.
3205     end_count                   A pointer past the counts array.
3206 
3207   DESCRIPTION
3208 
3209     Huffman coding works by removing the two least frequent values from
3210     the list of values and add a new value with the sum of their
3211     incidences in a loop until only one value is left. Every time a
3212     value is reused for a new value, it gets one more bit for its
3213     encoding. Hence, the least frequent values get the longest codes.
3214 
3215     To get a maximum code length for a value, two of the values must
3216     have an incidence of 1. As their sum is 2, the next infrequent value
3217     must have at least an incidence of 2, then 4, 8, 16 and so on. This
3218     means that one needs 2**n bytes (values) for a code length of n
3219     bits. However, using more distinct values forces the use of longer
3220     codes, or reaching the code length with less total bytes (values).
3221 
3222     To get 64(32)-bit codes, I sort the counts by decreasing incidence.
3223     I assign counts of 1 to the two most frequent values, a count of 2
3224     for the next one, then 4, 8, and so on until 2**64-1(2**30-1). All
3225     the remaining values get 1. That way every possible uchar has an
3226     assigned code, though not all codes are used if not all uchar values
3227     are present in the column.
3228 
3229     This strategy would work with distinct column values too, but
3230     requires that at least 64(32) values are present. To make things
3231     easier here, I cancel all distinct column values and force byte
3232     compression for all columns.
3233 
3234   RETURN
3235     void
3236 */
3237 
fakebigcodes(HUFF_COUNTS * huff_counts,HUFF_COUNTS * end_count)3238 static void fakebigcodes(HUFF_COUNTS *huff_counts, HUFF_COUNTS *end_count)
3239 {
3240   HUFF_COUNTS   *count;
3241   my_off_t      *cur_count_p;
3242   my_off_t      *end_count_p;
3243   my_off_t      **cur_sort_p;
3244   my_off_t      **end_sort_p;
3245   my_off_t      *sort_counts[256];
3246   my_off_t      total;
3247   DBUG_ENTER("fakebigcodes");
3248 
3249   for (count= huff_counts; count < end_count; count++)
3250   {
3251     /*
3252       Remove distinct column values.
3253     */
3254     if (huff_counts->tree_buff)
3255     {
3256       my_free(huff_counts->tree_buff);
3257       delete_tree(&huff_counts->int_tree, 0);
3258       huff_counts->tree_buff= NULL;
3259       DBUG_PRINT("fakebigcodes", ("freed distinct column values"));
3260     }
3261 
3262     /*
3263       Sort counts by decreasing incidence.
3264     */
3265     cur_count_p= count->counts;
3266     end_count_p= cur_count_p + 256;
3267     cur_sort_p= sort_counts;
3268     while (cur_count_p < end_count_p)
3269       *(cur_sort_p++)= cur_count_p++;
3270     (void) my_qsort(sort_counts, 256, sizeof(my_off_t*), (qsort_cmp) fakecmp);
3271 
3272     /*
3273       Assign faked counts.
3274     */
3275     cur_sort_p= sort_counts;
3276 #if SIZEOF_LONG_LONG > 4
3277     end_sort_p= sort_counts + 8 * sizeof(ulonglong) - 1;
3278 #else
3279     end_sort_p= sort_counts + 8 * sizeof(ulonglong) - 2;
3280 #endif
3281     /* Most frequent value gets a faked count of 1. */
3282     **(cur_sort_p++)= 1;
3283     total= 1;
3284     while (cur_sort_p < end_sort_p)
3285     {
3286       **(cur_sort_p++)= total;
3287       total<<= 1;
3288     }
3289     /* Set the last value. */
3290     **(cur_sort_p++)= --total;
3291     /*
3292       Set the remaining counts.
3293     */
3294     end_sort_p= sort_counts + 256;
3295     while (cur_sort_p < end_sort_p)
3296       **(cur_sort_p++)= 1;
3297   }
3298   DBUG_VOID_RETURN;
3299 }
3300 
3301 
3302 /*
3303   Compare two counts for reverse sorting.
3304 
3305   SYNOPSIS
3306     fakecmp()
3307     count1              One count.
3308     count2              Another count.
3309 
3310   RETURN
3311     1                   count1  < count2
3312     0                   count1 == count2
3313     -1                  count1 >  count2
3314 */
3315 
fakecmp(my_off_t ** count1,my_off_t ** count2)3316 static int fakecmp(my_off_t **count1, my_off_t **count2)
3317 {
3318   return ((**count1 < **count2) ? 1 :
3319           (**count1 > **count2) ? -1 : 0);
3320 }
3321 #endif
3322 
3323 #include "ma_check_standalone.h"
3324 
3325