1 /* Copyright (C) 2006 MySQL AB & MySQL Finland AB & TCX DataKonsult AB
2 
3    This program is free software; you can redistribute it and/or modify
4    it under the terms of the GNU General Public License as published by
5    the Free Software Foundation; version 2 of the License.
6 
7    This program is distributed in the hope that it will be useful,
8    but WITHOUT ANY WARRANTY; without even the implied warranty of
9    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
10    GNU General Public License for more details.
11 
12    You should have received a copy of the GNU General Public License
13    along with this program; if not, write to the Free Software
14    Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1335 USA */
15 
16 /* Pack MARIA file */
17 
18 #ifndef USE_MY_FUNC
19 #define USE_MY_FUNC			/* We need at least my_malloc */
20 #endif
21 
22 #include "maria_def.h"
23 #include "trnman_public.h"
24 #include "trnman.h"
25 #include <queues.h>
26 #include <my_tree.h>
27 #include "mysys_err.h"
28 #ifdef MSDOS
29 #include <io.h>
30 #endif
31 #ifndef __GNU_LIBRARY__
32 #define __GNU_LIBRARY__			/* Skip warnings in getopt.h */
33 #endif
34 #include <my_getopt.h>
35 #include <my_handler_errors.h>
36 
37 #if SIZEOF_LONG_LONG > 4
38 #define BITS_SAVED 64
39 #else
40 #define BITS_SAVED 32
41 #endif
42 #ifndef MAX_INTERNAL_TRID
43 #define MAX_INTERNAL_TRID  0xffffffffffffLL
44 #endif
45 
46 #define IS_OFFSET ((uint) 32768)	/* Bit if offset or char in tree */
47 #define HEAD_LENGTH	32
48 #define ALLOWED_JOIN_DIFF	256	/* Diff allowed to join trees */
49 
50 #define DATA_TMP_EXT		".TMD"
51 #define OLD_EXT			".OLD"
52 #define WRITE_COUNT		MY_HOW_OFTEN_TO_WRITE
53 
54 struct st_file_buffer {
55   File file;
56   uchar *buffer,*pos,*end;
57   my_off_t pos_in_file;
58   int bits;
59   ulonglong bitbucket;
60 };
61 
62 struct st_huff_tree;
63 struct st_huff_element;
64 
65 typedef struct st_huff_counts {
66   uint	field_length,max_zero_fill;
67   uint	pack_type;
68   uint	max_end_space,max_pre_space,length_bits,min_space;
69   ulong max_length;
70   enum en_fieldtype field_type;
71   struct st_huff_tree *tree;		/* Tree for field */
72   my_off_t counts[256];
73   my_off_t end_space[8];
74   my_off_t pre_space[8];
75   my_off_t tot_end_space,tot_pre_space,zero_fields,empty_fields,bytes_packed;
76   TREE int_tree;        /* Tree for detecting distinct column values. */
77   uchar *tree_buff;      /* Column values, 'field_length' each. */
78   uchar *tree_pos;       /* Points to end of column values in 'tree_buff'. */
79 } HUFF_COUNTS;
80 
81 typedef struct st_huff_element HUFF_ELEMENT;
82 
83 /*
84   WARNING: It is crucial for the optimizations in calc_packed_length()
85   that 'count' is the first element of 'HUFF_ELEMENT'.
86 */
87 struct st_huff_element {
88   my_off_t count;
89   union un_element {
90     struct st_nod {
91       HUFF_ELEMENT *left,*right;
92     } nod;
93     struct st_leaf {
94       HUFF_ELEMENT *null;
95       uint	element_nr;		/* Number of element */
96     } leaf;
97   } a;
98 };
99 
100 
101 typedef struct st_huff_tree {
102   HUFF_ELEMENT *root,*element_buffer;
103   HUFF_COUNTS *counts;
104   uint tree_number;
105   uint elements;
106   my_off_t bytes_packed;
107   uint tree_pack_length;
108   uint min_chr,max_chr,char_bits,offset_bits,max_offset,height;
109   ulonglong *code;
110   uchar *code_len;
111 } HUFF_TREE;
112 
113 
114 typedef struct st_isam_mrg {
115   MARIA_HA **file,**current,**end;
116   uint free_file;
117   uint count;
118   uint	min_pack_length;		/* Theese is used by packed data */
119   uint	max_pack_length;
120   uint	ref_length;
121   uint	max_blob_length;
122   my_off_t records;
123   /* true if at least one source file has at least one disabled index */
124   my_bool src_file_has_indexes_disabled;
125 } PACK_MRG_INFO;
126 
127 
128 extern int main(int argc,char * *argv);
129 static void get_options(int *argc,char ***argv);
130 static MARIA_HA *open_maria_file(char *name,int mode);
131 static my_bool open_maria_files(PACK_MRG_INFO *mrg,char **names,uint count);
132 static int compress(PACK_MRG_INFO *file,char *join_name);
133 static HUFF_COUNTS *init_huff_count(MARIA_HA *info,my_off_t records);
134 static void free_counts_and_tree_and_queue(HUFF_TREE *huff_trees,
135 					   uint trees,
136 					   HUFF_COUNTS *huff_counts,
137 					   uint fields);
138 static int compare_tree(void* cmp_arg __attribute__((unused)),
139 			const uchar *s,const uchar *t);
140 static int get_statistic(PACK_MRG_INFO *mrg,HUFF_COUNTS *huff_counts);
141 static void check_counts(HUFF_COUNTS *huff_counts,uint trees,
142 			 my_off_t records);
143 static int test_space_compress(HUFF_COUNTS *huff_counts,my_off_t records,
144 			       uint max_space_length,my_off_t *space_counts,
145 			       my_off_t tot_space_count,
146 			       enum en_fieldtype field_type);
147 static HUFF_TREE* make_huff_trees(HUFF_COUNTS *huff_counts,uint trees);
148 static int make_huff_tree(HUFF_TREE *tree,HUFF_COUNTS *huff_counts);
149 static int compare_huff_elements(void *not_used, uchar *a,uchar *b);
150 static int save_counts_in_queue(uchar *key,element_count count,
151 				    HUFF_TREE *tree);
152 static my_off_t calc_packed_length(HUFF_COUNTS *huff_counts,uint flag);
153 static uint join_same_trees(HUFF_COUNTS *huff_counts,uint trees);
154 static int make_huff_decode_table(HUFF_TREE *huff_tree,uint trees);
155 static void make_traverse_code_tree(HUFF_TREE *huff_tree,
156 				    HUFF_ELEMENT *element,uint size,
157 				    ulonglong code);
158 static int write_header(PACK_MRG_INFO *isam_file, uint header_length,uint trees,
159 			my_off_t tot_elements,my_off_t filelength);
160 static void write_field_info(HUFF_COUNTS *counts, uint fields,uint trees);
161 static my_off_t write_huff_tree(HUFF_TREE *huff_tree,uint trees);
162 static uint *make_offset_code_tree(HUFF_TREE *huff_tree,
163 				       HUFF_ELEMENT *element,
164 				       uint *offset);
165 static uint max_bit(uint value);
166 static int compress_maria_file(PACK_MRG_INFO *file,HUFF_COUNTS *huff_counts);
167 static char *make_new_name(char *new_name,char *old_name);
168 static char *make_old_name(char *new_name,char *old_name);
169 static void init_file_buffer(File file,pbool read_buffer);
170 static int flush_buffer(ulong neaded_length);
171 static void end_file_buffer(void);
172 static void write_bits(ulonglong value, uint bits);
173 static void flush_bits(void);
174 static int save_state(MARIA_HA *isam_file,PACK_MRG_INFO *mrg,
175                       my_off_t new_length, ha_checksum crc);
176 static int save_state_mrg(File file,PACK_MRG_INFO *isam_file,
177                           my_off_t new_length, ha_checksum crc);
178 static int mrg_close(PACK_MRG_INFO *mrg);
179 static int mrg_rrnd(PACK_MRG_INFO *info,uchar *buf);
180 static void mrg_reset(PACK_MRG_INFO *mrg);
181 #if !defined(DBUG_OFF)
182 static void fakebigcodes(HUFF_COUNTS *huff_counts, HUFF_COUNTS *end_count);
183 static int fakecmp(my_off_t **count1, my_off_t **count2);
184 #endif
185 
186 /*
187   tree_buff_length is somewhat arbitrary. The bigger it is the better
188   the chance to win in terms of compression factor. On the other hand,
189   this table becomes part of the compressed file header. And its length
190   is coded with 16 bits in the header. Hence the limit is 2**16 - 1.
191 */
192 static uint tree_buff_length= 65536 - MALLOC_OVERHEAD;
193 
194 static int error_on_write=0,test_only=0,verbose=0,silent=0,
195 	   write_loop=0,force_pack=0, isamchk_neaded=0;
196 static int tmpfile_createflag=O_RDWR | O_TRUNC | O_EXCL;
197 static my_bool backup, opt_wait;
198 static my_bool opt_ignore_control_file, opt_require_control_file;
199 static char tmp_dir[FN_REFLEN]={0},*join_table;
200 static my_off_t intervall_length;
201 static ha_checksum glob_crc;
202 static struct st_file_buffer file_buffer;
203 static QUEUE queue;
204 static HUFF_COUNTS *global_count;
205 static char zero_string[]={0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0};
206 static const char *load_default_groups[]= { "ariapack",0 };
207 static char **default_argv;
208 
209 /*
210   Register handler error messages for usage with my_error()
211 
212   NOTES
213     This is safe to call multiple times as my_error_register()
214     will ignore calls to register already registered error numbers.
215 */
216 
get_handler_error_messages(int e)217 static const char **get_handler_error_messages(int e __attribute__((unused)))
218 {
219   return handler_error_messages;
220 }
221 
222 
223 /* The main program */
224 
main(int argc,char ** argv)225 int main(int argc, char **argv)
226 {
227   int error,ok;
228   PACK_MRG_INFO merge;
229   my_bool no_control_file= 0;
230   MY_INIT(argv[0]);
231 
232   maria_data_root= ".";
233   load_defaults_or_exit("my", load_default_groups, &argc, &argv);
234   default_argv= argv;
235   get_options(&argc,&argv);
236   my_error_register(get_handler_error_messages, HA_ERR_FIRST,
237                     HA_ERR_FIRST+ array_elements(handler_error_messages)-1);
238 
239   if (!opt_ignore_control_file &&
240       (no_control_file= ma_control_file_open(FALSE,
241                                              (opt_require_control_file ||
242                                               !silent), FALSE)) &&
243        opt_require_control_file)
244   {
245     error= 1;
246     goto end;
247   }
248   maria_init();
249   if (no_control_file || force_pack)
250   {
251     /* Assume that all rows exists */
252     trnman_init(MAX_INTERNAL_TRID-16);
253   }
254 
255   error=ok=isamchk_neaded=0;
256   if (join_table)
257   {						/* Join files into one */
258     if (open_maria_files(&merge,argv,(uint) argc) ||
259 	compress(&merge,join_table))
260       error=1;
261   }
262   else while (argc--)
263   {
264     MARIA_HA *isam_file;
265     if (!(isam_file=open_maria_file(*argv++,O_RDWR)))
266       error=1;
267     else
268     {
269       merge.file= &isam_file;
270       merge.current=0;
271       merge.free_file=0;
272       merge.count=1;
273       if (compress(&merge,0))
274 	error=1;
275       else
276 	ok=1;
277     }
278   }
279   if (ok && isamchk_neaded && !silent)
280     puts("Remember to run aria_chk -rq on compressed tables");
281 
282 end:
283   fflush(stdout);
284   fflush(stderr);
285   free_defaults(default_argv);
286   my_error_unregister(HA_ERR_FIRST,
287                       HA_ERR_FIRST+ array_elements(handler_error_messages)-1);
288   maria_end();
289   my_end(verbose ? MY_CHECK_ERROR | MY_GIVE_INFO : MY_CHECK_ERROR);
290   exit(error ? 2 : 0);
291 #ifndef _lint
292   return 0;					/* No compiler warning */
293 #endif
294 }
295 
my_exit(int error)296 static void my_exit(int error)
297 {
298   free_defaults(default_argv);
299   maria_end();
300   my_end(verbose ? MY_CHECK_ERROR | MY_GIVE_INFO : MY_CHECK_ERROR);
301   exit(error);
302 }
303 
304 enum options_mp {OPT_CHARSETS_DIR_MP=256, OPT_AUTO_CLOSE};
305 
306 static struct my_option my_long_options[] =
307 {
308 #ifdef __NETWARE__
309   {"autoclose", OPT_AUTO_CLOSE, "Auto close the screen on exit for Netware.",
310    0, 0, 0, GET_NO_ARG, NO_ARG, 0, 0, 0, 0, 0, 0},
311 #endif
312   {"backup", 'b', "Make a backup of the table as table_name.OLD.",
313    &backup, &backup, 0, GET_BOOL, NO_ARG, 0, 0, 0, 0, 0, 0},
314   {"character-sets-dir", OPT_CHARSETS_DIR_MP,
315    "Directory where character sets are.", (char**) &charsets_dir,
316    (char**) &charsets_dir, 0, GET_STR, REQUIRED_ARG, 0, 0, 0, 0, 0, 0},
317   {"datadir", 'h',
318    "Path for control file (and logs if --logdir not used).",
319    (char**) &maria_data_root, 0, 0, GET_STR, REQUIRED_ARG,
320    0, 0, 0, 0, 0, 0},
321   {"debug", '#', "Output debug log. Often this is 'd:t:o,filename'.",
322    0, 0, 0, GET_STR, OPT_ARG, 0, 0, 0, 0, 0, 0},
323   {"force", 'f',
324    "Force packing of table even if it gets bigger or if tempfile exists.",
325    0, 0, 0, GET_NO_ARG, NO_ARG, 0, 0, 0, 0, 0, 0},
326   { "ignore-control-file", 0,
327     "Ignore the control file",
328     (uchar**)&opt_ignore_control_file, 0, 0, GET_BOOL, NO_ARG,
329     0, 0, 0, 0, 0, 0},
330   {"join", 'j',
331    "Join all given tables into 'new_table_name'. All tables MUST have identical layouts.",
332    &join_table, &join_table, 0, GET_STR, REQUIRED_ARG, 0, 0, 0,
333    0, 0, 0},
334   {"help", '?', "Display this help and exit.",
335    0, 0, 0, GET_NO_ARG, NO_ARG, 0, 0, 0, 0, 0, 0},
336   { "require-control-file", 0,
337     "Abort if cannot find control file",
338     (uchar**)&opt_require_control_file, 0, 0, GET_BOOL, NO_ARG,
339     0, 0, 0, 0, 0, 0},
340   {"silent", 's', "Be more silent.",
341    0, 0, 0, GET_NO_ARG, NO_ARG, 0, 0, 0, 0, 0, 0},
342   {"tmpdir", 'T', "Use temporary directory to store temporary table.",
343    0, 0, 0, GET_STR, REQUIRED_ARG, 0, 0, 0, 0, 0, 0},
344   {"test", 't', "Don't pack table, only test packing it.",
345    0, 0, 0, GET_NO_ARG, NO_ARG, 0, 0, 0, 0, 0, 0},
346   {"verbose", 'v', "Write info about progress and packing result. Use many -v for more verbosity!",
347    0, 0, 0, GET_NO_ARG, NO_ARG, 0, 0, 0, 0, 0, 0},
348   {"version", 'V', "Output version information and exit.",
349    0, 0, 0, GET_NO_ARG, NO_ARG, 0, 0, 0, 0, 0, 0},
350   {"wait", 'w', "Wait and retry if table is in use.", &opt_wait,
351    &opt_wait, 0, GET_BOOL, NO_ARG, 0, 0, 0, 0, 0, 0},
352   { 0, 0, 0, 0, 0, 0, GET_NO_ARG, NO_ARG, 0, 0, 0, 0, 0, 0}
353 };
354 
355 
print_version(void)356 static void print_version(void)
357 {
358   printf("%s Ver 1.0 for %s on %s\n", my_progname, SYSTEM_TYPE, MACHINE_TYPE);
359 }
360 
361 
usage(void)362 static void usage(void)
363 {
364   print_version();
365   puts("Copyright 2002-2008 MySQL AB, 2008-2009 Sun Microsystems, Inc.");
366   puts("This software comes with ABSOLUTELY NO WARRANTY. This is free software,");
367   puts("and you are welcome to modify and redistribute it under the GPL license\n");
368 
369   puts("Pack a Aria-table to take much less space.");
370   puts("Keys are not updated, you must run aria_chk -rq on the index (.MAI) file");
371   puts("afterwards to update the keys.");
372   puts("You should give the .MAI file as the filename argument.");
373   puts("To unpack a packed table, run aria_chk -u on the table");
374 
375   printf("\nUsage: %s [OPTIONS] filename...\n", my_progname);
376   my_print_help(my_long_options);
377   print_defaults("my", load_default_groups);
378   my_print_variables(my_long_options);
379 }
380 
381 
382 static my_bool
get_one_option(const struct my_option * opt,const char * argument,const char * filename)383 get_one_option(const struct my_option *opt,
384 	       const char *argument,
385                const char *filename __attribute__((unused)))
386 {
387   uint length;
388 
389   switch(opt->id) {
390 #ifdef __NETWARE__
391   case OPT_AUTO_CLOSE:
392     setscreenmode(SCR_AUTOCLOSE_ON_EXIT);
393     break;
394 #endif
395   case 'f':
396     force_pack= 1;
397     tmpfile_createflag= O_RDWR | O_TRUNC;
398     break;
399   case 's':
400     write_loop= verbose= 0;
401     silent= 1;
402     break;
403   case 't':
404     test_only= 1;
405     /* Avoid to reset 'verbose' if it was already set > 1. */
406     if (! verbose)
407       verbose= 1;
408     break;
409   case 'T':
410     length= (uint) (strmov(tmp_dir, argument) - tmp_dir);
411     if (length != dirname_length(tmp_dir))
412     {
413       tmp_dir[length]=FN_LIBCHAR;
414       tmp_dir[length+1]=0;
415     }
416     break;
417   case 'v':
418     verbose++; /* Allow for selecting the level of verbosity. */
419     silent= 0;
420     break;
421   case '#':
422     DBUG_PUSH(argument ? argument : "d:t:o,/tmp/aria_pack.trace");
423     break;
424   case 'V':
425     print_version();
426     my_exit(0);
427     break;
428   case 'I':
429   case '?':
430     usage();
431     my_exit(0);
432   }
433   return 0;
434 }
435 
436 	/* reads options */
437 	/* Initiates DEBUG - but no debugging here ! */
438 
get_options(int * argc,char *** argv)439 static void get_options(int *argc,char ***argv)
440 {
441   int ho_error;
442 
443   my_progname= argv[0][0];
444   if (isatty(fileno(stdout)))
445     write_loop=1;
446 
447   if ((ho_error=handle_options(argc, argv, my_long_options, get_one_option)))
448     my_exit(ho_error);
449 
450   if (!*argc)
451   {
452     usage();
453     my_exit(1);
454   }
455   if (join_table)
456   {
457     backup=0;					/* Not needed */
458     tmp_dir[0]=0;
459   }
460   return;
461 }
462 
463 
print_error(int error,const char * filename)464 static void print_error(int error, const char *filename)
465 {
466   switch (error) {
467   case HA_ERR_CRASHED:
468     fprintf(stderr, "'%s' doesn't have a correct index definition. You need to recreate it before you can do a repair",filename);
469     break;
470   case HA_ERR_NOT_A_TABLE:
471     fprintf(stderr, "'%s' is not a Aria table",filename);
472     break;
473   case HA_ERR_CRASHED_ON_USAGE:
474     fprintf(stderr, "'%s' is marked as crashed",filename);
475     break;
476   case HA_ERR_CRASHED_ON_REPAIR:
477     fprintf(stderr, "'%s' is marked as crashed after last repair",filename);
478     break;
479   case HA_ERR_OLD_FILE:
480     fprintf(stderr, "'%s' has transactions newer than registered in control file. If this is ok, please re-run with --ignore-control-file", filename);
481     break;
482   case HA_ERR_NEW_FILE:
483     fprintf(stderr, "'%s' uses new features not supported by this version of the Aria library", filename);
484     break;
485   case HA_ERR_END_OF_FILE:
486     fprintf(stderr, "Couldn't read complete header from '%s'", filename);
487     break;
488   case EAGAIN:
489     fprintf(stderr, "'%s' is locked. Use -w to wait until unlocked",filename);
490     break;
491   case ENOENT:
492     fprintf(stderr, "File '%s' doesn't exist",filename);
493     break;
494   case EACCES:
495     fprintf(stderr, "You don't have permission to use '%s'", filename);
496     break;
497   default:
498     fprintf(stderr, "%d when opening Aria table '%s'", error, filename);
499     break;
500   }
501   fputc('\n',stderr);
502 }
503 
504 
open_maria_file(char * name,int mode)505 static MARIA_HA *open_maria_file(char *name,int mode)
506 {
507   MARIA_HA *isam_file;
508   MARIA_SHARE *share;
509   DBUG_ENTER("open_maria_file");
510 
511   if (!(isam_file=maria_open(name, mode, HA_OPEN_IGNORE_MOVED_STATE |
512                              (opt_wait ? HA_OPEN_WAIT_IF_LOCKED :
513                               HA_OPEN_ABORT_IF_LOCKED), 0)))
514   {
515     print_error(my_errno, name);
516     DBUG_RETURN(0);
517   }
518   share=isam_file->s;
519   if (share->options & HA_OPTION_COMPRESS_RECORD && !join_table)
520   {
521     if (!force_pack)
522     {
523       fprintf(stderr, "%s is already compressed\n", name);
524       maria_close(isam_file);
525       DBUG_RETURN(0);
526     }
527     if (verbose)
528       puts("Recompressing already compressed table");
529     share->options&= ~HA_OPTION_READ_ONLY_DATA; /* We are modifing it */
530   }
531   if (! force_pack && share->state.state.records != 0 &&
532       (share->state.state.records <= 1 ||
533        share->state.state.data_file_length < 1024))
534   {
535     fprintf(stderr, "%s is too small to compress\n", name);
536     maria_close(isam_file);
537     DBUG_RETURN(0);
538   }
539   maria_lock_database(isam_file,F_WRLCK);
540   maria_ignore_trids(isam_file);
541   DBUG_RETURN(isam_file);
542 }
543 
544 
open_maria_files(PACK_MRG_INFO * mrg,char ** names,uint count)545 static my_bool open_maria_files(PACK_MRG_INFO *mrg,char **names,uint count)
546 {
547   uint i,j;
548   mrg->count=0;
549   mrg->current=0;
550   mrg->file=(MARIA_HA**) my_malloc(PSI_NOT_INSTRUMENTED, sizeof(MARIA_HA*)*count,MYF(MY_FAE));
551   mrg->free_file=1;
552   mrg->src_file_has_indexes_disabled= 0;
553   for (i=0; i < count ; i++)
554   {
555     if (!(mrg->file[i]=open_maria_file(names[i],O_RDONLY)))
556       goto error;
557 
558     mrg->src_file_has_indexes_disabled|=
559       ! maria_is_all_keys_active(mrg->file[i]->s->state.key_map,
560                               mrg->file[i]->s->base.keys);
561   }
562   /* Check that files are identical */
563   for (j=0 ; j < count-1 ; j++)
564   {
565     MARIA_COLUMNDEF *m1,*m2,*end;
566     if (mrg->file[j]->s->base.reclength != mrg->file[j+1]->s->base.reclength ||
567 	mrg->file[j]->s->base.fields != mrg->file[j+1]->s->base.fields)
568       goto diff_file;
569     m1=mrg->file[j]->s->columndef;
570     end=m1+mrg->file[j]->s->base.fields;
571     m2=mrg->file[j+1]->s->columndef;
572     for ( ; m1 != end ; m1++,m2++)
573     {
574       if (m1->type != m2->type || m1->length != m2->length)
575 	goto diff_file;
576     }
577   }
578   mrg->count=count;
579   return 0;
580 
581  diff_file:
582   fprintf(stderr, "%s: Tables '%s' and '%s' are not identical\n",
583                my_progname, names[j], names[j+1]);
584  error:
585   while (i--)
586     maria_close(mrg->file[i]);
587   my_free(mrg->file);
588   return 1;
589 }
590 
591 
compress(PACK_MRG_INFO * mrg,char * result_table)592 static int compress(PACK_MRG_INFO *mrg,char *result_table)
593 {
594   int error;
595   File new_file,join_maria_file;
596   MARIA_HA *isam_file;
597   MARIA_SHARE *share;
598   char org_name[FN_REFLEN],new_name[FN_REFLEN],temp_name[FN_REFLEN];
599   uint i,header_length,fields,trees,used_trees;
600   my_off_t old_length,new_length,tot_elements;
601   HUFF_COUNTS *huff_counts;
602   HUFF_TREE *huff_trees;
603   DBUG_ENTER("compress");
604 
605   isam_file=mrg->file[0];			/* Take this as an example */
606   share=isam_file->s;
607   new_file=join_maria_file= -1;
608   trees=fields=0;
609   huff_trees=0;
610   huff_counts=0;
611   maria_block_size= isam_file->s->block_size;
612 
613   /* Create temporary or join file */
614   if (backup)
615     fn_format(org_name,isam_file->s->open_file_name.str, "",MARIA_NAME_DEXT, 2);
616   else
617     fn_format(org_name,isam_file->s->open_file_name.str, "",MARIA_NAME_DEXT, 2+4+16);
618 
619   if (init_pagecache(maria_pagecache, MARIA_MIN_PAGE_CACHE_SIZE, 0, 0,
620                      maria_block_size, 0, MY_WME) == 0)
621   {
622     fprintf(stderr, "Can't initialize page cache\n");
623     goto err;
624   }
625 
626   if (!test_only && result_table)
627   {
628     /* Make a new indexfile based on first file in list */
629     uint length;
630     uchar *buff;
631     strmov(org_name,result_table);		/* Fix error messages */
632     fn_format(new_name,result_table,"",MARIA_NAME_IEXT,2);
633     if ((join_maria_file=my_create(new_name,0,tmpfile_createflag,MYF(MY_WME)))
634 	< 0)
635       goto err;
636     length=(uint) share->base.keystart;
637     if (!(buff= (uchar*) my_malloc(PSI_NOT_INSTRUMENTED, length, MYF(MY_WME))))
638       goto err;
639     if (my_pread(share->kfile.file, buff, length, 0L, MYF(MY_WME | MY_NABP)) ||
640 	my_write(join_maria_file,buff,length,
641 		 MYF(MY_WME | MY_NABP | MY_WAIT_IF_FULL)))
642     {
643       my_free(buff);
644       goto err;
645     }
646     my_free(buff);
647     fn_format(new_name,result_table,"",MARIA_NAME_DEXT,2);
648   }
649   else if (!tmp_dir[0])
650     make_new_name(new_name,org_name);
651   else
652     fn_format(new_name,org_name,tmp_dir,DATA_TMP_EXT,1+2+4);
653   if (!test_only &&
654       (new_file=my_create(new_name,0,tmpfile_createflag,MYF(MY_WME))) < 0)
655     goto err;
656 
657   /* Start calculating statistics */
658 
659   mrg->records=0;
660   for (i=0 ; i < mrg->count ; i++)
661     mrg->records+=mrg->file[i]->s->state.state.records;
662 
663   DBUG_PRINT("info", ("Compressing %s: (%lu records)",
664                       result_table ? new_name : org_name,
665                       (ulong) mrg->records));
666   if (write_loop || verbose)
667   {
668     printf("Compressing %s: (%lu records)\n",
669                 result_table ? new_name : org_name, (ulong) mrg->records);
670   }
671   trees=fields=share->base.fields;
672   huff_counts=init_huff_count(isam_file,mrg->records);
673 
674   /*
675     Read the whole data file(s) for statistics.
676   */
677   DBUG_PRINT("info", ("- Calculating statistics"));
678   if (write_loop || verbose)
679     printf("- Calculating statistics\n");
680   if (get_statistic(mrg,huff_counts))
681     goto err;
682 
683   old_length=0;
684   for (i=0; i < mrg->count ; i++)
685     old_length+= (mrg->file[i]->s->state.state.data_file_length -
686 		  mrg->file[i]->s->state.state.empty);
687 
688   /*
689     Create a global priority queue in preparation for making
690     temporary Huffman trees.
691   */
692   if (init_queue(&queue, 256, 0, 0, compare_huff_elements, 0, 0, 0))
693     goto err;
694 
695   /*
696     Check each column if we should use pre-space-compress, end-space-
697     compress, empty-field-compress or zero-field-compress.
698   */
699   check_counts(huff_counts,fields,mrg->records);
700 
701   /*
702     Build a Huffman tree for each column.
703   */
704   huff_trees=make_huff_trees(huff_counts,trees);
705 
706   /*
707     If the packed lengths of combined columns is less then the sum of
708     the non-combined columns, then create common Huffman trees for them.
709     We do this only for uchar compressed columns, not for distinct values
710     compressed columns.
711   */
712   if ((int) (used_trees=join_same_trees(huff_counts,trees)) < 0)
713     goto err;
714 
715   /*
716     Assign codes to all uchar or column values.
717   */
718   if (make_huff_decode_table(huff_trees,fields))
719     goto err;
720 
721   /* Prepare a file buffer. */
722   init_file_buffer(new_file,0);
723 
724   /*
725     Reserve space in the target file for the fixed compressed file header.
726   */
727   file_buffer.pos_in_file=HEAD_LENGTH;
728   if (! test_only)
729     my_seek(new_file,file_buffer.pos_in_file,MY_SEEK_SET,MYF(0));
730 
731   /*
732     Write field infos: field type, pack type, length bits, tree number.
733   */
734   write_field_info(huff_counts,fields,used_trees);
735 
736   /*
737     Write decode trees.
738   */
739   if (!(tot_elements=write_huff_tree(huff_trees,trees)))
740     goto err;
741 
742   /*
743     Calculate the total length of the compression info header.
744     This includes the fixed compressed file header, the column compression
745     type descriptions, and the decode trees.
746   */
747   header_length=(uint) file_buffer.pos_in_file+
748     (uint) (file_buffer.pos-file_buffer.buffer);
749 
750   /*
751     Compress the source file into the target file.
752   */
753   DBUG_PRINT("info", ("- Compressing file"));
754   if (write_loop || verbose)
755     printf("- Compressing file\n");
756   error=compress_maria_file(mrg,huff_counts);
757   new_length=file_buffer.pos_in_file;
758   if (!error && !test_only)
759   {
760     uchar buff[MEMMAP_EXTRA_MARGIN];		/* End marginal for memmap */
761     bzero(buff,sizeof(buff));
762     error=my_write(file_buffer.file,buff,sizeof(buff),
763 		   MYF(MY_WME | MY_NABP | MY_WAIT_IF_FULL)) != 0;
764   }
765 
766   /*
767     Write the fixed compressed file header.
768   */
769   if (!error)
770     error=write_header(mrg,header_length,used_trees,tot_elements,
771 		       new_length);
772 
773   /* Flush the file buffer. */
774   end_file_buffer();
775 
776   /* Display statistics. */
777   DBUG_PRINT("info", ("Min record length: %6d  Max length: %6d  "
778                       "Mean total length: %6ld",
779                       mrg->min_pack_length, mrg->max_pack_length,
780                       (ulong) (mrg->records ? (new_length/mrg->records) : 0)));
781   if (verbose && mrg->records)
782     printf("Min record length: %6d   Max length: %6d   "
783                 "Mean total length: %6ld\n", mrg->min_pack_length,
784                 mrg->max_pack_length, (ulong) (new_length/mrg->records));
785 
786   /* Close source and target file. */
787   if (!test_only)
788   {
789     error|=my_close(new_file,MYF(MY_WME));
790     if (!result_table)
791     {
792       (void) flush_pagecache_blocks(isam_file->s->pagecache, &isam_file->dfile,
793                                     FLUSH_RELEASE);
794       error|=my_close(isam_file->dfile.file, MYF(MY_WME));
795       isam_file->dfile.file= -1;	/* Tell maria_close file is closed */
796       isam_file->s->bitmap.file.file= -1;
797     }
798   }
799 
800   /* Cleanup. */
801   free_counts_and_tree_and_queue(huff_trees,trees,huff_counts,fields);
802   if (! test_only && ! error)
803   {
804     if (result_table)
805     {
806       error=save_state_mrg(join_maria_file,mrg,new_length,glob_crc);
807     }
808     else
809     {
810       if (backup)
811       {
812 	if (my_rename(org_name,make_old_name(temp_name,
813                                              isam_file->s->open_file_name.str),
814 		      MYF(MY_WME)))
815 	  error=1;
816 	else
817 	{
818 	  if (tmp_dir[0])
819 	    error=my_copy(new_name,org_name,MYF(MY_WME));
820 	  else
821 	    error=my_rename(new_name,org_name,MYF(MY_WME));
822 	  if (!error)
823           {
824 	    my_copystat(temp_name,org_name,MYF(MY_COPYTIME));
825             if (tmp_dir[0])
826               my_delete(new_name,MYF(MY_WME));
827           }
828 	}
829       }
830       else
831       {
832 	if (tmp_dir[0])
833         {
834 	  error=my_copy(new_name,org_name,
835 			MYF(MY_WME | MY_HOLD_ORIGINAL_MODES | MY_COPYTIME));
836           if (!error)
837             my_delete(new_name,MYF(MY_WME));
838         }
839 	else
840 	  error=my_redel(org_name, new_name, 0, MYF(MY_WME | MY_COPYTIME));
841       }
842       if (! error)
843 	error=save_state(isam_file,mrg,new_length,glob_crc);
844     }
845   }
846   error|=mrg_close(mrg);
847   if (join_maria_file >= 0)
848     error|=my_close(join_maria_file,MYF(MY_WME));
849   if (error)
850   {
851     fprintf(stderr, "Aborting: %s is not compressed\n", org_name);
852     my_delete(new_name,MYF(MY_WME));
853     DBUG_RETURN(-1);
854   }
855   if (write_loop || verbose)
856   {
857     if (old_length)
858       printf("%.4g%%     \n",
859                   (((longlong) (old_length - new_length)) * 100.0 /
860                    (longlong) old_length));
861     else
862       puts("Empty file saved in compressed format");
863   }
864   DBUG_RETURN(0);
865 
866  err:
867   free_counts_and_tree_and_queue(huff_trees,trees,huff_counts,fields);
868   if (new_file >= 0)
869     my_close(new_file,MYF(0));
870   if (join_maria_file >= 0)
871     my_close(join_maria_file,MYF(0));
872   mrg_close(mrg);
873   end_pagecache(maria_pagecache, 1);
874   fprintf(stderr, "Aborted: %s is not compressed\n", org_name);
875   DBUG_RETURN(-1);
876 }
877 
878 	/* Init a huff_count-struct for each field and init it */
879 
init_huff_count(MARIA_HA * info,my_off_t records)880 static HUFF_COUNTS *init_huff_count(MARIA_HA *info,my_off_t records)
881 {
882   reg2 uint i;
883   reg1 HUFF_COUNTS *count;
884   if ((count = (HUFF_COUNTS*) my_malloc(PSI_NOT_INSTRUMENTED,
885         info->s->base.fields*sizeof(HUFF_COUNTS), MYF(MY_ZEROFILL | MY_WME))))
886   {
887     for (i=0 ; i < info->s->base.fields ; i++)
888     {
889       enum en_fieldtype type;
890       uint col_nr = info->s->columndef[i].column_nr;
891       count[col_nr].field_length=info->s->columndef[i].length;
892       type= count[col_nr].field_type=
893         (enum en_fieldtype) info->s->columndef[i].type;
894       if (type == FIELD_INTERVALL ||
895 	  type == FIELD_CONSTANT ||
896 	  type == FIELD_ZERO)
897 	type = FIELD_NORMAL;
898       if (count[col_nr].field_length <= 8 &&
899 	  (type == FIELD_NORMAL ||
900 	   type == FIELD_SKIP_ZERO))
901 	count[col_nr].max_zero_fill= count[col_nr].field_length;
902       /*
903         For every column initialize a tree, which is used to detect distinct
904         column values. 'int_tree' works together with 'tree_buff' and
905         'tree_pos'. It's keys are implemented by pointers into 'tree_buff'.
906         This is accomplished by '-1' as the element size.
907       */
908       init_tree(&count[col_nr].int_tree,0,0,-1,(qsort_cmp2) compare_tree, NULL,
909 		NULL, MYF(0));
910       if (records && type != FIELD_BLOB && type != FIELD_VARCHAR)
911 	count[col_nr].tree_pos=count[col_nr].tree_buff =
912 	  my_malloc(PSI_NOT_INSTRUMENTED,
913                     count[col_nr].field_length > 1 ? tree_buff_length : 2,
914 		    MYF(MY_WME));
915     }
916   }
917   return count;
918 }
919 
920 
921 	/* Free memory used by counts and trees */
922 
free_counts_and_tree_and_queue(HUFF_TREE * huff_trees,uint trees,HUFF_COUNTS * huff_counts,uint fields)923 static void free_counts_and_tree_and_queue(HUFF_TREE *huff_trees, uint trees,
924 					   HUFF_COUNTS *huff_counts,
925 					   uint fields)
926 {
927   register uint i;
928 
929   if (huff_trees)
930   {
931     for (i=0 ; i < trees ; i++)
932     {
933       if (huff_trees[i].element_buffer)
934 	my_free(huff_trees[i].element_buffer);
935       if (huff_trees[i].code)
936 	my_free(huff_trees[i].code);
937     }
938     my_free(huff_trees);
939   }
940   if (huff_counts)
941   {
942     for (i=0 ; i < fields ; i++)
943     {
944       if (huff_counts[i].tree_buff)
945       {
946 	my_free(huff_counts[i].tree_buff);
947 	delete_tree(&huff_counts[i].int_tree, 0);
948       }
949     }
950     my_free(huff_counts);
951   }
952   delete_queue(&queue);		/* This is safe to free */
953   return;
954 }
955 
956 	/* Read through old file and gather some statistics */
957 
get_statistic(PACK_MRG_INFO * mrg,HUFF_COUNTS * huff_counts)958 static int get_statistic(PACK_MRG_INFO *mrg,HUFF_COUNTS *huff_counts)
959 {
960   int error;
961   uint length, null_bytes;
962   ulong reclength,max_blob_length;
963   uchar *record,*pos,*next_pos,*end_pos,*start_pos;
964   ha_rows record_count;
965   HUFF_COUNTS *count,*end_count;
966   TREE_ELEMENT *element;
967   ha_checksum(*calc_checksum)(MARIA_HA *, const uchar *);
968   DBUG_ENTER("get_statistic");
969 
970   reclength=  mrg->file[0]->s->base.reclength;
971   null_bytes= mrg->file[0]->s->base.null_bytes;
972   record=(uchar*) my_safe_alloca(reclength);
973   end_count=huff_counts+mrg->file[0]->s->base.fields;
974   record_count=0; glob_crc=0;
975   max_blob_length=0;
976 
977   /* Check how to calculate checksum */
978   if (mrg->file[0]->s->data_file_type == STATIC_RECORD)
979     calc_checksum= _ma_static_checksum;
980   else
981     calc_checksum= _ma_checksum;
982 
983   mrg_reset(mrg);
984   while ((error=mrg_rrnd(mrg,record)) != HA_ERR_END_OF_FILE)
985   {
986     ulong tot_blob_length=0;
987     if (! error)
988     {
989       /* glob_crc is a checksum over all bytes of all records. */
990       glob_crc+= (*calc_checksum)(mrg->file[0],record);
991 
992       /* Count the incidence of values separately for every column. */
993       for (pos=record + null_bytes, count=huff_counts ;
994 	   count < end_count ;
995 	   count++,
996 	   pos=next_pos)
997       {
998 	next_pos=end_pos=(start_pos=pos)+count->field_length;
999 
1000 	/*
1001           Put the whole column value in a tree if there is room for it.
1002           'int_tree' is used to quickly check for duplicate values.
1003           'tree_buff' collects as many distinct column values as
1004           possible. If the field length is > 1, it is tree_buff_length,
1005           else 2 bytes. Each value is 'field_length' bytes big. If there
1006           are more distinct column values than fit into the buffer, we
1007           give up with this tree. BLOBs and VARCHARs do not have a
1008           tree_buff as it can only be used with fixed length columns.
1009           For the special case of field length == 1, we handle only the
1010           case that there is only one distinct value in the table(s).
1011           Otherwise, we can have a maximum of 256 distinct values. This
1012           is then handled by the normal Huffman tree build.
1013 
1014           Another limit for collecting distinct column values is the
1015           number of values itself. Since we would need to build a
1016           Huffman tree for the values, we are limited by the 'IS_OFFSET'
1017           constant. This constant expresses a bit which is used to
1018           determine if a tree element holds a final value or an offset
1019           to a child element. Hence, all values and offsets need to be
1020           smaller than 'IS_OFFSET'. A tree element is implemented with
1021           two integer values, one for the left branch and one for the
1022           right branch. For the extreme case that the first element
1023           points to the last element, the number of integers in the tree
1024           must be less or equal to IS_OFFSET. So the number of elements
1025           must be less or equal to IS_OFFSET / 2.
1026 
1027           WARNING: At first, we insert a pointer into the record buffer
1028           as the key for the tree. If we got a new distinct value, which
1029           is really inserted into the tree, instead of being counted
1030           only, we will copy the column value from the record buffer to
1031           'tree_buff' and adjust the key pointer of the tree accordingly.
1032         */
1033 	if (count->tree_buff)
1034 	{
1035 	  global_count=count;
1036 	  if (!(element=tree_insert(&count->int_tree,pos, 0,
1037 				    count->int_tree.custom_arg)) ||
1038 	      (element->count == 1 &&
1039 	       (count->tree_buff + tree_buff_length <
1040                 count->tree_pos + count->field_length)) ||
1041               (count->int_tree.elements_in_tree > IS_OFFSET / 2) ||
1042 	      (count->field_length == 1 &&
1043 	       count->int_tree.elements_in_tree > 1))
1044 	  {
1045 	    delete_tree(&count->int_tree, 0);
1046 	    my_free(count->tree_buff);
1047 	    count->tree_buff=0;
1048 	  }
1049 	  else
1050 	  {
1051             /*
1052               If tree_insert() succeeds, it either creates a new element
1053               or increments the counter of an existing element.
1054             */
1055 	    if (element->count == 1)
1056 	    {
1057               /* Copy the new column value into 'tree_buff'. */
1058 	      memcpy(count->tree_pos,pos,(size_t) count->field_length);
1059               /* Adjust the key pointer in the tree. */
1060 	      tree_set_pointer(element,count->tree_pos);
1061               /* Point behind the last column value so far. */
1062 	      count->tree_pos+=count->field_length;
1063 	    }
1064 	  }
1065 	}
1066 
1067 	/* Save character counters and space-counts and zero-field-counts */
1068 	if (count->field_type == FIELD_NORMAL ||
1069 	    count->field_type == FIELD_SKIP_ENDSPACE)
1070 	{
1071           /* Ignore trailing space. */
1072 	  for ( ; end_pos > pos ; end_pos--)
1073 	    if (end_pos[-1] != ' ')
1074 	      break;
1075           /* Empty fields are just counted. Go to the next record. */
1076 	  if (end_pos == pos)
1077 	  {
1078 	    count->empty_fields++;
1079 	    count->max_zero_fill=0;
1080 	    continue;
1081 	  }
1082           /*
1083             Count the total of all trailing spaces and the number of
1084             short trailing spaces. Remember the longest trailing space.
1085           */
1086 	  length= (uint) (next_pos-end_pos);
1087 	  count->tot_end_space+=length;
1088 	  if (length < 8)
1089 	    count->end_space[length]++;
1090 	  if (count->max_end_space < length)
1091 	    count->max_end_space = length;
1092 	}
1093 
1094 	if (count->field_type == FIELD_NORMAL ||
1095 	    count->field_type == FIELD_SKIP_PRESPACE)
1096 	{
1097           /* Ignore leading space. */
1098 	  for (pos=start_pos; pos < end_pos ; pos++)
1099 	    if (pos[0] != ' ')
1100 	      break;
1101           /* Empty fields are just counted. Go to the next record. */
1102 	  if (end_pos == pos)
1103 	  {
1104 	    count->empty_fields++;
1105 	    count->max_zero_fill=0;
1106 	    continue;
1107 	  }
1108           /*
1109             Count the total of all leading spaces and the number of
1110             short leading spaces. Remember the longest leading space.
1111           */
1112 	  length= (uint) (pos-start_pos);
1113 	  count->tot_pre_space+=length;
1114 	  if (length < 8)
1115 	    count->pre_space[length]++;
1116 	  if (count->max_pre_space < length)
1117 	    count->max_pre_space = length;
1118 	}
1119 
1120         /* Calculate pos, end_pos, and max_length for variable length fields. */
1121 	if (count->field_type == FIELD_BLOB)
1122 	{
1123 	  uint field_length=count->field_length -portable_sizeof_char_ptr;
1124 	  ulong blob_length= _ma_calc_blob_length(field_length, start_pos);
1125 	  memcpy(&pos,  start_pos+field_length,sizeof(char*));
1126 	  end_pos=pos+blob_length;
1127 	  tot_blob_length+=blob_length;
1128 	  set_if_bigger(count->max_length,blob_length);
1129 	}
1130 	else if (count->field_type == FIELD_VARCHAR)
1131 	{
1132           uint pack_length= HA_VARCHAR_PACKLENGTH(count->field_length-1);
1133 	  length= (pack_length == 1 ? (uint) *(uchar*) start_pos :
1134                    uint2korr(start_pos));
1135 	  pos= start_pos+pack_length;
1136 	  end_pos= pos+length;
1137 	  set_if_bigger(count->max_length,length);
1138 	}
1139 
1140         /* Evaluate 'max_zero_fill' for short fields. */
1141 	if (count->field_length <= 8 &&
1142 	    (count->field_type == FIELD_NORMAL ||
1143 	     count->field_type == FIELD_SKIP_ZERO))
1144 	{
1145 	  uint i;
1146           /* Zero fields are just counted. Go to the next record. */
1147 	  if (!memcmp(start_pos, zero_string, count->field_length))
1148 	  {
1149 	    count->zero_fields++;
1150 	    continue;
1151 	  }
1152           /*
1153             max_zero_fill starts with field_length. It is decreased every
1154             time a shorter "zero trailer" is found. It is set to zero when
1155             an empty field is found (see above). This suggests that the
1156             variable should be called 'min_zero_fill'.
1157           */
1158 	  for (i =0 ; i < count->max_zero_fill && ! end_pos[-1 - (int) i] ;
1159 	       i++) ;
1160 	  if (i < count->max_zero_fill)
1161 	    count->max_zero_fill=i;
1162 	}
1163 
1164         /* Ignore zero fields and check fields. */
1165 	if (count->field_type == FIELD_ZERO ||
1166 	    count->field_type == FIELD_CHECK)
1167 	  continue;
1168 
1169         /*
1170           Count the incidence of every uchar value in the
1171           significant field value.
1172         */
1173 	for ( ; pos < end_pos ; pos++)
1174 	  count->counts[(uchar) *pos]++;
1175 
1176         /* Step to next field. */
1177       }
1178 
1179       if (tot_blob_length > max_blob_length)
1180 	max_blob_length=tot_blob_length;
1181       record_count++;
1182       if (write_loop && record_count % WRITE_COUNT == 0)
1183       {
1184 	printf("%lu\r", (ulong) record_count);
1185         fflush(stdout);
1186       }
1187     }
1188     else if (error != HA_ERR_RECORD_DELETED)
1189     {
1190       fprintf(stderr, "Got error %d while reading rows\n", error);
1191       break;
1192     }
1193 
1194     /* Step to next record. */
1195   }
1196   if (write_loop)
1197   {
1198     printf("            \r");
1199     fflush(stdout);
1200   }
1201 
1202   /*
1203     If --debug=d,fakebigcodes is set, fake the counts to get big Huffman
1204     codes.
1205   */
1206   DBUG_EXECUTE_IF("fakebigcodes", fakebigcodes(huff_counts, end_count););
1207 
1208   DBUG_PRINT("info", ("Found the following number of incidents "
1209                       "of the uchar codes:"));
1210   if (verbose >= 2)
1211     printf("Found the following number of incidents "
1212                 "of the uchar codes:\n");
1213   for (count= huff_counts ; count < end_count; count++)
1214   {
1215     uint      idx;
1216     my_off_t  total_count;
1217     char      llbuf[32];
1218 
1219     DBUG_PRINT("info", ("column: %3u", (uint) (count - huff_counts + 1)));
1220     if (verbose >= 2)
1221       printf("column: %3u\n", (uint) (count - huff_counts + 1));
1222     if (count->tree_buff)
1223     {
1224       DBUG_PRINT("info", ("number of distinct values: %u",
1225                           (uint) ((count->tree_pos - count->tree_buff) /
1226                                   count->field_length)));
1227       if (verbose >= 2)
1228         printf("number of distinct values: %u\n",
1229                     (uint) ((count->tree_pos - count->tree_buff) /
1230                             count->field_length));
1231     }
1232     total_count= 0;
1233     for (idx= 0; idx < 256; idx++)
1234     {
1235       if (count->counts[idx])
1236       {
1237         total_count+= count->counts[idx];
1238         DBUG_PRINT("info", ("counts[0x%02x]: %12s", idx,
1239                             llstr((longlong) count->counts[idx], llbuf)));
1240         if (verbose >= 2)
1241           printf("counts[0x%02x]: %12s\n", idx,
1242                       llstr((longlong) count->counts[idx], llbuf));
1243       }
1244     }
1245     DBUG_PRINT("info", ("total:        %12s", llstr((longlong) total_count,
1246                                                     llbuf)));
1247     if ((verbose >= 2) && total_count)
1248     {
1249       printf("total:        %12s\n",
1250                   llstr((longlong) total_count, llbuf));
1251     }
1252   }
1253 
1254   mrg->records=record_count;
1255   mrg->max_blob_length=max_blob_length;
1256   my_safe_afree(record, reclength);
1257   DBUG_RETURN(error != HA_ERR_END_OF_FILE);
1258 }
1259 
compare_huff_elements(void * not_used,uchar * a,uchar * b)1260 static int compare_huff_elements(void *not_used __attribute__((unused)),
1261 				 uchar *a, uchar *b)
1262 {
1263   return *((my_off_t*) a) < *((my_off_t*) b) ? -1 :
1264     (*((my_off_t*) a) == *((my_off_t*) b)  ? 0 : 1);
1265 }
1266 
1267 	/* Check each tree if we should use pre-space-compress, end-space-
1268 	   compress, empty-field-compress or zero-field-compress */
1269 
check_counts(HUFF_COUNTS * huff_counts,uint trees,my_off_t records)1270 static void check_counts(HUFF_COUNTS *huff_counts, uint trees,
1271 			 my_off_t records)
1272 {
1273   uint space_fields,fill_zero_fields,field_count[(int) FIELD_enum_val_count];
1274   my_off_t old_length,new_length,length;
1275   DBUG_ENTER("check_counts");
1276 
1277   bzero((uchar*) field_count,sizeof(field_count));
1278   space_fields=fill_zero_fields=0;
1279 
1280   for (; trees-- ; huff_counts++)
1281   {
1282     if (huff_counts->field_type == FIELD_BLOB)
1283     {
1284       huff_counts->length_bits=max_bit(huff_counts->max_length);
1285       goto found_pack;
1286     }
1287     else if (huff_counts->field_type == FIELD_VARCHAR)
1288     {
1289       huff_counts->length_bits=max_bit(huff_counts->max_length);
1290       goto found_pack;
1291     }
1292     else if (huff_counts->field_type == FIELD_CHECK)
1293     {
1294       huff_counts->bytes_packed=0;
1295       huff_counts->counts[0]=0;
1296       goto found_pack;
1297     }
1298 
1299     huff_counts->field_type=FIELD_NORMAL;
1300     huff_counts->pack_type=0;
1301 
1302     /* Check for zero-filled records (in this column), or zero records. */
1303     if (huff_counts->zero_fields || ! records)
1304     {
1305       my_off_t old_space_count;
1306       /*
1307         If there are only zero filled records (in this column),
1308         or no records at all, we are done.
1309       */
1310       if (huff_counts->zero_fields == records)
1311       {
1312 	huff_counts->field_type= FIELD_ZERO;
1313 	huff_counts->bytes_packed=0;
1314 	huff_counts->counts[0]=0;
1315 	goto found_pack;
1316       }
1317       /* Remeber the number of significant spaces. */
1318       old_space_count=huff_counts->counts[' '];
1319       /* Add all leading and trailing spaces. */
1320       huff_counts->counts[' ']+= (huff_counts->tot_end_space +
1321                                   huff_counts->tot_pre_space +
1322                                   huff_counts->empty_fields *
1323                                   huff_counts->field_length);
1324       /* Check, what the compressed length of this would be. */
1325       old_length=calc_packed_length(huff_counts,0)+records/8;
1326       /* Get the number of zero bytes. */
1327       length=huff_counts->zero_fields*huff_counts->field_length;
1328       /* Add it to the counts. */
1329       huff_counts->counts[0]+=length;
1330       /* Check, what the compressed length of this would be. */
1331       new_length=calc_packed_length(huff_counts,0);
1332       /* If the compression without the zeroes would be shorter, we are done. */
1333       if (old_length < new_length && huff_counts->field_length > 1)
1334       {
1335 	huff_counts->field_type=FIELD_SKIP_ZERO;
1336 	huff_counts->counts[0]-=length;
1337 	huff_counts->bytes_packed=old_length- records/8;
1338 	goto found_pack;
1339       }
1340       /* Remove the insignificant spaces, but keep the zeroes. */
1341       huff_counts->counts[' ']=old_space_count;
1342     }
1343     /* Check, what the compressed length of this column would be. */
1344     huff_counts->bytes_packed=calc_packed_length(huff_counts,0);
1345 
1346     /*
1347       If there are enough empty records (in this column),
1348       treating them specially may pay off.
1349     */
1350     if (huff_counts->empty_fields)
1351     {
1352       if (huff_counts->field_length > 2 &&
1353 	  huff_counts->empty_fields + (records - huff_counts->empty_fields)*
1354 	  (1+max_bit(MY_MAX(huff_counts->max_pre_space,
1355 			 huff_counts->max_end_space))) <
1356 	  records * max_bit(huff_counts->field_length))
1357       {
1358 	huff_counts->pack_type |= PACK_TYPE_SPACE_FIELDS;
1359       }
1360       else
1361       {
1362 	length=huff_counts->empty_fields*huff_counts->field_length;
1363 	if (huff_counts->tot_end_space || ! huff_counts->tot_pre_space)
1364 	{
1365 	  huff_counts->tot_end_space+=length;
1366 	  huff_counts->max_end_space=huff_counts->field_length;
1367 	  if (huff_counts->field_length < 8)
1368 	    huff_counts->end_space[huff_counts->field_length]+=
1369 	      huff_counts->empty_fields;
1370 	}
1371 	if (huff_counts->tot_pre_space)
1372 	{
1373 	  huff_counts->tot_pre_space+=length;
1374 	  huff_counts->max_pre_space=huff_counts->field_length;
1375 	  if (huff_counts->field_length < 8)
1376 	    huff_counts->pre_space[huff_counts->field_length]+=
1377 	      huff_counts->empty_fields;
1378 	}
1379       }
1380     }
1381 
1382     /*
1383       If there are enough trailing spaces (in this column),
1384       treating them specially may pay off.
1385     */
1386     if (huff_counts->tot_end_space)
1387     {
1388       huff_counts->counts[' ']+=huff_counts->tot_pre_space;
1389       if (test_space_compress(huff_counts,records,huff_counts->max_end_space,
1390 			      huff_counts->end_space,
1391 			      huff_counts->tot_end_space,FIELD_SKIP_ENDSPACE))
1392 	goto found_pack;
1393       huff_counts->counts[' ']-=huff_counts->tot_pre_space;
1394     }
1395 
1396     /*
1397       If there are enough leading spaces (in this column),
1398       treating them specially may pay off.
1399     */
1400     if (huff_counts->tot_pre_space)
1401     {
1402       if (test_space_compress(huff_counts,records,huff_counts->max_pre_space,
1403 			      huff_counts->pre_space,
1404 			      huff_counts->tot_pre_space,FIELD_SKIP_PRESPACE))
1405 	goto found_pack;
1406     }
1407 
1408   found_pack:			/* Found field-packing */
1409 
1410     /* Test if we can use zero-fill */
1411 
1412     if (huff_counts->max_zero_fill &&
1413 	(huff_counts->field_type == FIELD_NORMAL ||
1414 	 huff_counts->field_type == FIELD_SKIP_ZERO))
1415     {
1416       huff_counts->counts[0]-=huff_counts->max_zero_fill*
1417 	(huff_counts->field_type == FIELD_SKIP_ZERO ?
1418 	 records - huff_counts->zero_fields : records);
1419       huff_counts->pack_type|=PACK_TYPE_ZERO_FILL;
1420       huff_counts->bytes_packed=calc_packed_length(huff_counts,0);
1421     }
1422 
1423     /* Test if intervall-field is better */
1424 
1425     if (huff_counts->tree_buff)
1426     {
1427       HUFF_TREE tree;
1428 
1429       DBUG_EXECUTE_IF("forceintervall",
1430                       huff_counts->bytes_packed= ~ (my_off_t) 0;);
1431       tree.element_buffer=0;
1432       if (!make_huff_tree(&tree,huff_counts) &&
1433 	  tree.bytes_packed+tree.tree_pack_length < huff_counts->bytes_packed)
1434       {
1435 	if (tree.elements == 1)
1436 	  huff_counts->field_type=FIELD_CONSTANT;
1437 	else
1438 	  huff_counts->field_type=FIELD_INTERVALL;
1439 	huff_counts->pack_type=0;
1440       }
1441       else
1442       {
1443 	my_free(huff_counts->tree_buff);
1444 	delete_tree(&huff_counts->int_tree, 0);
1445 	huff_counts->tree_buff=0;
1446       }
1447       if (tree.element_buffer)
1448 	my_free(tree.element_buffer);
1449     }
1450     if (huff_counts->pack_type & PACK_TYPE_SPACE_FIELDS)
1451       space_fields++;
1452     if (huff_counts->pack_type & PACK_TYPE_ZERO_FILL)
1453       fill_zero_fields++;
1454     field_count[huff_counts->field_type]++;
1455   }
1456   DBUG_PRINT("info", ("normal:    %3d  empty-space:     %3d  "
1457                       "empty-zero:       %3d  empty-fill: %3d",
1458                       field_count[FIELD_NORMAL],space_fields,
1459                       field_count[FIELD_SKIP_ZERO],fill_zero_fields));
1460   DBUG_PRINT("info", ("pre-space: %3d  end-space:       %3d  "
1461                       "intervall-fields: %3d  zero:       %3d",
1462                       field_count[FIELD_SKIP_PRESPACE],
1463                       field_count[FIELD_SKIP_ENDSPACE],
1464                       field_count[FIELD_INTERVALL],
1465                       field_count[FIELD_ZERO]));
1466   if (verbose)
1467     printf("\nnormal:    %3d  empty-space:     %3d  "
1468                 "empty-zero:       %3d  empty-fill: %3d\n"
1469                 "pre-space: %3d  end-space:       %3d  "
1470                 "intervall-fields: %3d  zero:       %3d\n",
1471                 field_count[FIELD_NORMAL],space_fields,
1472                 field_count[FIELD_SKIP_ZERO],fill_zero_fields,
1473                 field_count[FIELD_SKIP_PRESPACE],
1474                 field_count[FIELD_SKIP_ENDSPACE],
1475                 field_count[FIELD_INTERVALL],
1476                 field_count[FIELD_ZERO]);
1477   DBUG_VOID_RETURN;
1478 }
1479 
1480 
1481 /* Test if we can use space-compression and empty-field-compression */
1482 
1483 static int
test_space_compress(HUFF_COUNTS * huff_counts,my_off_t records,uint max_space_length,my_off_t * space_counts,my_off_t tot_space_count,enum en_fieldtype field_type)1484 test_space_compress(HUFF_COUNTS *huff_counts, my_off_t records,
1485 		    uint max_space_length, my_off_t *space_counts,
1486 		    my_off_t tot_space_count, enum en_fieldtype field_type)
1487 {
1488   int min_pos;
1489   uint length_bits,i;
1490   my_off_t space_count,min_space_count,min_pack,new_length,skip;
1491 
1492   length_bits=max_bit(max_space_length);
1493 
1494 		/* Default no end_space-packing */
1495   space_count=huff_counts->counts[(uint) ' '];
1496   min_space_count= (huff_counts->counts[(uint) ' ']+= tot_space_count);
1497   min_pack=calc_packed_length(huff_counts,0);
1498   min_pos= -2;
1499   huff_counts->counts[(uint) ' ']=space_count;
1500 
1501 	/* Test with allways space-count */
1502   new_length=huff_counts->bytes_packed+length_bits*records/8;
1503   if (new_length+1 < min_pack)
1504   {
1505     min_pos= -1;
1506     min_pack=new_length;
1507     min_space_count=space_count;
1508   }
1509 	/* Test with length-flag */
1510   for (skip=0L, i=0 ; i < 8 ; i++)
1511   {
1512     if (space_counts[i])
1513     {
1514       if (i)
1515 	huff_counts->counts[(uint) ' ']+=space_counts[i];
1516       skip+=huff_counts->pre_space[i];
1517       new_length=calc_packed_length(huff_counts,0)+
1518 	(records+(records-skip)*(1+length_bits))/8;
1519       if (new_length < min_pack)
1520       {
1521 	min_pos=(int) i;
1522 	min_pack=new_length;
1523 	min_space_count=huff_counts->counts[(uint) ' '];
1524       }
1525     }
1526   }
1527 
1528   huff_counts->counts[(uint) ' ']=min_space_count;
1529   huff_counts->bytes_packed=min_pack;
1530   switch (min_pos) {
1531   case -2:
1532     return(0);				/* No space-compress */
1533   case -1:				/* Always space-count */
1534     huff_counts->field_type=field_type;
1535     huff_counts->min_space=0;
1536     huff_counts->length_bits=max_bit(max_space_length);
1537     break;
1538   default:
1539     huff_counts->field_type=field_type;
1540     huff_counts->min_space=(uint) min_pos;
1541     huff_counts->pack_type|=PACK_TYPE_SELECTED;
1542     huff_counts->length_bits=max_bit(max_space_length);
1543     break;
1544   }
1545   return(1);				/* Using space-compress */
1546 }
1547 
1548 
1549 	/* Make a huff_tree of each huff_count */
1550 
make_huff_trees(HUFF_COUNTS * huff_counts,uint trees)1551 static HUFF_TREE* make_huff_trees(HUFF_COUNTS *huff_counts, uint trees)
1552 {
1553   uint tree;
1554   HUFF_TREE *huff_tree;
1555   DBUG_ENTER("make_huff_trees");
1556 
1557   if (!(huff_tree=(HUFF_TREE*) my_malloc(PSI_NOT_INSTRUMENTED,
1558                          trees*sizeof(HUFF_TREE), MYF(MY_WME | MY_ZEROFILL))))
1559     DBUG_RETURN(0);
1560 
1561   for (tree=0 ; tree < trees ; tree++)
1562   {
1563     if (make_huff_tree(huff_tree+tree,huff_counts+tree))
1564     {
1565       while (tree--)
1566 	my_free(huff_tree[tree].element_buffer);
1567       my_free(huff_tree);
1568       DBUG_RETURN(0);
1569     }
1570   }
1571   DBUG_RETURN(huff_tree);
1572 }
1573 
1574 /*
1575   Build a Huffman tree.
1576 
1577   SYNOPSIS
1578     make_huff_tree()
1579     huff_tree                   The Huffman tree.
1580     huff_counts                 The counts.
1581 
1582   DESCRIPTION
1583     Build a Huffman tree according to huff_counts->counts or
1584     huff_counts->tree_buff. tree_buff, if non-NULL contains up to
1585     tree_buff_length of distinct column values. In that case, whole
1586     values can be Huffman encoded instead of single bytes.
1587 
1588   RETURN
1589     0           OK
1590     != 0        Error
1591 */
1592 
make_huff_tree(HUFF_TREE * huff_tree,HUFF_COUNTS * huff_counts)1593 static int make_huff_tree(HUFF_TREE *huff_tree, HUFF_COUNTS *huff_counts)
1594 {
1595   uint i,found,bits_packed,first,last;
1596   my_off_t bytes_packed;
1597   HUFF_ELEMENT *a,*b,*new_huff_el;
1598 
1599   first=last=0;
1600   if (huff_counts->tree_buff)
1601   {
1602     /* Calculate the number of distinct values in tree_buff. */
1603     found= (uint) (huff_counts->tree_pos - huff_counts->tree_buff) /
1604       huff_counts->field_length;
1605     first=0; last=found-1;
1606   }
1607   else
1608   {
1609     /* Count the number of uchar codes found in the column. */
1610     for (i=found=0 ; i < 256 ; i++)
1611     {
1612       if (huff_counts->counts[i])
1613       {
1614 	if (! found++)
1615 	  first=i;
1616 	last=i;
1617       }
1618     }
1619     if (found < 2)
1620       found=2;
1621   }
1622 
1623   /* When using 'tree_buff' we can have more that 256 values. */
1624   if (queue.max_elements < found)
1625   {
1626     delete_queue(&queue);
1627     if (init_queue(&queue,found, 0, 0, compare_huff_elements, 0, 0, 0))
1628       return -1;
1629   }
1630 
1631   /* Allocate or reallocate an element buffer for the Huffman tree. */
1632   if (!huff_tree->element_buffer)
1633   {
1634     if (!(huff_tree->element_buffer=
1635 	 (HUFF_ELEMENT*) my_malloc(PSI_NOT_INSTRUMENTED,
1636                                    found*2*sizeof(HUFF_ELEMENT),MYF(MY_WME))))
1637       return 1;
1638   }
1639   else
1640   {
1641     HUFF_ELEMENT *temp;
1642     if (!(temp= (HUFF_ELEMENT*) my_realloc(PSI_NOT_INSTRUMENTED,
1643            (uchar*) huff_tree->element_buffer, found*2*sizeof(HUFF_ELEMENT), MYF(MY_WME))))
1644       return 1;
1645     huff_tree->element_buffer=temp;
1646   }
1647 
1648   huff_counts->tree=huff_tree;
1649   huff_tree->counts=huff_counts;
1650   huff_tree->min_chr=first;
1651   huff_tree->max_chr=last;
1652   huff_tree->char_bits=max_bit(last-first);
1653   huff_tree->offset_bits=max_bit(found-1)+1;
1654 
1655   if (huff_counts->tree_buff)
1656   {
1657     huff_tree->elements=0;
1658     huff_tree->tree_pack_length=(1+15+16+5+5+
1659 				 (huff_tree->char_bits+1)*found+
1660 				 (huff_tree->offset_bits+1)*
1661 				 (found-2)+7)/8 +
1662 				   (uint) (huff_tree->counts->tree_pos-
1663 					   huff_tree->counts->tree_buff);
1664     /*
1665       Put a HUFF_ELEMENT into the queue for every distinct column value.
1666 
1667       tree_walk() calls save_counts_in_queue() for every element in
1668       'int_tree'. This takes elements from the target trees element
1669       buffer and places references to them into the buffer of the
1670       priority queue. We insert in column value order, but the order is
1671       in fact irrelevant here. We will establish the correct order
1672       later.
1673     */
1674     tree_walk(&huff_counts->int_tree,
1675 	      (int (*)(void*, element_count,void*)) save_counts_in_queue,
1676 	      (uchar*) huff_tree, left_root_right);
1677   }
1678   else
1679   {
1680     huff_tree->elements=found;
1681     huff_tree->tree_pack_length=(9+9+5+5+
1682 				 (huff_tree->char_bits+1)*found+
1683 				 (huff_tree->offset_bits+1)*
1684 				 (found-2)+7)/8;
1685     /*
1686       Put a HUFF_ELEMENT into the queue for every uchar code found in the column.
1687 
1688       The elements are taken from the target trees element buffer.
1689       Instead of using queue_insert(), we just place references to the
1690       elements into the buffer of the priority queue. We insert in byte
1691       value order, but the order is in fact irrelevant here. We will
1692       establish the correct order later.
1693     */
1694     for (i=first, found=0 ; i <= last ; i++)
1695     {
1696       if (huff_counts->counts[i])
1697       {
1698 	new_huff_el=huff_tree->element_buffer+(found++);
1699 	new_huff_el->count=huff_counts->counts[i];
1700 	new_huff_el->a.leaf.null=0;
1701 	new_huff_el->a.leaf.element_nr=i;
1702 	queue.root[found]=(uchar*) new_huff_el;
1703       }
1704     }
1705     /*
1706       If there is only a single uchar value in this field in all records,
1707       add a second element with zero incidence. This is required to enter
1708       the loop, which builds the Huffman tree.
1709     */
1710     while (found < 2)
1711     {
1712       new_huff_el=huff_tree->element_buffer+(found++);
1713       new_huff_el->count=0;
1714       new_huff_el->a.leaf.null=0;
1715       if (last)
1716 	new_huff_el->a.leaf.element_nr=huff_tree->min_chr=last-1;
1717       else
1718 	new_huff_el->a.leaf.element_nr=huff_tree->max_chr=last+1;
1719       queue.root[found]=(uchar*) new_huff_el;
1720     }
1721   }
1722 
1723   /* Make a queue from the queue buffer. */
1724   queue.elements=found;
1725 
1726   /*
1727     Make a priority queue from the queue. Construct its index so that we
1728     have a partially ordered tree.
1729   */
1730   queue_fix(&queue);
1731 
1732   /* The Huffman algorithm. */
1733   bytes_packed=0; bits_packed=0;
1734   for (i=1 ; i < found ; i++)
1735   {
1736     /*
1737       Pop the top element from the queue (the one with the least incidence).
1738       Popping from a priority queue includes a re-ordering of the queue,
1739       to get the next least incidence element to the top.
1740     */
1741     a=(HUFF_ELEMENT*) queue_remove_top(&queue);
1742     /* Copy the next least incidence element */
1743     b=(HUFF_ELEMENT*) queue_top(&queue);
1744     /* Get a new element from the element buffer. */
1745     new_huff_el=huff_tree->element_buffer+found+i;
1746     /* The new element gets the sum of the two least incidence elements. */
1747     new_huff_el->count=a->count+b->count;
1748     /*
1749       The Huffman algorithm assigns another bit to the code for a byte
1750       every time that bytes incidence is combined (directly or indirectly)
1751       to a new element as one of the two least incidence elements.
1752       This means that one more bit per incidence of that uchar is required
1753       in the resulting file. So we add the new combined incidence as the
1754       number of bits by which the result grows.
1755     */
1756     bits_packed+=(uint) (new_huff_el->count & 7);
1757     bytes_packed+=new_huff_el->count/8;
1758     /* The new element points to its children, lesser in left.  */
1759     new_huff_el->a.nod.left=a;
1760     new_huff_el->a.nod.right=b;
1761     /*
1762       Replace the copied top element by the new element and re-order the
1763       queue.
1764     */
1765     queue_top(&queue)= (uchar*) new_huff_el;
1766     queue_replace_top(&queue);
1767   }
1768   huff_tree->root=(HUFF_ELEMENT*) queue.root[1];
1769   huff_tree->bytes_packed=bytes_packed+(bits_packed+7)/8;
1770   return 0;
1771 }
1772 
compare_tree(void * cmp_arg,register const uchar * s,register const uchar * t)1773 static int compare_tree(void* cmp_arg __attribute__((unused)),
1774 			register const uchar *s, register const uchar *t)
1775 {
1776   uint length;
1777   for (length=global_count->field_length; length-- ;)
1778     if (*s++ != *t++)
1779       return (int) s[-1] - (int) t[-1];
1780   return 0;
1781 }
1782 
1783 /*
1784   Organize distinct column values and their incidences into a priority queue.
1785 
1786   SYNOPSIS
1787     save_counts_in_queue()
1788     key                         The column value.
1789     count                       The incidence of this value.
1790     tree                        The Huffman tree to be built later.
1791 
1792   DESCRIPTION
1793     We use the element buffer of the targeted tree. The distinct column
1794     values are organized in a priority queue first. The Huffman
1795     algorithm will later organize the elements into a Huffman tree. For
1796     the time being, we just place references to the elements into the
1797     queue buffer. The buffer will later be organized into a priority
1798     queue.
1799 
1800   RETURN
1801     0
1802  */
1803 
save_counts_in_queue(uchar * key,element_count count,HUFF_TREE * tree)1804 static int save_counts_in_queue(uchar *key, element_count count,
1805 				HUFF_TREE *tree)
1806 {
1807   HUFF_ELEMENT *new_huff_el;
1808 
1809   new_huff_el=tree->element_buffer+(tree->elements++);
1810   new_huff_el->count=count;
1811   new_huff_el->a.leaf.null=0;
1812   new_huff_el->a.leaf.element_nr= (uint) (key- tree->counts->tree_buff) /
1813     tree->counts->field_length;
1814   queue.root[tree->elements]=(uchar*) new_huff_el;
1815   return 0;
1816 }
1817 
1818 
1819 /*
1820   Calculate length of file if given counts should be used.
1821 
1822   SYNOPSIS
1823     calc_packed_length()
1824     huff_counts                 The counts for a column of the table(s).
1825     add_tree_lenght             If the decode tree length should be added.
1826 
1827   DESCRIPTION
1828     We need to follow the Huffman algorithm until we know, how many bits
1829     are required for each uchar code. But we do not need the resulting
1830     Huffman tree. Hence, we can leave out some steps which are essential
1831     in make_huff_tree().
1832 
1833   RETURN
1834     Number of bytes required to compress this table column.
1835 */
1836 
calc_packed_length(HUFF_COUNTS * huff_counts,uint add_tree_lenght)1837 static my_off_t calc_packed_length(HUFF_COUNTS *huff_counts,
1838 				   uint add_tree_lenght)
1839 {
1840   uint i,found,bits_packed,first,last;
1841   my_off_t bytes_packed;
1842   HUFF_ELEMENT element_buffer[256];
1843   DBUG_ENTER("calc_packed_length");
1844 
1845   /*
1846     WARNING: We use a small hack for efficiency: Instead of placing
1847     references to HUFF_ELEMENTs into the queue, we just insert
1848     references to the counts of the uchar codes which appeared in this
1849     table column. During the Huffman algorithm they are successively
1850     replaced by references to HUFF_ELEMENTs. This works, because
1851     HUFF_ELEMENTs have the incidence count at their beginning.
1852     Regardless, wether the queue array contains references to counts of
1853     type my_off_t or references to HUFF_ELEMENTs which have the count of
1854     type my_off_t at their beginning, it always points to a count of the
1855     same type.
1856 
1857     Instead of using queue_insert(), we just copy the references into
1858     the buffer of the priority queue. We insert in uchar value order, but
1859     the order is in fact irrelevant here. We will establish the correct
1860     order later.
1861   */
1862   first=last=0;
1863   for (i=found=0 ; i < 256 ; i++)
1864   {
1865     if (huff_counts->counts[i])
1866     {
1867       if (! found++)
1868 	first=i;
1869       last=i;
1870       /* We start with root[1], which is the queues top element. */
1871       queue.root[found]=(uchar*) &huff_counts->counts[i];
1872     }
1873   }
1874   if (!found)
1875     DBUG_RETURN(0);			/* Empty tree */
1876   /*
1877     If there is only a single uchar value in this field in all records,
1878     add a second element with zero incidence. This is required to enter
1879     the loop, which follows the Huffman algorithm.
1880   */
1881   if (found < 2)
1882     queue.root[++found]=(uchar*) &huff_counts->counts[last ? 0 : 1];
1883 
1884   /* Make a queue from the queue buffer. */
1885   queue.elements=found;
1886 
1887   bytes_packed=0; bits_packed=0;
1888   /* Add the length of the coding table, which would become part of the file. */
1889   if (add_tree_lenght)
1890     bytes_packed=(8+9+5+5+(max_bit(last-first)+1)*found+
1891 		  (max_bit(found-1)+1+1)*(found-2) +7)/8;
1892 
1893   /*
1894     Make a priority queue from the queue. Construct its index so that we
1895     have a partially ordered tree.
1896   */
1897   queue_fix(&queue);
1898 
1899   /* The Huffman algorithm. */
1900   for (i=0 ; i < found-1 ; i++)
1901   {
1902     my_off_t        *a;
1903     my_off_t        *b;
1904     HUFF_ELEMENT    *new_huff_el;
1905 
1906     /*
1907       Pop the top element from the queue (the one with the least
1908       incidence). Popping from a priority queue includes a re-ordering
1909       of the queue, to get the next least incidence element to the top.
1910     */
1911     a= (my_off_t*) queue_remove_top(&queue);
1912     /* Copy the next least incidence element. */
1913     b= (my_off_t*) queue_top(&queue);
1914     /* Create a new element in a local (automatic) buffer. */
1915     new_huff_el= element_buffer + i;
1916     /* The new element gets the sum of the two least incidence elements. */
1917     new_huff_el->count= *a + *b;
1918     /*
1919       The Huffman algorithm assigns another bit to the code for a byte
1920       every time that bytes incidence is combined (directly or indirectly)
1921       to a new element as one of the two least incidence elements.
1922       This means that one more bit per incidence of that uchar is required
1923       in the resulting file. So we add the new combined incidence as the
1924       number of bits by which the result grows.
1925     */
1926     bits_packed+=(uint) (new_huff_el->count & 7);
1927     bytes_packed+=new_huff_el->count/8;
1928     /*
1929       Replace the copied top element by the new element and re-order the
1930       queue. This successively replaces the references to counts by
1931       references to HUFF_ELEMENTs.
1932     */
1933     queue_top(&queue)= (uchar*) new_huff_el;
1934     queue_replace_top(&queue);
1935   }
1936   DBUG_RETURN(bytes_packed+(bits_packed+7)/8);
1937 }
1938 
1939 
1940 	/* Remove trees that don't give any compression */
1941 
join_same_trees(HUFF_COUNTS * huff_counts,uint trees)1942 static uint join_same_trees(HUFF_COUNTS *huff_counts, uint trees)
1943 {
1944   uint k,tree_number;
1945   HUFF_COUNTS count,*i,*j,*last_count;
1946 
1947   last_count=huff_counts+trees;
1948   for (tree_number=0, i=huff_counts ; i < last_count ; i++)
1949   {
1950     if (!i->tree->tree_number)
1951     {
1952       i->tree->tree_number= ++tree_number;
1953       if (i->tree_buff)
1954 	continue;			/* Don't join intervall */
1955       for (j=i+1 ; j < last_count ; j++)
1956       {
1957 	if (! j->tree->tree_number && ! j->tree_buff)
1958 	{
1959 	  for (k=0 ; k < 256 ; k++)
1960 	    count.counts[k]=i->counts[k]+j->counts[k];
1961 	  if (calc_packed_length(&count,1) <=
1962 	      i->tree->bytes_packed + j->tree->bytes_packed+
1963 	      i->tree->tree_pack_length+j->tree->tree_pack_length+
1964 	      ALLOWED_JOIN_DIFF)
1965 	  {
1966 	    memcpy(i->counts,(uchar*) count.counts, sizeof(count.counts[0])*256);
1967 	    my_free(j->tree->element_buffer);
1968 	    j->tree->element_buffer=0;
1969 	    j->tree=i->tree;
1970 	    bmove((uchar*) i->counts,(uchar*) count.counts,
1971 		  sizeof(count.counts[0])*256);
1972 	    if (make_huff_tree(i->tree,i))
1973 	      return (uint) -1;
1974 	  }
1975 	}
1976       }
1977     }
1978   }
1979   DBUG_PRINT("info", ("Original trees:  %d  After join: %d",
1980                       trees, tree_number));
1981   if (verbose)
1982     printf("Original trees:  %d  After join: %d\n", trees, tree_number);
1983   return tree_number;			/* Return trees left */
1984 }
1985 
1986 
1987 /*
1988   Fill in huff_tree encode tables.
1989 
1990   SYNOPSIS
1991     make_huff_decode_table()
1992     huff_tree               An array of HUFF_TREE which are to be encoded.
1993     trees                   The number of HUFF_TREE in the array.
1994 
1995   RETURN
1996     0           success
1997     != 0        error
1998 */
1999 
make_huff_decode_table(HUFF_TREE * huff_tree,uint trees)2000 static int make_huff_decode_table(HUFF_TREE *huff_tree, uint trees)
2001 {
2002   uint elements;
2003   for ( ; trees-- ; huff_tree++)
2004   {
2005     if (huff_tree->tree_number > 0)
2006     {
2007       elements=huff_tree->counts->tree_buff ? huff_tree->elements : 256;
2008       if (!(huff_tree->code =
2009             (ulonglong*) my_malloc(PSI_NOT_INSTRUMENTED,
2010                                 elements* (sizeof(ulonglong) + sizeof(uchar)),
2011                                    MYF(MY_WME | MY_ZEROFILL))))
2012 	return 1;
2013       huff_tree->code_len=(uchar*) (huff_tree->code+elements);
2014       make_traverse_code_tree(huff_tree, huff_tree->root,
2015                               8 * sizeof(ulonglong), 0);
2016     }
2017   }
2018   return 0;
2019 }
2020 
2021 
make_traverse_code_tree(HUFF_TREE * huff_tree,HUFF_ELEMENT * element,uint size,ulonglong code)2022 static void make_traverse_code_tree(HUFF_TREE *huff_tree,
2023 				    HUFF_ELEMENT *element,
2024 				    uint size, ulonglong code)
2025 {
2026   uint chr;
2027   if (!element->a.leaf.null)
2028   {
2029     chr=element->a.leaf.element_nr;
2030     huff_tree->code_len[chr]= (uchar) (8 * sizeof(ulonglong) - size);
2031     huff_tree->code[chr]= (code >> size);
2032     if (huff_tree->height < 8 * sizeof(ulonglong) - size)
2033         huff_tree->height= 8 * sizeof(ulonglong) - size;
2034   }
2035   else
2036   {
2037     size--;
2038     make_traverse_code_tree(huff_tree,element->a.nod.left,size,code);
2039     make_traverse_code_tree(huff_tree, element->a.nod.right, size,
2040 			    code + (((ulonglong) 1) << size));
2041   }
2042   return;
2043 }
2044 
2045 
2046 /*
2047   Convert a value into binary digits.
2048 
2049   SYNOPSIS
2050     bindigits()
2051     value                       The value.
2052     length                      The number of low order bits to convert.
2053 
2054   NOTE
2055     The result string is in static storage. It is reused on every call.
2056     So you cannot use it twice in one expression.
2057 
2058   RETURN
2059     A pointer to a static NUL-terminated string.
2060  */
2061 
bindigits(ulonglong value,uint bits)2062 static char *bindigits(ulonglong value, uint bits)
2063 {
2064   static char digits[72];
2065   char *ptr= digits;
2066   uint idx= bits;
2067 
2068   DBUG_ASSERT(idx < sizeof(digits));
2069   while (idx)
2070     *(ptr++)= '0' + ((char) (value >> (--idx)) & (char) 1);
2071   *ptr= '\0';
2072   return digits;
2073 }
2074 
2075 
2076 /*
2077   Convert a value into hexadecimal digits.
2078 
2079   SYNOPSIS
2080     hexdigits()
2081     value                       The value.
2082 
2083   NOTE
2084     The result string is in static storage. It is reused on every call.
2085     So you cannot use it twice in one expression.
2086 
2087   RETURN
2088     A pointer to a static NUL-terminated string.
2089  */
2090 
hexdigits(ulonglong value)2091 static char *hexdigits(ulonglong value)
2092 {
2093   static char digits[20];
2094   char *ptr= digits;
2095   uint idx= 2 * sizeof(value); /* Two hex digits per byte. */
2096 
2097   DBUG_ASSERT(idx < sizeof(digits));
2098   while (idx)
2099   {
2100     if ((*(ptr++)= '0' + ((char) (value >> (4 * (--idx))) & (char) 0xf)) > '9')
2101       *(ptr - 1)+= 'a' - '9' - 1;
2102   }
2103   *ptr= '\0';
2104   return digits;
2105 }
2106 
2107 
2108 	/* Write header to new packed data file */
2109 
write_header(PACK_MRG_INFO * mrg,uint head_length,uint trees,my_off_t tot_elements,my_off_t filelength)2110 static int write_header(PACK_MRG_INFO *mrg,uint head_length,uint trees,
2111 			my_off_t tot_elements,my_off_t filelength)
2112 {
2113   uchar *buff= (uchar*) file_buffer.pos;
2114 
2115   bzero(buff,HEAD_LENGTH);
2116   memcpy(buff,maria_pack_file_magic,4);
2117   int4store(buff+4,head_length);
2118   int4store(buff+8, mrg->min_pack_length);
2119   int4store(buff+12,mrg->max_pack_length);
2120   int4store(buff+16,tot_elements);
2121   int4store(buff+20,intervall_length);
2122   int2store(buff+24,trees);
2123   buff[26]=(char) mrg->ref_length;
2124 	/* Save record pointer length */
2125   buff[27]= (uchar) maria_get_pointer_length((ulonglong) filelength,2);
2126   if (test_only)
2127     return 0;
2128   my_seek(file_buffer.file,0L,MY_SEEK_SET,MYF(0));
2129   return my_write(file_buffer.file,(const uchar *) file_buffer.pos,HEAD_LENGTH,
2130 		  MYF(MY_WME | MY_NABP | MY_WAIT_IF_FULL)) != 0;
2131 }
2132 
2133 	/* Write fieldinfo to new packed file */
2134 
write_field_info(HUFF_COUNTS * counts,uint fields,uint trees)2135 static void write_field_info(HUFF_COUNTS *counts, uint fields, uint trees)
2136 {
2137   reg1 uint i;
2138   uint huff_tree_bits;
2139   huff_tree_bits=max_bit(trees ? trees-1 : 0);
2140 
2141   DBUG_PRINT("info", (" "));
2142   DBUG_PRINT("info", ("column types:"));
2143   DBUG_PRINT("info", ("FIELD_NORMAL          0"));
2144   DBUG_PRINT("info", ("FIELD_SKIP_ENDSPACE   1"));
2145   DBUG_PRINT("info", ("FIELD_SKIP_PRESPACE   2"));
2146   DBUG_PRINT("info", ("FIELD_SKIP_ZERO       3"));
2147   DBUG_PRINT("info", ("FIELD_BLOB            4"));
2148   DBUG_PRINT("info", ("FIELD_CONSTANT        5"));
2149   DBUG_PRINT("info", ("FIELD_INTERVALL       6"));
2150   DBUG_PRINT("info", ("FIELD_ZERO            7"));
2151   DBUG_PRINT("info", ("FIELD_VARCHAR         8"));
2152   DBUG_PRINT("info", ("FIELD_CHECK           9"));
2153   DBUG_PRINT("info", (" "));
2154   DBUG_PRINT("info", ("pack type as a set of flags:"));
2155   DBUG_PRINT("info", ("PACK_TYPE_SELECTED      1"));
2156   DBUG_PRINT("info", ("PACK_TYPE_SPACE_FIELDS  2"));
2157   DBUG_PRINT("info", ("PACK_TYPE_ZERO_FILL     4"));
2158   DBUG_PRINT("info", (" "));
2159   if (verbose >= 2)
2160   {
2161     printf("\n");
2162     printf("column types:\n");
2163     printf("FIELD_NORMAL          0\n");
2164     printf("FIELD_SKIP_ENDSPACE   1\n");
2165     printf("FIELD_SKIP_PRESPACE   2\n");
2166     printf("FIELD_SKIP_ZERO       3\n");
2167     printf("FIELD_BLOB            4\n");
2168     printf("FIELD_CONSTANT        5\n");
2169     printf("FIELD_INTERVALL       6\n");
2170     printf("FIELD_ZERO            7\n");
2171     printf("FIELD_VARCHAR         8\n");
2172     printf("FIELD_CHECK           9\n");
2173     printf("\n");
2174     printf("pack type as a set of flags:\n");
2175     printf("PACK_TYPE_SELECTED      1\n");
2176     printf("PACK_TYPE_SPACE_FIELDS  2\n");
2177     printf("PACK_TYPE_ZERO_FILL     4\n");
2178     printf("\n");
2179   }
2180   for (i=0 ; i++ < fields ; counts++)
2181   {
2182     write_bits((ulonglong) (int) counts->field_type, 5);
2183     write_bits(counts->pack_type,6);
2184     if (counts->pack_type & PACK_TYPE_ZERO_FILL)
2185       write_bits(counts->max_zero_fill,5);
2186     else
2187       write_bits(counts->length_bits,5);
2188     write_bits((ulonglong) counts->tree->tree_number - 1, huff_tree_bits);
2189     DBUG_PRINT("info", ("column: %3u  type: %2u  pack: %2u  zero: %4u  "
2190                         "lbits: %2u  tree: %2u  length: %4u",
2191                         i , counts->field_type, counts->pack_type,
2192                         counts->max_zero_fill, counts->length_bits,
2193                         counts->tree->tree_number, counts->field_length));
2194     if (verbose >= 2)
2195       printf("column: %3u  type: %2u  pack: %2u  zero: %4u  lbits: %2u  "
2196                   "tree: %2u  length: %4u\n", i , counts->field_type,
2197                   counts->pack_type, counts->max_zero_fill, counts->length_bits,
2198                   counts->tree->tree_number, counts->field_length);
2199   }
2200   flush_bits();
2201   return;
2202 }
2203 
2204 	/* Write all huff_trees to new datafile. Return tot count of
2205 	   elements in all trees
2206 	   Returns 0 on error */
2207 
write_huff_tree(HUFF_TREE * huff_tree,uint trees)2208 static my_off_t write_huff_tree(HUFF_TREE *huff_tree, uint trees)
2209 {
2210   uint i,int_length;
2211   uint tree_no;
2212   uint codes;
2213   uint errors= 0;
2214   uint *packed_tree,*offset,length;
2215   my_off_t elements;
2216 
2217   /* Find the highest number of elements in the trees. */
2218   for (i=length=0 ; i < trees ; i++)
2219     if (huff_tree[i].tree_number > 0 && huff_tree[i].elements > length)
2220       length=huff_tree[i].elements;
2221   /*
2222     Allocate a buffer for packing a decode tree. Two numbers per element
2223     (left child and right child).
2224   */
2225   if (!(packed_tree=(uint*) my_alloca(sizeof(uint)*length*2)))
2226   {
2227     my_error(EE_OUTOFMEMORY,MYF(ME_BELL),sizeof(uint)*length*2);
2228     return 0;
2229   }
2230 
2231   DBUG_PRINT("info", (" "));
2232   if (verbose >= 2)
2233     printf("\n");
2234   tree_no= 0;
2235   intervall_length=0;
2236   for (elements=0; trees-- ; huff_tree++)
2237   {
2238     /* Skip columns that have been joined with other columns. */
2239     if (huff_tree->tree_number == 0)
2240       continue;				/* Deleted tree */
2241     tree_no++;
2242     DBUG_PRINT("info", (" "));
2243     if (verbose >= 3)
2244       printf("\n");
2245     /* Count the total number of elements (byte codes or column values). */
2246     elements+=huff_tree->elements;
2247     huff_tree->max_offset=2;
2248     /* Build a tree of offsets and codes for decoding in 'packed_tree'. */
2249     if (huff_tree->elements <= 1)
2250       offset=packed_tree;
2251     else
2252       offset=make_offset_code_tree(huff_tree,huff_tree->root,packed_tree);
2253 
2254     /* This should be the same as 'length' above. */
2255     huff_tree->offset_bits=max_bit(huff_tree->max_offset);
2256 
2257     /*
2258       Since we check this during collecting the distinct column values,
2259       this should never happen.
2260     */
2261     if (huff_tree->max_offset >= IS_OFFSET)
2262     {				/* This should be impossible */
2263       fprintf(stderr, "Tree offset got too big: %d, aborted\n",
2264                    huff_tree->max_offset);
2265       my_afree(packed_tree);
2266       return 0;
2267     }
2268 
2269     DBUG_PRINT("info", ("pos: %lu  elements: %u  tree-elements: %lu  "
2270                         "char_bits: %u\n",
2271                         (ulong) (file_buffer.pos - file_buffer.buffer),
2272                         huff_tree->elements, (ulong) (offset - packed_tree),
2273                         huff_tree->char_bits));
2274     if (!huff_tree->counts->tree_buff)
2275     {
2276       /* We do a uchar compression on this column. Mark with bit 0. */
2277       write_bits(0,1);
2278       write_bits(huff_tree->min_chr,8);
2279       write_bits(huff_tree->elements,9);
2280       write_bits(huff_tree->char_bits,5);
2281       write_bits(huff_tree->offset_bits,5);
2282       int_length=0;
2283     }
2284     else
2285     {
2286       int_length=(uint) (huff_tree->counts->tree_pos -
2287 			 huff_tree->counts->tree_buff);
2288       /* We have distinct column values for this column. Mark with bit 1. */
2289       write_bits(1,1);
2290       write_bits(huff_tree->elements,15);
2291       write_bits(int_length,16);
2292       write_bits(huff_tree->char_bits,5);
2293       write_bits(huff_tree->offset_bits,5);
2294       intervall_length+=int_length;
2295     }
2296     DBUG_PRINT("info", ("tree: %2u  elements: %4u  char_bits: %2u  "
2297                         "offset_bits: %2u  %s: %5u  codelen: %2u",
2298                         tree_no, huff_tree->elements, huff_tree->char_bits,
2299                         huff_tree->offset_bits, huff_tree->counts->tree_buff ?
2300                         "bufflen" : "min_chr", huff_tree->counts->tree_buff ?
2301                         int_length : huff_tree->min_chr, huff_tree->height));
2302     if (verbose >= 2)
2303       printf("tree: %2u  elements: %4u  char_bits: %2u  offset_bits: %2u  "
2304                   "%s: %5u  codelen: %2u\n", tree_no, huff_tree->elements,
2305                   huff_tree->char_bits, huff_tree->offset_bits,
2306                   huff_tree->counts->tree_buff ? "bufflen" : "min_chr",
2307                   huff_tree->counts->tree_buff ? int_length :
2308                   huff_tree->min_chr, huff_tree->height);
2309 
2310     /* Check that the code tree length matches the element count. */
2311     length=(uint) (offset-packed_tree);
2312     if (length != huff_tree->elements*2-2)
2313     {
2314       fprintf(stderr, "error: Huff-tree-length: %d != calc_length: %d\n",
2315                    length, huff_tree->elements * 2 - 2);
2316       errors++;
2317       break;
2318     }
2319 
2320     for (i=0 ; i < length ; i++)
2321     {
2322       if (packed_tree[i] & IS_OFFSET)
2323 	write_bits(packed_tree[i] - IS_OFFSET+ (1 << huff_tree->offset_bits),
2324 		   huff_tree->offset_bits+1);
2325       else
2326 	write_bits(packed_tree[i]-huff_tree->min_chr,huff_tree->char_bits+1);
2327       DBUG_PRINT("info", ("tree[0x%04x]: %s0x%04x",
2328                           i, (packed_tree[i] & IS_OFFSET) ?
2329                           " -> " : "", (packed_tree[i] & IS_OFFSET) ?
2330                           packed_tree[i] - IS_OFFSET + i : packed_tree[i]));
2331       if (verbose >= 3)
2332         printf("tree[0x%04x]: %s0x%04x\n",
2333                     i, (packed_tree[i] & IS_OFFSET) ? " -> " : "",
2334                     (packed_tree[i] & IS_OFFSET) ?
2335                     packed_tree[i] - IS_OFFSET + i : packed_tree[i]);
2336     }
2337     flush_bits();
2338 
2339     /*
2340       Display coding tables and check their correctness.
2341     */
2342     codes= huff_tree->counts->tree_buff ? huff_tree->elements : 256;
2343     for (i= 0; i < codes; i++)
2344     {
2345       ulonglong code;
2346       uint bits;
2347       uint len;
2348       uint idx;
2349 
2350       if (! (len= huff_tree->code_len[i]))
2351         continue;
2352       DBUG_PRINT("info", ("code[0x%04x]:      0x%s  bits: %2u  bin: %s", i,
2353                           hexdigits(huff_tree->code[i]), huff_tree->code_len[i],
2354                           bindigits(huff_tree->code[i],
2355                                     huff_tree->code_len[i])));
2356       if (verbose >= 3)
2357         printf("code[0x%04x]:      0x%s  bits: %2u  bin: %s\n", i,
2358                     hexdigits(huff_tree->code[i]), huff_tree->code_len[i],
2359                     bindigits(huff_tree->code[i], huff_tree->code_len[i]));
2360 
2361       /* Check that the encode table decodes correctly. */
2362       code= 0;
2363       bits= 0;
2364       idx= 0;
2365       DBUG_EXECUTE_IF("forcechkerr1", len--;);
2366       DBUG_EXECUTE_IF("forcechkerr2", bits= 8 * sizeof(code););
2367       DBUG_EXECUTE_IF("forcechkerr3", idx= length;);
2368       for (;;)
2369       {
2370         if (! len)
2371         {
2372           fflush(stdout);
2373           fprintf(stderr, "error: code 0x%s with %u bits not found\n",
2374                        hexdigits(huff_tree->code[i]), huff_tree->code_len[i]);
2375           errors++;
2376           break;
2377         }
2378         code<<= 1;
2379         code|= (huff_tree->code[i] >> (--len)) & 1;
2380         bits++;
2381         if (bits > 8 * sizeof(code))
2382         {
2383           fflush(stdout);
2384           fprintf(stderr, "error: Huffman code too long: %u/%u\n",
2385                        bits, (uint) (8 * sizeof(code)));
2386           errors++;
2387           break;
2388         }
2389         idx+= (uint) code & 1;
2390         if (idx >= length)
2391         {
2392           fflush(stdout);
2393           fprintf(stderr, "error: illegal tree offset: %u/%u\n", idx, length);
2394           errors++;
2395           break;
2396         }
2397         if (packed_tree[idx] & IS_OFFSET)
2398           idx+= packed_tree[idx] & ~IS_OFFSET;
2399         else
2400           break; /* Hit a leaf. This contains the result value. */
2401       }
2402       if (errors)
2403         break;
2404 
2405       DBUG_EXECUTE_IF("forcechkerr4", packed_tree[idx]++;);
2406       if (packed_tree[idx] != i)
2407       {
2408         fflush(stdout);
2409         fprintf(stderr, "error: decoded value 0x%04x  should be: 0x%04x\n",
2410                      packed_tree[idx], i);
2411         errors++;
2412         break;
2413       }
2414     } /*end for (codes)*/
2415     if (errors)
2416       break;
2417 
2418     /* Write column values in case of distinct column value compression. */
2419     if (huff_tree->counts->tree_buff)
2420     {
2421       for (i=0 ; i < int_length ; i++)
2422       {
2423  	write_bits((ulonglong) (uchar) huff_tree->counts->tree_buff[i], 8);
2424         DBUG_PRINT("info", ("column_values[0x%04x]: 0x%02x",
2425                             i, (uchar) huff_tree->counts->tree_buff[i]));
2426         if (verbose >= 3)
2427           printf("column_values[0x%04x]: 0x%02x\n",
2428                       i, (uchar) huff_tree->counts->tree_buff[i]);
2429       }
2430     }
2431     flush_bits();
2432   }
2433   DBUG_PRINT("info", (" "));
2434   if (verbose >= 2)
2435     printf("\n");
2436   my_afree(packed_tree);
2437   if (errors)
2438   {
2439     fprintf(stderr, "Error: Generated decode trees are corrupt. Stop.\n");
2440     return 0;
2441   }
2442   return elements;
2443 }
2444 
2445 
make_offset_code_tree(HUFF_TREE * huff_tree,HUFF_ELEMENT * element,uint * offset)2446 static uint *make_offset_code_tree(HUFF_TREE *huff_tree, HUFF_ELEMENT *element,
2447 				   uint *offset)
2448 {
2449   uint *prev_offset;
2450 
2451   prev_offset= offset;
2452   /*
2453     'a.leaf.null' takes the same place as 'a.nod.left'. If this is null,
2454     then there is no left child and, hence no right child either. This
2455     is a property of a binary tree. An element is either a node with two
2456     childs, or a leaf without childs.
2457 
2458     The current element is always a node with two childs. Go left first.
2459   */
2460   if (!element->a.nod.left->a.leaf.null)
2461   {
2462     /* Store the uchar code or the index of the column value. */
2463     prev_offset[0] =(uint) element->a.nod.left->a.leaf.element_nr;
2464     offset+=2;
2465   }
2466   else
2467   {
2468     /*
2469       Recursively traverse the tree to the left. Mark it as an offset to
2470       another tree node (in contrast to a uchar code or column value index).
2471     */
2472     prev_offset[0]= IS_OFFSET+2;
2473     offset=make_offset_code_tree(huff_tree,element->a.nod.left,offset+2);
2474   }
2475 
2476   /* Now, check the right child. */
2477   if (!element->a.nod.right->a.leaf.null)
2478   {
2479     /* Store the uchar code or the index of the column value. */
2480     prev_offset[1]=element->a.nod.right->a.leaf.element_nr;
2481     return offset;
2482   }
2483   else
2484   {
2485     /*
2486       Recursively traverse the tree to the right. Mark it as an offset to
2487       another tree node (in contrast to a uchar code or column value index).
2488     */
2489     uint temp=(uint) (offset-prev_offset-1);
2490     prev_offset[1]= IS_OFFSET+ temp;
2491     if (huff_tree->max_offset < temp)
2492       huff_tree->max_offset = temp;
2493     return make_offset_code_tree(huff_tree,element->a.nod.right,offset);
2494   }
2495 }
2496 
2497 	/* Get number of bits neaded to represent value */
2498 
max_bit(register uint value)2499 static uint max_bit(register uint value)
2500 {
2501   reg2 uint power=1;
2502 
2503   while ((value>>=1))
2504     power++;
2505   return (power);
2506 }
2507 
2508 
compress_maria_file(PACK_MRG_INFO * mrg,HUFF_COUNTS * huff_counts)2509 static int compress_maria_file(PACK_MRG_INFO *mrg, HUFF_COUNTS *huff_counts)
2510 {
2511   int error;
2512   uint i,max_calc_length,pack_ref_length,min_record_length,max_record_length;
2513   uint intervall,field_length,max_pack_length,pack_blob_length, null_bytes;
2514   my_off_t record_count;
2515   char llbuf[32];
2516   ulong length,pack_length;
2517   uchar *record,*pos,*end_pos,*record_pos,*start_pos;
2518   HUFF_COUNTS *count,*end_count;
2519   HUFF_TREE *tree;
2520   MARIA_HA *isam_file=mrg->file[0];
2521   uint pack_version= (uint) isam_file->s->pack.version;
2522   DBUG_ENTER("compress_maria_file");
2523 
2524   /* Allocate a buffer for the records (excluding blobs). */
2525   if (!(record=(uchar*) my_safe_alloca(isam_file->s->base.reclength)))
2526     return -1;
2527 
2528   end_count=huff_counts+isam_file->s->base.fields;
2529   min_record_length= (uint) ~0;
2530   max_record_length=0;
2531   null_bytes= isam_file->s->base.null_bytes;
2532 
2533   /*
2534     Calculate the maximum number of bits required to pack the records.
2535     Remember to understand 'max_zero_fill' as 'min_zero_fill'.
2536     The tree height determines the maximum number of bits per value.
2537     Some fields skip leading or trailing spaces or zeroes. The skipped
2538     number of bytes is encoded by 'length_bits' bits.
2539     Empty blobs and varchar are encoded with a single 1 bit. Other blobs
2540     and varchar get a leading 0 bit.
2541   */
2542   max_calc_length= null_bytes;
2543   for (i= 0 ; i < isam_file->s->base.fields ; i++)
2544   {
2545     if (!(huff_counts[i].pack_type & PACK_TYPE_ZERO_FILL))
2546       huff_counts[i].max_zero_fill=0;
2547     if (huff_counts[i].field_type == FIELD_CONSTANT ||
2548 	huff_counts[i].field_type == FIELD_ZERO ||
2549 	huff_counts[i].field_type == FIELD_CHECK)
2550       continue;
2551     if (huff_counts[i].field_type == FIELD_INTERVALL)
2552       max_calc_length+=huff_counts[i].tree->height;
2553     else if (huff_counts[i].field_type == FIELD_BLOB ||
2554 	     huff_counts[i].field_type == FIELD_VARCHAR)
2555       max_calc_length+=huff_counts[i].tree->height*huff_counts[i].max_length + huff_counts[i].length_bits +1;
2556     else
2557       max_calc_length+=
2558 	(huff_counts[i].field_length - huff_counts[i].max_zero_fill)*
2559 	  huff_counts[i].tree->height+huff_counts[i].length_bits;
2560   }
2561   max_calc_length= (max_calc_length + 7) / 8;
2562   pack_ref_length= _ma_calc_pack_length(pack_version, max_calc_length);
2563   record_count=0;
2564   /* 'max_blob_length' is the max length of all blobs of a record. */
2565   pack_blob_length= isam_file->s->base.blobs ?
2566                     _ma_calc_pack_length(pack_version, mrg->max_blob_length) : 0;
2567   max_pack_length=pack_ref_length+pack_blob_length;
2568 
2569   DBUG_PRINT("fields", ("==="));
2570   mrg_reset(mrg);
2571   while ((error=mrg_rrnd(mrg,record)) != HA_ERR_END_OF_FILE)
2572   {
2573     ulong tot_blob_length=0;
2574     if (! error)
2575     {
2576       if (flush_buffer((ulong) max_calc_length + (ulong) max_pack_length +
2577                        null_bytes))
2578 	break;
2579       record_pos= file_buffer.pos;
2580       file_buffer.pos+= max_pack_length;
2581       if (null_bytes)
2582       {
2583         /* Copy null bits 'as is' */
2584         memcpy(file_buffer.pos, record, null_bytes);
2585         file_buffer.pos+= null_bytes;
2586       }
2587       for (start_pos=record+null_bytes, count= huff_counts;
2588            count < end_count ;
2589            count++)
2590       {
2591 	end_pos=start_pos+(field_length=count->field_length);
2592 	tree=count->tree;
2593 
2594         DBUG_PRINT("fields", ("column: %3lu  type: %2u  pack: %2u  zero: %4u  "
2595                               "lbits: %2u  tree: %2u  length: %4u",
2596                               (ulong) (count - huff_counts + 1),
2597                               count->field_type,
2598                               count->pack_type, count->max_zero_fill,
2599                               count->length_bits, count->tree->tree_number,
2600                               count->field_length));
2601 
2602         /* Check if the column contains spaces only. */
2603 	if (count->pack_type & PACK_TYPE_SPACE_FIELDS)
2604 	{
2605 	  for (pos=start_pos ; *pos == ' ' && pos < end_pos; pos++) ;
2606 	  if (pos == end_pos)
2607 	  {
2608             DBUG_PRINT("fields",
2609                        ("PACK_TYPE_SPACE_FIELDS spaces only, bits:  1"));
2610             DBUG_PRINT("fields", ("---"));
2611 	    write_bits(1,1);
2612 	    start_pos=end_pos;
2613 	    continue;
2614 	  }
2615           DBUG_PRINT("fields",
2616                      ("PACK_TYPE_SPACE_FIELDS not only spaces, bits:  1"));
2617 	  write_bits(0,1);
2618 	}
2619 	end_pos-=count->max_zero_fill;
2620 	field_length-=count->max_zero_fill;
2621 
2622 	switch (count->field_type) {
2623 	case FIELD_SKIP_ZERO:
2624 	  if (!memcmp(start_pos, zero_string, field_length))
2625 	  {
2626             DBUG_PRINT("fields", ("FIELD_SKIP_ZERO zeroes only, bits:  1"));
2627 	    write_bits(1,1);
2628 	    start_pos=end_pos;
2629 	    break;
2630 	  }
2631           DBUG_PRINT("fields", ("FIELD_SKIP_ZERO not only zeroes, bits:  1"));
2632 	  write_bits(0,1);
2633 	  /* Fall through */
2634 	case FIELD_NORMAL:
2635           DBUG_PRINT("fields", ("FIELD_NORMAL %lu bytes",
2636                                 (ulong) (end_pos - start_pos)));
2637 	  for ( ; start_pos < end_pos ; start_pos++)
2638           {
2639             DBUG_PRINT("fields",
2640                        ("value: 0x%02x  code: 0x%s  bits: %2u  bin: %s",
2641                         (uchar) *start_pos,
2642                         hexdigits(tree->code[(uchar) *start_pos]),
2643                         (uint) tree->code_len[(uchar) *start_pos],
2644                         bindigits(tree->code[(uchar) *start_pos],
2645                                   (uint) tree->code_len[(uchar) *start_pos])));
2646 	    write_bits(tree->code[(uchar) *start_pos],
2647 		       (uint) tree->code_len[(uchar) *start_pos]);
2648           }
2649 	  break;
2650 	case FIELD_SKIP_ENDSPACE:
2651 	  for (pos=end_pos ; pos > start_pos && pos[-1] == ' ' ; pos--) ;
2652 	  length= (ulong) (end_pos - pos);
2653 	  if (count->pack_type & PACK_TYPE_SELECTED)
2654 	  {
2655 	    if (length > count->min_space)
2656 	    {
2657               DBUG_PRINT("fields",
2658                          ("FIELD_SKIP_ENDSPACE more than min_space, bits:  1"));
2659               DBUG_PRINT("fields",
2660                          ("FIELD_SKIP_ENDSPACE skip %lu/%u bytes, bits: %2u",
2661                           length, field_length, count->length_bits));
2662 	      write_bits(1,1);
2663 	      write_bits(length,count->length_bits);
2664 	    }
2665 	    else
2666 	    {
2667               DBUG_PRINT("fields",
2668                          ("FIELD_SKIP_ENDSPACE not more than min_space, "
2669                           "bits:  1"));
2670 	      write_bits(0,1);
2671 	      pos=end_pos;
2672 	    }
2673 	  }
2674 	  else
2675           {
2676             DBUG_PRINT("fields",
2677                        ("FIELD_SKIP_ENDSPACE skip %lu/%u bytes, bits: %2u",
2678                         length, field_length, count->length_bits));
2679 	    write_bits(length,count->length_bits);
2680           }
2681           /* Encode all significant bytes. */
2682           DBUG_PRINT("fields", ("FIELD_SKIP_ENDSPACE %lu bytes",
2683                                 (ulong) (pos - start_pos)));
2684 	  for ( ; start_pos < pos ; start_pos++)
2685           {
2686             DBUG_PRINT("fields",
2687                        ("value: 0x%02x  code: 0x%s  bits: %2u  bin: %s",
2688                         (uchar) *start_pos,
2689                         hexdigits(tree->code[(uchar) *start_pos]),
2690                         (uint) tree->code_len[(uchar) *start_pos],
2691                         bindigits(tree->code[(uchar) *start_pos],
2692                                   (uint) tree->code_len[(uchar) *start_pos])));
2693 	    write_bits(tree->code[(uchar) *start_pos],
2694 		       (uint) tree->code_len[(uchar) *start_pos]);
2695           }
2696 	  start_pos=end_pos;
2697 	  break;
2698 	case FIELD_SKIP_PRESPACE:
2699 	  for (pos=start_pos ; pos < end_pos && pos[0] == ' ' ; pos++) ;
2700           length= (ulong) (pos - start_pos);
2701 	  if (count->pack_type & PACK_TYPE_SELECTED)
2702 	  {
2703 	    if (length > count->min_space)
2704 	    {
2705               DBUG_PRINT("fields",
2706                          ("FIELD_SKIP_PRESPACE more than min_space, bits:  1"));
2707               DBUG_PRINT("fields",
2708                          ("FIELD_SKIP_PRESPACE skip %lu/%u bytes, bits: %2u",
2709                           length, field_length, count->length_bits));
2710 	      write_bits(1,1);
2711 	      write_bits(length,count->length_bits);
2712 	    }
2713 	    else
2714 	    {
2715               DBUG_PRINT("fields",
2716                          ("FIELD_SKIP_PRESPACE not more than min_space, "
2717                           "bits:  1"));
2718 	      pos=start_pos;
2719 	      write_bits(0,1);
2720 	    }
2721 	  }
2722 	  else
2723           {
2724             DBUG_PRINT("fields",
2725                        ("FIELD_SKIP_PRESPACE skip %lu/%u bytes, bits: %2u",
2726                         length, field_length, count->length_bits));
2727 	    write_bits(length,count->length_bits);
2728           }
2729           /* Encode all significant bytes. */
2730           DBUG_PRINT("fields", ("FIELD_SKIP_PRESPACE %lu bytes",
2731                                 (ulong) (end_pos - start_pos)));
2732 	  for (start_pos=pos ; start_pos < end_pos ; start_pos++)
2733           {
2734             DBUG_PRINT("fields",
2735                        ("value: 0x%02x  code: 0x%s  bits: %2u  bin: %s",
2736                         (uchar) *start_pos,
2737                         hexdigits(tree->code[(uchar) *start_pos]),
2738                         (uint) tree->code_len[(uchar) *start_pos],
2739                         bindigits(tree->code[(uchar) *start_pos],
2740                                   (uint) tree->code_len[(uchar) *start_pos])));
2741 	    write_bits(tree->code[(uchar) *start_pos],
2742 		       (uint) tree->code_len[(uchar) *start_pos]);
2743           }
2744 	  break;
2745 	case FIELD_CONSTANT:
2746 	case FIELD_ZERO:
2747 	case FIELD_CHECK:
2748           DBUG_PRINT("fields", ("FIELD_CONSTANT/ZERO/CHECK"));
2749 	  start_pos=end_pos;
2750 	  break;
2751 	case FIELD_INTERVALL:
2752 	  global_count=count;
2753 	  pos=(uchar*) tree_search(&count->int_tree, start_pos,
2754 				  count->int_tree.custom_arg);
2755 	  intervall=(uint) (pos - count->tree_buff)/field_length;
2756           DBUG_PRINT("fields", ("FIELD_INTERVALL"));
2757           DBUG_PRINT("fields", ("index: %4u code: 0x%s  bits: %2u",
2758                                 intervall, hexdigits(tree->code[intervall]),
2759                                 (uint) tree->code_len[intervall]));
2760 	  write_bits(tree->code[intervall],(uint) tree->code_len[intervall]);
2761 	  start_pos=end_pos;
2762 	  break;
2763 	case FIELD_BLOB:
2764 	{
2765 	  ulong blob_length= _ma_calc_blob_length(field_length-
2766 						 portable_sizeof_char_ptr,
2767 						 start_pos);
2768           /* Empty blobs are encoded with a single 1 bit. */
2769 	  if (!blob_length)
2770 	  {
2771             DBUG_PRINT("fields", ("FIELD_BLOB empty, bits:  1"));
2772             write_bits(1,1);
2773 	  }
2774 	  else
2775 	  {
2776 	    uchar *blob,*blob_end;
2777             DBUG_PRINT("fields", ("FIELD_BLOB not empty, bits:  1"));
2778 	    write_bits(0,1);
2779             /* Write the blob length. */
2780             DBUG_PRINT("fields", ("FIELD_BLOB %lu bytes, bits: %2u",
2781                                   blob_length, count->length_bits));
2782 	    write_bits(blob_length,count->length_bits);
2783 	    memcpy(&blob,end_pos-portable_sizeof_char_ptr, sizeof(char*));
2784 	    blob_end=blob+blob_length;
2785             /* Encode the blob bytes. */
2786 	    for ( ; blob < blob_end ; blob++)
2787             {
2788               DBUG_PRINT("fields",
2789                          ("value: 0x%02x  code: 0x%s  bits: %2u  bin: %s",
2790                           (uchar) *blob, hexdigits(tree->code[(uchar) *blob]),
2791                           (uint) tree->code_len[(uchar) *blob],
2792                           bindigits(tree->code[(uchar) *start_pos],
2793                                     (uint)tree->code_len[(uchar) *start_pos])));
2794 	      write_bits(tree->code[(uchar) *blob],
2795 			 (uint) tree->code_len[(uchar) *blob]);
2796             }
2797 	    tot_blob_length+=blob_length;
2798 	  }
2799 	  start_pos= end_pos;
2800 	  break;
2801 	}
2802 	case FIELD_VARCHAR:
2803 	{
2804           uint var_pack_length= HA_VARCHAR_PACKLENGTH(count->field_length-1);
2805 	  ulong col_length= (var_pack_length == 1 ?
2806                              (uint) *(uchar*) start_pos :
2807                              uint2korr(start_pos));
2808           /* Empty varchar are encoded with a single 1 bit. */
2809 	  if (!col_length)
2810 	  {
2811             DBUG_PRINT("fields", ("FIELD_VARCHAR empty, bits:  1"));
2812 	    write_bits(1,1);			/* Empty varchar */
2813 	  }
2814 	  else
2815 	  {
2816 	    uchar *end= start_pos + var_pack_length + col_length;
2817             DBUG_PRINT("fields", ("FIELD_VARCHAR not empty, bits:  1"));
2818 	    write_bits(0,1);
2819             /* Write the varchar length. */
2820             DBUG_PRINT("fields", ("FIELD_VARCHAR %lu bytes, bits: %2u",
2821                                   col_length, count->length_bits));
2822 	    write_bits(col_length,count->length_bits);
2823             /* Encode the varchar bytes. */
2824 	    for (start_pos+= var_pack_length ; start_pos < end ; start_pos++)
2825             {
2826               DBUG_PRINT("fields",
2827                          ("value: 0x%02x  code: 0x%s  bits: %2u  bin: %s",
2828                           (uchar) *start_pos,
2829                           hexdigits(tree->code[(uchar) *start_pos]),
2830                           (uint) tree->code_len[(uchar) *start_pos],
2831                           bindigits(tree->code[(uchar) *start_pos],
2832                                     (uint)tree->code_len[(uchar) *start_pos])));
2833 	      write_bits(tree->code[(uchar) *start_pos],
2834 			 (uint) tree->code_len[(uchar) *start_pos]);
2835             }
2836 	  }
2837 	  start_pos= end_pos;
2838 	  break;
2839 	}
2840 	case FIELD_LAST:
2841         case FIELD_enum_val_count:
2842 	  abort();				/* Impossible */
2843 	}
2844 	start_pos+=count->max_zero_fill;
2845         DBUG_PRINT("fields", ("---"));
2846       }
2847       flush_bits();
2848       length=(ulong) (file_buffer.pos - record_pos) - max_pack_length;
2849       pack_length= _ma_save_pack_length(pack_version, record_pos, length);
2850       if (pack_blob_length)
2851 	pack_length+= _ma_save_pack_length(pack_version,
2852                                            record_pos + pack_length,
2853                                            tot_blob_length);
2854       DBUG_PRINT("fields", ("record: %lu  length: %lu  blob-length: %lu  "
2855                             "length-bytes: %lu", (ulong) record_count, length,
2856                             tot_blob_length, pack_length));
2857       DBUG_PRINT("fields", ("==="));
2858 
2859       /* Correct file buffer if the header was smaller */
2860       if (pack_length != max_pack_length)
2861       {
2862 	bmove(record_pos+pack_length,record_pos+max_pack_length,length);
2863 	file_buffer.pos-= (max_pack_length-pack_length);
2864       }
2865       if (length < (ulong) min_record_length)
2866 	min_record_length=(uint) length;
2867       if (length > (ulong) max_record_length)
2868 	max_record_length=(uint) length;
2869       record_count++;
2870       if (write_loop && record_count % WRITE_COUNT == 0)
2871       {
2872 	printf("%lu\r", (ulong) record_count);
2873         fflush(stdout);
2874       }
2875     }
2876     else if (error != HA_ERR_RECORD_DELETED)
2877       break;
2878   }
2879   if (error == HA_ERR_END_OF_FILE)
2880     error=0;
2881   else
2882   {
2883     fprintf(stderr, "%s: Got error %d reading records\n", my_progname, error);
2884   }
2885   if (verbose >= 2)
2886     printf("wrote %s records.\n", llstr((longlong) record_count, llbuf));
2887 
2888   my_safe_afree(record, isam_file->s->base.reclength);
2889   mrg->ref_length=max_pack_length;
2890   mrg->min_pack_length=max_record_length ? min_record_length : 0;
2891   mrg->max_pack_length=max_record_length;
2892   DBUG_RETURN(error || error_on_write || flush_buffer(~(ulong) 0));
2893 }
2894 
2895 
make_new_name(char * new_name,char * old_name)2896 static char *make_new_name(char *new_name, char *old_name)
2897 {
2898   return fn_format(new_name,old_name,"",DATA_TMP_EXT,2+4);
2899 }
2900 
make_old_name(char * new_name,char * old_name)2901 static char *make_old_name(char *new_name, char *old_name)
2902 {
2903   return fn_format(new_name,old_name,"",OLD_EXT,2+4);
2904 }
2905 
2906 	/* rutines for bit writing buffer */
2907 
init_file_buffer(File file,pbool read_buffer)2908 static void init_file_buffer(File file, pbool read_buffer)
2909 {
2910   file_buffer.file=file;
2911   file_buffer.buffer= (uchar*) my_malloc(PSI_NOT_INSTRUMENTED,
2912                                    ALIGN_SIZE(RECORD_CACHE_SIZE), MYF(MY_WME));
2913   file_buffer.end=file_buffer.buffer+ALIGN_SIZE(RECORD_CACHE_SIZE)-8;
2914   file_buffer.pos_in_file=0;
2915   error_on_write=0;
2916   if (read_buffer)
2917   {
2918 
2919     file_buffer.pos=file_buffer.end;
2920     file_buffer.bits=0;
2921   }
2922   else
2923   {
2924     file_buffer.pos=file_buffer.buffer;
2925     file_buffer.bits=BITS_SAVED;
2926   }
2927   file_buffer.bitbucket= 0;
2928 }
2929 
2930 
flush_buffer(ulong neaded_length)2931 static int flush_buffer(ulong neaded_length)
2932 {
2933   ulong length;
2934 
2935   /*
2936     file_buffer.end is 8 bytes lower than the real end of the buffer.
2937     This is done so that the end-of-buffer condition does not need to be
2938     checked for every uchar (see write_bits()). Consequently,
2939     file_buffer.pos can become greater than file_buffer.end. The
2940     algorithms in the other functions ensure that there will never be
2941     more than 8 bytes written to the buffer without an end-of-buffer
2942     check. So the buffer cannot be overrun. But we need to check for the
2943     near-to-buffer-end condition to avoid a negative result, which is
2944     casted to unsigned and thus becomes giant.
2945   */
2946   if ((file_buffer.pos < file_buffer.end) &&
2947       ((ulong) (file_buffer.end - file_buffer.pos) > neaded_length))
2948     return 0;
2949   length=(ulong) (file_buffer.pos-file_buffer.buffer);
2950   file_buffer.pos=file_buffer.buffer;
2951   file_buffer.pos_in_file+=length;
2952   if (test_only)
2953     return 0;
2954   if (error_on_write|| my_write(file_buffer.file,
2955 				(const uchar*) file_buffer.buffer,
2956 				length,
2957 				MYF(MY_WME | MY_NABP | MY_WAIT_IF_FULL)))
2958   {
2959     error_on_write=1;
2960     return 1;
2961   }
2962 
2963   if (neaded_length != ~(ulong) 0 &&
2964       (ulong) (file_buffer.end-file_buffer.buffer) < neaded_length)
2965   {
2966     uchar *tmp;
2967     neaded_length+=256;				/* some margin */
2968     tmp= (uchar*) my_realloc(PSI_NOT_INSTRUMENTED, file_buffer.buffer,
2969                              neaded_length,MYF(MY_WME));
2970     if (!tmp)
2971       return 1;
2972     file_buffer.pos=    (tmp + (ulong) (file_buffer.pos - file_buffer.buffer));
2973     file_buffer.buffer= tmp;
2974     file_buffer.end=    (tmp+neaded_length-8);
2975   }
2976   return 0;
2977 }
2978 
2979 
end_file_buffer(void)2980 static void end_file_buffer(void)
2981 {
2982   my_free(file_buffer.buffer);
2983 }
2984 
2985 	/* output `bits` low bits of `value' */
2986 
write_bits(register ulonglong value,register uint bits)2987 static void write_bits(register ulonglong value, register uint bits)
2988 {
2989   DBUG_ASSERT(((bits < 8 * sizeof(value)) && ! (value >> bits)) ||
2990               (bits == 8 * sizeof(value)));
2991 
2992   if ((file_buffer.bits-= (int) bits) >= 0)
2993   {
2994     file_buffer.bitbucket|= value << file_buffer.bits;
2995   }
2996   else
2997   {
2998     reg3 ulonglong bit_buffer;
2999     bits= (uint) -file_buffer.bits;
3000     bit_buffer= (file_buffer.bitbucket |
3001                  ((bits != 8 * sizeof(value)) ? (value >> bits) : 0));
3002 #if BITS_SAVED == 64
3003     *file_buffer.pos++= (uchar) (bit_buffer >> 56);
3004     *file_buffer.pos++= (uchar) (bit_buffer >> 48);
3005     *file_buffer.pos++= (uchar) (bit_buffer >> 40);
3006     *file_buffer.pos++= (uchar) (bit_buffer >> 32);
3007 #endif
3008     *file_buffer.pos++= (uchar) (bit_buffer >> 24);
3009     *file_buffer.pos++= (uchar) (bit_buffer >> 16);
3010     *file_buffer.pos++= (uchar) (bit_buffer >> 8);
3011     *file_buffer.pos++= (uchar) (bit_buffer);
3012 
3013     if (bits != 8 * sizeof(value))
3014       value&= (((ulonglong) 1) << bits) - 1;
3015     if (file_buffer.pos >= file_buffer.end)
3016       flush_buffer(~ (ulong) 0);
3017     file_buffer.bits=(int) (BITS_SAVED - bits);
3018     file_buffer.bitbucket= value << (BITS_SAVED - bits);
3019   }
3020   return;
3021 }
3022 
3023 	/* Flush bits in bit_buffer to buffer */
3024 
flush_bits(void)3025 static void flush_bits(void)
3026 {
3027   int bits;
3028   ulonglong bit_buffer;
3029 
3030   bits= file_buffer.bits & ~7;
3031   bit_buffer= file_buffer.bitbucket >> bits;
3032   bits= BITS_SAVED - bits;
3033   while (bits > 0)
3034   {
3035     bits-= 8;
3036     *file_buffer.pos++= (uchar) (bit_buffer >> bits);
3037   }
3038   if (file_buffer.pos >= file_buffer.end)
3039     flush_buffer(~ (ulong) 0);
3040   file_buffer.bits= BITS_SAVED;
3041   file_buffer.bitbucket= 0;
3042 }
3043 
3044 
3045 /****************************************************************************
3046 ** functions to handle the joined files
3047 ****************************************************************************/
3048 
save_state(MARIA_HA * isam_file,PACK_MRG_INFO * mrg,my_off_t new_length,ha_checksum crc)3049 static int save_state(MARIA_HA *isam_file,PACK_MRG_INFO *mrg,
3050                       my_off_t new_length,
3051 		      ha_checksum crc)
3052 {
3053   MARIA_SHARE *share=isam_file->s;
3054   uint options=mi_uint2korr(share->state.header.options);
3055   uint key;
3056   DBUG_ENTER("save_state");
3057 
3058   options|= HA_OPTION_COMPRESS_RECORD | HA_OPTION_READ_ONLY_DATA;
3059   mi_int2store(share->state.header.options,options);
3060   /* Save the original file type of we have to undo the packing later */
3061   share->state.header.org_data_file_type= share->state.header.data_file_type;
3062   share->state.header.data_file_type= COMPRESSED_RECORD;
3063 
3064   share->state.state.data_file_length=new_length;
3065   share->state.state.del=0;
3066   share->state.state.empty=0;
3067   share->state.dellink= HA_OFFSET_ERROR;
3068   share->state.split=(ha_rows) mrg->records;
3069   share->state.version=(ulong) time((time_t*) 0);
3070   if (share->base.born_transactional)
3071     share->state.create_rename_lsn= share->state.is_of_horizon=
3072       share->state.skip_redo_lsn= LSN_NEEDS_NEW_STATE_LSNS;
3073   if (! maria_is_all_keys_active(share->state.key_map, share->base.keys))
3074   {
3075     /*
3076       Some indexes are disabled, cannot use current key_file_length value
3077       as an estimate of upper bound of index file size. Use packed data file
3078       size instead.
3079     */
3080     share->state.state.key_file_length= new_length;
3081   }
3082   /*
3083     If there are no disabled indexes, keep key_file_length value from
3084     original file so "aria_chk -rq" can use this value (this is necessary
3085     because index size cannot be easily calculated for fulltext keys)
3086   */
3087   maria_clear_all_keys_active(share->state.key_map);
3088   for (key=0 ; key < share->base.keys ; key++)
3089     share->state.key_root[key]= HA_OFFSET_ERROR;
3090   share->state.key_del= HA_OFFSET_ERROR;
3091   share->state.state.checksum= crc;     /* Save crc in file */
3092   share->changed=1;			/* Force write of header */
3093   share->state.open_count=0;
3094   share->global_changed=0;
3095   my_chsize(share->kfile.file, share->base.keystart, 0, MYF(0));
3096   if (share->base.keys)
3097     isamchk_neaded=1;
3098   DBUG_RETURN(_ma_state_info_write_sub(share->kfile.file,
3099                                        &share->state,
3100                                        MA_STATE_INFO_WRITE_DONT_MOVE_OFFSET |
3101                                        MA_STATE_INFO_WRITE_FULL_INFO));
3102 }
3103 
3104 
save_state_mrg(File file,PACK_MRG_INFO * mrg,my_off_t new_length,ha_checksum crc)3105 static int save_state_mrg(File file,PACK_MRG_INFO *mrg,my_off_t new_length,
3106 			  ha_checksum crc)
3107 {
3108   MARIA_STATE_INFO state;
3109   MARIA_HA *isam_file=mrg->file[0];
3110   uint options;
3111   DBUG_ENTER("save_state_mrg");
3112 
3113   state= isam_file->s->state;
3114   options= (mi_uint2korr(state.header.options) | HA_OPTION_COMPRESS_RECORD |
3115 	    HA_OPTION_READ_ONLY_DATA);
3116   mi_int2store(state.header.options,options);
3117   /* Save the original file type of we have to undo the packing later */
3118   state.header.org_data_file_type= state.header.data_file_type;
3119   state.header.data_file_type= COMPRESSED_RECORD;
3120 
3121   state.state.data_file_length=new_length;
3122   state.state.del=0;
3123   state.state.empty=0;
3124   state.state.records=state.split=(ha_rows) mrg->records;
3125   state.create_rename_lsn= state.is_of_horizon= state.skip_redo_lsn=
3126     LSN_NEEDS_NEW_STATE_LSNS;
3127 
3128   /* See comment above in save_state about key_file_length handling. */
3129   if (mrg->src_file_has_indexes_disabled)
3130   {
3131     isam_file->s->state.state.key_file_length=
3132       MY_MAX(isam_file->s->state.state.key_file_length, new_length);
3133   }
3134   state.dellink= HA_OFFSET_ERROR;
3135   state.version=(ulong) time((time_t*) 0);
3136   maria_clear_all_keys_active(state.key_map);
3137   state.state.checksum=crc;
3138   if (isam_file->s->base.keys)
3139     isamchk_neaded=1;
3140   state.changed=STATE_CHANGED | STATE_NOT_ANALYZED; /* Force check of table */
3141   DBUG_RETURN (_ma_state_info_write_sub(file, &state,
3142                                         MA_STATE_INFO_WRITE_DONT_MOVE_OFFSET |
3143                                         MA_STATE_INFO_WRITE_FULL_INFO));
3144 }
3145 
3146 
3147 /* reset for mrg_rrnd */
3148 
mrg_reset(PACK_MRG_INFO * mrg)3149 static void mrg_reset(PACK_MRG_INFO *mrg)
3150 {
3151   if (mrg->current)
3152   {
3153     maria_extra(*mrg->current, HA_EXTRA_NO_CACHE, 0);
3154     mrg->current=0;
3155   }
3156 }
3157 
mrg_rrnd(PACK_MRG_INFO * info,uchar * buf)3158 static int mrg_rrnd(PACK_MRG_INFO *info,uchar *buf)
3159 {
3160   int error;
3161   MARIA_HA *isam_info;
3162 
3163   if (!info->current)
3164   {
3165     isam_info= *(info->current=info->file);
3166     info->end=info->current+info->count;
3167     maria_reset(isam_info);
3168     maria_extra(isam_info, HA_EXTRA_CACHE, 0);
3169     if ((error= maria_scan_init(isam_info)))
3170       return(error);
3171   }
3172   else
3173     isam_info= *info->current;
3174 
3175   for (;;)
3176   {
3177     if (!(error= maria_scan(isam_info, buf)) ||
3178 	error != HA_ERR_END_OF_FILE)
3179       return (error);
3180     maria_scan_end(isam_info);
3181     maria_extra(isam_info,HA_EXTRA_NO_CACHE, 0);
3182     if (info->current+1 == info->end)
3183       return(HA_ERR_END_OF_FILE);
3184     info->current++;
3185     isam_info= *info->current;
3186     maria_reset(isam_info);
3187     maria_extra(isam_info,HA_EXTRA_CACHE, 0);
3188     if ((error= maria_scan_init(isam_info)))
3189       return(error);
3190   }
3191 }
3192 
3193 
mrg_close(PACK_MRG_INFO * mrg)3194 static int mrg_close(PACK_MRG_INFO *mrg)
3195 {
3196   uint i;
3197   int error=0;
3198   DBUG_ENTER("mrg_close");
3199 
3200   for (i=0 ; i < mrg->count ; i++)
3201     error|=maria_close(mrg->file[i]);
3202   if (mrg->free_file)
3203     my_free(mrg->file);
3204   DBUG_RETURN(error);
3205 }
3206 
3207 
3208 #if !defined(DBUG_OFF)
3209 /*
3210   Fake the counts to get big Huffman codes.
3211 
3212   SYNOPSIS
3213     fakebigcodes()
3214     huff_counts                 A pointer to the counts array.
3215     end_count                   A pointer past the counts array.
3216 
3217   DESCRIPTION
3218 
3219     Huffman coding works by removing the two least frequent values from
3220     the list of values and add a new value with the sum of their
3221     incidences in a loop until only one value is left. Every time a
3222     value is reused for a new value, it gets one more bit for its
3223     encoding. Hence, the least frequent values get the longest codes.
3224 
3225     To get a maximum code length for a value, two of the values must
3226     have an incidence of 1. As their sum is 2, the next infrequent value
3227     must have at least an incidence of 2, then 4, 8, 16 and so on. This
3228     means that one needs 2**n bytes (values) for a code length of n
3229     bits. However, using more distinct values forces the use of longer
3230     codes, or reaching the code length with less total bytes (values).
3231 
3232     To get 64(32)-bit codes, I sort the counts by decreasing incidence.
3233     I assign counts of 1 to the two most frequent values, a count of 2
3234     for the next one, then 4, 8, and so on until 2**64-1(2**30-1). All
3235     the remaining values get 1. That way every possible uchar has an
3236     assigned code, though not all codes are used if not all uchar values
3237     are present in the column.
3238 
3239     This strategy would work with distinct column values too, but
3240     requires that at least 64(32) values are present. To make things
3241     easier here, I cancel all distinct column values and force byte
3242     compression for all columns.
3243 
3244   RETURN
3245     void
3246 */
3247 
fakebigcodes(HUFF_COUNTS * huff_counts,HUFF_COUNTS * end_count)3248 static void fakebigcodes(HUFF_COUNTS *huff_counts, HUFF_COUNTS *end_count)
3249 {
3250   HUFF_COUNTS   *count;
3251   my_off_t      *cur_count_p;
3252   my_off_t      *end_count_p;
3253   my_off_t      **cur_sort_p;
3254   my_off_t      **end_sort_p;
3255   my_off_t      *sort_counts[256];
3256   my_off_t      total;
3257   DBUG_ENTER("fakebigcodes");
3258 
3259   for (count= huff_counts; count < end_count; count++)
3260   {
3261     /*
3262       Remove distinct column values.
3263     */
3264     if (huff_counts->tree_buff)
3265     {
3266       my_free(huff_counts->tree_buff);
3267       delete_tree(&huff_counts->int_tree, 0);
3268       huff_counts->tree_buff= NULL;
3269       DBUG_PRINT("fakebigcodes", ("freed distinct column values"));
3270     }
3271 
3272     /*
3273       Sort counts by decreasing incidence.
3274     */
3275     cur_count_p= count->counts;
3276     end_count_p= cur_count_p + 256;
3277     cur_sort_p= sort_counts;
3278     while (cur_count_p < end_count_p)
3279       *(cur_sort_p++)= cur_count_p++;
3280     (void) my_qsort(sort_counts, 256, sizeof(my_off_t*), (qsort_cmp) fakecmp);
3281 
3282     /*
3283       Assign faked counts.
3284     */
3285     cur_sort_p= sort_counts;
3286 #if SIZEOF_LONG_LONG > 4
3287     end_sort_p= sort_counts + 8 * sizeof(ulonglong) - 1;
3288 #else
3289     end_sort_p= sort_counts + 8 * sizeof(ulonglong) - 2;
3290 #endif
3291     /* Most frequent value gets a faked count of 1. */
3292     **(cur_sort_p++)= 1;
3293     total= 1;
3294     while (cur_sort_p < end_sort_p)
3295     {
3296       **(cur_sort_p++)= total;
3297       total<<= 1;
3298     }
3299     /* Set the last value. */
3300     **(cur_sort_p++)= --total;
3301     /*
3302       Set the remaining counts.
3303     */
3304     end_sort_p= sort_counts + 256;
3305     while (cur_sort_p < end_sort_p)
3306       **(cur_sort_p++)= 1;
3307   }
3308   DBUG_VOID_RETURN;
3309 }
3310 
3311 
3312 /*
3313   Compare two counts for reverse sorting.
3314 
3315   SYNOPSIS
3316     fakecmp()
3317     count1              One count.
3318     count2              Another count.
3319 
3320   RETURN
3321     1                   count1  < count2
3322     0                   count1 == count2
3323     -1                  count1 >  count2
3324 */
3325 
fakecmp(my_off_t ** count1,my_off_t ** count2)3326 static int fakecmp(my_off_t **count1, my_off_t **count2)
3327 {
3328   return ((**count1 < **count2) ? 1 :
3329           (**count1 > **count2) ? -1 : 0);
3330 }
3331 #endif
3332 
3333 #include "ma_check_standalone.h"
3334 
3335