1 /* Copyright (C) 2006 MySQL AB & MySQL Finland AB & TCX DataKonsult AB
2 
3    This program is free software; you can redistribute it and/or modify
4    it under the terms of the GNU General Public License as published by
5    the Free Software Foundation; version 2 of the License.
6 
7    This program is distributed in the hope that it will be useful,
8    but WITHOUT ANY WARRANTY; without even the implied warranty of
9    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
10    GNU General Public License for more details.
11 
12    You should have received a copy of the GNU General Public License
13    along with this program; if not, write to the Free Software
14    Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1335 USA */
15 
16 /* Pack MARIA file */
17 
18 #ifndef USE_MY_FUNC
19 #define USE_MY_FUNC			/* We need at least my_malloc */
20 #endif
21 
22 #include "maria_def.h"
23 #include <queues.h>
24 #include <my_tree.h>
25 #include "mysys_err.h"
26 #ifdef MSDOS
27 #include <io.h>
28 #endif
29 #ifndef __GNU_LIBRARY__
30 #define __GNU_LIBRARY__			/* Skip warnings in getopt.h */
31 #endif
32 #include <my_getopt.h>
33 #include <assert.h>
34 
35 #if SIZEOF_LONG_LONG > 4
36 #define BITS_SAVED 64
37 #else
38 #define BITS_SAVED 32
39 #endif
40 
41 #define IS_OFFSET ((uint) 32768)	/* Bit if offset or char in tree */
42 #define HEAD_LENGTH	32
43 #define ALLOWED_JOIN_DIFF	256	/* Diff allowed to join trees */
44 
45 #define DATA_TMP_EXT		".TMD"
46 #define OLD_EXT			".OLD"
47 #define WRITE_COUNT		MY_HOW_OFTEN_TO_WRITE
48 
49 struct st_file_buffer {
50   File file;
51   uchar *buffer,*pos,*end;
52   my_off_t pos_in_file;
53   int bits;
54   ulonglong bitbucket;
55 };
56 
57 struct st_huff_tree;
58 struct st_huff_element;
59 
60 typedef struct st_huff_counts {
61   uint	field_length,max_zero_fill;
62   uint	pack_type;
63   uint	max_end_space,max_pre_space,length_bits,min_space;
64   ulong max_length;
65   enum en_fieldtype field_type;
66   struct st_huff_tree *tree;		/* Tree for field */
67   my_off_t counts[256];
68   my_off_t end_space[8];
69   my_off_t pre_space[8];
70   my_off_t tot_end_space,tot_pre_space,zero_fields,empty_fields,bytes_packed;
71   TREE int_tree;        /* Tree for detecting distinct column values. */
72   uchar *tree_buff;      /* Column values, 'field_length' each. */
73   uchar *tree_pos;       /* Points to end of column values in 'tree_buff'. */
74 } HUFF_COUNTS;
75 
76 typedef struct st_huff_element HUFF_ELEMENT;
77 
78 /*
79   WARNING: It is crucial for the optimizations in calc_packed_length()
80   that 'count' is the first element of 'HUFF_ELEMENT'.
81 */
82 struct st_huff_element {
83   my_off_t count;
84   union un_element {
85     struct st_nod {
86       HUFF_ELEMENT *left,*right;
87     } nod;
88     struct st_leaf {
89       HUFF_ELEMENT *null;
90       uint	element_nr;		/* Number of element */
91     } leaf;
92   } a;
93 };
94 
95 
96 typedef struct st_huff_tree {
97   HUFF_ELEMENT *root,*element_buffer;
98   HUFF_COUNTS *counts;
99   uint tree_number;
100   uint elements;
101   my_off_t bytes_packed;
102   uint tree_pack_length;
103   uint min_chr,max_chr,char_bits,offset_bits,max_offset,height;
104   ulonglong *code;
105   uchar *code_len;
106 } HUFF_TREE;
107 
108 
109 typedef struct st_isam_mrg {
110   MARIA_HA **file,**current,**end;
111   uint free_file;
112   uint count;
113   uint	min_pack_length;		/* Theese is used by packed data */
114   uint	max_pack_length;
115   uint	ref_length;
116   uint	max_blob_length;
117   my_off_t records;
118   /* true if at least one source file has at least one disabled index */
119   my_bool src_file_has_indexes_disabled;
120 } PACK_MRG_INFO;
121 
122 
123 extern int main(int argc,char * *argv);
124 static void get_options(int *argc,char ***argv);
125 static MARIA_HA *open_maria_file(char *name,int mode);
126 static my_bool open_maria_files(PACK_MRG_INFO *mrg,char **names,uint count);
127 static int compress(PACK_MRG_INFO *file,char *join_name);
128 static HUFF_COUNTS *init_huff_count(MARIA_HA *info,my_off_t records);
129 static void free_counts_and_tree_and_queue(HUFF_TREE *huff_trees,
130 					   uint trees,
131 					   HUFF_COUNTS *huff_counts,
132 					   uint fields);
133 static int compare_tree(void* cmp_arg __attribute__((unused)),
134 			const uchar *s,const uchar *t);
135 static int get_statistic(PACK_MRG_INFO *mrg,HUFF_COUNTS *huff_counts);
136 static void check_counts(HUFF_COUNTS *huff_counts,uint trees,
137 			 my_off_t records);
138 static int test_space_compress(HUFF_COUNTS *huff_counts,my_off_t records,
139 			       uint max_space_length,my_off_t *space_counts,
140 			       my_off_t tot_space_count,
141 			       enum en_fieldtype field_type);
142 static HUFF_TREE* make_huff_trees(HUFF_COUNTS *huff_counts,uint trees);
143 static int make_huff_tree(HUFF_TREE *tree,HUFF_COUNTS *huff_counts);
144 static int compare_huff_elements(void *not_used, uchar *a,uchar *b);
145 static int save_counts_in_queue(uchar *key,element_count count,
146 				    HUFF_TREE *tree);
147 static my_off_t calc_packed_length(HUFF_COUNTS *huff_counts,uint flag);
148 static uint join_same_trees(HUFF_COUNTS *huff_counts,uint trees);
149 static int make_huff_decode_table(HUFF_TREE *huff_tree,uint trees);
150 static void make_traverse_code_tree(HUFF_TREE *huff_tree,
151 				    HUFF_ELEMENT *element,uint size,
152 				    ulonglong code);
153 static int write_header(PACK_MRG_INFO *isam_file, uint header_length,uint trees,
154 			my_off_t tot_elements,my_off_t filelength);
155 static void write_field_info(HUFF_COUNTS *counts, uint fields,uint trees);
156 static my_off_t write_huff_tree(HUFF_TREE *huff_tree,uint trees);
157 static uint *make_offset_code_tree(HUFF_TREE *huff_tree,
158 				       HUFF_ELEMENT *element,
159 				       uint *offset);
160 static uint max_bit(uint value);
161 static int compress_maria_file(PACK_MRG_INFO *file,HUFF_COUNTS *huff_counts);
162 static char *make_new_name(char *new_name,char *old_name);
163 static char *make_old_name(char *new_name,char *old_name);
164 static void init_file_buffer(File file,pbool read_buffer);
165 static int flush_buffer(ulong neaded_length);
166 static void end_file_buffer(void);
167 static void write_bits(ulonglong value, uint bits);
168 static void flush_bits(void);
169 static int save_state(MARIA_HA *isam_file,PACK_MRG_INFO *mrg,
170                       my_off_t new_length, ha_checksum crc);
171 static int save_state_mrg(File file,PACK_MRG_INFO *isam_file,
172                           my_off_t new_length, ha_checksum crc);
173 static int mrg_close(PACK_MRG_INFO *mrg);
174 static int mrg_rrnd(PACK_MRG_INFO *info,uchar *buf);
175 static void mrg_reset(PACK_MRG_INFO *mrg);
176 #if !defined(DBUG_OFF)
177 static void fakebigcodes(HUFF_COUNTS *huff_counts, HUFF_COUNTS *end_count);
178 static int fakecmp(my_off_t **count1, my_off_t **count2);
179 #endif
180 
181 
182 static int error_on_write=0,test_only=0,verbose=0,silent=0,
183 	   write_loop=0,force_pack=0, isamchk_neaded=0;
184 static int tmpfile_createflag=O_RDWR | O_TRUNC | O_EXCL;
185 static my_bool backup, opt_wait;
186 /*
187   tree_buff_length is somewhat arbitrary. The bigger it is the better
188   the chance to win in terms of compression factor. On the other hand,
189   this table becomes part of the compressed file header. And its length
190   is coded with 16 bits in the header. Hence the limit is 2**16 - 1.
191 */
192 static uint tree_buff_length= 65536 - MALLOC_OVERHEAD;
193 static char tmp_dir[FN_REFLEN]={0},*join_table;
194 static my_off_t intervall_length;
195 static ha_checksum glob_crc;
196 static struct st_file_buffer file_buffer;
197 static QUEUE queue;
198 static HUFF_COUNTS *global_count;
199 static char zero_string[]={0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0};
200 static const char *load_default_groups[]= { "ariapack",0 };
201 
202 	/* The main program */
203 
main(int argc,char ** argv)204 int main(int argc, char **argv)
205 {
206   int error,ok;
207   PACK_MRG_INFO merge;
208   char **default_argv;
209   MY_INIT(argv[0]);
210 
211   load_defaults_or_exit("my", load_default_groups, &argc, &argv);
212   default_argv= argv;
213   get_options(&argc,&argv);
214   maria_init();
215 
216   error=ok=isamchk_neaded=0;
217   if (join_table)
218   {						/* Join files into one */
219     if (open_maria_files(&merge,argv,(uint) argc) ||
220 	compress(&merge,join_table))
221       error=1;
222   }
223   else while (argc--)
224   {
225     MARIA_HA *isam_file;
226     if (!(isam_file=open_maria_file(*argv++,O_RDWR)))
227       error=1;
228     else
229     {
230       merge.file= &isam_file;
231       merge.current=0;
232       merge.free_file=0;
233       merge.count=1;
234       if (compress(&merge,0))
235 	error=1;
236       else
237 	ok=1;
238     }
239   }
240   if (ok && isamchk_neaded && !silent)
241     puts("Remember to run aria_chk -rq on compressed tables");
242   fflush(stdout);
243   fflush(stderr);
244   free_defaults(default_argv);
245   maria_end();
246   my_end(verbose ? MY_CHECK_ERROR | MY_GIVE_INFO : MY_CHECK_ERROR);
247   exit(error ? 2 : 0);
248 #ifndef _lint
249   return 0;					/* No compiler warning */
250 #endif
251 }
252 
253 enum options_mp {OPT_CHARSETS_DIR_MP=256, OPT_AUTO_CLOSE};
254 
255 static struct my_option my_long_options[] =
256 {
257 #ifdef __NETWARE__
258   {"autoclose", OPT_AUTO_CLOSE, "Auto close the screen on exit for Netware.",
259    0, 0, 0, GET_NO_ARG, NO_ARG, 0, 0, 0, 0, 0, 0},
260 #endif
261   {"backup", 'b', "Make a backup of the table as table_name.OLD.",
262    &backup, &backup, 0, GET_BOOL, NO_ARG, 0, 0, 0, 0, 0, 0},
263   {"character-sets-dir", OPT_CHARSETS_DIR_MP,
264    "Directory where character sets are.", (char**) &charsets_dir,
265    (char**) &charsets_dir, 0, GET_STR, REQUIRED_ARG, 0, 0, 0, 0, 0, 0},
266   {"debug", '#', "Output debug log. Often this is 'd:t:o,filename'.",
267    0, 0, 0, GET_STR, OPT_ARG, 0, 0, 0, 0, 0, 0},
268   {"force", 'f',
269    "Force packing of table even if it gets bigger or if tempfile exists.",
270    0, 0, 0, GET_NO_ARG, NO_ARG, 0, 0, 0, 0, 0, 0},
271   {"join", 'j',
272    "Join all given tables into 'new_table_name'. All tables MUST have identical layouts.",
273    &join_table, &join_table, 0, GET_STR, REQUIRED_ARG, 0, 0, 0,
274    0, 0, 0},
275   {"help", '?', "Display this help and exit.",
276    0, 0, 0, GET_NO_ARG, NO_ARG, 0, 0, 0, 0, 0, 0},
277   {"silent", 's', "Be more silent.",
278    0, 0, 0, GET_NO_ARG, NO_ARG, 0, 0, 0, 0, 0, 0},
279   {"tmpdir", 'T', "Use temporary directory to store temporary table.",
280    0, 0, 0, GET_STR, REQUIRED_ARG, 0, 0, 0, 0, 0, 0},
281   {"test", 't', "Don't pack table, only test packing it.",
282    0, 0, 0, GET_NO_ARG, NO_ARG, 0, 0, 0, 0, 0, 0},
283   {"verbose", 'v', "Write info about progress and packing result. Use many -v for more verbosity!",
284    0, 0, 0, GET_NO_ARG, NO_ARG, 0, 0, 0, 0, 0, 0},
285   {"version", 'V', "Output version information and exit.",
286    0, 0, 0, GET_NO_ARG, NO_ARG, 0, 0, 0, 0, 0, 0},
287   {"wait", 'w', "Wait and retry if table is in use.", &opt_wait,
288    &opt_wait, 0, GET_BOOL, NO_ARG, 0, 0, 0, 0, 0, 0},
289   { 0, 0, 0, 0, 0, 0, GET_NO_ARG, NO_ARG, 0, 0, 0, 0, 0, 0}
290 };
291 
292 
print_version(void)293 static void print_version(void)
294 {
295   printf("%s Ver 1.0 for %s on %s\n", my_progname, SYSTEM_TYPE, MACHINE_TYPE);
296 }
297 
298 
usage(void)299 static void usage(void)
300 {
301   print_version();
302   puts("Copyright 2002-2008 MySQL AB, 2008-2009 Sun Microsystems, Inc.");
303   puts("This software comes with ABSOLUTELY NO WARRANTY. This is free software,");
304   puts("and you are welcome to modify and redistribute it under the GPL license\n");
305 
306   puts("Pack a Aria-table to take much less space.");
307   puts("Keys are not updated, you must run aria_chk -rq on the index (.MAI) file");
308   puts("afterwards to update the keys.");
309   puts("You should give the .MAI file as the filename argument.");
310   puts("To unpack a packed table, run aria_chk -u on the table");
311 
312   printf("\nUsage: %s [OPTIONS] filename...\n", my_progname);
313   my_print_help(my_long_options);
314   print_defaults("my", load_default_groups);
315   my_print_variables(my_long_options);
316 }
317 
318 
319 static my_bool
get_one_option(int optid,const struct my_option * opt,char * argument)320 get_one_option(int optid, const struct my_option *opt __attribute__((unused)),
321 	       char *argument)
322 {
323   uint length;
324 
325   switch(optid) {
326 #ifdef __NETWARE__
327   case OPT_AUTO_CLOSE:
328     setscreenmode(SCR_AUTOCLOSE_ON_EXIT);
329     break;
330 #endif
331   case 'f':
332     force_pack= 1;
333     tmpfile_createflag= O_RDWR | O_TRUNC;
334     break;
335   case 's':
336     write_loop= verbose= 0;
337     silent= 1;
338     break;
339   case 't':
340     test_only= 1;
341     /* Avoid to reset 'verbose' if it was already set > 1. */
342     if (! verbose)
343       verbose= 1;
344     break;
345   case 'T':
346     length= (uint) (strmov(tmp_dir, argument) - tmp_dir);
347     if (length != dirname_length(tmp_dir))
348     {
349       tmp_dir[length]=FN_LIBCHAR;
350       tmp_dir[length+1]=0;
351     }
352     break;
353   case 'v':
354     verbose++; /* Allow for selecting the level of verbosity. */
355     silent= 0;
356     break;
357   case '#':
358     DBUG_PUSH(argument ? argument : "d:t:o,/tmp/aria_pack.trace");
359     break;
360   case 'V':
361     print_version();
362     exit(0);
363   case 'I':
364   case '?':
365     usage();
366     exit(0);
367   }
368   return 0;
369 }
370 
371 	/* reads options */
372 	/* Initiates DEBUG - but no debugging here ! */
373 
get_options(int * argc,char *** argv)374 static void get_options(int *argc,char ***argv)
375 {
376   int ho_error;
377 
378   my_progname= argv[0][0];
379   if (isatty(fileno(stdout)))
380     write_loop=1;
381 
382   if ((ho_error=handle_options(argc, argv, my_long_options, get_one_option)))
383     exit(ho_error);
384 
385   if (!*argc)
386   {
387     usage();
388     exit(1);
389   }
390   if (join_table)
391   {
392     backup=0;					/* Not needed */
393     tmp_dir[0]=0;
394   }
395   return;
396 }
397 
398 
open_maria_file(char * name,int mode)399 static MARIA_HA *open_maria_file(char *name,int mode)
400 {
401   MARIA_HA *isam_file;
402   MARIA_SHARE *share;
403   DBUG_ENTER("open_maria_file");
404 
405   if (!(isam_file=maria_open(name, mode, HA_OPEN_IGNORE_MOVED_STATE |
406 			  (opt_wait ? HA_OPEN_WAIT_IF_LOCKED :
407 			   HA_OPEN_ABORT_IF_LOCKED))))
408   {
409     fprintf(stderr, "%s gave error %d on open\n", name, my_errno);
410     DBUG_RETURN(0);
411   }
412   share=isam_file->s;
413   if (share->options & HA_OPTION_COMPRESS_RECORD && !join_table)
414   {
415     if (!force_pack)
416     {
417       fprintf(stderr, "%s is already compressed\n", name);
418       maria_close(isam_file);
419       DBUG_RETURN(0);
420     }
421     if (verbose)
422       puts("Recompressing already compressed table");
423     share->options&= ~HA_OPTION_READ_ONLY_DATA; /* We are modifing it */
424   }
425   if (! force_pack && share->state.state.records != 0 &&
426       (share->state.state.records <= 1 ||
427        share->state.state.data_file_length < 1024))
428   {
429     fprintf(stderr, "%s is too small to compress\n", name);
430     maria_close(isam_file);
431     DBUG_RETURN(0);
432   }
433   maria_lock_database(isam_file,F_WRLCK);
434   maria_ignore_trids(isam_file);
435   DBUG_RETURN(isam_file);
436 }
437 
438 
open_maria_files(PACK_MRG_INFO * mrg,char ** names,uint count)439 static my_bool open_maria_files(PACK_MRG_INFO *mrg,char **names,uint count)
440 {
441   uint i,j;
442   mrg->count=0;
443   mrg->current=0;
444   mrg->file=(MARIA_HA**) my_malloc(sizeof(MARIA_HA*)*count,MYF(MY_FAE));
445   mrg->free_file=1;
446   mrg->src_file_has_indexes_disabled= 0;
447   for (i=0; i < count ; i++)
448   {
449     if (!(mrg->file[i]=open_maria_file(names[i],O_RDONLY)))
450       goto error;
451 
452     mrg->src_file_has_indexes_disabled|=
453       ! maria_is_all_keys_active(mrg->file[i]->s->state.key_map,
454                               mrg->file[i]->s->base.keys);
455   }
456   /* Check that files are identical */
457   for (j=0 ; j < count-1 ; j++)
458   {
459     MARIA_COLUMNDEF *m1,*m2,*end;
460     if (mrg->file[j]->s->base.reclength != mrg->file[j+1]->s->base.reclength ||
461 	mrg->file[j]->s->base.fields != mrg->file[j+1]->s->base.fields)
462       goto diff_file;
463     m1=mrg->file[j]->s->columndef;
464     end=m1+mrg->file[j]->s->base.fields;
465     m2=mrg->file[j+1]->s->columndef;
466     for ( ; m1 != end ; m1++,m2++)
467     {
468       if (m1->type != m2->type || m1->length != m2->length)
469 	goto diff_file;
470     }
471   }
472   mrg->count=count;
473   return 0;
474 
475  diff_file:
476   fprintf(stderr, "%s: Tables '%s' and '%s' are not identical\n",
477                my_progname, names[j], names[j+1]);
478  error:
479   while (i--)
480     maria_close(mrg->file[i]);
481   my_free(mrg->file);
482   return 1;
483 }
484 
485 
compress(PACK_MRG_INFO * mrg,char * result_table)486 static int compress(PACK_MRG_INFO *mrg,char *result_table)
487 {
488   int error;
489   File new_file,join_maria_file;
490   MARIA_HA *isam_file;
491   MARIA_SHARE *share;
492   char org_name[FN_REFLEN],new_name[FN_REFLEN],temp_name[FN_REFLEN];
493   uint i,header_length,fields,trees,used_trees;
494   my_off_t old_length,new_length,tot_elements;
495   HUFF_COUNTS *huff_counts;
496   HUFF_TREE *huff_trees;
497   DBUG_ENTER("compress");
498 
499   isam_file=mrg->file[0];			/* Take this as an example */
500   share=isam_file->s;
501   new_file=join_maria_file= -1;
502   trees=fields=0;
503   huff_trees=0;
504   huff_counts=0;
505   maria_block_size= isam_file->s->block_size;
506 
507   /* Create temporary or join file */
508   if (backup)
509     fn_format(org_name,isam_file->s->open_file_name.str, "",MARIA_NAME_DEXT, 2);
510   else
511     fn_format(org_name,isam_file->s->open_file_name.str, "",MARIA_NAME_DEXT, 2+4+16);
512 
513   if (init_pagecache(maria_pagecache, MARIA_MIN_PAGE_CACHE_SIZE, 0, 0,
514                      maria_block_size, 0, MY_WME) == 0)
515   {
516     fprintf(stderr, "Can't initialize page cache\n");
517     goto err;
518   }
519 
520   if (!test_only && result_table)
521   {
522     /* Make a new indexfile based on first file in list */
523     uint length;
524     uchar *buff;
525     strmov(org_name,result_table);		/* Fix error messages */
526     fn_format(new_name,result_table,"",MARIA_NAME_IEXT,2);
527     if ((join_maria_file=my_create(new_name,0,tmpfile_createflag,MYF(MY_WME)))
528 	< 0)
529       goto err;
530     length=(uint) share->base.keystart;
531     if (!(buff= (uchar*) my_malloc(length,MYF(MY_WME))))
532       goto err;
533     if (my_pread(share->kfile.file, buff, length, 0L, MYF(MY_WME | MY_NABP)) ||
534 	my_write(join_maria_file,buff,length,
535 		 MYF(MY_WME | MY_NABP | MY_WAIT_IF_FULL)))
536     {
537       my_free(buff);
538       goto err;
539     }
540     my_free(buff);
541     fn_format(new_name,result_table,"",MARIA_NAME_DEXT,2);
542   }
543   else if (!tmp_dir[0])
544     make_new_name(new_name,org_name);
545   else
546     fn_format(new_name,org_name,tmp_dir,DATA_TMP_EXT,1+2+4);
547   if (!test_only &&
548       (new_file=my_create(new_name,0,tmpfile_createflag,MYF(MY_WME))) < 0)
549     goto err;
550 
551   /* Start calculating statistics */
552 
553   mrg->records=0;
554   for (i=0 ; i < mrg->count ; i++)
555     mrg->records+=mrg->file[i]->s->state.state.records;
556 
557   DBUG_PRINT("info", ("Compressing %s: (%lu records)",
558                       result_table ? new_name : org_name,
559                       (ulong) mrg->records));
560   if (write_loop || verbose)
561   {
562     printf("Compressing %s: (%lu records)\n",
563                 result_table ? new_name : org_name, (ulong) mrg->records);
564   }
565   trees=fields=share->base.fields;
566   huff_counts=init_huff_count(isam_file,mrg->records);
567 
568   /*
569     Read the whole data file(s) for statistics.
570   */
571   DBUG_PRINT("info", ("- Calculating statistics"));
572   if (write_loop || verbose)
573     printf("- Calculating statistics\n");
574   if (get_statistic(mrg,huff_counts))
575     goto err;
576 
577   old_length=0;
578   for (i=0; i < mrg->count ; i++)
579     old_length+= (mrg->file[i]->s->state.state.data_file_length -
580 		  mrg->file[i]->s->state.state.empty);
581 
582   /*
583     Create a global priority queue in preparation for making
584     temporary Huffman trees.
585   */
586   if (init_queue(&queue, 256, 0, 0, compare_huff_elements, 0, 0, 0))
587     goto err;
588 
589   /*
590     Check each column if we should use pre-space-compress, end-space-
591     compress, empty-field-compress or zero-field-compress.
592   */
593   check_counts(huff_counts,fields,mrg->records);
594 
595   /*
596     Build a Huffman tree for each column.
597   */
598   huff_trees=make_huff_trees(huff_counts,trees);
599 
600   /*
601     If the packed lengths of combined columns is less then the sum of
602     the non-combined columns, then create common Huffman trees for them.
603     We do this only for uchar compressed columns, not for distinct values
604     compressed columns.
605   */
606   if ((int) (used_trees=join_same_trees(huff_counts,trees)) < 0)
607     goto err;
608 
609   /*
610     Assign codes to all uchar or column values.
611   */
612   if (make_huff_decode_table(huff_trees,fields))
613     goto err;
614 
615   /* Prepare a file buffer. */
616   init_file_buffer(new_file,0);
617 
618   /*
619     Reserve space in the target file for the fixed compressed file header.
620   */
621   file_buffer.pos_in_file=HEAD_LENGTH;
622   if (! test_only)
623     my_seek(new_file,file_buffer.pos_in_file,MY_SEEK_SET,MYF(0));
624 
625   /*
626     Write field infos: field type, pack type, length bits, tree number.
627   */
628   write_field_info(huff_counts,fields,used_trees);
629 
630   /*
631     Write decode trees.
632   */
633   if (!(tot_elements=write_huff_tree(huff_trees,trees)))
634     goto err;
635 
636   /*
637     Calculate the total length of the compression info header.
638     This includes the fixed compressed file header, the column compression
639     type descriptions, and the decode trees.
640   */
641   header_length=(uint) file_buffer.pos_in_file+
642     (uint) (file_buffer.pos-file_buffer.buffer);
643 
644   /*
645     Compress the source file into the target file.
646   */
647   DBUG_PRINT("info", ("- Compressing file"));
648   if (write_loop || verbose)
649     printf("- Compressing file\n");
650   error=compress_maria_file(mrg,huff_counts);
651   new_length=file_buffer.pos_in_file;
652   if (!error && !test_only)
653   {
654     uchar buff[MEMMAP_EXTRA_MARGIN];		/* End marginal for memmap */
655     bzero(buff,sizeof(buff));
656     error=my_write(file_buffer.file,buff,sizeof(buff),
657 		   MYF(MY_WME | MY_NABP | MY_WAIT_IF_FULL)) != 0;
658   }
659 
660   /*
661     Write the fixed compressed file header.
662   */
663   if (!error)
664     error=write_header(mrg,header_length,used_trees,tot_elements,
665 		       new_length);
666 
667   /* Flush the file buffer. */
668   end_file_buffer();
669 
670   /* Display statistics. */
671   DBUG_PRINT("info", ("Min record length: %6d  Max length: %6d  "
672                       "Mean total length: %6ld",
673                       mrg->min_pack_length, mrg->max_pack_length,
674                       (ulong) (mrg->records ? (new_length/mrg->records) : 0)));
675   if (verbose && mrg->records)
676     printf("Min record length: %6d   Max length: %6d   "
677                 "Mean total length: %6ld\n", mrg->min_pack_length,
678                 mrg->max_pack_length, (ulong) (new_length/mrg->records));
679 
680   /* Close source and target file. */
681   if (!test_only)
682   {
683     error|=my_close(new_file,MYF(MY_WME));
684     if (!result_table)
685     {
686       (void) flush_pagecache_blocks(isam_file->s->pagecache, &isam_file->dfile,
687                                     FLUSH_RELEASE);
688       error|=my_close(isam_file->dfile.file, MYF(MY_WME));
689       isam_file->dfile.file= -1;	/* Tell maria_close file is closed */
690       isam_file->s->bitmap.file.file= -1;
691     }
692   }
693 
694   /* Cleanup. */
695   free_counts_and_tree_and_queue(huff_trees,trees,huff_counts,fields);
696   if (! test_only && ! error)
697   {
698     if (result_table)
699     {
700       error=save_state_mrg(join_maria_file,mrg,new_length,glob_crc);
701     }
702     else
703     {
704       if (backup)
705       {
706 	if (my_rename(org_name,make_old_name(temp_name,
707                                              isam_file->s->open_file_name.str),
708 		      MYF(MY_WME)))
709 	  error=1;
710 	else
711 	{
712 	  if (tmp_dir[0])
713 	    error=my_copy(new_name,org_name,MYF(MY_WME));
714 	  else
715 	    error=my_rename(new_name,org_name,MYF(MY_WME));
716 	  if (!error)
717           {
718 	    my_copystat(temp_name,org_name,MYF(MY_COPYTIME));
719             if (tmp_dir[0])
720               my_delete(new_name,MYF(MY_WME));
721           }
722 	}
723       }
724       else
725       {
726 	if (tmp_dir[0])
727         {
728 	  error=my_copy(new_name,org_name,
729 			MYF(MY_WME | MY_HOLD_ORIGINAL_MODES | MY_COPYTIME));
730           if (!error)
731             my_delete(new_name,MYF(MY_WME));
732         }
733 	else
734 	  error=my_redel(org_name, new_name, 0, MYF(MY_WME | MY_COPYTIME));
735       }
736       if (! error)
737 	error=save_state(isam_file,mrg,new_length,glob_crc);
738     }
739   }
740   error|=mrg_close(mrg);
741   if (join_maria_file >= 0)
742     error|=my_close(join_maria_file,MYF(MY_WME));
743   if (error)
744   {
745     fprintf(stderr, "Aborting: %s is not compressed\n", org_name);
746     my_delete(new_name,MYF(MY_WME));
747     DBUG_RETURN(-1);
748   }
749   if (write_loop || verbose)
750   {
751     if (old_length)
752       printf("%.4g%%     \n",
753                   (((longlong) (old_length - new_length)) * 100.0 /
754                    (longlong) old_length));
755     else
756       puts("Empty file saved in compressed format");
757   }
758   DBUG_RETURN(0);
759 
760  err:
761   free_counts_and_tree_and_queue(huff_trees,trees,huff_counts,fields);
762   if (new_file >= 0)
763     my_close(new_file,MYF(0));
764   if (join_maria_file >= 0)
765     my_close(join_maria_file,MYF(0));
766   mrg_close(mrg);
767   end_pagecache(maria_pagecache, 1);
768   fprintf(stderr, "Aborted: %s is not compressed\n", org_name);
769   DBUG_RETURN(-1);
770 }
771 
772 	/* Init a huff_count-struct for each field and init it */
773 
init_huff_count(MARIA_HA * info,my_off_t records)774 static HUFF_COUNTS *init_huff_count(MARIA_HA *info,my_off_t records)
775 {
776   reg2 uint i;
777   reg1 HUFF_COUNTS *count;
778   if ((count = (HUFF_COUNTS*) my_malloc(info->s->base.fields*
779 					sizeof(HUFF_COUNTS),
780 					MYF(MY_ZEROFILL | MY_WME))))
781   {
782     for (i=0 ; i < info->s->base.fields ; i++)
783     {
784       enum en_fieldtype type;
785       uint col_nr = info->s->columndef[i].column_nr;
786       count[col_nr].field_length=info->s->columndef[i].length;
787       type= count[col_nr].field_type=
788         (enum en_fieldtype) info->s->columndef[i].type;
789       if (type == FIELD_INTERVALL ||
790 	  type == FIELD_CONSTANT ||
791 	  type == FIELD_ZERO)
792 	type = FIELD_NORMAL;
793       if (count[col_nr].field_length <= 8 &&
794 	  (type == FIELD_NORMAL ||
795 	   type == FIELD_SKIP_ZERO))
796 	count[col_nr].max_zero_fill= count[col_nr].field_length;
797       /*
798         For every column initialize a tree, which is used to detect distinct
799         column values. 'int_tree' works together with 'tree_buff' and
800         'tree_pos'. It's keys are implemented by pointers into 'tree_buff'.
801         This is accomplished by '-1' as the element size.
802       */
803       init_tree(&count[col_nr].int_tree,0,0,-1,(qsort_cmp2) compare_tree, NULL,
804 		NULL, MYF(0));
805       if (records && type != FIELD_BLOB && type != FIELD_VARCHAR)
806 	count[col_nr].tree_pos=count[col_nr].tree_buff =
807 	  my_malloc(count[col_nr].field_length > 1 ? tree_buff_length : 2,
808 		    MYF(MY_WME));
809     }
810   }
811   return count;
812 }
813 
814 
815 	/* Free memory used by counts and trees */
816 
free_counts_and_tree_and_queue(HUFF_TREE * huff_trees,uint trees,HUFF_COUNTS * huff_counts,uint fields)817 static void free_counts_and_tree_and_queue(HUFF_TREE *huff_trees, uint trees,
818 					   HUFF_COUNTS *huff_counts,
819 					   uint fields)
820 {
821   register uint i;
822 
823   if (huff_trees)
824   {
825     for (i=0 ; i < trees ; i++)
826     {
827       if (huff_trees[i].element_buffer)
828 	my_free(huff_trees[i].element_buffer);
829       if (huff_trees[i].code)
830 	my_free(huff_trees[i].code);
831     }
832     my_free(huff_trees);
833   }
834   if (huff_counts)
835   {
836     for (i=0 ; i < fields ; i++)
837     {
838       if (huff_counts[i].tree_buff)
839       {
840 	my_free(huff_counts[i].tree_buff);
841 	delete_tree(&huff_counts[i].int_tree, 0);
842       }
843     }
844     my_free(huff_counts);
845   }
846   delete_queue(&queue);		/* This is safe to free */
847   return;
848 }
849 
850 	/* Read through old file and gather some statistics */
851 
get_statistic(PACK_MRG_INFO * mrg,HUFF_COUNTS * huff_counts)852 static int get_statistic(PACK_MRG_INFO *mrg,HUFF_COUNTS *huff_counts)
853 {
854   int error;
855   uint length, null_bytes;
856   ulong reclength,max_blob_length;
857   uchar *record,*pos,*next_pos,*end_pos,*start_pos;
858   ha_rows record_count;
859   HUFF_COUNTS *count,*end_count;
860   TREE_ELEMENT *element;
861   ha_checksum(*calc_checksum)(MARIA_HA *, const uchar *);
862   DBUG_ENTER("get_statistic");
863 
864   reclength=  mrg->file[0]->s->base.reclength;
865   null_bytes= mrg->file[0]->s->base.null_bytes;
866   record=(uchar*) my_safe_alloca(reclength);
867   end_count=huff_counts+mrg->file[0]->s->base.fields;
868   record_count=0; glob_crc=0;
869   max_blob_length=0;
870 
871   /* Check how to calculate checksum */
872   if (mrg->file[0]->s->data_file_type == STATIC_RECORD)
873     calc_checksum= _ma_static_checksum;
874   else
875     calc_checksum= _ma_checksum;
876 
877   mrg_reset(mrg);
878   while ((error=mrg_rrnd(mrg,record)) != HA_ERR_END_OF_FILE)
879   {
880     ulong tot_blob_length=0;
881     if (! error)
882     {
883       /* glob_crc is a checksum over all bytes of all records. */
884       glob_crc+= (*calc_checksum)(mrg->file[0],record);
885 
886       /* Count the incidence of values separately for every column. */
887       for (pos=record + null_bytes, count=huff_counts ;
888 	   count < end_count ;
889 	   count++,
890 	   pos=next_pos)
891       {
892 	next_pos=end_pos=(start_pos=pos)+count->field_length;
893 
894 	/*
895           Put the whole column value in a tree if there is room for it.
896           'int_tree' is used to quickly check for duplicate values.
897           'tree_buff' collects as many distinct column values as
898           possible. If the field length is > 1, it is tree_buff_length,
899           else 2 bytes. Each value is 'field_length' bytes big. If there
900           are more distinct column values than fit into the buffer, we
901           give up with this tree. BLOBs and VARCHARs do not have a
902           tree_buff as it can only be used with fixed length columns.
903           For the special case of field length == 1, we handle only the
904           case that there is only one distinct value in the table(s).
905           Otherwise, we can have a maximum of 256 distinct values. This
906           is then handled by the normal Huffman tree build.
907 
908           Another limit for collecting distinct column values is the
909           number of values itself. Since we would need to build a
910           Huffman tree for the values, we are limited by the 'IS_OFFSET'
911           constant. This constant expresses a bit which is used to
912           determine if a tree element holds a final value or an offset
913           to a child element. Hence, all values and offsets need to be
914           smaller than 'IS_OFFSET'. A tree element is implemented with
915           two integer values, one for the left branch and one for the
916           right branch. For the extreme case that the first element
917           points to the last element, the number of integers in the tree
918           must be less or equal to IS_OFFSET. So the number of elements
919           must be less or equal to IS_OFFSET / 2.
920 
921           WARNING: At first, we insert a pointer into the record buffer
922           as the key for the tree. If we got a new distinct value, which
923           is really inserted into the tree, instead of being counted
924           only, we will copy the column value from the record buffer to
925           'tree_buff' and adjust the key pointer of the tree accordingly.
926         */
927 	if (count->tree_buff)
928 	{
929 	  global_count=count;
930 	  if (!(element=tree_insert(&count->int_tree,pos, 0,
931 				    count->int_tree.custom_arg)) ||
932 	      (element->count == 1 &&
933 	       (count->tree_buff + tree_buff_length <
934                 count->tree_pos + count->field_length)) ||
935               (count->int_tree.elements_in_tree > IS_OFFSET / 2) ||
936 	      (count->field_length == 1 &&
937 	       count->int_tree.elements_in_tree > 1))
938 	  {
939 	    delete_tree(&count->int_tree, 0);
940 	    my_free(count->tree_buff);
941 	    count->tree_buff=0;
942 	  }
943 	  else
944 	  {
945             /*
946               If tree_insert() succeeds, it either creates a new element
947               or increments the counter of an existing element.
948             */
949 	    if (element->count == 1)
950 	    {
951               /* Copy the new column value into 'tree_buff'. */
952 	      memcpy(count->tree_pos,pos,(size_t) count->field_length);
953               /* Adjust the key pointer in the tree. */
954 	      tree_set_pointer(element,count->tree_pos);
955               /* Point behind the last column value so far. */
956 	      count->tree_pos+=count->field_length;
957 	    }
958 	  }
959 	}
960 
961 	/* Save character counters and space-counts and zero-field-counts */
962 	if (count->field_type == FIELD_NORMAL ||
963 	    count->field_type == FIELD_SKIP_ENDSPACE)
964 	{
965           /* Ignore trailing space. */
966 	  for ( ; end_pos > pos ; end_pos--)
967 	    if (end_pos[-1] != ' ')
968 	      break;
969           /* Empty fields are just counted. Go to the next record. */
970 	  if (end_pos == pos)
971 	  {
972 	    count->empty_fields++;
973 	    count->max_zero_fill=0;
974 	    continue;
975 	  }
976           /*
977             Count the total of all trailing spaces and the number of
978             short trailing spaces. Remember the longest trailing space.
979           */
980 	  length= (uint) (next_pos-end_pos);
981 	  count->tot_end_space+=length;
982 	  if (length < 8)
983 	    count->end_space[length]++;
984 	  if (count->max_end_space < length)
985 	    count->max_end_space = length;
986 	}
987 
988 	if (count->field_type == FIELD_NORMAL ||
989 	    count->field_type == FIELD_SKIP_PRESPACE)
990 	{
991           /* Ignore leading space. */
992 	  for (pos=start_pos; pos < end_pos ; pos++)
993 	    if (pos[0] != ' ')
994 	      break;
995           /* Empty fields are just counted. Go to the next record. */
996 	  if (end_pos == pos)
997 	  {
998 	    count->empty_fields++;
999 	    count->max_zero_fill=0;
1000 	    continue;
1001 	  }
1002           /*
1003             Count the total of all leading spaces and the number of
1004             short leading spaces. Remember the longest leading space.
1005           */
1006 	  length= (uint) (pos-start_pos);
1007 	  count->tot_pre_space+=length;
1008 	  if (length < 8)
1009 	    count->pre_space[length]++;
1010 	  if (count->max_pre_space < length)
1011 	    count->max_pre_space = length;
1012 	}
1013 
1014         /* Calculate pos, end_pos, and max_length for variable length fields. */
1015 	if (count->field_type == FIELD_BLOB)
1016 	{
1017 	  uint field_length=count->field_length -portable_sizeof_char_ptr;
1018 	  ulong blob_length= _ma_calc_blob_length(field_length, start_pos);
1019 	  memcpy(&pos,  start_pos+field_length,sizeof(char*));
1020 	  end_pos=pos+blob_length;
1021 	  tot_blob_length+=blob_length;
1022 	  set_if_bigger(count->max_length,blob_length);
1023 	}
1024 	else if (count->field_type == FIELD_VARCHAR)
1025 	{
1026           uint pack_length= HA_VARCHAR_PACKLENGTH(count->field_length-1);
1027 	  length= (pack_length == 1 ? (uint) *(uchar*) start_pos :
1028                    uint2korr(start_pos));
1029 	  pos= start_pos+pack_length;
1030 	  end_pos= pos+length;
1031 	  set_if_bigger(count->max_length,length);
1032 	}
1033 
1034         /* Evaluate 'max_zero_fill' for short fields. */
1035 	if (count->field_length <= 8 &&
1036 	    (count->field_type == FIELD_NORMAL ||
1037 	     count->field_type == FIELD_SKIP_ZERO))
1038 	{
1039 	  uint i;
1040           /* Zero fields are just counted. Go to the next record. */
1041 	  if (!memcmp(start_pos, zero_string, count->field_length))
1042 	  {
1043 	    count->zero_fields++;
1044 	    continue;
1045 	  }
1046           /*
1047             max_zero_fill starts with field_length. It is decreased every
1048             time a shorter "zero trailer" is found. It is set to zero when
1049             an empty field is found (see above). This suggests that the
1050             variable should be called 'min_zero_fill'.
1051           */
1052 	  for (i =0 ; i < count->max_zero_fill && ! end_pos[-1 - (int) i] ;
1053 	       i++) ;
1054 	  if (i < count->max_zero_fill)
1055 	    count->max_zero_fill=i;
1056 	}
1057 
1058         /* Ignore zero fields and check fields. */
1059 	if (count->field_type == FIELD_ZERO ||
1060 	    count->field_type == FIELD_CHECK)
1061 	  continue;
1062 
1063         /*
1064           Count the incidence of every uchar value in the
1065           significant field value.
1066         */
1067 	for ( ; pos < end_pos ; pos++)
1068 	  count->counts[(uchar) *pos]++;
1069 
1070         /* Step to next field. */
1071       }
1072 
1073       if (tot_blob_length > max_blob_length)
1074 	max_blob_length=tot_blob_length;
1075       record_count++;
1076       if (write_loop && record_count % WRITE_COUNT == 0)
1077       {
1078 	printf("%lu\r", (ulong) record_count);
1079         fflush(stdout);
1080       }
1081     }
1082     else if (error != HA_ERR_RECORD_DELETED)
1083     {
1084       fprintf(stderr, "Got error %d while reading rows\n", error);
1085       break;
1086     }
1087 
1088     /* Step to next record. */
1089   }
1090   if (write_loop)
1091   {
1092     printf("            \r");
1093     fflush(stdout);
1094   }
1095 
1096   /*
1097     If --debug=d,fakebigcodes is set, fake the counts to get big Huffman
1098     codes.
1099   */
1100   DBUG_EXECUTE_IF("fakebigcodes", fakebigcodes(huff_counts, end_count););
1101 
1102   DBUG_PRINT("info", ("Found the following number of incidents "
1103                       "of the uchar codes:"));
1104   if (verbose >= 2)
1105     printf("Found the following number of incidents "
1106                 "of the uchar codes:\n");
1107   for (count= huff_counts ; count < end_count; count++)
1108   {
1109     uint      idx;
1110     my_off_t  total_count;
1111     char      llbuf[32];
1112 
1113     DBUG_PRINT("info", ("column: %3u", (uint) (count - huff_counts + 1)));
1114     if (verbose >= 2)
1115       printf("column: %3u\n", (uint) (count - huff_counts + 1));
1116     if (count->tree_buff)
1117     {
1118       DBUG_PRINT("info", ("number of distinct values: %u",
1119                           (uint) ((count->tree_pos - count->tree_buff) /
1120                                   count->field_length)));
1121       if (verbose >= 2)
1122         printf("number of distinct values: %u\n",
1123                     (uint) ((count->tree_pos - count->tree_buff) /
1124                             count->field_length));
1125     }
1126     total_count= 0;
1127     for (idx= 0; idx < 256; idx++)
1128     {
1129       if (count->counts[idx])
1130       {
1131         total_count+= count->counts[idx];
1132         DBUG_PRINT("info", ("counts[0x%02x]: %12s", idx,
1133                             llstr((longlong) count->counts[idx], llbuf)));
1134         if (verbose >= 2)
1135           printf("counts[0x%02x]: %12s\n", idx,
1136                       llstr((longlong) count->counts[idx], llbuf));
1137       }
1138     }
1139     DBUG_PRINT("info", ("total:        %12s", llstr((longlong) total_count,
1140                                                     llbuf)));
1141     if ((verbose >= 2) && total_count)
1142     {
1143       printf("total:        %12s\n",
1144                   llstr((longlong) total_count, llbuf));
1145     }
1146   }
1147 
1148   mrg->records=record_count;
1149   mrg->max_blob_length=max_blob_length;
1150   my_safe_afree(record, reclength);
1151   DBUG_RETURN(error != HA_ERR_END_OF_FILE);
1152 }
1153 
compare_huff_elements(void * not_used,uchar * a,uchar * b)1154 static int compare_huff_elements(void *not_used __attribute__((unused)),
1155 				 uchar *a, uchar *b)
1156 {
1157   return *((my_off_t*) a) < *((my_off_t*) b) ? -1 :
1158     (*((my_off_t*) a) == *((my_off_t*) b)  ? 0 : 1);
1159 }
1160 
1161 	/* Check each tree if we should use pre-space-compress, end-space-
1162 	   compress, empty-field-compress or zero-field-compress */
1163 
check_counts(HUFF_COUNTS * huff_counts,uint trees,my_off_t records)1164 static void check_counts(HUFF_COUNTS *huff_counts, uint trees,
1165 			 my_off_t records)
1166 {
1167   uint space_fields,fill_zero_fields,field_count[(int) FIELD_enum_val_count];
1168   my_off_t old_length,new_length,length;
1169   DBUG_ENTER("check_counts");
1170 
1171   bzero((uchar*) field_count,sizeof(field_count));
1172   space_fields=fill_zero_fields=0;
1173 
1174   for (; trees-- ; huff_counts++)
1175   {
1176     if (huff_counts->field_type == FIELD_BLOB)
1177     {
1178       huff_counts->length_bits=max_bit(huff_counts->max_length);
1179       goto found_pack;
1180     }
1181     else if (huff_counts->field_type == FIELD_VARCHAR)
1182     {
1183       huff_counts->length_bits=max_bit(huff_counts->max_length);
1184       goto found_pack;
1185     }
1186     else if (huff_counts->field_type == FIELD_CHECK)
1187     {
1188       huff_counts->bytes_packed=0;
1189       huff_counts->counts[0]=0;
1190       goto found_pack;
1191     }
1192 
1193     huff_counts->field_type=FIELD_NORMAL;
1194     huff_counts->pack_type=0;
1195 
1196     /* Check for zero-filled records (in this column), or zero records. */
1197     if (huff_counts->zero_fields || ! records)
1198     {
1199       my_off_t old_space_count;
1200       /*
1201         If there are only zero filled records (in this column),
1202         or no records at all, we are done.
1203       */
1204       if (huff_counts->zero_fields == records)
1205       {
1206 	huff_counts->field_type= FIELD_ZERO;
1207 	huff_counts->bytes_packed=0;
1208 	huff_counts->counts[0]=0;
1209 	goto found_pack;
1210       }
1211       /* Remeber the number of significant spaces. */
1212       old_space_count=huff_counts->counts[' '];
1213       /* Add all leading and trailing spaces. */
1214       huff_counts->counts[' ']+= (huff_counts->tot_end_space +
1215                                   huff_counts->tot_pre_space +
1216                                   huff_counts->empty_fields *
1217                                   huff_counts->field_length);
1218       /* Check, what the compressed length of this would be. */
1219       old_length=calc_packed_length(huff_counts,0)+records/8;
1220       /* Get the number of zero bytes. */
1221       length=huff_counts->zero_fields*huff_counts->field_length;
1222       /* Add it to the counts. */
1223       huff_counts->counts[0]+=length;
1224       /* Check, what the compressed length of this would be. */
1225       new_length=calc_packed_length(huff_counts,0);
1226       /* If the compression without the zeroes would be shorter, we are done. */
1227       if (old_length < new_length && huff_counts->field_length > 1)
1228       {
1229 	huff_counts->field_type=FIELD_SKIP_ZERO;
1230 	huff_counts->counts[0]-=length;
1231 	huff_counts->bytes_packed=old_length- records/8;
1232 	goto found_pack;
1233       }
1234       /* Remove the insignificant spaces, but keep the zeroes. */
1235       huff_counts->counts[' ']=old_space_count;
1236     }
1237     /* Check, what the compressed length of this column would be. */
1238     huff_counts->bytes_packed=calc_packed_length(huff_counts,0);
1239 
1240     /*
1241       If there are enough empty records (in this column),
1242       treating them specially may pay off.
1243     */
1244     if (huff_counts->empty_fields)
1245     {
1246       if (huff_counts->field_length > 2 &&
1247 	  huff_counts->empty_fields + (records - huff_counts->empty_fields)*
1248 	  (1+max_bit(MY_MAX(huff_counts->max_pre_space,
1249 			 huff_counts->max_end_space))) <
1250 	  records * max_bit(huff_counts->field_length))
1251       {
1252 	huff_counts->pack_type |= PACK_TYPE_SPACE_FIELDS;
1253       }
1254       else
1255       {
1256 	length=huff_counts->empty_fields*huff_counts->field_length;
1257 	if (huff_counts->tot_end_space || ! huff_counts->tot_pre_space)
1258 	{
1259 	  huff_counts->tot_end_space+=length;
1260 	  huff_counts->max_end_space=huff_counts->field_length;
1261 	  if (huff_counts->field_length < 8)
1262 	    huff_counts->end_space[huff_counts->field_length]+=
1263 	      huff_counts->empty_fields;
1264 	}
1265 	if (huff_counts->tot_pre_space)
1266 	{
1267 	  huff_counts->tot_pre_space+=length;
1268 	  huff_counts->max_pre_space=huff_counts->field_length;
1269 	  if (huff_counts->field_length < 8)
1270 	    huff_counts->pre_space[huff_counts->field_length]+=
1271 	      huff_counts->empty_fields;
1272 	}
1273       }
1274     }
1275 
1276     /*
1277       If there are enough trailing spaces (in this column),
1278       treating them specially may pay off.
1279     */
1280     if (huff_counts->tot_end_space)
1281     {
1282       huff_counts->counts[' ']+=huff_counts->tot_pre_space;
1283       if (test_space_compress(huff_counts,records,huff_counts->max_end_space,
1284 			      huff_counts->end_space,
1285 			      huff_counts->tot_end_space,FIELD_SKIP_ENDSPACE))
1286 	goto found_pack;
1287       huff_counts->counts[' ']-=huff_counts->tot_pre_space;
1288     }
1289 
1290     /*
1291       If there are enough leading spaces (in this column),
1292       treating them specially may pay off.
1293     */
1294     if (huff_counts->tot_pre_space)
1295     {
1296       if (test_space_compress(huff_counts,records,huff_counts->max_pre_space,
1297 			      huff_counts->pre_space,
1298 			      huff_counts->tot_pre_space,FIELD_SKIP_PRESPACE))
1299 	goto found_pack;
1300     }
1301 
1302   found_pack:			/* Found field-packing */
1303 
1304     /* Test if we can use zero-fill */
1305 
1306     if (huff_counts->max_zero_fill &&
1307 	(huff_counts->field_type == FIELD_NORMAL ||
1308 	 huff_counts->field_type == FIELD_SKIP_ZERO))
1309     {
1310       huff_counts->counts[0]-=huff_counts->max_zero_fill*
1311 	(huff_counts->field_type == FIELD_SKIP_ZERO ?
1312 	 records - huff_counts->zero_fields : records);
1313       huff_counts->pack_type|=PACK_TYPE_ZERO_FILL;
1314       huff_counts->bytes_packed=calc_packed_length(huff_counts,0);
1315     }
1316 
1317     /* Test if intervall-field is better */
1318 
1319     if (huff_counts->tree_buff)
1320     {
1321       HUFF_TREE tree;
1322 
1323       DBUG_EXECUTE_IF("forceintervall",
1324                       huff_counts->bytes_packed= ~ (my_off_t) 0;);
1325       tree.element_buffer=0;
1326       if (!make_huff_tree(&tree,huff_counts) &&
1327 	  tree.bytes_packed+tree.tree_pack_length < huff_counts->bytes_packed)
1328       {
1329 	if (tree.elements == 1)
1330 	  huff_counts->field_type=FIELD_CONSTANT;
1331 	else
1332 	  huff_counts->field_type=FIELD_INTERVALL;
1333 	huff_counts->pack_type=0;
1334       }
1335       else
1336       {
1337 	my_free(huff_counts->tree_buff);
1338 	delete_tree(&huff_counts->int_tree, 0);
1339 	huff_counts->tree_buff=0;
1340       }
1341       if (tree.element_buffer)
1342 	my_free(tree.element_buffer);
1343     }
1344     if (huff_counts->pack_type & PACK_TYPE_SPACE_FIELDS)
1345       space_fields++;
1346     if (huff_counts->pack_type & PACK_TYPE_ZERO_FILL)
1347       fill_zero_fields++;
1348     field_count[huff_counts->field_type]++;
1349   }
1350   DBUG_PRINT("info", ("normal:    %3d  empty-space:     %3d  "
1351                       "empty-zero:       %3d  empty-fill: %3d",
1352                       field_count[FIELD_NORMAL],space_fields,
1353                       field_count[FIELD_SKIP_ZERO],fill_zero_fields));
1354   DBUG_PRINT("info", ("pre-space: %3d  end-space:       %3d  "
1355                       "intervall-fields: %3d  zero:       %3d",
1356                       field_count[FIELD_SKIP_PRESPACE],
1357                       field_count[FIELD_SKIP_ENDSPACE],
1358                       field_count[FIELD_INTERVALL],
1359                       field_count[FIELD_ZERO]));
1360   if (verbose)
1361     printf("\nnormal:    %3d  empty-space:     %3d  "
1362                 "empty-zero:       %3d  empty-fill: %3d\n"
1363                 "pre-space: %3d  end-space:       %3d  "
1364                 "intervall-fields: %3d  zero:       %3d\n",
1365                 field_count[FIELD_NORMAL],space_fields,
1366                 field_count[FIELD_SKIP_ZERO],fill_zero_fields,
1367                 field_count[FIELD_SKIP_PRESPACE],
1368                 field_count[FIELD_SKIP_ENDSPACE],
1369                 field_count[FIELD_INTERVALL],
1370                 field_count[FIELD_ZERO]);
1371   DBUG_VOID_RETURN;
1372 }
1373 
1374 
1375 /* Test if we can use space-compression and empty-field-compression */
1376 
1377 static int
test_space_compress(HUFF_COUNTS * huff_counts,my_off_t records,uint max_space_length,my_off_t * space_counts,my_off_t tot_space_count,enum en_fieldtype field_type)1378 test_space_compress(HUFF_COUNTS *huff_counts, my_off_t records,
1379 		    uint max_space_length, my_off_t *space_counts,
1380 		    my_off_t tot_space_count, enum en_fieldtype field_type)
1381 {
1382   int min_pos;
1383   uint length_bits,i;
1384   my_off_t space_count,min_space_count,min_pack,new_length,skip;
1385 
1386   length_bits=max_bit(max_space_length);
1387 
1388 		/* Default no end_space-packing */
1389   space_count=huff_counts->counts[(uint) ' '];
1390   min_space_count= (huff_counts->counts[(uint) ' ']+= tot_space_count);
1391   min_pack=calc_packed_length(huff_counts,0);
1392   min_pos= -2;
1393   huff_counts->counts[(uint) ' ']=space_count;
1394 
1395 	/* Test with allways space-count */
1396   new_length=huff_counts->bytes_packed+length_bits*records/8;
1397   if (new_length+1 < min_pack)
1398   {
1399     min_pos= -1;
1400     min_pack=new_length;
1401     min_space_count=space_count;
1402   }
1403 	/* Test with length-flag */
1404   for (skip=0L, i=0 ; i < 8 ; i++)
1405   {
1406     if (space_counts[i])
1407     {
1408       if (i)
1409 	huff_counts->counts[(uint) ' ']+=space_counts[i];
1410       skip+=huff_counts->pre_space[i];
1411       new_length=calc_packed_length(huff_counts,0)+
1412 	(records+(records-skip)*(1+length_bits))/8;
1413       if (new_length < min_pack)
1414       {
1415 	min_pos=(int) i;
1416 	min_pack=new_length;
1417 	min_space_count=huff_counts->counts[(uint) ' '];
1418       }
1419     }
1420   }
1421 
1422   huff_counts->counts[(uint) ' ']=min_space_count;
1423   huff_counts->bytes_packed=min_pack;
1424   switch (min_pos) {
1425   case -2:
1426     return(0);				/* No space-compress */
1427   case -1:				/* Always space-count */
1428     huff_counts->field_type=field_type;
1429     huff_counts->min_space=0;
1430     huff_counts->length_bits=max_bit(max_space_length);
1431     break;
1432   default:
1433     huff_counts->field_type=field_type;
1434     huff_counts->min_space=(uint) min_pos;
1435     huff_counts->pack_type|=PACK_TYPE_SELECTED;
1436     huff_counts->length_bits=max_bit(max_space_length);
1437     break;
1438   }
1439   return(1);				/* Using space-compress */
1440 }
1441 
1442 
1443 	/* Make a huff_tree of each huff_count */
1444 
make_huff_trees(HUFF_COUNTS * huff_counts,uint trees)1445 static HUFF_TREE* make_huff_trees(HUFF_COUNTS *huff_counts, uint trees)
1446 {
1447   uint tree;
1448   HUFF_TREE *huff_tree;
1449   DBUG_ENTER("make_huff_trees");
1450 
1451   if (!(huff_tree=(HUFF_TREE*) my_malloc(trees*sizeof(HUFF_TREE),
1452 					 MYF(MY_WME | MY_ZEROFILL))))
1453     DBUG_RETURN(0);
1454 
1455   for (tree=0 ; tree < trees ; tree++)
1456   {
1457     if (make_huff_tree(huff_tree+tree,huff_counts+tree))
1458     {
1459       while (tree--)
1460 	my_free(huff_tree[tree].element_buffer);
1461       my_free(huff_tree);
1462       DBUG_RETURN(0);
1463     }
1464   }
1465   DBUG_RETURN(huff_tree);
1466 }
1467 
1468 /*
1469   Build a Huffman tree.
1470 
1471   SYNOPSIS
1472     make_huff_tree()
1473     huff_tree                   The Huffman tree.
1474     huff_counts                 The counts.
1475 
1476   DESCRIPTION
1477     Build a Huffman tree according to huff_counts->counts or
1478     huff_counts->tree_buff. tree_buff, if non-NULL contains up to
1479     tree_buff_length of distinct column values. In that case, whole
1480     values can be Huffman encoded instead of single bytes.
1481 
1482   RETURN
1483     0           OK
1484     != 0        Error
1485 */
1486 
make_huff_tree(HUFF_TREE * huff_tree,HUFF_COUNTS * huff_counts)1487 static int make_huff_tree(HUFF_TREE *huff_tree, HUFF_COUNTS *huff_counts)
1488 {
1489   uint i,found,bits_packed,first,last;
1490   my_off_t bytes_packed;
1491   HUFF_ELEMENT *a,*b,*new_huff_el;
1492 
1493   first=last=0;
1494   if (huff_counts->tree_buff)
1495   {
1496     /* Calculate the number of distinct values in tree_buff. */
1497     found= (uint) (huff_counts->tree_pos - huff_counts->tree_buff) /
1498       huff_counts->field_length;
1499     first=0; last=found-1;
1500   }
1501   else
1502   {
1503     /* Count the number of uchar codes found in the column. */
1504     for (i=found=0 ; i < 256 ; i++)
1505     {
1506       if (huff_counts->counts[i])
1507       {
1508 	if (! found++)
1509 	  first=i;
1510 	last=i;
1511       }
1512     }
1513     if (found < 2)
1514       found=2;
1515   }
1516 
1517   /* When using 'tree_buff' we can have more that 256 values. */
1518   if (queue.max_elements < found)
1519   {
1520     delete_queue(&queue);
1521     if (init_queue(&queue,found, 0, 0, compare_huff_elements, 0, 0, 0))
1522       return -1;
1523   }
1524 
1525   /* Allocate or reallocate an element buffer for the Huffman tree. */
1526   if (!huff_tree->element_buffer)
1527   {
1528     if (!(huff_tree->element_buffer=
1529 	 (HUFF_ELEMENT*) my_malloc(found*2*sizeof(HUFF_ELEMENT),MYF(MY_WME))))
1530       return 1;
1531   }
1532   else
1533   {
1534     HUFF_ELEMENT *temp;
1535     if (!(temp=
1536 	  (HUFF_ELEMENT*) my_realloc((uchar*) huff_tree->element_buffer,
1537 				     found*2*sizeof(HUFF_ELEMENT),
1538 				     MYF(MY_WME))))
1539       return 1;
1540     huff_tree->element_buffer=temp;
1541   }
1542 
1543   huff_counts->tree=huff_tree;
1544   huff_tree->counts=huff_counts;
1545   huff_tree->min_chr=first;
1546   huff_tree->max_chr=last;
1547   huff_tree->char_bits=max_bit(last-first);
1548   huff_tree->offset_bits=max_bit(found-1)+1;
1549 
1550   if (huff_counts->tree_buff)
1551   {
1552     huff_tree->elements=0;
1553     huff_tree->tree_pack_length=(1+15+16+5+5+
1554 				 (huff_tree->char_bits+1)*found+
1555 				 (huff_tree->offset_bits+1)*
1556 				 (found-2)+7)/8 +
1557 				   (uint) (huff_tree->counts->tree_pos-
1558 					   huff_tree->counts->tree_buff);
1559     /*
1560       Put a HUFF_ELEMENT into the queue for every distinct column value.
1561 
1562       tree_walk() calls save_counts_in_queue() for every element in
1563       'int_tree'. This takes elements from the target trees element
1564       buffer and places references to them into the buffer of the
1565       priority queue. We insert in column value order, but the order is
1566       in fact irrelevant here. We will establish the correct order
1567       later.
1568     */
1569     tree_walk(&huff_counts->int_tree,
1570 	      (int (*)(void*, element_count,void*)) save_counts_in_queue,
1571 	      (uchar*) huff_tree, left_root_right);
1572   }
1573   else
1574   {
1575     huff_tree->elements=found;
1576     huff_tree->tree_pack_length=(9+9+5+5+
1577 				 (huff_tree->char_bits+1)*found+
1578 				 (huff_tree->offset_bits+1)*
1579 				 (found-2)+7)/8;
1580     /*
1581       Put a HUFF_ELEMENT into the queue for every uchar code found in the column.
1582 
1583       The elements are taken from the target trees element buffer.
1584       Instead of using queue_insert(), we just place references to the
1585       elements into the buffer of the priority queue. We insert in byte
1586       value order, but the order is in fact irrelevant here. We will
1587       establish the correct order later.
1588     */
1589     for (i=first, found=0 ; i <= last ; i++)
1590     {
1591       if (huff_counts->counts[i])
1592       {
1593 	new_huff_el=huff_tree->element_buffer+(found++);
1594 	new_huff_el->count=huff_counts->counts[i];
1595 	new_huff_el->a.leaf.null=0;
1596 	new_huff_el->a.leaf.element_nr=i;
1597 	queue.root[found]=(uchar*) new_huff_el;
1598       }
1599     }
1600     /*
1601       If there is only a single uchar value in this field in all records,
1602       add a second element with zero incidence. This is required to enter
1603       the loop, which builds the Huffman tree.
1604     */
1605     while (found < 2)
1606     {
1607       new_huff_el=huff_tree->element_buffer+(found++);
1608       new_huff_el->count=0;
1609       new_huff_el->a.leaf.null=0;
1610       if (last)
1611 	new_huff_el->a.leaf.element_nr=huff_tree->min_chr=last-1;
1612       else
1613 	new_huff_el->a.leaf.element_nr=huff_tree->max_chr=last+1;
1614       queue.root[found]=(uchar*) new_huff_el;
1615     }
1616   }
1617 
1618   /* Make a queue from the queue buffer. */
1619   queue.elements=found;
1620 
1621   /*
1622     Make a priority queue from the queue. Construct its index so that we
1623     have a partially ordered tree.
1624   */
1625   queue_fix(&queue);
1626 
1627   /* The Huffman algorithm. */
1628   bytes_packed=0; bits_packed=0;
1629   for (i=1 ; i < found ; i++)
1630   {
1631     /*
1632       Pop the top element from the queue (the one with the least incidence).
1633       Popping from a priority queue includes a re-ordering of the queue,
1634       to get the next least incidence element to the top.
1635     */
1636     a=(HUFF_ELEMENT*) queue_remove_top(&queue);
1637     /* Copy the next least incidence element */
1638     b=(HUFF_ELEMENT*) queue_top(&queue);
1639     /* Get a new element from the element buffer. */
1640     new_huff_el=huff_tree->element_buffer+found+i;
1641     /* The new element gets the sum of the two least incidence elements. */
1642     new_huff_el->count=a->count+b->count;
1643     /*
1644       The Huffman algorithm assigns another bit to the code for a byte
1645       every time that bytes incidence is combined (directly or indirectly)
1646       to a new element as one of the two least incidence elements.
1647       This means that one more bit per incidence of that uchar is required
1648       in the resulting file. So we add the new combined incidence as the
1649       number of bits by which the result grows.
1650     */
1651     bits_packed+=(uint) (new_huff_el->count & 7);
1652     bytes_packed+=new_huff_el->count/8;
1653     /* The new element points to its children, lesser in left.  */
1654     new_huff_el->a.nod.left=a;
1655     new_huff_el->a.nod.right=b;
1656     /*
1657       Replace the copied top element by the new element and re-order the
1658       queue.
1659     */
1660     queue_top(&queue)= (uchar*) new_huff_el;
1661     queue_replace_top(&queue);
1662   }
1663   huff_tree->root=(HUFF_ELEMENT*) queue.root[1];
1664   huff_tree->bytes_packed=bytes_packed+(bits_packed+7)/8;
1665   return 0;
1666 }
1667 
compare_tree(void * cmp_arg,register const uchar * s,register const uchar * t)1668 static int compare_tree(void* cmp_arg __attribute__((unused)),
1669 			register const uchar *s, register const uchar *t)
1670 {
1671   uint length;
1672   for (length=global_count->field_length; length-- ;)
1673     if (*s++ != *t++)
1674       return (int) s[-1] - (int) t[-1];
1675   return 0;
1676 }
1677 
1678 /*
1679   Organize distinct column values and their incidences into a priority queue.
1680 
1681   SYNOPSIS
1682     save_counts_in_queue()
1683     key                         The column value.
1684     count                       The incidence of this value.
1685     tree                        The Huffman tree to be built later.
1686 
1687   DESCRIPTION
1688     We use the element buffer of the targeted tree. The distinct column
1689     values are organized in a priority queue first. The Huffman
1690     algorithm will later organize the elements into a Huffman tree. For
1691     the time being, we just place references to the elements into the
1692     queue buffer. The buffer will later be organized into a priority
1693     queue.
1694 
1695   RETURN
1696     0
1697  */
1698 
save_counts_in_queue(uchar * key,element_count count,HUFF_TREE * tree)1699 static int save_counts_in_queue(uchar *key, element_count count,
1700 				HUFF_TREE *tree)
1701 {
1702   HUFF_ELEMENT *new_huff_el;
1703 
1704   new_huff_el=tree->element_buffer+(tree->elements++);
1705   new_huff_el->count=count;
1706   new_huff_el->a.leaf.null=0;
1707   new_huff_el->a.leaf.element_nr= (uint) (key- tree->counts->tree_buff) /
1708     tree->counts->field_length;
1709   queue.root[tree->elements]=(uchar*) new_huff_el;
1710   return 0;
1711 }
1712 
1713 
1714 /*
1715   Calculate length of file if given counts should be used.
1716 
1717   SYNOPSIS
1718     calc_packed_length()
1719     huff_counts                 The counts for a column of the table(s).
1720     add_tree_lenght             If the decode tree length should be added.
1721 
1722   DESCRIPTION
1723     We need to follow the Huffman algorithm until we know, how many bits
1724     are required for each uchar code. But we do not need the resulting
1725     Huffman tree. Hence, we can leave out some steps which are essential
1726     in make_huff_tree().
1727 
1728   RETURN
1729     Number of bytes required to compress this table column.
1730 */
1731 
calc_packed_length(HUFF_COUNTS * huff_counts,uint add_tree_lenght)1732 static my_off_t calc_packed_length(HUFF_COUNTS *huff_counts,
1733 				   uint add_tree_lenght)
1734 {
1735   uint i,found,bits_packed,first,last;
1736   my_off_t bytes_packed;
1737   HUFF_ELEMENT element_buffer[256];
1738   DBUG_ENTER("calc_packed_length");
1739 
1740   /*
1741     WARNING: We use a small hack for efficiency: Instead of placing
1742     references to HUFF_ELEMENTs into the queue, we just insert
1743     references to the counts of the uchar codes which appeared in this
1744     table column. During the Huffman algorithm they are successively
1745     replaced by references to HUFF_ELEMENTs. This works, because
1746     HUFF_ELEMENTs have the incidence count at their beginning.
1747     Regardless, wether the queue array contains references to counts of
1748     type my_off_t or references to HUFF_ELEMENTs which have the count of
1749     type my_off_t at their beginning, it always points to a count of the
1750     same type.
1751 
1752     Instead of using queue_insert(), we just copy the references into
1753     the buffer of the priority queue. We insert in uchar value order, but
1754     the order is in fact irrelevant here. We will establish the correct
1755     order later.
1756   */
1757   first=last=0;
1758   for (i=found=0 ; i < 256 ; i++)
1759   {
1760     if (huff_counts->counts[i])
1761     {
1762       if (! found++)
1763 	first=i;
1764       last=i;
1765       /* We start with root[1], which is the queues top element. */
1766       queue.root[found]=(uchar*) &huff_counts->counts[i];
1767     }
1768   }
1769   if (!found)
1770     DBUG_RETURN(0);			/* Empty tree */
1771   /*
1772     If there is only a single uchar value in this field in all records,
1773     add a second element with zero incidence. This is required to enter
1774     the loop, which follows the Huffman algorithm.
1775   */
1776   if (found < 2)
1777     queue.root[++found]=(uchar*) &huff_counts->counts[last ? 0 : 1];
1778 
1779   /* Make a queue from the queue buffer. */
1780   queue.elements=found;
1781 
1782   bytes_packed=0; bits_packed=0;
1783   /* Add the length of the coding table, which would become part of the file. */
1784   if (add_tree_lenght)
1785     bytes_packed=(8+9+5+5+(max_bit(last-first)+1)*found+
1786 		  (max_bit(found-1)+1+1)*(found-2) +7)/8;
1787 
1788   /*
1789     Make a priority queue from the queue. Construct its index so that we
1790     have a partially ordered tree.
1791   */
1792   queue_fix(&queue);
1793 
1794   /* The Huffman algorithm. */
1795   for (i=0 ; i < found-1 ; i++)
1796   {
1797     my_off_t        *a;
1798     my_off_t        *b;
1799     HUFF_ELEMENT    *new_huff_el;
1800 
1801     /*
1802       Pop the top element from the queue (the one with the least
1803       incidence). Popping from a priority queue includes a re-ordering
1804       of the queue, to get the next least incidence element to the top.
1805     */
1806     a= (my_off_t*) queue_remove_top(&queue);
1807     /* Copy the next least incidence element. */
1808     b= (my_off_t*) queue_top(&queue);
1809     /* Create a new element in a local (automatic) buffer. */
1810     new_huff_el= element_buffer + i;
1811     /* The new element gets the sum of the two least incidence elements. */
1812     new_huff_el->count= *a + *b;
1813     /*
1814       The Huffman algorithm assigns another bit to the code for a byte
1815       every time that bytes incidence is combined (directly or indirectly)
1816       to a new element as one of the two least incidence elements.
1817       This means that one more bit per incidence of that uchar is required
1818       in the resulting file. So we add the new combined incidence as the
1819       number of bits by which the result grows.
1820     */
1821     bits_packed+=(uint) (new_huff_el->count & 7);
1822     bytes_packed+=new_huff_el->count/8;
1823     /*
1824       Replace the copied top element by the new element and re-order the
1825       queue. This successively replaces the references to counts by
1826       references to HUFF_ELEMENTs.
1827     */
1828     queue_top(&queue)= (uchar*) new_huff_el;
1829     queue_replace_top(&queue);
1830   }
1831   DBUG_RETURN(bytes_packed+(bits_packed+7)/8);
1832 }
1833 
1834 
1835 	/* Remove trees that don't give any compression */
1836 
join_same_trees(HUFF_COUNTS * huff_counts,uint trees)1837 static uint join_same_trees(HUFF_COUNTS *huff_counts, uint trees)
1838 {
1839   uint k,tree_number;
1840   HUFF_COUNTS count,*i,*j,*last_count;
1841 
1842   last_count=huff_counts+trees;
1843   for (tree_number=0, i=huff_counts ; i < last_count ; i++)
1844   {
1845     if (!i->tree->tree_number)
1846     {
1847       i->tree->tree_number= ++tree_number;
1848       if (i->tree_buff)
1849 	continue;			/* Don't join intervall */
1850       for (j=i+1 ; j < last_count ; j++)
1851       {
1852 	if (! j->tree->tree_number && ! j->tree_buff)
1853 	{
1854 	  for (k=0 ; k < 256 ; k++)
1855 	    count.counts[k]=i->counts[k]+j->counts[k];
1856 	  if (calc_packed_length(&count,1) <=
1857 	      i->tree->bytes_packed + j->tree->bytes_packed+
1858 	      i->tree->tree_pack_length+j->tree->tree_pack_length+
1859 	      ALLOWED_JOIN_DIFF)
1860 	  {
1861 	    memcpy(i->counts,(uchar*) count.counts, sizeof(count.counts[0])*256);
1862 	    my_free(j->tree->element_buffer);
1863 	    j->tree->element_buffer=0;
1864 	    j->tree=i->tree;
1865 	    bmove((uchar*) i->counts,(uchar*) count.counts,
1866 		  sizeof(count.counts[0])*256);
1867 	    if (make_huff_tree(i->tree,i))
1868 	      return (uint) -1;
1869 	  }
1870 	}
1871       }
1872     }
1873   }
1874   DBUG_PRINT("info", ("Original trees:  %d  After join: %d",
1875                       trees, tree_number));
1876   if (verbose)
1877     printf("Original trees:  %d  After join: %d\n", trees, tree_number);
1878   return tree_number;			/* Return trees left */
1879 }
1880 
1881 
1882 /*
1883   Fill in huff_tree encode tables.
1884 
1885   SYNOPSIS
1886     make_huff_decode_table()
1887     huff_tree               An array of HUFF_TREE which are to be encoded.
1888     trees                   The number of HUFF_TREE in the array.
1889 
1890   RETURN
1891     0           success
1892     != 0        error
1893 */
1894 
make_huff_decode_table(HUFF_TREE * huff_tree,uint trees)1895 static int make_huff_decode_table(HUFF_TREE *huff_tree, uint trees)
1896 {
1897   uint elements;
1898   for ( ; trees-- ; huff_tree++)
1899   {
1900     if (huff_tree->tree_number > 0)
1901     {
1902       elements=huff_tree->counts->tree_buff ? huff_tree->elements : 256;
1903       if (!(huff_tree->code =
1904             (ulonglong*) my_malloc(elements*
1905                                    (sizeof(ulonglong) + sizeof(uchar)),
1906                                    MYF(MY_WME | MY_ZEROFILL))))
1907 	return 1;
1908       huff_tree->code_len=(uchar*) (huff_tree->code+elements);
1909       make_traverse_code_tree(huff_tree, huff_tree->root,
1910                               8 * sizeof(ulonglong), 0);
1911     }
1912   }
1913   return 0;
1914 }
1915 
1916 
make_traverse_code_tree(HUFF_TREE * huff_tree,HUFF_ELEMENT * element,uint size,ulonglong code)1917 static void make_traverse_code_tree(HUFF_TREE *huff_tree,
1918 				    HUFF_ELEMENT *element,
1919 				    uint size, ulonglong code)
1920 {
1921   uint chr;
1922   if (!element->a.leaf.null)
1923   {
1924     chr=element->a.leaf.element_nr;
1925     huff_tree->code_len[chr]= (uchar) (8 * sizeof(ulonglong) - size);
1926     huff_tree->code[chr]= (code >> size);
1927     if (huff_tree->height < 8 * sizeof(ulonglong) - size)
1928         huff_tree->height= 8 * sizeof(ulonglong) - size;
1929   }
1930   else
1931   {
1932     size--;
1933     make_traverse_code_tree(huff_tree,element->a.nod.left,size,code);
1934     make_traverse_code_tree(huff_tree, element->a.nod.right, size,
1935 			    code + (((ulonglong) 1) << size));
1936   }
1937   return;
1938 }
1939 
1940 
1941 /*
1942   Convert a value into binary digits.
1943 
1944   SYNOPSIS
1945     bindigits()
1946     value                       The value.
1947     length                      The number of low order bits to convert.
1948 
1949   NOTE
1950     The result string is in static storage. It is reused on every call.
1951     So you cannot use it twice in one expression.
1952 
1953   RETURN
1954     A pointer to a static NUL-terminated string.
1955  */
1956 
bindigits(ulonglong value,uint bits)1957 static char *bindigits(ulonglong value, uint bits)
1958 {
1959   static char digits[72];
1960   char *ptr= digits;
1961   uint idx= bits;
1962 
1963   DBUG_ASSERT(idx < sizeof(digits));
1964   while (idx)
1965     *(ptr++)= '0' + ((char) (value >> (--idx)) & (char) 1);
1966   *ptr= '\0';
1967   return digits;
1968 }
1969 
1970 
1971 /*
1972   Convert a value into hexadecimal digits.
1973 
1974   SYNOPSIS
1975     hexdigits()
1976     value                       The value.
1977 
1978   NOTE
1979     The result string is in static storage. It is reused on every call.
1980     So you cannot use it twice in one expression.
1981 
1982   RETURN
1983     A pointer to a static NUL-terminated string.
1984  */
1985 
hexdigits(ulonglong value)1986 static char *hexdigits(ulonglong value)
1987 {
1988   static char digits[20];
1989   char *ptr= digits;
1990   uint idx= 2 * sizeof(value); /* Two hex digits per byte. */
1991 
1992   DBUG_ASSERT(idx < sizeof(digits));
1993   while (idx)
1994   {
1995     if ((*(ptr++)= '0' + ((char) (value >> (4 * (--idx))) & (char) 0xf)) > '9')
1996       *(ptr - 1)+= 'a' - '9' - 1;
1997   }
1998   *ptr= '\0';
1999   return digits;
2000 }
2001 
2002 
2003 	/* Write header to new packed data file */
2004 
write_header(PACK_MRG_INFO * mrg,uint head_length,uint trees,my_off_t tot_elements,my_off_t filelength)2005 static int write_header(PACK_MRG_INFO *mrg,uint head_length,uint trees,
2006 			my_off_t tot_elements,my_off_t filelength)
2007 {
2008   uchar *buff= (uchar*) file_buffer.pos;
2009 
2010   bzero(buff,HEAD_LENGTH);
2011   memcpy(buff,maria_pack_file_magic,4);
2012   int4store(buff+4,head_length);
2013   int4store(buff+8, mrg->min_pack_length);
2014   int4store(buff+12,mrg->max_pack_length);
2015   int4store(buff+16,tot_elements);
2016   int4store(buff+20,intervall_length);
2017   int2store(buff+24,trees);
2018   buff[26]=(char) mrg->ref_length;
2019 	/* Save record pointer length */
2020   buff[27]= (uchar) maria_get_pointer_length((ulonglong) filelength,2);
2021   if (test_only)
2022     return 0;
2023   my_seek(file_buffer.file,0L,MY_SEEK_SET,MYF(0));
2024   return my_write(file_buffer.file,(const uchar *) file_buffer.pos,HEAD_LENGTH,
2025 		  MYF(MY_WME | MY_NABP | MY_WAIT_IF_FULL)) != 0;
2026 }
2027 
2028 	/* Write fieldinfo to new packed file */
2029 
write_field_info(HUFF_COUNTS * counts,uint fields,uint trees)2030 static void write_field_info(HUFF_COUNTS *counts, uint fields, uint trees)
2031 {
2032   reg1 uint i;
2033   uint huff_tree_bits;
2034   huff_tree_bits=max_bit(trees ? trees-1 : 0);
2035 
2036   DBUG_PRINT("info", (" "));
2037   DBUG_PRINT("info", ("column types:"));
2038   DBUG_PRINT("info", ("FIELD_NORMAL          0"));
2039   DBUG_PRINT("info", ("FIELD_SKIP_ENDSPACE   1"));
2040   DBUG_PRINT("info", ("FIELD_SKIP_PRESPACE   2"));
2041   DBUG_PRINT("info", ("FIELD_SKIP_ZERO       3"));
2042   DBUG_PRINT("info", ("FIELD_BLOB            4"));
2043   DBUG_PRINT("info", ("FIELD_CONSTANT        5"));
2044   DBUG_PRINT("info", ("FIELD_INTERVALL       6"));
2045   DBUG_PRINT("info", ("FIELD_ZERO            7"));
2046   DBUG_PRINT("info", ("FIELD_VARCHAR         8"));
2047   DBUG_PRINT("info", ("FIELD_CHECK           9"));
2048   DBUG_PRINT("info", (" "));
2049   DBUG_PRINT("info", ("pack type as a set of flags:"));
2050   DBUG_PRINT("info", ("PACK_TYPE_SELECTED      1"));
2051   DBUG_PRINT("info", ("PACK_TYPE_SPACE_FIELDS  2"));
2052   DBUG_PRINT("info", ("PACK_TYPE_ZERO_FILL     4"));
2053   DBUG_PRINT("info", (" "));
2054   if (verbose >= 2)
2055   {
2056     printf("\n");
2057     printf("column types:\n");
2058     printf("FIELD_NORMAL          0\n");
2059     printf("FIELD_SKIP_ENDSPACE   1\n");
2060     printf("FIELD_SKIP_PRESPACE   2\n");
2061     printf("FIELD_SKIP_ZERO       3\n");
2062     printf("FIELD_BLOB            4\n");
2063     printf("FIELD_CONSTANT        5\n");
2064     printf("FIELD_INTERVALL       6\n");
2065     printf("FIELD_ZERO            7\n");
2066     printf("FIELD_VARCHAR         8\n");
2067     printf("FIELD_CHECK           9\n");
2068     printf("\n");
2069     printf("pack type as a set of flags:\n");
2070     printf("PACK_TYPE_SELECTED      1\n");
2071     printf("PACK_TYPE_SPACE_FIELDS  2\n");
2072     printf("PACK_TYPE_ZERO_FILL     4\n");
2073     printf("\n");
2074   }
2075   for (i=0 ; i++ < fields ; counts++)
2076   {
2077     write_bits((ulonglong) (int) counts->field_type, 5);
2078     write_bits(counts->pack_type,6);
2079     if (counts->pack_type & PACK_TYPE_ZERO_FILL)
2080       write_bits(counts->max_zero_fill,5);
2081     else
2082       write_bits(counts->length_bits,5);
2083     write_bits((ulonglong) counts->tree->tree_number - 1, huff_tree_bits);
2084     DBUG_PRINT("info", ("column: %3u  type: %2u  pack: %2u  zero: %4u  "
2085                         "lbits: %2u  tree: %2u  length: %4u",
2086                         i , counts->field_type, counts->pack_type,
2087                         counts->max_zero_fill, counts->length_bits,
2088                         counts->tree->tree_number, counts->field_length));
2089     if (verbose >= 2)
2090       printf("column: %3u  type: %2u  pack: %2u  zero: %4u  lbits: %2u  "
2091                   "tree: %2u  length: %4u\n", i , counts->field_type,
2092                   counts->pack_type, counts->max_zero_fill, counts->length_bits,
2093                   counts->tree->tree_number, counts->field_length);
2094   }
2095   flush_bits();
2096   return;
2097 }
2098 
2099 	/* Write all huff_trees to new datafile. Return tot count of
2100 	   elements in all trees
2101 	   Returns 0 on error */
2102 
write_huff_tree(HUFF_TREE * huff_tree,uint trees)2103 static my_off_t write_huff_tree(HUFF_TREE *huff_tree, uint trees)
2104 {
2105   uint i,int_length;
2106   uint tree_no;
2107   uint codes;
2108   uint errors= 0;
2109   uint *packed_tree,*offset,length;
2110   my_off_t elements;
2111 
2112   /* Find the highest number of elements in the trees. */
2113   for (i=length=0 ; i < trees ; i++)
2114     if (huff_tree[i].tree_number > 0 && huff_tree[i].elements > length)
2115       length=huff_tree[i].elements;
2116   /*
2117     Allocate a buffer for packing a decode tree. Two numbers per element
2118     (left child and right child).
2119   */
2120   if (!(packed_tree=(uint*) my_alloca(sizeof(uint)*length*2)))
2121   {
2122     my_error(EE_OUTOFMEMORY,MYF(ME_BELL),sizeof(uint)*length*2);
2123     return 0;
2124   }
2125 
2126   DBUG_PRINT("info", (" "));
2127   if (verbose >= 2)
2128     printf("\n");
2129   tree_no= 0;
2130   intervall_length=0;
2131   for (elements=0; trees-- ; huff_tree++)
2132   {
2133     /* Skip columns that have been joined with other columns. */
2134     if (huff_tree->tree_number == 0)
2135       continue;				/* Deleted tree */
2136     tree_no++;
2137     DBUG_PRINT("info", (" "));
2138     if (verbose >= 3)
2139       printf("\n");
2140     /* Count the total number of elements (byte codes or column values). */
2141     elements+=huff_tree->elements;
2142     huff_tree->max_offset=2;
2143     /* Build a tree of offsets and codes for decoding in 'packed_tree'. */
2144     if (huff_tree->elements <= 1)
2145       offset=packed_tree;
2146     else
2147       offset=make_offset_code_tree(huff_tree,huff_tree->root,packed_tree);
2148 
2149     /* This should be the same as 'length' above. */
2150     huff_tree->offset_bits=max_bit(huff_tree->max_offset);
2151 
2152     /*
2153       Since we check this during collecting the distinct column values,
2154       this should never happen.
2155     */
2156     if (huff_tree->max_offset >= IS_OFFSET)
2157     {				/* This should be impossible */
2158       fprintf(stderr, "Tree offset got too big: %d, aborted\n",
2159                    huff_tree->max_offset);
2160       my_afree(packed_tree);
2161       return 0;
2162     }
2163 
2164     DBUG_PRINT("info", ("pos: %lu  elements: %u  tree-elements: %lu  "
2165                         "char_bits: %u\n",
2166                         (ulong) (file_buffer.pos - file_buffer.buffer),
2167                         huff_tree->elements, (ulong) (offset - packed_tree),
2168                         huff_tree->char_bits));
2169     if (!huff_tree->counts->tree_buff)
2170     {
2171       /* We do a uchar compression on this column. Mark with bit 0. */
2172       write_bits(0,1);
2173       write_bits(huff_tree->min_chr,8);
2174       write_bits(huff_tree->elements,9);
2175       write_bits(huff_tree->char_bits,5);
2176       write_bits(huff_tree->offset_bits,5);
2177       int_length=0;
2178     }
2179     else
2180     {
2181       int_length=(uint) (huff_tree->counts->tree_pos -
2182 			 huff_tree->counts->tree_buff);
2183       /* We have distinct column values for this column. Mark with bit 1. */
2184       write_bits(1,1);
2185       write_bits(huff_tree->elements,15);
2186       write_bits(int_length,16);
2187       write_bits(huff_tree->char_bits,5);
2188       write_bits(huff_tree->offset_bits,5);
2189       intervall_length+=int_length;
2190     }
2191     DBUG_PRINT("info", ("tree: %2u  elements: %4u  char_bits: %2u  "
2192                         "offset_bits: %2u  %s: %5u  codelen: %2u",
2193                         tree_no, huff_tree->elements, huff_tree->char_bits,
2194                         huff_tree->offset_bits, huff_tree->counts->tree_buff ?
2195                         "bufflen" : "min_chr", huff_tree->counts->tree_buff ?
2196                         int_length : huff_tree->min_chr, huff_tree->height));
2197     if (verbose >= 2)
2198       printf("tree: %2u  elements: %4u  char_bits: %2u  offset_bits: %2u  "
2199                   "%s: %5u  codelen: %2u\n", tree_no, huff_tree->elements,
2200                   huff_tree->char_bits, huff_tree->offset_bits,
2201                   huff_tree->counts->tree_buff ? "bufflen" : "min_chr",
2202                   huff_tree->counts->tree_buff ? int_length :
2203                   huff_tree->min_chr, huff_tree->height);
2204 
2205     /* Check that the code tree length matches the element count. */
2206     length=(uint) (offset-packed_tree);
2207     if (length != huff_tree->elements*2-2)
2208     {
2209       fprintf(stderr, "error: Huff-tree-length: %d != calc_length: %d\n",
2210                    length, huff_tree->elements * 2 - 2);
2211       errors++;
2212       break;
2213     }
2214 
2215     for (i=0 ; i < length ; i++)
2216     {
2217       if (packed_tree[i] & IS_OFFSET)
2218 	write_bits(packed_tree[i] - IS_OFFSET+ (1 << huff_tree->offset_bits),
2219 		   huff_tree->offset_bits+1);
2220       else
2221 	write_bits(packed_tree[i]-huff_tree->min_chr,huff_tree->char_bits+1);
2222       DBUG_PRINT("info", ("tree[0x%04x]: %s0x%04x",
2223                           i, (packed_tree[i] & IS_OFFSET) ?
2224                           " -> " : "", (packed_tree[i] & IS_OFFSET) ?
2225                           packed_tree[i] - IS_OFFSET + i : packed_tree[i]));
2226       if (verbose >= 3)
2227         printf("tree[0x%04x]: %s0x%04x\n",
2228                     i, (packed_tree[i] & IS_OFFSET) ? " -> " : "",
2229                     (packed_tree[i] & IS_OFFSET) ?
2230                     packed_tree[i] - IS_OFFSET + i : packed_tree[i]);
2231     }
2232     flush_bits();
2233 
2234     /*
2235       Display coding tables and check their correctness.
2236     */
2237     codes= huff_tree->counts->tree_buff ? huff_tree->elements : 256;
2238     for (i= 0; i < codes; i++)
2239     {
2240       ulonglong code;
2241       uint bits;
2242       uint len;
2243       uint idx;
2244 
2245       if (! (len= huff_tree->code_len[i]))
2246         continue;
2247       DBUG_PRINT("info", ("code[0x%04x]:      0x%s  bits: %2u  bin: %s", i,
2248                           hexdigits(huff_tree->code[i]), huff_tree->code_len[i],
2249                           bindigits(huff_tree->code[i],
2250                                     huff_tree->code_len[i])));
2251       if (verbose >= 3)
2252         printf("code[0x%04x]:      0x%s  bits: %2u  bin: %s\n", i,
2253                     hexdigits(huff_tree->code[i]), huff_tree->code_len[i],
2254                     bindigits(huff_tree->code[i], huff_tree->code_len[i]));
2255 
2256       /* Check that the encode table decodes correctly. */
2257       code= 0;
2258       bits= 0;
2259       idx= 0;
2260       DBUG_EXECUTE_IF("forcechkerr1", len--;);
2261       DBUG_EXECUTE_IF("forcechkerr2", bits= 8 * sizeof(code););
2262       DBUG_EXECUTE_IF("forcechkerr3", idx= length;);
2263       for (;;)
2264       {
2265         if (! len)
2266         {
2267           fflush(stdout);
2268           fprintf(stderr, "error: code 0x%s with %u bits not found\n",
2269                        hexdigits(huff_tree->code[i]), huff_tree->code_len[i]);
2270           errors++;
2271           break;
2272         }
2273         code<<= 1;
2274         code|= (huff_tree->code[i] >> (--len)) & 1;
2275         bits++;
2276         if (bits > 8 * sizeof(code))
2277         {
2278           fflush(stdout);
2279           fprintf(stderr, "error: Huffman code too long: %u/%u\n",
2280                        bits, (uint) (8 * sizeof(code)));
2281           errors++;
2282           break;
2283         }
2284         idx+= (uint) code & 1;
2285         if (idx >= length)
2286         {
2287           fflush(stdout);
2288           fprintf(stderr, "error: illegal tree offset: %u/%u\n", idx, length);
2289           errors++;
2290           break;
2291         }
2292         if (packed_tree[idx] & IS_OFFSET)
2293           idx+= packed_tree[idx] & ~IS_OFFSET;
2294         else
2295           break; /* Hit a leaf. This contains the result value. */
2296       }
2297       if (errors)
2298         break;
2299 
2300       DBUG_EXECUTE_IF("forcechkerr4", packed_tree[idx]++;);
2301       if (packed_tree[idx] != i)
2302       {
2303         fflush(stdout);
2304         fprintf(stderr, "error: decoded value 0x%04x  should be: 0x%04x\n",
2305                      packed_tree[idx], i);
2306         errors++;
2307         break;
2308       }
2309     } /*end for (codes)*/
2310     if (errors)
2311       break;
2312 
2313     /* Write column values in case of distinct column value compression. */
2314     if (huff_tree->counts->tree_buff)
2315     {
2316       for (i=0 ; i < int_length ; i++)
2317       {
2318  	write_bits((ulonglong) (uchar) huff_tree->counts->tree_buff[i], 8);
2319         DBUG_PRINT("info", ("column_values[0x%04x]: 0x%02x",
2320                             i, (uchar) huff_tree->counts->tree_buff[i]));
2321         if (verbose >= 3)
2322           printf("column_values[0x%04x]: 0x%02x\n",
2323                       i, (uchar) huff_tree->counts->tree_buff[i]);
2324       }
2325     }
2326     flush_bits();
2327   }
2328   DBUG_PRINT("info", (" "));
2329   if (verbose >= 2)
2330     printf("\n");
2331   my_afree(packed_tree);
2332   if (errors)
2333   {
2334     fprintf(stderr, "Error: Generated decode trees are corrupt. Stop.\n");
2335     return 0;
2336   }
2337   return elements;
2338 }
2339 
2340 
make_offset_code_tree(HUFF_TREE * huff_tree,HUFF_ELEMENT * element,uint * offset)2341 static uint *make_offset_code_tree(HUFF_TREE *huff_tree, HUFF_ELEMENT *element,
2342 				   uint *offset)
2343 {
2344   uint *prev_offset;
2345 
2346   prev_offset= offset;
2347   /*
2348     'a.leaf.null' takes the same place as 'a.nod.left'. If this is null,
2349     then there is no left child and, hence no right child either. This
2350     is a property of a binary tree. An element is either a node with two
2351     childs, or a leaf without childs.
2352 
2353     The current element is always a node with two childs. Go left first.
2354   */
2355   if (!element->a.nod.left->a.leaf.null)
2356   {
2357     /* Store the uchar code or the index of the column value. */
2358     prev_offset[0] =(uint) element->a.nod.left->a.leaf.element_nr;
2359     offset+=2;
2360   }
2361   else
2362   {
2363     /*
2364       Recursively traverse the tree to the left. Mark it as an offset to
2365       another tree node (in contrast to a uchar code or column value index).
2366     */
2367     prev_offset[0]= IS_OFFSET+2;
2368     offset=make_offset_code_tree(huff_tree,element->a.nod.left,offset+2);
2369   }
2370 
2371   /* Now, check the right child. */
2372   if (!element->a.nod.right->a.leaf.null)
2373   {
2374     /* Store the uchar code or the index of the column value. */
2375     prev_offset[1]=element->a.nod.right->a.leaf.element_nr;
2376     return offset;
2377   }
2378   else
2379   {
2380     /*
2381       Recursively traverse the tree to the right. Mark it as an offset to
2382       another tree node (in contrast to a uchar code or column value index).
2383     */
2384     uint temp=(uint) (offset-prev_offset-1);
2385     prev_offset[1]= IS_OFFSET+ temp;
2386     if (huff_tree->max_offset < temp)
2387       huff_tree->max_offset = temp;
2388     return make_offset_code_tree(huff_tree,element->a.nod.right,offset);
2389   }
2390 }
2391 
2392 	/* Get number of bits neaded to represent value */
2393 
max_bit(register uint value)2394 static uint max_bit(register uint value)
2395 {
2396   reg2 uint power=1;
2397 
2398   while ((value>>=1))
2399     power++;
2400   return (power);
2401 }
2402 
2403 
compress_maria_file(PACK_MRG_INFO * mrg,HUFF_COUNTS * huff_counts)2404 static int compress_maria_file(PACK_MRG_INFO *mrg, HUFF_COUNTS *huff_counts)
2405 {
2406   int error;
2407   uint i,max_calc_length,pack_ref_length,min_record_length,max_record_length;
2408   uint intervall,field_length,max_pack_length,pack_blob_length, null_bytes;
2409   my_off_t record_count;
2410   char llbuf[32];
2411   ulong length,pack_length;
2412   uchar *record,*pos,*end_pos,*record_pos,*start_pos;
2413   HUFF_COUNTS *count,*end_count;
2414   HUFF_TREE *tree;
2415   MARIA_HA *isam_file=mrg->file[0];
2416   uint pack_version= (uint) isam_file->s->pack.version;
2417   DBUG_ENTER("compress_maria_file");
2418 
2419   /* Allocate a buffer for the records (excluding blobs). */
2420   if (!(record=(uchar*) my_safe_alloca(isam_file->s->base.reclength)))
2421     return -1;
2422 
2423   end_count=huff_counts+isam_file->s->base.fields;
2424   min_record_length= (uint) ~0;
2425   max_record_length=0;
2426   null_bytes= isam_file->s->base.null_bytes;
2427 
2428   /*
2429     Calculate the maximum number of bits required to pack the records.
2430     Remember to understand 'max_zero_fill' as 'min_zero_fill'.
2431     The tree height determines the maximum number of bits per value.
2432     Some fields skip leading or trailing spaces or zeroes. The skipped
2433     number of bytes is encoded by 'length_bits' bits.
2434     Empty blobs and varchar are encoded with a single 1 bit. Other blobs
2435     and varchar get a leading 0 bit.
2436   */
2437   max_calc_length= null_bytes;
2438   for (i= 0 ; i < isam_file->s->base.fields ; i++)
2439   {
2440     if (!(huff_counts[i].pack_type & PACK_TYPE_ZERO_FILL))
2441       huff_counts[i].max_zero_fill=0;
2442     if (huff_counts[i].field_type == FIELD_CONSTANT ||
2443 	huff_counts[i].field_type == FIELD_ZERO ||
2444 	huff_counts[i].field_type == FIELD_CHECK)
2445       continue;
2446     if (huff_counts[i].field_type == FIELD_INTERVALL)
2447       max_calc_length+=huff_counts[i].tree->height;
2448     else if (huff_counts[i].field_type == FIELD_BLOB ||
2449 	     huff_counts[i].field_type == FIELD_VARCHAR)
2450       max_calc_length+=huff_counts[i].tree->height*huff_counts[i].max_length + huff_counts[i].length_bits +1;
2451     else
2452       max_calc_length+=
2453 	(huff_counts[i].field_length - huff_counts[i].max_zero_fill)*
2454 	  huff_counts[i].tree->height+huff_counts[i].length_bits;
2455   }
2456   max_calc_length= (max_calc_length + 7) / 8;
2457   pack_ref_length= _ma_calc_pack_length(pack_version, max_calc_length);
2458   record_count=0;
2459   /* 'max_blob_length' is the max length of all blobs of a record. */
2460   pack_blob_length= isam_file->s->base.blobs ?
2461                     _ma_calc_pack_length(pack_version, mrg->max_blob_length) : 0;
2462   max_pack_length=pack_ref_length+pack_blob_length;
2463 
2464   DBUG_PRINT("fields", ("==="));
2465   mrg_reset(mrg);
2466   while ((error=mrg_rrnd(mrg,record)) != HA_ERR_END_OF_FILE)
2467   {
2468     ulong tot_blob_length=0;
2469     if (! error)
2470     {
2471       if (flush_buffer((ulong) max_calc_length + (ulong) max_pack_length +
2472                        null_bytes))
2473 	break;
2474       record_pos= file_buffer.pos;
2475       file_buffer.pos+= max_pack_length;
2476       if (null_bytes)
2477       {
2478         /* Copy null bits 'as is' */
2479         memcpy(file_buffer.pos, record, null_bytes);
2480         file_buffer.pos+= null_bytes;
2481       }
2482       for (start_pos=record+null_bytes, count= huff_counts;
2483            count < end_count ;
2484            count++)
2485       {
2486 	end_pos=start_pos+(field_length=count->field_length);
2487 	tree=count->tree;
2488 
2489         DBUG_PRINT("fields", ("column: %3lu  type: %2u  pack: %2u  zero: %4u  "
2490                               "lbits: %2u  tree: %2u  length: %4u",
2491                               (ulong) (count - huff_counts + 1),
2492                               count->field_type,
2493                               count->pack_type, count->max_zero_fill,
2494                               count->length_bits, count->tree->tree_number,
2495                               count->field_length));
2496 
2497         /* Check if the column contains spaces only. */
2498 	if (count->pack_type & PACK_TYPE_SPACE_FIELDS)
2499 	{
2500 	  for (pos=start_pos ; *pos == ' ' && pos < end_pos; pos++) ;
2501 	  if (pos == end_pos)
2502 	  {
2503             DBUG_PRINT("fields",
2504                        ("PACK_TYPE_SPACE_FIELDS spaces only, bits:  1"));
2505             DBUG_PRINT("fields", ("---"));
2506 	    write_bits(1,1);
2507 	    start_pos=end_pos;
2508 	    continue;
2509 	  }
2510           DBUG_PRINT("fields",
2511                      ("PACK_TYPE_SPACE_FIELDS not only spaces, bits:  1"));
2512 	  write_bits(0,1);
2513 	}
2514 	end_pos-=count->max_zero_fill;
2515 	field_length-=count->max_zero_fill;
2516 
2517 	switch (count->field_type) {
2518 	case FIELD_SKIP_ZERO:
2519 	  if (!memcmp(start_pos, zero_string, field_length))
2520 	  {
2521             DBUG_PRINT("fields", ("FIELD_SKIP_ZERO zeroes only, bits:  1"));
2522 	    write_bits(1,1);
2523 	    start_pos=end_pos;
2524 	    break;
2525 	  }
2526           DBUG_PRINT("fields", ("FIELD_SKIP_ZERO not only zeroes, bits:  1"));
2527 	  write_bits(0,1);
2528 	  /* Fall through */
2529 	case FIELD_NORMAL:
2530           DBUG_PRINT("fields", ("FIELD_NORMAL %lu bytes",
2531                                 (ulong) (end_pos - start_pos)));
2532 	  for ( ; start_pos < end_pos ; start_pos++)
2533           {
2534             DBUG_PRINT("fields",
2535                        ("value: 0x%02x  code: 0x%s  bits: %2u  bin: %s",
2536                         (uchar) *start_pos,
2537                         hexdigits(tree->code[(uchar) *start_pos]),
2538                         (uint) tree->code_len[(uchar) *start_pos],
2539                         bindigits(tree->code[(uchar) *start_pos],
2540                                   (uint) tree->code_len[(uchar) *start_pos])));
2541 	    write_bits(tree->code[(uchar) *start_pos],
2542 		       (uint) tree->code_len[(uchar) *start_pos]);
2543           }
2544 	  break;
2545 	case FIELD_SKIP_ENDSPACE:
2546 	  for (pos=end_pos ; pos > start_pos && pos[-1] == ' ' ; pos--) ;
2547 	  length= (ulong) (end_pos - pos);
2548 	  if (count->pack_type & PACK_TYPE_SELECTED)
2549 	  {
2550 	    if (length > count->min_space)
2551 	    {
2552               DBUG_PRINT("fields",
2553                          ("FIELD_SKIP_ENDSPACE more than min_space, bits:  1"));
2554               DBUG_PRINT("fields",
2555                          ("FIELD_SKIP_ENDSPACE skip %lu/%u bytes, bits: %2u",
2556                           length, field_length, count->length_bits));
2557 	      write_bits(1,1);
2558 	      write_bits(length,count->length_bits);
2559 	    }
2560 	    else
2561 	    {
2562               DBUG_PRINT("fields",
2563                          ("FIELD_SKIP_ENDSPACE not more than min_space, "
2564                           "bits:  1"));
2565 	      write_bits(0,1);
2566 	      pos=end_pos;
2567 	    }
2568 	  }
2569 	  else
2570           {
2571             DBUG_PRINT("fields",
2572                        ("FIELD_SKIP_ENDSPACE skip %lu/%u bytes, bits: %2u",
2573                         length, field_length, count->length_bits));
2574 	    write_bits(length,count->length_bits);
2575           }
2576           /* Encode all significant bytes. */
2577           DBUG_PRINT("fields", ("FIELD_SKIP_ENDSPACE %lu bytes",
2578                                 (ulong) (pos - start_pos)));
2579 	  for ( ; start_pos < pos ; start_pos++)
2580           {
2581             DBUG_PRINT("fields",
2582                        ("value: 0x%02x  code: 0x%s  bits: %2u  bin: %s",
2583                         (uchar) *start_pos,
2584                         hexdigits(tree->code[(uchar) *start_pos]),
2585                         (uint) tree->code_len[(uchar) *start_pos],
2586                         bindigits(tree->code[(uchar) *start_pos],
2587                                   (uint) tree->code_len[(uchar) *start_pos])));
2588 	    write_bits(tree->code[(uchar) *start_pos],
2589 		       (uint) tree->code_len[(uchar) *start_pos]);
2590           }
2591 	  start_pos=end_pos;
2592 	  break;
2593 	case FIELD_SKIP_PRESPACE:
2594 	  for (pos=start_pos ; pos < end_pos && pos[0] == ' ' ; pos++) ;
2595           length= (ulong) (pos - start_pos);
2596 	  if (count->pack_type & PACK_TYPE_SELECTED)
2597 	  {
2598 	    if (length > count->min_space)
2599 	    {
2600               DBUG_PRINT("fields",
2601                          ("FIELD_SKIP_PRESPACE more than min_space, bits:  1"));
2602               DBUG_PRINT("fields",
2603                          ("FIELD_SKIP_PRESPACE skip %lu/%u bytes, bits: %2u",
2604                           length, field_length, count->length_bits));
2605 	      write_bits(1,1);
2606 	      write_bits(length,count->length_bits);
2607 	    }
2608 	    else
2609 	    {
2610               DBUG_PRINT("fields",
2611                          ("FIELD_SKIP_PRESPACE not more than min_space, "
2612                           "bits:  1"));
2613 	      pos=start_pos;
2614 	      write_bits(0,1);
2615 	    }
2616 	  }
2617 	  else
2618           {
2619             DBUG_PRINT("fields",
2620                        ("FIELD_SKIP_PRESPACE skip %lu/%u bytes, bits: %2u",
2621                         length, field_length, count->length_bits));
2622 	    write_bits(length,count->length_bits);
2623           }
2624           /* Encode all significant bytes. */
2625           DBUG_PRINT("fields", ("FIELD_SKIP_PRESPACE %lu bytes",
2626                                 (ulong) (end_pos - start_pos)));
2627 	  for (start_pos=pos ; start_pos < end_pos ; start_pos++)
2628           {
2629             DBUG_PRINT("fields",
2630                        ("value: 0x%02x  code: 0x%s  bits: %2u  bin: %s",
2631                         (uchar) *start_pos,
2632                         hexdigits(tree->code[(uchar) *start_pos]),
2633                         (uint) tree->code_len[(uchar) *start_pos],
2634                         bindigits(tree->code[(uchar) *start_pos],
2635                                   (uint) tree->code_len[(uchar) *start_pos])));
2636 	    write_bits(tree->code[(uchar) *start_pos],
2637 		       (uint) tree->code_len[(uchar) *start_pos]);
2638           }
2639 	  break;
2640 	case FIELD_CONSTANT:
2641 	case FIELD_ZERO:
2642 	case FIELD_CHECK:
2643           DBUG_PRINT("fields", ("FIELD_CONSTANT/ZERO/CHECK"));
2644 	  start_pos=end_pos;
2645 	  break;
2646 	case FIELD_INTERVALL:
2647 	  global_count=count;
2648 	  pos=(uchar*) tree_search(&count->int_tree, start_pos,
2649 				  count->int_tree.custom_arg);
2650 	  intervall=(uint) (pos - count->tree_buff)/field_length;
2651           DBUG_PRINT("fields", ("FIELD_INTERVALL"));
2652           DBUG_PRINT("fields", ("index: %4u code: 0x%s  bits: %2u",
2653                                 intervall, hexdigits(tree->code[intervall]),
2654                                 (uint) tree->code_len[intervall]));
2655 	  write_bits(tree->code[intervall],(uint) tree->code_len[intervall]);
2656 	  start_pos=end_pos;
2657 	  break;
2658 	case FIELD_BLOB:
2659 	{
2660 	  ulong blob_length= _ma_calc_blob_length(field_length-
2661 						 portable_sizeof_char_ptr,
2662 						 start_pos);
2663           /* Empty blobs are encoded with a single 1 bit. */
2664 	  if (!blob_length)
2665 	  {
2666             DBUG_PRINT("fields", ("FIELD_BLOB empty, bits:  1"));
2667             write_bits(1,1);
2668 	  }
2669 	  else
2670 	  {
2671 	    uchar *blob,*blob_end;
2672             DBUG_PRINT("fields", ("FIELD_BLOB not empty, bits:  1"));
2673 	    write_bits(0,1);
2674             /* Write the blob length. */
2675             DBUG_PRINT("fields", ("FIELD_BLOB %lu bytes, bits: %2u",
2676                                   blob_length, count->length_bits));
2677 	    write_bits(blob_length,count->length_bits);
2678 	    memcpy(&blob,end_pos-portable_sizeof_char_ptr, sizeof(char*));
2679 	    blob_end=blob+blob_length;
2680             /* Encode the blob bytes. */
2681 	    for ( ; blob < blob_end ; blob++)
2682             {
2683               DBUG_PRINT("fields",
2684                          ("value: 0x%02x  code: 0x%s  bits: %2u  bin: %s",
2685                           (uchar) *blob, hexdigits(tree->code[(uchar) *blob]),
2686                           (uint) tree->code_len[(uchar) *blob],
2687                           bindigits(tree->code[(uchar) *start_pos],
2688                                     (uint)tree->code_len[(uchar) *start_pos])));
2689 	      write_bits(tree->code[(uchar) *blob],
2690 			 (uint) tree->code_len[(uchar) *blob]);
2691             }
2692 	    tot_blob_length+=blob_length;
2693 	  }
2694 	  start_pos= end_pos;
2695 	  break;
2696 	}
2697 	case FIELD_VARCHAR:
2698 	{
2699           uint var_pack_length= HA_VARCHAR_PACKLENGTH(count->field_length-1);
2700 	  ulong col_length= (var_pack_length == 1 ?
2701                              (uint) *(uchar*) start_pos :
2702                              uint2korr(start_pos));
2703           /* Empty varchar are encoded with a single 1 bit. */
2704 	  if (!col_length)
2705 	  {
2706             DBUG_PRINT("fields", ("FIELD_VARCHAR empty, bits:  1"));
2707 	    write_bits(1,1);			/* Empty varchar */
2708 	  }
2709 	  else
2710 	  {
2711 	    uchar *end= start_pos + var_pack_length + col_length;
2712             DBUG_PRINT("fields", ("FIELD_VARCHAR not empty, bits:  1"));
2713 	    write_bits(0,1);
2714             /* Write the varchar length. */
2715             DBUG_PRINT("fields", ("FIELD_VARCHAR %lu bytes, bits: %2u",
2716                                   col_length, count->length_bits));
2717 	    write_bits(col_length,count->length_bits);
2718             /* Encode the varchar bytes. */
2719 	    for (start_pos+= var_pack_length ; start_pos < end ; start_pos++)
2720             {
2721               DBUG_PRINT("fields",
2722                          ("value: 0x%02x  code: 0x%s  bits: %2u  bin: %s",
2723                           (uchar) *start_pos,
2724                           hexdigits(tree->code[(uchar) *start_pos]),
2725                           (uint) tree->code_len[(uchar) *start_pos],
2726                           bindigits(tree->code[(uchar) *start_pos],
2727                                     (uint)tree->code_len[(uchar) *start_pos])));
2728 	      write_bits(tree->code[(uchar) *start_pos],
2729 			 (uint) tree->code_len[(uchar) *start_pos]);
2730             }
2731 	  }
2732 	  start_pos= end_pos;
2733 	  break;
2734 	}
2735 	case FIELD_LAST:
2736         case FIELD_enum_val_count:
2737 	  abort();				/* Impossible */
2738 	}
2739 	start_pos+=count->max_zero_fill;
2740         DBUG_PRINT("fields", ("---"));
2741       }
2742       flush_bits();
2743       length=(ulong) (file_buffer.pos - record_pos) - max_pack_length;
2744       pack_length= _ma_save_pack_length(pack_version, record_pos, length);
2745       if (pack_blob_length)
2746 	pack_length+= _ma_save_pack_length(pack_version,
2747                                            record_pos + pack_length,
2748                                            tot_blob_length);
2749       DBUG_PRINT("fields", ("record: %lu  length: %lu  blob-length: %lu  "
2750                             "length-bytes: %lu", (ulong) record_count, length,
2751                             tot_blob_length, pack_length));
2752       DBUG_PRINT("fields", ("==="));
2753 
2754       /* Correct file buffer if the header was smaller */
2755       if (pack_length != max_pack_length)
2756       {
2757 	bmove(record_pos+pack_length,record_pos+max_pack_length,length);
2758 	file_buffer.pos-= (max_pack_length-pack_length);
2759       }
2760       if (length < (ulong) min_record_length)
2761 	min_record_length=(uint) length;
2762       if (length > (ulong) max_record_length)
2763 	max_record_length=(uint) length;
2764       record_count++;
2765       if (write_loop && record_count % WRITE_COUNT == 0)
2766       {
2767 	printf("%lu\r", (ulong) record_count);
2768         fflush(stdout);
2769       }
2770     }
2771     else if (error != HA_ERR_RECORD_DELETED)
2772       break;
2773   }
2774   if (error == HA_ERR_END_OF_FILE)
2775     error=0;
2776   else
2777   {
2778     fprintf(stderr, "%s: Got error %d reading records\n", my_progname, error);
2779   }
2780   if (verbose >= 2)
2781     printf("wrote %s records.\n", llstr((longlong) record_count, llbuf));
2782 
2783   my_safe_afree(record, isam_file->s->base.reclength);
2784   mrg->ref_length=max_pack_length;
2785   mrg->min_pack_length=max_record_length ? min_record_length : 0;
2786   mrg->max_pack_length=max_record_length;
2787   DBUG_RETURN(error || error_on_write || flush_buffer(~(ulong) 0));
2788 }
2789 
2790 
make_new_name(char * new_name,char * old_name)2791 static char *make_new_name(char *new_name, char *old_name)
2792 {
2793   return fn_format(new_name,old_name,"",DATA_TMP_EXT,2+4);
2794 }
2795 
make_old_name(char * new_name,char * old_name)2796 static char *make_old_name(char *new_name, char *old_name)
2797 {
2798   return fn_format(new_name,old_name,"",OLD_EXT,2+4);
2799 }
2800 
2801 	/* rutines for bit writing buffer */
2802 
init_file_buffer(File file,pbool read_buffer)2803 static void init_file_buffer(File file, pbool read_buffer)
2804 {
2805   file_buffer.file=file;
2806   file_buffer.buffer= (uchar*) my_malloc(ALIGN_SIZE(RECORD_CACHE_SIZE),
2807 					 MYF(MY_WME));
2808   file_buffer.end=file_buffer.buffer+ALIGN_SIZE(RECORD_CACHE_SIZE)-8;
2809   file_buffer.pos_in_file=0;
2810   error_on_write=0;
2811   if (read_buffer)
2812   {
2813 
2814     file_buffer.pos=file_buffer.end;
2815     file_buffer.bits=0;
2816   }
2817   else
2818   {
2819     file_buffer.pos=file_buffer.buffer;
2820     file_buffer.bits=BITS_SAVED;
2821   }
2822   file_buffer.bitbucket= 0;
2823 }
2824 
2825 
flush_buffer(ulong neaded_length)2826 static int flush_buffer(ulong neaded_length)
2827 {
2828   ulong length;
2829 
2830   /*
2831     file_buffer.end is 8 bytes lower than the real end of the buffer.
2832     This is done so that the end-of-buffer condition does not need to be
2833     checked for every uchar (see write_bits()). Consequently,
2834     file_buffer.pos can become greater than file_buffer.end. The
2835     algorithms in the other functions ensure that there will never be
2836     more than 8 bytes written to the buffer without an end-of-buffer
2837     check. So the buffer cannot be overrun. But we need to check for the
2838     near-to-buffer-end condition to avoid a negative result, which is
2839     casted to unsigned and thus becomes giant.
2840   */
2841   if ((file_buffer.pos < file_buffer.end) &&
2842       ((ulong) (file_buffer.end - file_buffer.pos) > neaded_length))
2843     return 0;
2844   length=(ulong) (file_buffer.pos-file_buffer.buffer);
2845   file_buffer.pos=file_buffer.buffer;
2846   file_buffer.pos_in_file+=length;
2847   if (test_only)
2848     return 0;
2849   if (error_on_write|| my_write(file_buffer.file,
2850 				(const uchar*) file_buffer.buffer,
2851 				length,
2852 				MYF(MY_WME | MY_NABP | MY_WAIT_IF_FULL)))
2853   {
2854     error_on_write=1;
2855     return 1;
2856   }
2857 
2858   if (neaded_length != ~(ulong) 0 &&
2859       (ulong) (file_buffer.end-file_buffer.buffer) < neaded_length)
2860   {
2861     uchar *tmp;
2862     neaded_length+=256;				/* some margin */
2863     tmp= (uchar*) my_realloc(file_buffer.buffer, neaded_length,MYF(MY_WME));
2864     if (!tmp)
2865       return 1;
2866     file_buffer.pos=    (tmp + (ulong) (file_buffer.pos - file_buffer.buffer));
2867     file_buffer.buffer= tmp;
2868     file_buffer.end=    (tmp+neaded_length-8);
2869   }
2870   return 0;
2871 }
2872 
2873 
end_file_buffer(void)2874 static void end_file_buffer(void)
2875 {
2876   my_free(file_buffer.buffer);
2877 }
2878 
2879 	/* output `bits` low bits of `value' */
2880 
write_bits(register ulonglong value,register uint bits)2881 static void write_bits(register ulonglong value, register uint bits)
2882 {
2883   DBUG_ASSERT(((bits < 8 * sizeof(value)) && ! (value >> bits)) ||
2884               (bits == 8 * sizeof(value)));
2885 
2886   if ((file_buffer.bits-= (int) bits) >= 0)
2887   {
2888     file_buffer.bitbucket|= value << file_buffer.bits;
2889   }
2890   else
2891   {
2892     reg3 ulonglong bit_buffer;
2893     bits= (uint) -file_buffer.bits;
2894     bit_buffer= (file_buffer.bitbucket |
2895                  ((bits != 8 * sizeof(value)) ? (value >> bits) : 0));
2896 #if BITS_SAVED == 64
2897     *file_buffer.pos++= (uchar) (bit_buffer >> 56);
2898     *file_buffer.pos++= (uchar) (bit_buffer >> 48);
2899     *file_buffer.pos++= (uchar) (bit_buffer >> 40);
2900     *file_buffer.pos++= (uchar) (bit_buffer >> 32);
2901 #endif
2902     *file_buffer.pos++= (uchar) (bit_buffer >> 24);
2903     *file_buffer.pos++= (uchar) (bit_buffer >> 16);
2904     *file_buffer.pos++= (uchar) (bit_buffer >> 8);
2905     *file_buffer.pos++= (uchar) (bit_buffer);
2906 
2907     if (bits != 8 * sizeof(value))
2908       value&= (((ulonglong) 1) << bits) - 1;
2909     if (file_buffer.pos >= file_buffer.end)
2910       flush_buffer(~ (ulong) 0);
2911     file_buffer.bits=(int) (BITS_SAVED - bits);
2912     file_buffer.bitbucket= value << (BITS_SAVED - bits);
2913   }
2914   return;
2915 }
2916 
2917 	/* Flush bits in bit_buffer to buffer */
2918 
flush_bits(void)2919 static void flush_bits(void)
2920 {
2921   int bits;
2922   ulonglong bit_buffer;
2923 
2924   bits= file_buffer.bits & ~7;
2925   bit_buffer= file_buffer.bitbucket >> bits;
2926   bits= BITS_SAVED - bits;
2927   while (bits > 0)
2928   {
2929     bits-= 8;
2930     *file_buffer.pos++= (uchar) (bit_buffer >> bits);
2931   }
2932   if (file_buffer.pos >= file_buffer.end)
2933     flush_buffer(~ (ulong) 0);
2934   file_buffer.bits= BITS_SAVED;
2935   file_buffer.bitbucket= 0;
2936 }
2937 
2938 
2939 /****************************************************************************
2940 ** functions to handle the joined files
2941 ****************************************************************************/
2942 
save_state(MARIA_HA * isam_file,PACK_MRG_INFO * mrg,my_off_t new_length,ha_checksum crc)2943 static int save_state(MARIA_HA *isam_file,PACK_MRG_INFO *mrg,
2944                       my_off_t new_length,
2945 		      ha_checksum crc)
2946 {
2947   MARIA_SHARE *share=isam_file->s;
2948   uint options=mi_uint2korr(share->state.header.options);
2949   uint key;
2950   DBUG_ENTER("save_state");
2951 
2952   options|= HA_OPTION_COMPRESS_RECORD | HA_OPTION_READ_ONLY_DATA;
2953   mi_int2store(share->state.header.options,options);
2954   /* Save the original file type of we have to undo the packing later */
2955   share->state.header.org_data_file_type= share->state.header.data_file_type;
2956   share->state.header.data_file_type= COMPRESSED_RECORD;
2957 
2958   share->state.state.data_file_length=new_length;
2959   share->state.state.del=0;
2960   share->state.state.empty=0;
2961   share->state.dellink= HA_OFFSET_ERROR;
2962   share->state.split=(ha_rows) mrg->records;
2963   share->state.version=(ulong) time((time_t*) 0);
2964   if (share->base.born_transactional)
2965     share->state.create_rename_lsn= share->state.is_of_horizon=
2966       share->state.skip_redo_lsn= LSN_NEEDS_NEW_STATE_LSNS;
2967   if (! maria_is_all_keys_active(share->state.key_map, share->base.keys))
2968   {
2969     /*
2970       Some indexes are disabled, cannot use current key_file_length value
2971       as an estimate of upper bound of index file size. Use packed data file
2972       size instead.
2973     */
2974     share->state.state.key_file_length= new_length;
2975   }
2976   /*
2977     If there are no disabled indexes, keep key_file_length value from
2978     original file so "aria_chk -rq" can use this value (this is necessary
2979     because index size cannot be easily calculated for fulltext keys)
2980   */
2981   maria_clear_all_keys_active(share->state.key_map);
2982   for (key=0 ; key < share->base.keys ; key++)
2983     share->state.key_root[key]= HA_OFFSET_ERROR;
2984   share->state.key_del= HA_OFFSET_ERROR;
2985   share->state.state.checksum= crc;     /* Save crc in file */
2986   share->changed=1;			/* Force write of header */
2987   share->state.open_count=0;
2988   share->global_changed=0;
2989   my_chsize(share->kfile.file, share->base.keystart, 0, MYF(0));
2990   if (share->base.keys)
2991     isamchk_neaded=1;
2992   DBUG_RETURN(_ma_state_info_write_sub(share->kfile.file,
2993                                        &share->state,
2994                                        MA_STATE_INFO_WRITE_DONT_MOVE_OFFSET |
2995                                        MA_STATE_INFO_WRITE_FULL_INFO));
2996 }
2997 
2998 
save_state_mrg(File file,PACK_MRG_INFO * mrg,my_off_t new_length,ha_checksum crc)2999 static int save_state_mrg(File file,PACK_MRG_INFO *mrg,my_off_t new_length,
3000 			  ha_checksum crc)
3001 {
3002   MARIA_STATE_INFO state;
3003   MARIA_HA *isam_file=mrg->file[0];
3004   uint options;
3005   DBUG_ENTER("save_state_mrg");
3006 
3007   state= isam_file->s->state;
3008   options= (mi_uint2korr(state.header.options) | HA_OPTION_COMPRESS_RECORD |
3009 	    HA_OPTION_READ_ONLY_DATA);
3010   mi_int2store(state.header.options,options);
3011   /* Save the original file type of we have to undo the packing later */
3012   state.header.org_data_file_type= state.header.data_file_type;
3013   state.header.data_file_type= COMPRESSED_RECORD;
3014 
3015   state.state.data_file_length=new_length;
3016   state.state.del=0;
3017   state.state.empty=0;
3018   state.state.records=state.split=(ha_rows) mrg->records;
3019   state.create_rename_lsn= state.is_of_horizon= state.skip_redo_lsn=
3020     LSN_NEEDS_NEW_STATE_LSNS;
3021 
3022   /* See comment above in save_state about key_file_length handling. */
3023   if (mrg->src_file_has_indexes_disabled)
3024   {
3025     isam_file->s->state.state.key_file_length=
3026       MY_MAX(isam_file->s->state.state.key_file_length, new_length);
3027   }
3028   state.dellink= HA_OFFSET_ERROR;
3029   state.version=(ulong) time((time_t*) 0);
3030   maria_clear_all_keys_active(state.key_map);
3031   state.state.checksum=crc;
3032   if (isam_file->s->base.keys)
3033     isamchk_neaded=1;
3034   state.changed=STATE_CHANGED | STATE_NOT_ANALYZED; /* Force check of table */
3035   DBUG_RETURN (_ma_state_info_write_sub(file, &state,
3036                                         MA_STATE_INFO_WRITE_DONT_MOVE_OFFSET |
3037                                         MA_STATE_INFO_WRITE_FULL_INFO));
3038 }
3039 
3040 
3041 /* reset for mrg_rrnd */
3042 
mrg_reset(PACK_MRG_INFO * mrg)3043 static void mrg_reset(PACK_MRG_INFO *mrg)
3044 {
3045   if (mrg->current)
3046   {
3047     maria_extra(*mrg->current, HA_EXTRA_NO_CACHE, 0);
3048     mrg->current=0;
3049   }
3050 }
3051 
mrg_rrnd(PACK_MRG_INFO * info,uchar * buf)3052 static int mrg_rrnd(PACK_MRG_INFO *info,uchar *buf)
3053 {
3054   int error;
3055   MARIA_HA *isam_info;
3056 
3057   if (!info->current)
3058   {
3059     isam_info= *(info->current=info->file);
3060     info->end=info->current+info->count;
3061     maria_reset(isam_info);
3062     maria_extra(isam_info, HA_EXTRA_CACHE, 0);
3063     if ((error= maria_scan_init(isam_info)))
3064       return(error);
3065   }
3066   else
3067     isam_info= *info->current;
3068 
3069   for (;;)
3070   {
3071     if (!(error= maria_scan(isam_info, buf)) ||
3072 	error != HA_ERR_END_OF_FILE)
3073       return (error);
3074     maria_scan_end(isam_info);
3075     maria_extra(isam_info,HA_EXTRA_NO_CACHE, 0);
3076     if (info->current+1 == info->end)
3077       return(HA_ERR_END_OF_FILE);
3078     info->current++;
3079     isam_info= *info->current;
3080     maria_reset(isam_info);
3081     maria_extra(isam_info,HA_EXTRA_CACHE, 0);
3082     if ((error= maria_scan_init(isam_info)))
3083       return(error);
3084   }
3085 }
3086 
3087 
mrg_close(PACK_MRG_INFO * mrg)3088 static int mrg_close(PACK_MRG_INFO *mrg)
3089 {
3090   uint i;
3091   int error=0;
3092   DBUG_ENTER("mrg_close");
3093 
3094   for (i=0 ; i < mrg->count ; i++)
3095     error|=maria_close(mrg->file[i]);
3096   if (mrg->free_file)
3097     my_free(mrg->file);
3098   DBUG_RETURN(error);
3099 }
3100 
3101 
3102 #if !defined(DBUG_OFF)
3103 /*
3104   Fake the counts to get big Huffman codes.
3105 
3106   SYNOPSIS
3107     fakebigcodes()
3108     huff_counts                 A pointer to the counts array.
3109     end_count                   A pointer past the counts array.
3110 
3111   DESCRIPTION
3112 
3113     Huffman coding works by removing the two least frequent values from
3114     the list of values and add a new value with the sum of their
3115     incidences in a loop until only one value is left. Every time a
3116     value is reused for a new value, it gets one more bit for its
3117     encoding. Hence, the least frequent values get the longest codes.
3118 
3119     To get a maximum code length for a value, two of the values must
3120     have an incidence of 1. As their sum is 2, the next infrequent value
3121     must have at least an incidence of 2, then 4, 8, 16 and so on. This
3122     means that one needs 2**n bytes (values) for a code length of n
3123     bits. However, using more distinct values forces the use of longer
3124     codes, or reaching the code length with less total bytes (values).
3125 
3126     To get 64(32)-bit codes, I sort the counts by decreasing incidence.
3127     I assign counts of 1 to the two most frequent values, a count of 2
3128     for the next one, then 4, 8, and so on until 2**64-1(2**30-1). All
3129     the remaining values get 1. That way every possible uchar has an
3130     assigned code, though not all codes are used if not all uchar values
3131     are present in the column.
3132 
3133     This strategy would work with distinct column values too, but
3134     requires that at least 64(32) values are present. To make things
3135     easier here, I cancel all distinct column values and force byte
3136     compression for all columns.
3137 
3138   RETURN
3139     void
3140 */
3141 
fakebigcodes(HUFF_COUNTS * huff_counts,HUFF_COUNTS * end_count)3142 static void fakebigcodes(HUFF_COUNTS *huff_counts, HUFF_COUNTS *end_count)
3143 {
3144   HUFF_COUNTS   *count;
3145   my_off_t      *cur_count_p;
3146   my_off_t      *end_count_p;
3147   my_off_t      **cur_sort_p;
3148   my_off_t      **end_sort_p;
3149   my_off_t      *sort_counts[256];
3150   my_off_t      total;
3151   DBUG_ENTER("fakebigcodes");
3152 
3153   for (count= huff_counts; count < end_count; count++)
3154   {
3155     /*
3156       Remove distinct column values.
3157     */
3158     if (huff_counts->tree_buff)
3159     {
3160       my_free(huff_counts->tree_buff);
3161       delete_tree(&huff_counts->int_tree, 0);
3162       huff_counts->tree_buff= NULL;
3163       DBUG_PRINT("fakebigcodes", ("freed distinct column values"));
3164     }
3165 
3166     /*
3167       Sort counts by decreasing incidence.
3168     */
3169     cur_count_p= count->counts;
3170     end_count_p= cur_count_p + 256;
3171     cur_sort_p= sort_counts;
3172     while (cur_count_p < end_count_p)
3173       *(cur_sort_p++)= cur_count_p++;
3174     (void) my_qsort(sort_counts, 256, sizeof(my_off_t*), (qsort_cmp) fakecmp);
3175 
3176     /*
3177       Assign faked counts.
3178     */
3179     cur_sort_p= sort_counts;
3180 #if SIZEOF_LONG_LONG > 4
3181     end_sort_p= sort_counts + 8 * sizeof(ulonglong) - 1;
3182 #else
3183     end_sort_p= sort_counts + 8 * sizeof(ulonglong) - 2;
3184 #endif
3185     /* Most frequent value gets a faked count of 1. */
3186     **(cur_sort_p++)= 1;
3187     total= 1;
3188     while (cur_sort_p < end_sort_p)
3189     {
3190       **(cur_sort_p++)= total;
3191       total<<= 1;
3192     }
3193     /* Set the last value. */
3194     **(cur_sort_p++)= --total;
3195     /*
3196       Set the remaining counts.
3197     */
3198     end_sort_p= sort_counts + 256;
3199     while (cur_sort_p < end_sort_p)
3200       **(cur_sort_p++)= 1;
3201   }
3202   DBUG_VOID_RETURN;
3203 }
3204 
3205 
3206 /*
3207   Compare two counts for reverse sorting.
3208 
3209   SYNOPSIS
3210     fakecmp()
3211     count1              One count.
3212     count2              Another count.
3213 
3214   RETURN
3215     1                   count1  < count2
3216     0                   count1 == count2
3217     -1                  count1 >  count2
3218 */
3219 
fakecmp(my_off_t ** count1,my_off_t ** count2)3220 static int fakecmp(my_off_t **count1, my_off_t **count2)
3221 {
3222   return ((**count1 < **count2) ? 1 :
3223           (**count1 > **count2) ? -1 : 0);
3224 }
3225 #endif
3226 
3227 #include "ma_check_standalone.h"
3228 
3229