1 /* Copyright (c) 2000, 2013, Oracle and/or its affiliates
2    Copyright (c) 2009, 2013, Monty Program Ab.
3 
4    This program is free software; you can redistribute it and/or modify
5    it under the terms of the GNU General Public License as published by
6    the Free Software Foundation; version 2 of the License.
7 
8    This program is distributed in the hope that it will be useful,
9    but WITHOUT ANY WARRANTY; without even the implied warranty of
10    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
11    GNU General Public License for more details.
12 
13    You should have received a copy of the GNU General Public License
14    along with this program; if not, write to the Free Software
15    Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1335  USA */
16 
17 /* Pack MyISAM file */
18 
19 #ifndef USE_MY_FUNC
20 #define USE_MY_FUNC			/* We need at least my_malloc */
21 #endif
22 
23 #include "myisamdef.h"
24 #include "my_default.h"
25 #include <queues.h>
26 #include <my_tree.h>
27 #include "mysys_err.h"
28 #ifndef __GNU_LIBRARY__
29 #define __GNU_LIBRARY__			/* Skip warnings in getopt.h */
30 #endif
31 #include <my_getopt.h>
32 #include <assert.h>
33 
34 #if SIZEOF_LONG_LONG > 4
35 #define BITS_SAVED 64
36 #else
37 #define BITS_SAVED 32
38 #endif
39 
40 #define IS_OFFSET ((uint) 32768)	/* Bit if offset or char in tree */
41 #define HEAD_LENGTH	32
42 #define ALLOWED_JOIN_DIFF	256	/* Diff allowed to join trees */
43 
44 #define DATA_TMP_EXT		".TMD"
45 #define OLD_EXT			".OLD"
46 #define FRM_EXT                 ".frm"
47 #define WRITE_COUNT		MY_HOW_OFTEN_TO_WRITE
48 
49 struct st_file_buffer {
50   File file;
51   uchar *buffer,*pos,*end;
52   my_off_t pos_in_file;
53   int bits;
54   ulonglong bitbucket;
55 };
56 
57 struct st_huff_tree;
58 struct st_huff_element;
59 
60 typedef struct st_huff_counts {
61   uint	field_length,max_zero_fill;
62   uint	pack_type;
63   uint	max_end_space,max_pre_space,length_bits,min_space;
64   ulong max_length;
65   enum en_fieldtype field_type;
66   struct st_huff_tree *tree;		/* Tree for field */
67   my_off_t counts[256];
68   my_off_t end_space[8];
69   my_off_t pre_space[8];
70   my_off_t tot_end_space,tot_pre_space,zero_fields,empty_fields,bytes_packed;
71   TREE int_tree;        /* Tree for detecting distinct column values. */
72   uchar *tree_buff;      /* Column values, 'field_length' each. */
73   uchar *tree_pos;       /* Points to end of column values in 'tree_buff'. */
74 } HUFF_COUNTS;
75 
76 typedef struct st_huff_element HUFF_ELEMENT;
77 
78 /*
79   WARNING: It is crucial for the optimizations in calc_packed_length()
80   that 'count' is the first element of 'HUFF_ELEMENT'.
81 */
82 struct st_huff_element {
83   my_off_t count;
84   union un_element {
85     struct st_nod {
86       HUFF_ELEMENT *left,*right;
87     } nod;
88     struct st_leaf {
89       HUFF_ELEMENT *null;
90       uint	element_nr;		/* Number of element */
91     } leaf;
92   } a;
93 };
94 
95 
96 typedef struct st_huff_tree {
97   HUFF_ELEMENT *root,*element_buffer;
98   HUFF_COUNTS *counts;
99   uint tree_number;
100   uint elements;
101   my_off_t bytes_packed;
102   uint tree_pack_length;
103   uint min_chr,max_chr,char_bits,offset_bits,max_offset,height;
104   ulonglong *code;
105   uchar *code_len;
106 } HUFF_TREE;
107 
108 
109 typedef struct st_isam_mrg {
110   MI_INFO **file,**current,**end;
111   uint free_file;
112   uint count;
113   uint	min_pack_length;		/* These are used by packed data */
114   uint	max_pack_length;
115   uint	ref_length;
116   uint	max_blob_length;
117   my_off_t records;
118   /* true if at least one source file has at least one disabled index */
119   my_bool src_file_has_indexes_disabled;
120 } PACK_MRG_INFO;
121 
122 
123 extern int main(int argc,char * *argv);
124 static void get_options(int *argc,char ***argv);
125 static MI_INFO *open_isam_file(char *name,int mode);
126 static my_bool open_isam_files(PACK_MRG_INFO *mrg,char **names,uint count);
127 static int compress(PACK_MRG_INFO *file,char *join_name);
128 static int create_dest_frm(char *source_table, char *dest_table);
129 static HUFF_COUNTS *init_huff_count(MI_INFO *info,my_off_t records);
130 static void free_counts_and_tree_and_queue(HUFF_TREE *huff_trees,
131 					   uint trees,
132 					   HUFF_COUNTS *huff_counts,
133 					   uint fields);
134 static int compare_tree(void* cmp_arg __attribute__((unused)),
135 			const uchar *s,const uchar *t);
136 static int get_statistic(PACK_MRG_INFO *mrg,HUFF_COUNTS *huff_counts);
137 static void check_counts(HUFF_COUNTS *huff_counts,uint trees,
138 			 my_off_t records);
139 static int test_space_compress(HUFF_COUNTS *huff_counts,my_off_t records,
140 			       uint max_space_length,my_off_t *space_counts,
141 			       my_off_t tot_space_count,
142 			       enum en_fieldtype field_type);
143 static HUFF_TREE* make_huff_trees(HUFF_COUNTS *huff_counts,uint trees);
144 static int make_huff_tree(HUFF_TREE *tree,HUFF_COUNTS *huff_counts);
145 static int compare_huff_elements(void *not_used, uchar *a,uchar *b);
146 static int save_counts_in_queue(uchar *key,element_count count,
147 				    HUFF_TREE *tree);
148 static my_off_t calc_packed_length(HUFF_COUNTS *huff_counts,uint flag);
149 static uint join_same_trees(HUFF_COUNTS *huff_counts,uint trees);
150 static int make_huff_decode_table(HUFF_TREE *huff_tree,uint trees);
151 static void make_traverse_code_tree(HUFF_TREE *huff_tree,
152 				    HUFF_ELEMENT *element,uint size,
153 				    ulonglong code);
154 static int write_header(PACK_MRG_INFO *isam_file, uint header_length,uint trees,
155 			my_off_t tot_elements,my_off_t filelength);
156 static void write_field_info(HUFF_COUNTS *counts, uint fields,uint trees);
157 static my_off_t write_huff_tree(HUFF_TREE *huff_tree,uint trees);
158 static uint *make_offset_code_tree(HUFF_TREE *huff_tree,
159 				       HUFF_ELEMENT *element,
160 				       uint *offset);
161 static uint max_bit(uint value);
162 static int compress_isam_file(PACK_MRG_INFO *file,HUFF_COUNTS *huff_counts);
163 static char *make_new_name(char *new_name,char *old_name);
164 static char *make_old_name(char *new_name,char *old_name);
165 static void init_file_buffer(File file,pbool read_buffer);
166 static int flush_buffer(ulong neaded_length);
167 static void end_file_buffer(void);
168 static void write_bits(ulonglong value, uint bits);
169 static void flush_bits(void);
170 static int save_state(MI_INFO *isam_file,PACK_MRG_INFO *mrg,my_off_t new_length,
171 		      ha_checksum crc);
172 static int save_state_mrg(File file,PACK_MRG_INFO *isam_file,my_off_t new_length,
173 			  ha_checksum crc);
174 static int mrg_close(PACK_MRG_INFO *mrg);
175 static int mrg_rrnd(PACK_MRG_INFO *info,uchar *buf);
176 static void mrg_reset(PACK_MRG_INFO *mrg);
177 #if !defined(DBUG_OFF)
178 static void fakebigcodes(HUFF_COUNTS *huff_counts, HUFF_COUNTS *end_count);
179 static int fakecmp(my_off_t **count1, my_off_t **count2);
180 #endif
181 
182 
183 static int error_on_write=0,test_only=0,verbose=0,silent=0,
184 	   write_loop=0,force_pack=0, isamchk_neaded=0;
185 static int tmpfile_createflag=O_RDWR | O_TRUNC | O_EXCL;
186 static my_bool backup, opt_wait;
187 /*
188   tree_buff_length is somewhat arbitrary. The bigger it is the better
189   the chance to win in terms of compression factor. On the other hand,
190   this table becomes part of the compressed file header. And its length
191   is coded with 16 bits in the header. Hence the limit is 2**16 - 1.
192 */
193 static uint tree_buff_length= 65536 - MALLOC_OVERHEAD;
194 static char tmp_dir[FN_REFLEN]={0},*join_table;
195 static my_off_t intervall_length;
196 static ha_checksum glob_crc;
197 static struct st_file_buffer file_buffer;
198 static QUEUE queue;
199 static HUFF_COUNTS *global_count;
200 static char zero_string[]={0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0};
201 static const char *load_default_groups[]= { "myisampack",0 };
202 
203 	/* The main program */
204 
main(int argc,char ** argv)205 int main(int argc, char **argv)
206 {
207   int error,ok;
208   PACK_MRG_INFO merge;
209   char **default_argv;
210   MY_INIT(argv[0]);
211 
212   load_defaults_or_exit("my", load_default_groups, &argc, &argv);
213   default_argv= argv;
214   get_options(&argc,&argv);
215 
216   error=ok=isamchk_neaded=0;
217   if (join_table)
218   {
219     /*
220       Join files into one and create FRM file for the compressed table only if
221       the compression succeeds
222     */
223     if (open_isam_files(&merge,argv,(uint) argc) ||
224 	compress(&merge, join_table) || create_dest_frm(argv[0], join_table))
225       error=1;
226   }
227   else while (argc--)
228   {
229     MI_INFO *isam_file;
230     if (!(isam_file=open_isam_file(*argv++,O_RDWR)))
231       error=1;
232     else
233     {
234       merge.file= &isam_file;
235       merge.current=0;
236       merge.free_file=0;
237       merge.count=1;
238       if (compress(&merge,0))
239 	error=1;
240       else
241 	ok=1;
242     }
243   }
244   if (ok && isamchk_neaded && !silent)
245     puts("Remember to run myisamchk -rq on compressed tables");
246   (void) fflush(stdout);
247   (void) fflush(stderr);
248   free_defaults(default_argv);
249   my_end(verbose ? MY_CHECK_ERROR | MY_GIVE_INFO : MY_CHECK_ERROR);
250   exit(error ? 2 : 0);
251 #ifndef _lint
252   return 0;					/* No compiler warning */
253 #endif
254 }
255 
256 enum options_mp {OPT_CHARSETS_DIR_MP=256};
257 
258 static struct my_option my_long_options[] =
259 {
260   {"backup", 'b', "Make a backup of the table as table_name.OLD.",
261    &backup, &backup, 0, GET_BOOL, NO_ARG, 0, 0, 0, 0, 0, 0},
262   {"character-sets-dir", OPT_CHARSETS_DIR_MP,
263    "Directory where character sets are.", (char**) &charsets_dir,
264    (char**) &charsets_dir, 0, GET_STR, REQUIRED_ARG, 0, 0, 0, 0, 0, 0},
265   {"debug", '#', "Output debug log. Often this is 'd:t:o,filename'.",
266    0, 0, 0, GET_STR, OPT_ARG, 0, 0, 0, 0, 0, 0},
267   {"force", 'f',
268    "Force packing of table even if it gets bigger or if tempfile exists.",
269    0, 0, 0, GET_NO_ARG, NO_ARG, 0, 0, 0, 0, 0, 0},
270   {"join", 'j',
271    "Join all given tables into 'new_table_name'. All tables MUST have identical layouts.",
272    &join_table, &join_table, 0, GET_STR, REQUIRED_ARG, 0, 0, 0,
273    0, 0, 0},
274   {"help", '?', "Display this help and exit.",
275    0, 0, 0, GET_NO_ARG, NO_ARG, 0, 0, 0, 0, 0, 0},
276   {"silent", 's', "Be more silent.",
277    0, 0, 0, GET_NO_ARG, NO_ARG, 0, 0, 0, 0, 0, 0},
278   {"tmpdir", 'T', "Use temporary directory to store temporary table.",
279    0, 0, 0, GET_STR, REQUIRED_ARG, 0, 0, 0, 0, 0, 0},
280   {"test", 't', "Don't pack table, only test packing it.",
281    0, 0, 0, GET_NO_ARG, NO_ARG, 0, 0, 0, 0, 0, 0},
282   {"verbose", 'v', "Write info about progress and packing result. Use many -v for more verbosity!",
283    0, 0, 0, GET_NO_ARG, NO_ARG, 0, 0, 0, 0, 0, 0},
284   {"version", 'V', "Output version information and exit.",
285    0, 0, 0, GET_NO_ARG, NO_ARG, 0, 0, 0, 0, 0, 0},
286   {"wait", 'w', "Wait and retry if table is in use.", &opt_wait,
287    &opt_wait, 0, GET_BOOL, NO_ARG, 0, 0, 0, 0, 0, 0},
288   { 0, 0, 0, 0, 0, 0, GET_NO_ARG, NO_ARG, 0, 0, 0, 0, 0, 0}
289 };
290 
291 
print_version(void)292 static void print_version(void)
293 {
294   printf("%s Ver 1.23 for %s on %s\n",
295               my_progname, SYSTEM_TYPE, MACHINE_TYPE);
296 }
297 
298 
usage(void)299 static void usage(void)
300 {
301   print_version();
302   puts("Copyright 2002-2008 MySQL AB, 2008 Sun Microsystems, Inc.");
303   puts("This software comes with ABSOLUTELY NO WARRANTY. This is free software,");
304   puts("and you are welcome to modify and redistribute it under the GPL license\n");
305 
306   puts("Pack a MyISAM-table to take much less space.");
307   puts("Keys are not updated, you must run myisamchk -rq on the index (.MYI) file");
308   puts("afterwards to update the keys.");
309   puts("You should give the .MYI file as the filename argument.");
310 
311   printf("\nUsage: %s [OPTIONS] filename...\n", my_progname);
312   my_print_help(my_long_options);
313   print_defaults("my", load_default_groups);
314   my_print_variables(my_long_options);
315 }
316 
317 
318 static my_bool
get_one_option(int optid,const struct my_option * opt,char * argument)319 get_one_option(int optid, const struct my_option *opt __attribute__((unused)),
320 	       char *argument)
321 {
322   uint length;
323 
324   switch(optid) {
325   case 'f':
326     force_pack= 1;
327     tmpfile_createflag= O_RDWR | O_TRUNC;
328     break;
329   case 's':
330     write_loop= verbose= 0;
331     silent= 1;
332     break;
333   case 't':
334     test_only= 1;
335     /* Avoid to reset 'verbose' if it was already set > 1. */
336     if (! verbose)
337       verbose= 1;
338     break;
339   case 'T':
340     length= (uint) (strmov(tmp_dir, argument) - tmp_dir);
341     if (length != dirname_length(tmp_dir))
342     {
343       tmp_dir[length]=FN_LIBCHAR;
344       tmp_dir[length+1]=0;
345     }
346     break;
347   case 'v':
348     verbose++; /* Allow for selecting the level of verbosity. */
349     silent= 0;
350     break;
351   case '#':
352     DBUG_PUSH(argument ? argument : "d:t:o");
353     break;
354   case 'V':
355     print_version();
356     exit(0);
357   case 'I':
358   case '?':
359     usage();
360     exit(0);
361   }
362   return 0;
363 }
364 
365 	/* reads options */
366 	/* Initiates DEBUG - but no debugging here ! */
367 
get_options(int * argc,char *** argv)368 static void get_options(int *argc,char ***argv)
369 {
370   int ho_error;
371 
372   my_progname= argv[0][0];
373   if (isatty(fileno(stdout)))
374     write_loop=1;
375 
376   if ((ho_error=handle_options(argc, argv, my_long_options, get_one_option)))
377     exit(ho_error);
378 
379   if (!*argc)
380   {
381     usage();
382     exit(1);
383   }
384   if (join_table)
385   {
386     backup=0;					/* Not needed */
387     tmp_dir[0]=0;
388   }
389   return;
390 }
391 
392 
open_isam_file(char * name,int mode)393 static MI_INFO *open_isam_file(char *name,int mode)
394 {
395   MI_INFO *isam_file;
396   MYISAM_SHARE *share;
397   DBUG_ENTER("open_isam_file");
398 
399   if (!(isam_file=mi_open(name,mode,
400 			  (opt_wait ? HA_OPEN_WAIT_IF_LOCKED :
401 			   HA_OPEN_ABORT_IF_LOCKED))))
402   {
403     (void) fprintf(stderr, "%s gave error %d on open\n", name, my_errno);
404     DBUG_RETURN(0);
405   }
406   share=isam_file->s;
407   if (share->options & HA_OPTION_COMPRESS_RECORD && !join_table)
408   {
409     if (!force_pack)
410     {
411       (void) fprintf(stderr, "%s is already compressed\n", name);
412       (void) mi_close(isam_file);
413       DBUG_RETURN(0);
414     }
415     if (verbose)
416       puts("Recompressing already compressed table");
417     share->options&= ~HA_OPTION_READ_ONLY_DATA; /* We are modifing it */
418 
419     /* We want to use the new checksums if we have null fields */
420     if (share->has_null_fields)
421       share->options|= HA_OPTION_NULL_FIELDS;
422 
423   }
424   if (! force_pack && share->state.state.records != 0 &&
425       (share->state.state.records <= 1 ||
426        share->state.state.data_file_length < 1024))
427   {
428     (void) fprintf(stderr, "%s is too small to compress\n", name);
429     (void) mi_close(isam_file);
430     DBUG_RETURN(0);
431   }
432   (void) mi_lock_database(isam_file,F_WRLCK);
433   DBUG_RETURN(isam_file);
434 }
435 
436 
open_isam_files(PACK_MRG_INFO * mrg,char ** names,uint count)437 static my_bool open_isam_files(PACK_MRG_INFO *mrg, char **names, uint count)
438 {
439   uint i,j;
440   mrg->count=0;
441   mrg->current=0;
442   mrg->file=(MI_INFO**) my_malloc(sizeof(MI_INFO*)*count,MYF(MY_FAE));
443   mrg->free_file=1;
444   mrg->src_file_has_indexes_disabled= 0;
445   for (i=0; i < count ; i++)
446   {
447     if (!(mrg->file[i]=open_isam_file(names[i],O_RDONLY)))
448       goto error;
449 
450     mrg->src_file_has_indexes_disabled|=
451       ! mi_is_all_keys_active(mrg->file[i]->s->state.key_map,
452                               mrg->file[i]->s->base.keys);
453   }
454   /* Check that files are identical */
455   for (j=0 ; j < count-1 ; j++)
456   {
457     MI_COLUMNDEF *m1,*m2,*end;
458     if (mrg->file[j]->s->base.reclength != mrg->file[j+1]->s->base.reclength ||
459 	mrg->file[j]->s->base.fields != mrg->file[j+1]->s->base.fields)
460       goto diff_file;
461     m1=mrg->file[j]->s->rec;
462     end=m1+mrg->file[j]->s->base.fields;
463     m2=mrg->file[j+1]->s->rec;
464     for ( ; m1 != end ; m1++,m2++)
465     {
466       if (m1->type != m2->type || m1->length != m2->length)
467 	goto diff_file;
468     }
469   }
470   mrg->count=count;
471   return 0;
472 
473  diff_file:
474   (void) fprintf(stderr, "%s: Tables '%s' and '%s' are not identical\n",
475                my_progname, names[j], names[j+1]);
476  error:
477   while (i--)
478     mi_close(mrg->file[i]);
479   my_free(mrg->file);
480   return 1;
481 }
482 
483 
compress(PACK_MRG_INFO * mrg,char * result_table)484 static int compress(PACK_MRG_INFO *mrg,char *result_table)
485 {
486   int error;
487   File new_file,join_isam_file;
488   MI_INFO *isam_file;
489   MYISAM_SHARE *share;
490   char org_name[FN_REFLEN],new_name[FN_REFLEN],temp_name[FN_REFLEN];
491   uint i,header_length,fields,trees,used_trees;
492   my_off_t old_length,new_length,tot_elements;
493   HUFF_COUNTS *huff_counts;
494   HUFF_TREE *huff_trees;
495   DBUG_ENTER("compress");
496 
497   isam_file=mrg->file[0];			/* Take this as an example */
498   share=isam_file->s;
499   new_file=join_isam_file= -1;
500   trees=fields=0;
501   huff_trees=0;
502   huff_counts=0;
503 
504   /* Create temporary or join file */
505 
506   if (backup)
507     (void) fn_format(org_name,isam_file->filename,"",MI_NAME_DEXT,2);
508   else
509     (void) fn_format(org_name,isam_file->filename,"",MI_NAME_DEXT,2+4+16);
510   if (!test_only && result_table)
511   {
512     /* Make a new indexfile based on first file in list */
513     uint length;
514     uchar *buff;
515     strmov(org_name,result_table);		/* Fix error messages */
516     (void) fn_format(new_name,result_table,"",MI_NAME_IEXT,2);
517     if ((join_isam_file=my_create(new_name,0,tmpfile_createflag,MYF(MY_WME)))
518 	< 0)
519       goto err;
520     length=(uint) share->base.keystart;
521     if (!(buff= (uchar*) my_malloc(length,MYF(MY_WME))))
522       goto err;
523     if (my_pread(share->kfile,buff,length,0L,MYF(MY_WME | MY_NABP)) ||
524 	my_write(join_isam_file,buff,length,
525 		 MYF(MY_WME | MY_NABP | MY_WAIT_IF_FULL)))
526     {
527       my_free(buff);
528       goto err;
529     }
530     my_free(buff);
531     (void) fn_format(new_name,result_table,"",MI_NAME_DEXT,2);
532   }
533   else if (!tmp_dir[0])
534     (void) make_new_name(new_name,org_name);
535   else
536     (void) fn_format(new_name,org_name,tmp_dir,DATA_TMP_EXT,1+2+4);
537   if (!test_only &&
538       (new_file=my_create(new_name,0,tmpfile_createflag,MYF(MY_WME))) < 0)
539     goto err;
540 
541   /* Start calculating statistics */
542 
543   mrg->records=0;
544   for (i=0 ; i < mrg->count ; i++)
545     mrg->records+=mrg->file[i]->s->state.state.records;
546 
547   DBUG_PRINT("info", ("Compressing %s: (%lu records)",
548                       result_table ? new_name : org_name,
549                       (ulong) mrg->records));
550   if (write_loop || verbose)
551   {
552     printf("Compressing %s: (%lu records)\n",
553                 result_table ? new_name : org_name, (ulong) mrg->records);
554   }
555   trees=fields=share->base.fields;
556   huff_counts=init_huff_count(isam_file,mrg->records);
557 
558   /*
559     Read the whole data file(s) for statistics.
560   */
561   DBUG_PRINT("info", ("- Calculating statistics"));
562   if (write_loop || verbose)
563     printf("- Calculating statistics\n");
564   if (get_statistic(mrg,huff_counts))
565     goto err;
566 
567   old_length=0;
568   for (i=0; i < mrg->count ; i++)
569     old_length+= (mrg->file[i]->s->state.state.data_file_length -
570 		  mrg->file[i]->s->state.state.empty);
571 
572   /*
573     Create a global priority queue in preparation for making
574     temporary Huffman trees.
575   */
576   if (init_queue(&queue, 256, 0, 0, compare_huff_elements, 0, 0, 0))
577     goto err;
578 
579   /*
580     Check each column if we should use pre-space-compress, end-space-
581     compress, empty-field-compress or zero-field-compress.
582   */
583   check_counts(huff_counts,fields,mrg->records);
584 
585   /*
586     Build a Huffman tree for each column.
587   */
588   huff_trees=make_huff_trees(huff_counts,trees);
589 
590   /*
591     If the packed lengths of combined columns is less then the sum of
592     the non-combined columns, then create common Huffman trees for them.
593     We do this only for byte compressed columns, not for distinct values
594     compressed columns.
595   */
596   if ((int) (used_trees=join_same_trees(huff_counts,trees)) < 0)
597     goto err;
598 
599   /*
600     Assign codes to all byte or column values.
601   */
602   if (make_huff_decode_table(huff_trees,fields))
603     goto err;
604 
605   /* Prepare a file buffer. */
606   init_file_buffer(new_file,0);
607 
608   /*
609     Reserve space in the target file for the fixed compressed file header.
610   */
611   file_buffer.pos_in_file=HEAD_LENGTH;
612   if (! test_only)
613     my_seek(new_file,file_buffer.pos_in_file,MY_SEEK_SET,MYF(0));
614 
615   /*
616     Write field infos: field type, pack type, length bits, tree number.
617   */
618   write_field_info(huff_counts,fields,used_trees);
619 
620   /*
621     Write decode trees.
622   */
623   if (!(tot_elements=write_huff_tree(huff_trees,trees)))
624     goto err;
625 
626   /*
627     Calculate the total length of the compression info header.
628     This includes the fixed compressed file header, the column compression
629     type descriptions, and the decode trees.
630   */
631   header_length=(uint) file_buffer.pos_in_file+
632     (uint) (file_buffer.pos-file_buffer.buffer);
633 
634   /*
635     Compress the source file into the target file.
636   */
637   DBUG_PRINT("info", ("- Compressing file"));
638   if (write_loop || verbose)
639     printf("- Compressing file\n");
640   error=compress_isam_file(mrg,huff_counts);
641   new_length=file_buffer.pos_in_file;
642   if (!error && !test_only)
643   {
644     uchar buff[MEMMAP_EXTRA_MARGIN];		/* End marginal for memmap */
645     bzero(buff,sizeof(buff));
646     error=my_write(file_buffer.file,buff,sizeof(buff),
647 		   MYF(MY_WME | MY_NABP | MY_WAIT_IF_FULL)) != 0;
648   }
649 
650   /*
651     Write the fixed compressed file header.
652   */
653   if (!error)
654     error=write_header(mrg,header_length,used_trees,tot_elements,
655 		       new_length);
656 
657   /* Flush the file buffer. */
658   end_file_buffer();
659 
660   /* Display statistics. */
661   DBUG_PRINT("info", ("Min record length: %6d  Max length: %6d  "
662                       "Mean total length: %6ld\n",
663                       mrg->min_pack_length, mrg->max_pack_length,
664                       (ulong) (mrg->records ? (new_length/mrg->records) : 0)));
665   if (verbose && mrg->records)
666     printf("Min record length: %6d   Max length: %6d   "
667                 "Mean total length: %6ld\n", mrg->min_pack_length,
668                 mrg->max_pack_length, (ulong) (new_length/mrg->records));
669 
670   /* Close source and target file. */
671   if (!test_only)
672   {
673     error|=my_close(new_file,MYF(MY_WME));
674     if (!result_table)
675     {
676       error|=my_close(isam_file->dfile,MYF(MY_WME));
677       isam_file->dfile= -1;		/* Tell mi_close file is closed */
678     }
679   }
680 
681   /* Cleanup. */
682   free_counts_and_tree_and_queue(huff_trees,trees,huff_counts,fields);
683   if (! test_only && ! error)
684   {
685     if (result_table)
686     {
687       error=save_state_mrg(join_isam_file,mrg,new_length,glob_crc);
688     }
689     else
690     {
691       if (backup)
692       {
693 	if (my_rename(org_name,make_old_name(temp_name,isam_file->filename),
694 		      MYF(MY_WME)))
695 	  error=1;
696 	else
697 	{
698 	  if (tmp_dir[0])
699 	    error=my_copy(new_name,org_name,MYF(MY_WME));
700 	  else
701 	    error=my_rename(new_name,org_name,MYF(MY_WME));
702 	  if (!error)
703           {
704 	    (void) my_copystat(temp_name,org_name,MYF(MY_COPYTIME));
705             if (tmp_dir[0])
706               (void) my_delete(new_name,MYF(MY_WME));
707           }
708 	}
709       }
710       else
711       {
712 	if (tmp_dir[0])
713         {
714 	  error=my_copy(new_name,org_name,
715 			MYF(MY_WME | MY_HOLD_ORIGINAL_MODES | MY_COPYTIME));
716           if (!error)
717             (void) my_delete(new_name,MYF(MY_WME));
718         }
719 	else
720 	  error=my_redel(org_name, new_name, 0, MYF(MY_WME | MY_COPYTIME));
721       }
722       if (! error)
723 	error=save_state(isam_file,mrg,new_length,glob_crc);
724     }
725   }
726   error|=mrg_close(mrg);
727   if (join_isam_file >= 0)
728     error|=my_close(join_isam_file,MYF(MY_WME));
729   if (error)
730   {
731     (void) fprintf(stderr, "Aborting: %s is not compressed\n", org_name);
732     (void) my_delete(new_name,MYF(MY_WME));
733     DBUG_RETURN(-1);
734   }
735   if (write_loop || verbose)
736   {
737     if (old_length)
738       printf("%.4g%%     \n",
739                   (((longlong) (old_length - new_length)) * 100.0 /
740                    (longlong) old_length));
741     else
742       puts("Empty file saved in compressed format");
743   }
744   DBUG_RETURN(0);
745 
746  err:
747   free_counts_and_tree_and_queue(huff_trees,trees,huff_counts,fields);
748   if (new_file >= 0)
749     (void) my_close(new_file,MYF(0));
750   if (join_isam_file >= 0)
751     (void) my_close(join_isam_file,MYF(0));
752   mrg_close(mrg);
753   (void) fprintf(stderr, "Aborted: %s is not compressed\n", org_name);
754   DBUG_RETURN(-1);
755 }
756 
757 
758 /**
759   Create FRM for the destination table for --join operation
760   Copy the first table FRM as the destination table FRM file. Doing so
761   will help the mysql server to recognize the newly created table.
762   See Bug#36573.
763 
764   @param    source_table    Name of the source table
765   @param    dest_table      Name of the destination table
766   @retval   0               Successful copy operation
767 
768   @note  We always return 0 because we don't want myisampack to report error
769   even if the copy operation fails.
770 */
771 
create_dest_frm(char * source_table,char * dest_table)772 static int create_dest_frm(char *source_table, char *dest_table)
773 {
774   char source_name[FN_REFLEN], dest_name[FN_REFLEN];
775 
776   DBUG_ENTER("create_dest_frm");
777 
778   (void) fn_format(source_name, source_table,
779                    "", FRM_EXT, MY_UNPACK_FILENAME | MY_RESOLVE_SYMLINKS);
780   (void) fn_format(dest_name, dest_table,
781                    "", FRM_EXT, MY_UNPACK_FILENAME | MY_RESOLVE_SYMLINKS);
782   /*
783     Error messages produced by my_copy() are suppressed as this
784     is not vital for --join operation. User shouldn't see any error messages
785     like "source file frm not found" and "unable to create destination frm
786     file. So we don't pass the flag MY_WME -Write Message on Error to
787     my_copy()
788   */
789   (void) my_copy(source_name, dest_name, MYF(MY_DONT_OVERWRITE_FILE));
790 
791   DBUG_RETURN(0);
792 }
793 
794 
795 	/* Init a huff_count-struct for each field and init it */
796 
init_huff_count(MI_INFO * info,my_off_t records)797 static HUFF_COUNTS *init_huff_count(MI_INFO *info,my_off_t records)
798 {
799   reg2 uint i;
800   reg1 HUFF_COUNTS *count;
801   if ((count = (HUFF_COUNTS*) my_malloc(info->s->base.fields*
802 					sizeof(HUFF_COUNTS),
803 					MYF(MY_ZEROFILL | MY_WME))))
804   {
805     for (i=0 ; i < info->s->base.fields ; i++)
806     {
807       enum en_fieldtype type;
808       count[i].field_length=info->s->rec[i].length;
809       type= count[i].field_type= (enum en_fieldtype) info->s->rec[i].type;
810       if (type == FIELD_INTERVALL ||
811 	  type == FIELD_CONSTANT ||
812 	  type == FIELD_ZERO)
813 	type = FIELD_NORMAL;
814       if (count[i].field_length <= 8 &&
815 	  (type == FIELD_NORMAL ||
816 	   type == FIELD_SKIP_ZERO))
817 	count[i].max_zero_fill= count[i].field_length;
818       /*
819         For every column initialize a tree, which is used to detect distinct
820         column values. 'int_tree' works together with 'tree_buff' and
821         'tree_pos'. It's keys are implemented by pointers into 'tree_buff'.
822         This is accomplished by '-1' as the element size.
823       */
824       init_tree(&count[i].int_tree,0,0,-1,(qsort_cmp2) compare_tree, NULL,
825 		NULL, MYF(0));
826       if (records && type != FIELD_BLOB && type != FIELD_VARCHAR)
827 	count[i].tree_pos=count[i].tree_buff =
828 	  my_malloc(count[i].field_length > 1 ? tree_buff_length : 2,
829 		    MYF(MY_WME));
830     }
831   }
832   return count;
833 }
834 
835 
836 	/* Free memory used by counts and trees */
837 
free_counts_and_tree_and_queue(HUFF_TREE * huff_trees,uint trees,HUFF_COUNTS * huff_counts,uint fields)838 static void free_counts_and_tree_and_queue(HUFF_TREE *huff_trees, uint trees,
839 					   HUFF_COUNTS *huff_counts,
840 					   uint fields)
841 {
842   register uint i;
843 
844   if (huff_trees)
845   {
846     for (i=0 ; i < trees ; i++)
847     {
848       if (huff_trees[i].element_buffer)
849 	my_free(huff_trees[i].element_buffer);
850       if (huff_trees[i].code)
851 	my_free(huff_trees[i].code);
852     }
853     my_free(huff_trees);
854   }
855   if (huff_counts)
856   {
857     for (i=0 ; i < fields ; i++)
858     {
859       if (huff_counts[i].tree_buff)
860       {
861 	my_free(huff_counts[i].tree_buff);
862 	delete_tree(&huff_counts[i].int_tree, 0);
863       }
864     }
865     my_free(huff_counts);
866   }
867   delete_queue(&queue);		/* This is safe to free */
868   return;
869 }
870 
871 	/* Read through old file and gather some statistics */
872 
get_statistic(PACK_MRG_INFO * mrg,HUFF_COUNTS * huff_counts)873 static int get_statistic(PACK_MRG_INFO *mrg,HUFF_COUNTS *huff_counts)
874 {
875   int error;
876   uint length;
877   ulong reclength,max_blob_length;
878   uchar *record,*pos,*next_pos,*end_pos,*start_pos;
879   ha_rows record_count;
880   my_bool static_row_size;
881   HUFF_COUNTS *count,*end_count;
882   TREE_ELEMENT *element;
883   DBUG_ENTER("get_statistic");
884 
885   reclength=mrg->file[0]->s->base.reclength;
886   record=(uchar*) my_alloca(reclength);
887   end_count=huff_counts+mrg->file[0]->s->base.fields;
888   record_count=0; glob_crc=0;
889   max_blob_length=0;
890 
891   /* Check how to calculate checksum */
892   static_row_size=1;
893   for (count=huff_counts ; count < end_count ; count++)
894   {
895     if (count->field_type == FIELD_BLOB ||
896         count->field_type == FIELD_VARCHAR)
897     {
898       static_row_size=0;
899       break;
900     }
901   }
902 
903   mrg_reset(mrg);
904   while ((error=mrg_rrnd(mrg,record)) != HA_ERR_END_OF_FILE)
905   {
906     ulong tot_blob_length=0;
907     if (! error)
908     {
909       /* glob_crc is a checksum over all bytes of all records. */
910       if (static_row_size)
911 	glob_crc+=mi_static_checksum(mrg->file[0],record);
912       else
913 	glob_crc+=mi_checksum(mrg->file[0],record);
914 
915       /* Count the incidence of values separately for every column. */
916       for (pos=record,count=huff_counts ;
917 	   count < end_count ;
918 	   count++,
919 	   pos=next_pos)
920       {
921 	next_pos=end_pos=(start_pos=pos)+count->field_length;
922 
923 	/*
924           Put the whole column value in a tree if there is room for it.
925           'int_tree' is used to quickly check for duplicate values.
926           'tree_buff' collects as many distinct column values as
927           possible. If the field length is > 1, it is tree_buff_length,
928           else 2 bytes. Each value is 'field_length' bytes big. If there
929           are more distinct column values than fit into the buffer, we
930           give up with this tree. BLOBs and VARCHARs do not have a
931           tree_buff as it can only be used with fixed length columns.
932           For the special case of field length == 1, we handle only the
933           case that there is only one distinct value in the table(s).
934           Otherwise, we can have a maximum of 256 distinct values. This
935           is then handled by the normal Huffman tree build.
936 
937           Another limit for collecting distinct column values is the
938           number of values itself. Since we would need to build a
939           Huffman tree for the values, we are limited by the 'IS_OFFSET'
940           constant. This constant expresses a bit which is used to
941           determine if a tree element holds a final value or an offset
942           to a child element. Hence, all values and offsets need to be
943           smaller than 'IS_OFFSET'. A tree element is implemented with
944           two integer values, one for the left branch and one for the
945           right branch. For the extreme case that the first element
946           points to the last element, the number of integers in the tree
947           must be less or equal to IS_OFFSET. So the number of elements
948           must be less or equal to IS_OFFSET / 2.
949 
950           WARNING: At first, we insert a pointer into the record buffer
951           as the key for the tree. If we got a new distinct value, which
952           is really inserted into the tree, instead of being counted
953           only, we will copy the column value from the record buffer to
954           'tree_buff' and adjust the key pointer of the tree accordingly.
955         */
956 	if (count->tree_buff)
957 	{
958 	  global_count=count;
959 	  if (!(element=tree_insert(&count->int_tree,pos, 0,
960 				    count->int_tree.custom_arg)) ||
961 	      (element->count == 1 &&
962 	       (count->tree_buff + tree_buff_length <
963                 count->tree_pos + count->field_length)) ||
964               (count->int_tree.elements_in_tree > IS_OFFSET / 2) ||
965 	      (count->field_length == 1 &&
966 	       count->int_tree.elements_in_tree > 1))
967 	  {
968 	    delete_tree(&count->int_tree, 0);
969 	    my_free(count->tree_buff);
970 	    count->tree_buff=0;
971 	  }
972 	  else
973 	  {
974             /*
975               If tree_insert() succeeds, it either creates a new element
976               or increments the counter of an existing element.
977             */
978 	    if (element->count == 1)
979 	    {
980               /* Copy the new column value into 'tree_buff'. */
981 	      memcpy(count->tree_pos,pos,(size_t) count->field_length);
982               /* Adjust the key pointer in the tree. */
983 	      tree_set_pointer(element,count->tree_pos);
984               /* Point behind the last column value so far. */
985 	      count->tree_pos+=count->field_length;
986 	    }
987 	  }
988 	}
989 
990 	/* Save character counters and space-counts and zero-field-counts */
991 	if (count->field_type == FIELD_NORMAL ||
992 	    count->field_type == FIELD_SKIP_ENDSPACE)
993 	{
994           /* Ignore trailing space. */
995 	  for ( ; end_pos > pos ; end_pos--)
996 	    if (end_pos[-1] != ' ')
997 	      break;
998           /* Empty fields are just counted. Go to the next record. */
999 	  if (end_pos == pos)
1000 	  {
1001 	    count->empty_fields++;
1002 	    count->max_zero_fill=0;
1003 	    continue;
1004 	  }
1005           /*
1006             Count the total of all trailing spaces and the number of
1007             short trailing spaces. Remember the longest trailing space.
1008           */
1009 	  length= (uint) (next_pos-end_pos);
1010 	  count->tot_end_space+=length;
1011 	  if (length < 8)
1012 	    count->end_space[length]++;
1013 	  if (count->max_end_space < length)
1014 	    count->max_end_space = length;
1015 	}
1016 
1017 	if (count->field_type == FIELD_NORMAL ||
1018 	    count->field_type == FIELD_SKIP_PRESPACE)
1019 	{
1020           /* Ignore leading space. */
1021 	  for (pos=start_pos; pos < end_pos ; pos++)
1022 	    if (pos[0] != ' ')
1023 	      break;
1024           /* Empty fields are just counted. Go to the next record. */
1025 	  if (end_pos == pos)
1026 	  {
1027 	    count->empty_fields++;
1028 	    count->max_zero_fill=0;
1029 	    continue;
1030 	  }
1031           /*
1032             Count the total of all leading spaces and the number of
1033             short leading spaces. Remember the longest leading space.
1034           */
1035 	  length= (uint) (pos-start_pos);
1036 	  count->tot_pre_space+=length;
1037 	  if (length < 8)
1038 	    count->pre_space[length]++;
1039 	  if (count->max_pre_space < length)
1040 	    count->max_pre_space = length;
1041 	}
1042 
1043         /* Calculate pos, end_pos, and max_length for variable length fields. */
1044 	if (count->field_type == FIELD_BLOB)
1045 	{
1046 	  uint field_length=count->field_length -portable_sizeof_char_ptr;
1047 	  ulong blob_length= _mi_calc_blob_length(field_length, start_pos);
1048 	  memcpy(&pos, start_pos+field_length, sizeof(char*));
1049 	  end_pos=pos+blob_length;
1050 	  tot_blob_length+=blob_length;
1051 	  set_if_bigger(count->max_length,blob_length);
1052 	}
1053 	else if (count->field_type == FIELD_VARCHAR)
1054 	{
1055           uint pack_length= HA_VARCHAR_PACKLENGTH(count->field_length-1);
1056 	  length= (pack_length == 1 ? (uint) *(uchar*) start_pos :
1057                    uint2korr(start_pos));
1058 	  pos= start_pos+pack_length;
1059 	  end_pos= pos+length;
1060 	  set_if_bigger(count->max_length,length);
1061 	}
1062 
1063         /* Evaluate 'max_zero_fill' for short fields. */
1064 	if (count->field_length <= 8 &&
1065 	    (count->field_type == FIELD_NORMAL ||
1066 	     count->field_type == FIELD_SKIP_ZERO))
1067 	{
1068 	  uint i;
1069           /* Zero fields are just counted. Go to the next record. */
1070 	  if (!memcmp((uchar*) start_pos,zero_string,count->field_length))
1071 	  {
1072 	    count->zero_fields++;
1073 	    continue;
1074 	  }
1075           /*
1076             max_zero_fill starts with field_length. It is decreased every
1077             time a shorter "zero trailer" is found. It is set to zero when
1078             an empty field is found (see above). This suggests that the
1079             variable should be called 'min_zero_fill'.
1080           */
1081 	  for (i =0 ; i < count->max_zero_fill && ! end_pos[-1 - (int) i] ;
1082 	       i++) ;
1083 	  if (i < count->max_zero_fill)
1084 	    count->max_zero_fill=i;
1085 	}
1086 
1087         /* Ignore zero fields and check fields. */
1088 	if (count->field_type == FIELD_ZERO ||
1089 	    count->field_type == FIELD_CHECK)
1090 	  continue;
1091 
1092         /*
1093           Count the incidence of every byte value in the
1094           significant field value.
1095         */
1096 	for ( ; pos < end_pos ; pos++)
1097 	  count->counts[(uchar) *pos]++;
1098 
1099         /* Step to next field. */
1100       }
1101 
1102       if (tot_blob_length > max_blob_length)
1103 	max_blob_length=tot_blob_length;
1104       record_count++;
1105       if (write_loop && record_count % WRITE_COUNT == 0)
1106       {
1107 	printf("%lu\r", (ulong) record_count);
1108         (void) fflush(stdout);
1109       }
1110     }
1111     else if (error != HA_ERR_RECORD_DELETED)
1112     {
1113       (void) fprintf(stderr, "Got error %d while reading rows", error);
1114       break;
1115     }
1116 
1117     /* Step to next record. */
1118   }
1119   if (write_loop)
1120   {
1121     printf("            \r");
1122     (void) fflush(stdout);
1123   }
1124 
1125   /*
1126     If --debug=d,fakebigcodes is set, fake the counts to get big Huffman
1127     codes.
1128   */
1129   DBUG_EXECUTE_IF("fakebigcodes", fakebigcodes(huff_counts, end_count););
1130 
1131   DBUG_PRINT("info", ("Found the following number of incidents "
1132                       "of the byte codes:"));
1133   if (verbose >= 2)
1134     printf("Found the following number of incidents "
1135                 "of the byte codes:\n");
1136   for (count= huff_counts ; count < end_count; count++)
1137   {
1138     uint      idx;
1139     my_off_t  total_count;
1140     char      llbuf[32];
1141 
1142     DBUG_PRINT("info", ("column: %3u", (uint) (count - huff_counts + 1)));
1143     if (verbose >= 2)
1144       printf("column: %3u\n", (uint) (count - huff_counts + 1));
1145     if (count->tree_buff)
1146     {
1147       DBUG_PRINT("info", ("number of distinct values: %u",
1148                           (uint) ((count->tree_pos - count->tree_buff) /
1149                                   count->field_length)));
1150       if (verbose >= 2)
1151         printf("number of distinct values: %u\n",
1152                     (uint) ((count->tree_pos - count->tree_buff) /
1153                             count->field_length));
1154     }
1155     total_count= 0;
1156     for (idx= 0; idx < 256; idx++)
1157     {
1158       if (count->counts[idx])
1159       {
1160         total_count+= count->counts[idx];
1161         DBUG_PRINT("info", ("counts[0x%02x]: %12s", idx,
1162                             llstr((longlong) count->counts[idx], llbuf)));
1163         if (verbose >= 2)
1164           printf("counts[0x%02x]: %12s\n", idx,
1165                       llstr((longlong) count->counts[idx], llbuf));
1166       }
1167     }
1168     DBUG_PRINT("info", ("total:        %12s", llstr((longlong) total_count,
1169                                                     llbuf)));
1170     if ((verbose >= 2) && total_count)
1171     {
1172       printf("total:        %12s\n",
1173                   llstr((longlong) total_count, llbuf));
1174     }
1175   }
1176 
1177   mrg->records=record_count;
1178   mrg->max_blob_length=max_blob_length;
1179   my_afree((uchar*) record);
1180   DBUG_RETURN(error != HA_ERR_END_OF_FILE);
1181 }
1182 
compare_huff_elements(void * not_used,uchar * a,uchar * b)1183 static int compare_huff_elements(void *not_used __attribute__((unused)),
1184 				 uchar *a, uchar *b)
1185 {
1186   return *((my_off_t*) a) < *((my_off_t*) b) ? -1 :
1187     (*((my_off_t*) a) == *((my_off_t*) b)  ? 0 : 1);
1188 }
1189 
1190 	/* Check each tree if we should use pre-space-compress, end-space-
1191 	   compress, empty-field-compress or zero-field-compress */
1192 
check_counts(HUFF_COUNTS * huff_counts,uint trees,my_off_t records)1193 static void check_counts(HUFF_COUNTS *huff_counts, uint trees,
1194 			 my_off_t records)
1195 {
1196   uint space_fields,fill_zero_fields,field_count[(int) FIELD_enum_val_count];
1197   my_off_t old_length,new_length,length;
1198   DBUG_ENTER("check_counts");
1199 
1200   bzero((uchar*) field_count,sizeof(field_count));
1201   space_fields=fill_zero_fields=0;
1202 
1203   for (; trees-- ; huff_counts++)
1204   {
1205     if (huff_counts->field_type == FIELD_BLOB)
1206     {
1207       huff_counts->length_bits=max_bit(huff_counts->max_length);
1208       goto found_pack;
1209     }
1210     else if (huff_counts->field_type == FIELD_VARCHAR)
1211     {
1212       huff_counts->length_bits=max_bit(huff_counts->max_length);
1213       goto found_pack;
1214     }
1215     else if (huff_counts->field_type == FIELD_CHECK)
1216     {
1217       huff_counts->bytes_packed=0;
1218       huff_counts->counts[0]=0;
1219       goto found_pack;
1220     }
1221 
1222     huff_counts->field_type=FIELD_NORMAL;
1223     huff_counts->pack_type=0;
1224 
1225     /* Check for zero-filled records (in this column), or zero records. */
1226     if (huff_counts->zero_fields || ! records)
1227     {
1228       my_off_t old_space_count;
1229       /*
1230         If there are only zero filled records (in this column),
1231         or no records at all, we are done.
1232       */
1233       if (huff_counts->zero_fields == records)
1234       {
1235 	huff_counts->field_type= FIELD_ZERO;
1236 	huff_counts->bytes_packed=0;
1237 	huff_counts->counts[0]=0;
1238 	goto found_pack;
1239       }
1240       /* Remember the number of significant spaces. */
1241       old_space_count=huff_counts->counts[' '];
1242       /* Add all leading and trailing spaces. */
1243       huff_counts->counts[' ']+= (huff_counts->tot_end_space +
1244                                   huff_counts->tot_pre_space +
1245                                   huff_counts->empty_fields *
1246                                   huff_counts->field_length);
1247       /* Check, what the compressed length of this would be. */
1248       old_length=calc_packed_length(huff_counts,0)+records/8;
1249       /* Get the number of zero bytes. */
1250       length=huff_counts->zero_fields*huff_counts->field_length;
1251       /* Add it to the counts. */
1252       huff_counts->counts[0]+=length;
1253       /* Check, what the compressed length of this would be. */
1254       new_length=calc_packed_length(huff_counts,0);
1255       /* If the compression without the zeroes would be shorter, we are done. */
1256       if (old_length < new_length && huff_counts->field_length > 1)
1257       {
1258 	huff_counts->field_type=FIELD_SKIP_ZERO;
1259 	huff_counts->counts[0]-=length;
1260 	huff_counts->bytes_packed=old_length- records/8;
1261 	goto found_pack;
1262       }
1263       /* Remove the insignificant spaces, but keep the zeroes. */
1264       huff_counts->counts[' ']=old_space_count;
1265     }
1266     /* Check, what the compressed length of this column would be. */
1267     huff_counts->bytes_packed=calc_packed_length(huff_counts,0);
1268 
1269     /*
1270       If there are enough empty records (in this column),
1271       treating them specially may pay off.
1272     */
1273     if (huff_counts->empty_fields)
1274     {
1275       if (huff_counts->field_length > 2 &&
1276 	  huff_counts->empty_fields + (records - huff_counts->empty_fields)*
1277 	  (1+max_bit(MY_MAX(huff_counts->max_pre_space,
1278 			 huff_counts->max_end_space))) <
1279 	  records * max_bit(huff_counts->field_length))
1280       {
1281 	huff_counts->pack_type |= PACK_TYPE_SPACE_FIELDS;
1282       }
1283       else
1284       {
1285 	length=huff_counts->empty_fields*huff_counts->field_length;
1286 	if (huff_counts->tot_end_space || ! huff_counts->tot_pre_space)
1287 	{
1288 	  huff_counts->tot_end_space+=length;
1289 	  huff_counts->max_end_space=huff_counts->field_length;
1290 	  if (huff_counts->field_length < 8)
1291 	    huff_counts->end_space[huff_counts->field_length]+=
1292 	      huff_counts->empty_fields;
1293 	}
1294 	if (huff_counts->tot_pre_space)
1295 	{
1296 	  huff_counts->tot_pre_space+=length;
1297 	  huff_counts->max_pre_space=huff_counts->field_length;
1298 	  if (huff_counts->field_length < 8)
1299 	    huff_counts->pre_space[huff_counts->field_length]+=
1300 	      huff_counts->empty_fields;
1301 	}
1302       }
1303     }
1304 
1305     /*
1306       If there are enough trailing spaces (in this column),
1307       treating them specially may pay off.
1308     */
1309     if (huff_counts->tot_end_space)
1310     {
1311       huff_counts->counts[' ']+=huff_counts->tot_pre_space;
1312       if (test_space_compress(huff_counts,records,huff_counts->max_end_space,
1313 			      huff_counts->end_space,
1314 			      huff_counts->tot_end_space,FIELD_SKIP_ENDSPACE))
1315 	goto found_pack;
1316       huff_counts->counts[' ']-=huff_counts->tot_pre_space;
1317     }
1318 
1319     /*
1320       If there are enough leading spaces (in this column),
1321       treating them specially may pay off.
1322     */
1323     if (huff_counts->tot_pre_space)
1324     {
1325       if (test_space_compress(huff_counts,records,huff_counts->max_pre_space,
1326 			      huff_counts->pre_space,
1327 			      huff_counts->tot_pre_space,FIELD_SKIP_PRESPACE))
1328 	goto found_pack;
1329     }
1330 
1331   found_pack:			/* Found field-packing */
1332 
1333     /* Test if we can use zero-fill */
1334 
1335     if (huff_counts->max_zero_fill &&
1336 	(huff_counts->field_type == FIELD_NORMAL ||
1337 	 huff_counts->field_type == FIELD_SKIP_ZERO))
1338     {
1339       huff_counts->counts[0]-=huff_counts->max_zero_fill*
1340 	(huff_counts->field_type == FIELD_SKIP_ZERO ?
1341 	 records - huff_counts->zero_fields : records);
1342       huff_counts->pack_type|=PACK_TYPE_ZERO_FILL;
1343       huff_counts->bytes_packed=calc_packed_length(huff_counts,0);
1344     }
1345 
1346     /* Test if intervall-field is better */
1347 
1348     if (huff_counts->tree_buff)
1349     {
1350       HUFF_TREE tree;
1351 
1352       DBUG_EXECUTE_IF("forceintervall",
1353                       huff_counts->bytes_packed= ~ (my_off_t) 0;);
1354       tree.element_buffer=0;
1355       if (!make_huff_tree(&tree,huff_counts) &&
1356 	  tree.bytes_packed+tree.tree_pack_length < huff_counts->bytes_packed)
1357       {
1358 	if (tree.elements == 1)
1359 	  huff_counts->field_type=FIELD_CONSTANT;
1360 	else
1361 	  huff_counts->field_type=FIELD_INTERVALL;
1362 	huff_counts->pack_type=0;
1363       }
1364       else
1365       {
1366 	my_free(huff_counts->tree_buff);
1367 	delete_tree(&huff_counts->int_tree, 0);
1368 	huff_counts->tree_buff=0;
1369       }
1370       if (tree.element_buffer)
1371 	my_free(tree.element_buffer);
1372     }
1373     if (huff_counts->pack_type & PACK_TYPE_SPACE_FIELDS)
1374       space_fields++;
1375     if (huff_counts->pack_type & PACK_TYPE_ZERO_FILL)
1376       fill_zero_fields++;
1377     field_count[huff_counts->field_type]++;
1378   }
1379   DBUG_PRINT("info", ("normal:    %3d  empty-space:     %3d  "
1380                       "empty-zero:       %3d  empty-fill: %3d",
1381                       field_count[FIELD_NORMAL],space_fields,
1382                       field_count[FIELD_SKIP_ZERO],fill_zero_fields));
1383   DBUG_PRINT("info", ("pre-space: %3d  end-space:       %3d  "
1384                       "intervall-fields: %3d  zero:       %3d",
1385                       field_count[FIELD_SKIP_PRESPACE],
1386                       field_count[FIELD_SKIP_ENDSPACE],
1387                       field_count[FIELD_INTERVALL],
1388                       field_count[FIELD_ZERO]));
1389   if (verbose)
1390     printf("\nnormal:    %3d  empty-space:     %3d  "
1391                 "empty-zero:       %3d  empty-fill: %3d\n"
1392                 "pre-space: %3d  end-space:       %3d  "
1393                 "intervall-fields: %3d  zero:       %3d\n",
1394                 field_count[FIELD_NORMAL],space_fields,
1395                 field_count[FIELD_SKIP_ZERO],fill_zero_fields,
1396                 field_count[FIELD_SKIP_PRESPACE],
1397                 field_count[FIELD_SKIP_ENDSPACE],
1398                 field_count[FIELD_INTERVALL],
1399                 field_count[FIELD_ZERO]);
1400   DBUG_VOID_RETURN;
1401 }
1402 
1403 	/* Test if we can use space-compression and empty-field-compression */
1404 
1405 static int
test_space_compress(HUFF_COUNTS * huff_counts,my_off_t records,uint max_space_length,my_off_t * space_counts,my_off_t tot_space_count,enum en_fieldtype field_type)1406 test_space_compress(HUFF_COUNTS *huff_counts, my_off_t records,
1407 		    uint max_space_length, my_off_t *space_counts,
1408 		    my_off_t tot_space_count, enum en_fieldtype field_type)
1409 {
1410   int min_pos;
1411   uint length_bits,i;
1412   my_off_t space_count,min_space_count,min_pack,new_length,skip;
1413 
1414   length_bits=max_bit(max_space_length);
1415 
1416 		/* Default no end_space-packing */
1417   space_count=huff_counts->counts[(uint) ' '];
1418   min_space_count= (huff_counts->counts[(uint) ' ']+= tot_space_count);
1419   min_pack=calc_packed_length(huff_counts,0);
1420   min_pos= -2;
1421   huff_counts->counts[(uint) ' ']=space_count;
1422 
1423 	/* Test with allways space-count */
1424   new_length=huff_counts->bytes_packed+length_bits*records/8;
1425   if (new_length+1 < min_pack)
1426   {
1427     min_pos= -1;
1428     min_pack=new_length;
1429     min_space_count=space_count;
1430   }
1431 	/* Test with length-flag */
1432   for (skip=0L, i=0 ; i < 8 ; i++)
1433   {
1434     if (space_counts[i])
1435     {
1436       if (i)
1437 	huff_counts->counts[(uint) ' ']+=space_counts[i];
1438       skip+=huff_counts->pre_space[i];
1439       new_length=calc_packed_length(huff_counts,0)+
1440 	(records+(records-skip)*(1+length_bits))/8;
1441       if (new_length < min_pack)
1442       {
1443 	min_pos=(int) i;
1444 	min_pack=new_length;
1445 	min_space_count=huff_counts->counts[(uint) ' '];
1446       }
1447     }
1448   }
1449 
1450   huff_counts->counts[(uint) ' ']=min_space_count;
1451   huff_counts->bytes_packed=min_pack;
1452   switch (min_pos) {
1453   case -2:
1454     return(0);				/* No space-compress */
1455   case -1:				/* Always space-count */
1456     huff_counts->field_type=field_type;
1457     huff_counts->min_space=0;
1458     huff_counts->length_bits=max_bit(max_space_length);
1459     break;
1460   default:
1461     huff_counts->field_type=field_type;
1462     huff_counts->min_space=(uint) min_pos;
1463     huff_counts->pack_type|=PACK_TYPE_SELECTED;
1464     huff_counts->length_bits=max_bit(max_space_length);
1465     break;
1466   }
1467   return(1);				/* Using space-compress */
1468 }
1469 
1470 
1471 	/* Make a huff_tree of each huff_count */
1472 
make_huff_trees(HUFF_COUNTS * huff_counts,uint trees)1473 static HUFF_TREE* make_huff_trees(HUFF_COUNTS *huff_counts, uint trees)
1474 {
1475   uint tree;
1476   HUFF_TREE *huff_tree;
1477   DBUG_ENTER("make_huff_trees");
1478 
1479   if (!(huff_tree=(HUFF_TREE*) my_malloc(trees*sizeof(HUFF_TREE),
1480 					 MYF(MY_WME | MY_ZEROFILL))))
1481     DBUG_RETURN(0);
1482 
1483   for (tree=0 ; tree < trees ; tree++)
1484   {
1485     if (make_huff_tree(huff_tree+tree,huff_counts+tree))
1486     {
1487       while (tree--)
1488 	my_free(huff_tree[tree].element_buffer);
1489       my_free(huff_tree);
1490       DBUG_RETURN(0);
1491     }
1492   }
1493   DBUG_RETURN(huff_tree);
1494 }
1495 
1496 /*
1497   Build a Huffman tree.
1498 
1499   SYNOPSIS
1500     make_huff_tree()
1501     huff_tree                   The Huffman tree.
1502     huff_counts                 The counts.
1503 
1504   DESCRIPTION
1505     Build a Huffman tree according to huff_counts->counts or
1506     huff_counts->tree_buff. tree_buff, if non-NULL contains up to
1507     tree_buff_length of distinct column values. In that case, whole
1508     values can be Huffman encoded instead of single bytes.
1509 
1510   RETURN
1511     0           OK
1512     != 0        Error
1513 */
1514 
make_huff_tree(HUFF_TREE * huff_tree,HUFF_COUNTS * huff_counts)1515 static int make_huff_tree(HUFF_TREE *huff_tree, HUFF_COUNTS *huff_counts)
1516 {
1517   uint i,found,bits_packed,first,last;
1518   my_off_t bytes_packed;
1519   HUFF_ELEMENT *a,*b,*new_huff_el;
1520 
1521   first=last=0;
1522   if (huff_counts->tree_buff)
1523   {
1524     /* Calculate the number of distinct values in tree_buff. */
1525     found= (uint) (huff_counts->tree_pos - huff_counts->tree_buff) /
1526       huff_counts->field_length;
1527     first=0; last=found-1;
1528   }
1529   else
1530   {
1531     /* Count the number of byte codes found in the column. */
1532     for (i=found=0 ; i < 256 ; i++)
1533     {
1534       if (huff_counts->counts[i])
1535       {
1536 	if (! found++)
1537 	  first=i;
1538 	last=i;
1539       }
1540     }
1541     if (found < 2)
1542       found=2;
1543   }
1544 
1545   /* When using 'tree_buff' we can have more that 256 values. */
1546   if (queue.max_elements < found)
1547   {
1548     delete_queue(&queue);
1549     if (init_queue(&queue,found, 0, 0, compare_huff_elements, 0, 0, 0))
1550       return -1;
1551   }
1552 
1553   /* Allocate or reallocate an element buffer for the Huffman tree. */
1554   if (!huff_tree->element_buffer)
1555   {
1556     if (!(huff_tree->element_buffer=
1557 	 (HUFF_ELEMENT*) my_malloc(found*2*sizeof(HUFF_ELEMENT),MYF(MY_WME))))
1558       return 1;
1559   }
1560   else
1561   {
1562     HUFF_ELEMENT *temp;
1563     if (!(temp=
1564 	  (HUFF_ELEMENT*) my_realloc((uchar*) huff_tree->element_buffer,
1565 				     found*2*sizeof(HUFF_ELEMENT),
1566 				     MYF(MY_WME))))
1567       return 1;
1568     huff_tree->element_buffer=temp;
1569   }
1570 
1571   huff_counts->tree=huff_tree;
1572   huff_tree->counts=huff_counts;
1573   huff_tree->min_chr=first;
1574   huff_tree->max_chr=last;
1575   huff_tree->char_bits=max_bit(last-first);
1576   huff_tree->offset_bits=max_bit(found-1)+1;
1577 
1578   if (huff_counts->tree_buff)
1579   {
1580     huff_tree->elements=0;
1581     huff_tree->tree_pack_length=(1+15+16+5+5+
1582 				 (huff_tree->char_bits+1)*found+
1583 				 (huff_tree->offset_bits+1)*
1584 				 (found-2)+7)/8 +
1585 				   (uint) (huff_tree->counts->tree_pos-
1586 					   huff_tree->counts->tree_buff);
1587     /*
1588       Put a HUFF_ELEMENT into the queue for every distinct column value.
1589 
1590       tree_walk() calls save_counts_in_queue() for every element in
1591       'int_tree'. This takes elements from the target trees element
1592       buffer and places references to them into the buffer of the
1593       priority queue. We insert in column value order, but the order is
1594       in fact irrelevant here. We will establish the correct order
1595       later.
1596     */
1597     tree_walk(&huff_counts->int_tree,
1598 	      (int (*)(void*, element_count,void*)) save_counts_in_queue,
1599 	      (uchar*) huff_tree, left_root_right);
1600   }
1601   else
1602   {
1603     huff_tree->elements=found;
1604     huff_tree->tree_pack_length=(9+9+5+5+
1605 				 (huff_tree->char_bits+1)*found+
1606 				 (huff_tree->offset_bits+1)*
1607 				 (found-2)+7)/8;
1608     /*
1609       Put a HUFF_ELEMENT into the queue for every byte code found in the column.
1610 
1611       The elements are taken from the target trees element buffer.
1612       Instead of using queue_insert(), we just place references to the
1613       elements into the buffer of the priority queue. We insert in byte
1614       value order, but the order is in fact irrelevant here. We will
1615       establish the correct order later.
1616     */
1617     for (i=first, found=0 ; i <= last ; i++)
1618     {
1619       if (huff_counts->counts[i])
1620       {
1621 	new_huff_el=huff_tree->element_buffer+(found++);
1622 	new_huff_el->count=huff_counts->counts[i];
1623 	new_huff_el->a.leaf.null=0;
1624 	new_huff_el->a.leaf.element_nr=i;
1625 	queue.root[found]=(uchar*) new_huff_el;
1626       }
1627     }
1628     /*
1629       If there is only a single byte value in this field in all records,
1630       add a second element with zero incidence. This is required to enter
1631       the loop, which builds the Huffman tree.
1632     */
1633     while (found < 2)
1634     {
1635       new_huff_el=huff_tree->element_buffer+(found++);
1636       new_huff_el->count=0;
1637       new_huff_el->a.leaf.null=0;
1638       if (last)
1639 	new_huff_el->a.leaf.element_nr=huff_tree->min_chr=last-1;
1640       else
1641 	new_huff_el->a.leaf.element_nr=huff_tree->max_chr=last+1;
1642       queue.root[found]=(uchar*) new_huff_el;
1643     }
1644   }
1645 
1646   /* Make a queue from the queue buffer. */
1647   queue.elements=found;
1648 
1649   /*
1650     Make a priority queue from the queue. Construct its index so that we
1651     have a partially ordered tree.
1652   */
1653   queue_fix(&queue);
1654 
1655   /* The Huffman algorithm. */
1656   bytes_packed=0; bits_packed=0;
1657   for (i=1 ; i < found ; i++)
1658   {
1659     /*
1660       Pop the top element from the queue (the one with the least incidence).
1661       Popping from a priority queue includes a re-ordering of the queue,
1662       to get the next least incidence element to the top.
1663     */
1664     a=(HUFF_ELEMENT*) queue_remove_top(&queue);
1665     /* Copy the next least incidence element */
1666     b=(HUFF_ELEMENT*) queue_top(&queue);
1667     /* Get a new element from the element buffer. */
1668     new_huff_el=huff_tree->element_buffer+found+i;
1669     /* The new element gets the sum of the two least incidence elements. */
1670     new_huff_el->count=a->count+b->count;
1671     /*
1672       The Huffman algorithm assigns another bit to the code for a byte
1673       every time that bytes incidence is combined (directly or indirectly)
1674       to a new element as one of the two least incidence elements.
1675       This means that one more bit per incidence of that byte is required
1676       in the resulting file. So we add the new combined incidence as the
1677       number of bits by which the result grows.
1678     */
1679     bits_packed+=(uint) (new_huff_el->count & 7);
1680     bytes_packed+=new_huff_el->count/8;
1681     /* The new element points to its children, lesser in left.  */
1682     new_huff_el->a.nod.left=a;
1683     new_huff_el->a.nod.right=b;
1684     /*
1685       Replace the copied top element by the new element and re-order the
1686       queue.
1687     */
1688     queue_top(&queue)= (uchar*) new_huff_el;
1689     queue_replace_top(&queue);
1690   }
1691   huff_tree->root=(HUFF_ELEMENT*) queue.root[1];
1692   huff_tree->bytes_packed=bytes_packed+(bits_packed+7)/8;
1693   return 0;
1694 }
1695 
compare_tree(void * cmp_arg,register const uchar * s,register const uchar * t)1696 static int compare_tree(void* cmp_arg __attribute__((unused)),
1697 			register const uchar *s, register const uchar *t)
1698 {
1699   uint length;
1700   for (length=global_count->field_length; length-- ;)
1701     if (*s++ != *t++)
1702       return (int) s[-1] - (int) t[-1];
1703   return 0;
1704 }
1705 
1706 /*
1707   Organize distinct column values and their incidences into a priority queue.
1708 
1709   SYNOPSIS
1710     save_counts_in_queue()
1711     key                         The column value.
1712     count                       The incidence of this value.
1713     tree                        The Huffman tree to be built later.
1714 
1715   DESCRIPTION
1716     We use the element buffer of the targeted tree. The distinct column
1717     values are organized in a priority queue first. The Huffman
1718     algorithm will later organize the elements into a Huffman tree. For
1719     the time being, we just place references to the elements into the
1720     queue buffer. The buffer will later be organized into a priority
1721     queue.
1722 
1723   RETURN
1724     0
1725  */
1726 
save_counts_in_queue(uchar * key,element_count count,HUFF_TREE * tree)1727 static int save_counts_in_queue(uchar *key, element_count count,
1728 				HUFF_TREE *tree)
1729 {
1730   HUFF_ELEMENT *new_huff_el;
1731 
1732   new_huff_el=tree->element_buffer+(tree->elements++);
1733   new_huff_el->count=count;
1734   new_huff_el->a.leaf.null=0;
1735   new_huff_el->a.leaf.element_nr= (uint) (key- tree->counts->tree_buff) /
1736     tree->counts->field_length;
1737   queue.root[tree->elements]=(uchar*) new_huff_el;
1738   return 0;
1739 }
1740 
1741 
1742 /*
1743   Calculate length of file if given counts should be used.
1744 
1745   SYNOPSIS
1746     calc_packed_length()
1747     huff_counts                 The counts for a column of the table(s).
1748     add_tree_lenght             If the decode tree length should be added.
1749 
1750   DESCRIPTION
1751     We need to follow the Huffman algorithm until we know, how many bits
1752     are required for each byte code. But we do not need the resulting
1753     Huffman tree. Hence, we can leave out some steps which are essential
1754     in make_huff_tree().
1755 
1756   RETURN
1757     Number of bytes required to compress this table column.
1758 */
1759 
calc_packed_length(HUFF_COUNTS * huff_counts,uint add_tree_lenght)1760 static my_off_t calc_packed_length(HUFF_COUNTS *huff_counts,
1761 				   uint add_tree_lenght)
1762 {
1763   uint i,found,bits_packed,first,last;
1764   my_off_t bytes_packed;
1765   HUFF_ELEMENT element_buffer[256];
1766   DBUG_ENTER("calc_packed_length");
1767 
1768   /*
1769     WARNING: We use a small hack for efficiency: Instead of placing
1770     references to HUFF_ELEMENTs into the queue, we just insert
1771     references to the counts of the byte codes which appeared in this
1772     table column. During the Huffman algorithm they are successively
1773     replaced by references to HUFF_ELEMENTs. This works, because
1774     HUFF_ELEMENTs have the incidence count at their beginning.
1775     Regardless, wether the queue array contains references to counts of
1776     type my_off_t or references to HUFF_ELEMENTs which have the count of
1777     type my_off_t at their beginning, it always points to a count of the
1778     same type.
1779 
1780     Instead of using queue_insert(), we just copy the references into
1781     the buffer of the priority queue. We insert in byte value order, but
1782     the order is in fact irrelevant here. We will establish the correct
1783     order later.
1784   */
1785   first=last=0;
1786   for (i=found=0 ; i < 256 ; i++)
1787   {
1788     if (huff_counts->counts[i])
1789     {
1790       if (! found++)
1791 	first=i;
1792       last=i;
1793       /* We start with root[1], which is the queues top element. */
1794       queue.root[found]=(uchar*) &huff_counts->counts[i];
1795     }
1796   }
1797   if (!found)
1798     DBUG_RETURN(0);			/* Empty tree */
1799   /*
1800     If there is only a single byte value in this field in all records,
1801     add a second element with zero incidence. This is required to enter
1802     the loop, which follows the Huffman algorithm.
1803   */
1804   if (found < 2)
1805     queue.root[++found]=(uchar*) &huff_counts->counts[last ? 0 : 1];
1806 
1807   /* Make a queue from the queue buffer. */
1808   queue.elements=found;
1809 
1810   bytes_packed=0; bits_packed=0;
1811   /* Add the length of the coding table, which would become part of the file. */
1812   if (add_tree_lenght)
1813     bytes_packed=(8+9+5+5+(max_bit(last-first)+1)*found+
1814 		  (max_bit(found-1)+1+1)*(found-2) +7)/8;
1815 
1816   /*
1817     Make a priority queue from the queue. Construct its index so that we
1818     have a partially ordered tree.
1819   */
1820   queue_fix(&queue);
1821 
1822   /* The Huffman algorithm. */
1823   for (i=0 ; i < found-1 ; i++)
1824   {
1825     my_off_t        *a;
1826     my_off_t        *b;
1827     HUFF_ELEMENT    *new_huff_el;
1828 
1829     /*
1830       Pop the top element from the queue (the one with the least
1831       incidence). Popping from a priority queue includes a re-ordering
1832       of the queue, to get the next least incidence element to the top.
1833     */
1834     a= (my_off_t*) queue_remove_top(&queue);
1835     /* Copy the next least incidence element. */
1836     b= (my_off_t*) queue_top(&queue);
1837     /* Create a new element in a local (automatic) buffer. */
1838     new_huff_el= element_buffer + i;
1839     /* The new element gets the sum of the two least incidence elements. */
1840     new_huff_el->count= *a + *b;
1841     /*
1842       The Huffman algorithm assigns another bit to the code for a byte
1843       every time that bytes incidence is combined (directly or indirectly)
1844       to a new element as one of the two least incidence elements.
1845       This means that one more bit per incidence of that byte is required
1846       in the resulting file. So we add the new combined incidence as the
1847       number of bits by which the result grows.
1848     */
1849     bits_packed+=(uint) (new_huff_el->count & 7);
1850     bytes_packed+=new_huff_el->count/8;
1851     /*
1852       Replace the copied top element by the new element and re-order the
1853       queue. This successively replaces the references to counts by
1854       references to HUFF_ELEMENTs.
1855     */
1856     queue_top(&queue)= (uchar*) new_huff_el;
1857     queue_replace_top(&queue);
1858   }
1859   DBUG_RETURN(bytes_packed+(bits_packed+7)/8);
1860 }
1861 
1862 
1863 	/* Remove trees that don't give any compression */
1864 
join_same_trees(HUFF_COUNTS * huff_counts,uint trees)1865 static uint join_same_trees(HUFF_COUNTS *huff_counts, uint trees)
1866 {
1867   uint k,tree_number;
1868   HUFF_COUNTS count,*i,*j,*last_count;
1869 
1870   last_count=huff_counts+trees;
1871   for (tree_number=0, i=huff_counts ; i < last_count ; i++)
1872   {
1873     if (!i->tree->tree_number)
1874     {
1875       i->tree->tree_number= ++tree_number;
1876       if (i->tree_buff)
1877 	continue;			/* Don't join intervall */
1878       for (j=i+1 ; j < last_count ; j++)
1879       {
1880 	if (! j->tree->tree_number && ! j->tree_buff)
1881 	{
1882 	  for (k=0 ; k < 256 ; k++)
1883 	    count.counts[k]=i->counts[k]+j->counts[k];
1884 	  if (calc_packed_length(&count,1) <=
1885 	      i->tree->bytes_packed + j->tree->bytes_packed+
1886 	      i->tree->tree_pack_length+j->tree->tree_pack_length+
1887 	      ALLOWED_JOIN_DIFF)
1888 	  {
1889 	    memcpy(i->counts, count.counts,
1890 			 sizeof(count.counts[0])*256);
1891 	    my_free(j->tree->element_buffer);
1892 	    j->tree->element_buffer=0;
1893 	    j->tree=i->tree;
1894 	    bmove((uchar*) i->counts,(uchar*) count.counts,
1895 		  sizeof(count.counts[0])*256);
1896 	    if (make_huff_tree(i->tree,i))
1897 	      return (uint) -1;
1898 	  }
1899 	}
1900       }
1901     }
1902   }
1903   DBUG_PRINT("info", ("Original trees:  %d  After join: %d",
1904                       trees, tree_number));
1905   if (verbose)
1906     printf("Original trees:  %d  After join: %d\n", trees, tree_number);
1907   return tree_number;			/* Return trees left */
1908 }
1909 
1910 
1911 /*
1912   Fill in huff_tree encode tables.
1913 
1914   SYNOPSIS
1915     make_huff_decode_table()
1916     huff_tree               An array of HUFF_TREE which are to be encoded.
1917     trees                   The number of HUFF_TREE in the array.
1918 
1919   RETURN
1920     0           success
1921     != 0        error
1922 */
1923 
make_huff_decode_table(HUFF_TREE * huff_tree,uint trees)1924 static int make_huff_decode_table(HUFF_TREE *huff_tree, uint trees)
1925 {
1926   uint elements;
1927   for ( ; trees-- ; huff_tree++)
1928   {
1929     if (huff_tree->tree_number > 0)
1930     {
1931       elements=huff_tree->counts->tree_buff ? huff_tree->elements : 256;
1932       if (!(huff_tree->code =
1933             (ulonglong*) my_malloc(elements*
1934                                    (sizeof(ulonglong) + sizeof(uchar)),
1935                                    MYF(MY_WME | MY_ZEROFILL))))
1936 	return 1;
1937       huff_tree->code_len=(uchar*) (huff_tree->code+elements);
1938       make_traverse_code_tree(huff_tree, huff_tree->root,
1939                               8 * sizeof(ulonglong), 0);
1940     }
1941   }
1942   return 0;
1943 }
1944 
1945 
make_traverse_code_tree(HUFF_TREE * huff_tree,HUFF_ELEMENT * element,uint size,ulonglong code)1946 static void make_traverse_code_tree(HUFF_TREE *huff_tree,
1947 				    HUFF_ELEMENT *element,
1948 				    uint size, ulonglong code)
1949 {
1950   uint chr;
1951   if (!element->a.leaf.null)
1952   {
1953     chr=element->a.leaf.element_nr;
1954     huff_tree->code_len[chr]= (uchar) (8 * sizeof(ulonglong) - size);
1955     huff_tree->code[chr]= (code >> size);
1956     if (huff_tree->height < 8 * sizeof(ulonglong) - size)
1957         huff_tree->height= 8 * sizeof(ulonglong) - size;
1958   }
1959   else
1960   {
1961     size--;
1962     make_traverse_code_tree(huff_tree,element->a.nod.left,size,code);
1963     make_traverse_code_tree(huff_tree, element->a.nod.right, size,
1964 			    code + (((ulonglong) 1) << size));
1965   }
1966   return;
1967 }
1968 
1969 
1970 /*
1971   Convert a value into binary digits.
1972 
1973   SYNOPSIS
1974     bindigits()
1975     value                       The value.
1976     length                      The number of low order bits to convert.
1977 
1978   NOTE
1979     The result string is in static storage. It is reused on every call.
1980     So you cannot use it twice in one expression.
1981 
1982   RETURN
1983     A pointer to a static NUL-terminated string.
1984  */
1985 
bindigits(ulonglong value,uint bits)1986 static char *bindigits(ulonglong value, uint bits)
1987 {
1988   static char digits[72];
1989   char *ptr= digits;
1990   uint idx= bits;
1991 
1992   DBUG_ASSERT(idx < sizeof(digits));
1993   while (idx)
1994     *(ptr++)= '0' + ((char) (value >> (--idx)) & (char) 1);
1995   *ptr= '\0';
1996   return digits;
1997 }
1998 
1999 
2000 /*
2001   Convert a value into hexadecimal digits.
2002 
2003   SYNOPSIS
2004     hexdigits()
2005     value                       The value.
2006 
2007   NOTE
2008     The result string is in static storage. It is reused on every call.
2009     So you cannot use it twice in one expression.
2010 
2011   RETURN
2012     A pointer to a static NUL-terminated string.
2013  */
2014 
hexdigits(ulonglong value)2015 static char *hexdigits(ulonglong value)
2016 {
2017   static char digits[20];
2018   char *ptr= digits;
2019   uint idx= 2 * sizeof(value); /* Two hex digits per byte. */
2020 
2021   DBUG_ASSERT(idx < sizeof(digits));
2022   while (idx)
2023   {
2024     if ((*(ptr++)= '0' + ((char) (value >> (4 * (--idx))) & (char) 0xf)) > '9')
2025       *(ptr - 1)+= 'a' - '9' - 1;
2026   }
2027   *ptr= '\0';
2028   return digits;
2029 }
2030 
2031 
2032 	/* Write header to new packed data file */
2033 
write_header(PACK_MRG_INFO * mrg,uint head_length,uint trees,my_off_t tot_elements,my_off_t filelength)2034 static int write_header(PACK_MRG_INFO *mrg,uint head_length,uint trees,
2035 			my_off_t tot_elements,my_off_t filelength)
2036 {
2037   uchar *buff= (uchar*) file_buffer.pos;
2038 
2039   bzero(buff,HEAD_LENGTH);
2040   memcpy(buff,myisam_pack_file_magic,4);
2041   int4store(buff+4,head_length);
2042   int4store(buff+8, mrg->min_pack_length);
2043   int4store(buff+12,mrg->max_pack_length);
2044   int4store(buff+16,tot_elements);
2045   int4store(buff+20,intervall_length);
2046   int2store(buff+24,trees);
2047   buff[26]=(char) mrg->ref_length;
2048 	/* Save record pointer length */
2049   buff[27]= (uchar) mi_get_pointer_length((ulonglong) filelength,2);
2050   if (test_only)
2051     return 0;
2052   my_seek(file_buffer.file,0L,MY_SEEK_SET,MYF(0));
2053   return my_write(file_buffer.file,(const uchar *) file_buffer.pos,HEAD_LENGTH,
2054 		  MYF(MY_WME | MY_NABP | MY_WAIT_IF_FULL)) != 0;
2055 }
2056 
2057 	/* Write fieldinfo to new packed file */
2058 
write_field_info(HUFF_COUNTS * counts,uint fields,uint trees)2059 static void write_field_info(HUFF_COUNTS *counts, uint fields, uint trees)
2060 {
2061   reg1 uint i;
2062   uint huff_tree_bits;
2063   huff_tree_bits=max_bit(trees ? trees-1 : 0);
2064 
2065   DBUG_PRINT("info", (" "));
2066   DBUG_PRINT("info", ("column types:"));
2067   DBUG_PRINT("info", ("FIELD_NORMAL          0"));
2068   DBUG_PRINT("info", ("FIELD_SKIP_ENDSPACE   1"));
2069   DBUG_PRINT("info", ("FIELD_SKIP_PRESPACE   2"));
2070   DBUG_PRINT("info", ("FIELD_SKIP_ZERO       3"));
2071   DBUG_PRINT("info", ("FIELD_BLOB            4"));
2072   DBUG_PRINT("info", ("FIELD_CONSTANT        5"));
2073   DBUG_PRINT("info", ("FIELD_INTERVALL       6"));
2074   DBUG_PRINT("info", ("FIELD_ZERO            7"));
2075   DBUG_PRINT("info", ("FIELD_VARCHAR         8"));
2076   DBUG_PRINT("info", ("FIELD_CHECK           9"));
2077   DBUG_PRINT("info", (" "));
2078   DBUG_PRINT("info", ("pack type as a set of flags:"));
2079   DBUG_PRINT("info", ("PACK_TYPE_SELECTED      1"));
2080   DBUG_PRINT("info", ("PACK_TYPE_SPACE_FIELDS  2"));
2081   DBUG_PRINT("info", ("PACK_TYPE_ZERO_FILL     4"));
2082   DBUG_PRINT("info", (" "));
2083   if (verbose >= 2)
2084   {
2085     printf("\n");
2086     printf("column types:\n");
2087     printf("FIELD_NORMAL          0\n");
2088     printf("FIELD_SKIP_ENDSPACE   1\n");
2089     printf("FIELD_SKIP_PRESPACE   2\n");
2090     printf("FIELD_SKIP_ZERO       3\n");
2091     printf("FIELD_BLOB            4\n");
2092     printf("FIELD_CONSTANT        5\n");
2093     printf("FIELD_INTERVALL       6\n");
2094     printf("FIELD_ZERO            7\n");
2095     printf("FIELD_VARCHAR         8\n");
2096     printf("FIELD_CHECK           9\n");
2097     printf("\n");
2098     printf("pack type as a set of flags:\n");
2099     printf("PACK_TYPE_SELECTED      1\n");
2100     printf("PACK_TYPE_SPACE_FIELDS  2\n");
2101     printf("PACK_TYPE_ZERO_FILL     4\n");
2102     printf("\n");
2103   }
2104   for (i=0 ; i++ < fields ; counts++)
2105   {
2106     write_bits((ulonglong) (int) counts->field_type, 5);
2107     write_bits(counts->pack_type,6);
2108     if (counts->pack_type & PACK_TYPE_ZERO_FILL)
2109       write_bits(counts->max_zero_fill,5);
2110     else
2111       write_bits(counts->length_bits,5);
2112     write_bits((ulonglong) counts->tree->tree_number - 1, huff_tree_bits);
2113     DBUG_PRINT("info", ("column: %3u  type: %2u  pack: %2u  zero: %4u  "
2114                         "lbits: %2u  tree: %2u  length: %4u",
2115                         i , counts->field_type, counts->pack_type,
2116                         counts->max_zero_fill, counts->length_bits,
2117                         counts->tree->tree_number, counts->field_length));
2118     if (verbose >= 2)
2119       printf("column: %3u  type: %2u  pack: %2u  zero: %4u  lbits: %2u  "
2120                   "tree: %2u  length: %4u\n", i , counts->field_type,
2121                   counts->pack_type, counts->max_zero_fill, counts->length_bits,
2122                   counts->tree->tree_number, counts->field_length);
2123   }
2124   flush_bits();
2125   return;
2126 }
2127 
2128 	/* Write all huff_trees to new datafile. Return tot count of
2129 	   elements in all trees
2130 	   Returns 0 on error */
2131 
write_huff_tree(HUFF_TREE * huff_tree,uint trees)2132 static my_off_t write_huff_tree(HUFF_TREE *huff_tree, uint trees)
2133 {
2134   uint i,int_length;
2135   uint tree_no;
2136   uint codes;
2137   uint errors= 0;
2138   uint *packed_tree,*offset,length;
2139   my_off_t elements;
2140 
2141   /* Find the highest number of elements in the trees. */
2142   for (i=length=0 ; i < trees ; i++)
2143     if (huff_tree[i].tree_number > 0 && huff_tree[i].elements > length)
2144       length=huff_tree[i].elements;
2145   /*
2146     Allocate a buffer for packing a decode tree. Two numbers per element
2147     (left child and right child).
2148   */
2149   if (!(packed_tree=(uint*) my_alloca(sizeof(uint)*length*2)))
2150   {
2151     my_error(EE_OUTOFMEMORY, MYF(ME_BELL+ME_FATALERROR),
2152              sizeof(uint)*length*2);
2153     return 0;
2154   }
2155 
2156   DBUG_PRINT("info", (" "));
2157   if (verbose >= 2)
2158     printf("\n");
2159   tree_no= 0;
2160   intervall_length=0;
2161   for (elements=0; trees-- ; huff_tree++)
2162   {
2163     /* Skip columns that have been joined with other columns. */
2164     if (huff_tree->tree_number == 0)
2165       continue;				/* Deleted tree */
2166     tree_no++;
2167     DBUG_PRINT("info", (" "));
2168     if (verbose >= 3)
2169       printf("\n");
2170     /* Count the total number of elements (byte codes or column values). */
2171     elements+=huff_tree->elements;
2172     huff_tree->max_offset=2;
2173     /* Build a tree of offsets and codes for decoding in 'packed_tree'. */
2174     if (huff_tree->elements <= 1)
2175       offset=packed_tree;
2176     else
2177       offset=make_offset_code_tree(huff_tree,huff_tree->root,packed_tree);
2178 
2179     /* This should be the same as 'length' above. */
2180     huff_tree->offset_bits=max_bit(huff_tree->max_offset);
2181 
2182     /*
2183       Since we check this during collecting the distinct column values,
2184       this should never happen.
2185     */
2186     if (huff_tree->max_offset >= IS_OFFSET)
2187     {				/* This should be impossible */
2188       (void) fprintf(stderr, "Tree offset got too big: %d, aborted\n",
2189                    huff_tree->max_offset);
2190       my_afree((uchar*) packed_tree);
2191       return 0;
2192     }
2193 
2194     DBUG_PRINT("info", ("pos: %lu  elements: %u  tree-elements: %lu  "
2195                         "char_bits: %u\n",
2196                         (ulong) (file_buffer.pos - file_buffer.buffer),
2197                         huff_tree->elements, (ulong) (offset - packed_tree),
2198                         huff_tree->char_bits));
2199     if (!huff_tree->counts->tree_buff)
2200     {
2201       /* We do a byte compression on this column. Mark with bit 0. */
2202       write_bits(0,1);
2203       write_bits(huff_tree->min_chr,8);
2204       write_bits(huff_tree->elements,9);
2205       write_bits(huff_tree->char_bits,5);
2206       write_bits(huff_tree->offset_bits,5);
2207       int_length=0;
2208     }
2209     else
2210     {
2211       int_length=(uint) (huff_tree->counts->tree_pos -
2212 			 huff_tree->counts->tree_buff);
2213       /* We have distinct column values for this column. Mark with bit 1. */
2214       write_bits(1,1);
2215       write_bits(huff_tree->elements,15);
2216       write_bits(int_length,16);
2217       write_bits(huff_tree->char_bits,5);
2218       write_bits(huff_tree->offset_bits,5);
2219       intervall_length+=int_length;
2220     }
2221     DBUG_PRINT("info", ("tree: %2u  elements: %4u  char_bits: %2u  "
2222                         "offset_bits: %2u  %s: %5u  codelen: %2u",
2223                         tree_no, huff_tree->elements, huff_tree->char_bits,
2224                         huff_tree->offset_bits, huff_tree->counts->tree_buff ?
2225                         "bufflen" : "min_chr", huff_tree->counts->tree_buff ?
2226                         int_length : huff_tree->min_chr, huff_tree->height));
2227     if (verbose >= 2)
2228       printf("tree: %2u  elements: %4u  char_bits: %2u  offset_bits: %2u  "
2229                   "%s: %5u  codelen: %2u\n", tree_no, huff_tree->elements,
2230                   huff_tree->char_bits, huff_tree->offset_bits,
2231                   huff_tree->counts->tree_buff ? "bufflen" : "min_chr",
2232                   huff_tree->counts->tree_buff ? int_length :
2233                   huff_tree->min_chr, huff_tree->height);
2234 
2235     /* Check that the code tree length matches the element count. */
2236     length=(uint) (offset-packed_tree);
2237     if (length != huff_tree->elements*2-2)
2238     {
2239       (void) fprintf(stderr, "error: Huff-tree-length: %d != calc_length: %d\n",
2240                    length, huff_tree->elements * 2 - 2);
2241       errors++;
2242       break;
2243     }
2244 
2245     for (i=0 ; i < length ; i++)
2246     {
2247       if (packed_tree[i] & IS_OFFSET)
2248 	write_bits(packed_tree[i] - IS_OFFSET+ (1 << huff_tree->offset_bits),
2249 		   huff_tree->offset_bits+1);
2250       else
2251 	write_bits(packed_tree[i]-huff_tree->min_chr,huff_tree->char_bits+1);
2252       DBUG_PRINT("info", ("tree[0x%04x]: %s0x%04x",
2253                           i, (packed_tree[i] & IS_OFFSET) ?
2254                           " -> " : "", (packed_tree[i] & IS_OFFSET) ?
2255                           packed_tree[i] - IS_OFFSET + i : packed_tree[i]));
2256       if (verbose >= 3)
2257         printf("tree[0x%04x]: %s0x%04x\n",
2258                     i, (packed_tree[i] & IS_OFFSET) ? " -> " : "",
2259                     (packed_tree[i] & IS_OFFSET) ?
2260                     packed_tree[i] - IS_OFFSET + i : packed_tree[i]);
2261     }
2262     flush_bits();
2263 
2264     /*
2265       Display coding tables and check their correctness.
2266     */
2267     codes= huff_tree->counts->tree_buff ? huff_tree->elements : 256;
2268     for (i= 0; i < codes; i++)
2269     {
2270       ulonglong code;
2271       uint bits;
2272       uint len;
2273       uint idx;
2274 
2275       if (! (len= huff_tree->code_len[i]))
2276         continue;
2277       DBUG_PRINT("info", ("code[0x%04x]:      0x%s  bits: %2u  bin: %s", i,
2278                           hexdigits(huff_tree->code[i]), huff_tree->code_len[i],
2279                           bindigits(huff_tree->code[i],
2280                                     huff_tree->code_len[i])));
2281       if (verbose >= 3)
2282         printf("code[0x%04x]:      0x%s  bits: %2u  bin: %s\n", i,
2283                     hexdigits(huff_tree->code[i]), huff_tree->code_len[i],
2284                     bindigits(huff_tree->code[i], huff_tree->code_len[i]));
2285 
2286       /* Check that the encode table decodes correctly. */
2287       code= 0;
2288       bits= 0;
2289       idx= 0;
2290       DBUG_EXECUTE_IF("forcechkerr1", len--;);
2291       DBUG_EXECUTE_IF("forcechkerr2", bits= 8 * sizeof(code););
2292       DBUG_EXECUTE_IF("forcechkerr3", idx= length;);
2293       for (;;)
2294       {
2295         if (! len)
2296         {
2297           (void) fflush(stdout);
2298           (void) fprintf(stderr, "error: code 0x%s with %u bits not found\n",
2299                        hexdigits(huff_tree->code[i]), huff_tree->code_len[i]);
2300           errors++;
2301           break;
2302         }
2303         code<<= 1;
2304         code|= (huff_tree->code[i] >> (--len)) & 1;
2305         bits++;
2306         if (bits > 8 * sizeof(code))
2307         {
2308           (void) fflush(stdout);
2309           (void) fprintf(stderr, "error: Huffman code too long: %u/%u\n",
2310                        bits, (uint) (8 * sizeof(code)));
2311           errors++;
2312           break;
2313         }
2314         idx+= (uint) code & 1;
2315         if (idx >= length)
2316         {
2317           (void) fflush(stdout);
2318           (void) fprintf(stderr, "error: illegal tree offset: %u/%u\n",
2319                        idx, length);
2320           errors++;
2321           break;
2322         }
2323         if (packed_tree[idx] & IS_OFFSET)
2324           idx+= packed_tree[idx] & ~IS_OFFSET;
2325         else
2326           break; /* Hit a leaf. This contains the result value. */
2327       }
2328       if (errors)
2329         break;
2330 
2331       DBUG_EXECUTE_IF("forcechkerr4", packed_tree[idx]++;);
2332       if (packed_tree[idx] != i)
2333       {
2334         (void) fflush(stdout);
2335         (void) fprintf(stderr, "error: decoded value 0x%04x  should be: 0x%04x\n",
2336                      packed_tree[idx], i);
2337         errors++;
2338         break;
2339       }
2340     } /*end for (codes)*/
2341     if (errors)
2342       break;
2343 
2344     /* Write column values in case of distinct column value compression. */
2345     if (huff_tree->counts->tree_buff)
2346     {
2347       for (i=0 ; i < int_length ; i++)
2348       {
2349  	write_bits((ulonglong) (uchar) huff_tree->counts->tree_buff[i], 8);
2350         DBUG_PRINT("info", ("column_values[0x%04x]: 0x%02x",
2351                             i, (uchar) huff_tree->counts->tree_buff[i]));
2352         if (verbose >= 3)
2353           printf("column_values[0x%04x]: 0x%02x\n",
2354                       i, (uchar) huff_tree->counts->tree_buff[i]);
2355       }
2356     }
2357     flush_bits();
2358   }
2359   DBUG_PRINT("info", (" "));
2360   if (verbose >= 2)
2361     printf("\n");
2362   my_afree((uchar*) packed_tree);
2363   if (errors)
2364   {
2365     (void) fprintf(stderr, "Error: Generated decode trees are corrupt. Stop.\n");
2366     return 0;
2367   }
2368   return elements;
2369 }
2370 
2371 
make_offset_code_tree(HUFF_TREE * huff_tree,HUFF_ELEMENT * element,uint * offset)2372 static uint *make_offset_code_tree(HUFF_TREE *huff_tree, HUFF_ELEMENT *element,
2373 				   uint *offset)
2374 {
2375   uint *prev_offset;
2376 
2377   prev_offset= offset;
2378   /*
2379     'a.leaf.null' takes the same place as 'a.nod.left'. If this is null,
2380     then there is no left child and, hence no right child either. This
2381     is a property of a binary tree. An element is either a node with two
2382     childs, or a leaf without childs.
2383 
2384     The current element is always a node with two childs. Go left first.
2385   */
2386   if (!element->a.nod.left->a.leaf.null)
2387   {
2388     /* Store the byte code or the index of the column value. */
2389     prev_offset[0] =(uint) element->a.nod.left->a.leaf.element_nr;
2390     offset+=2;
2391   }
2392   else
2393   {
2394     /*
2395       Recursively traverse the tree to the left. Mark it as an offset to
2396       another tree node (in contrast to a byte code or column value index).
2397     */
2398     prev_offset[0]= IS_OFFSET+2;
2399     offset=make_offset_code_tree(huff_tree,element->a.nod.left,offset+2);
2400   }
2401 
2402   /* Now, check the right child. */
2403   if (!element->a.nod.right->a.leaf.null)
2404   {
2405     /* Store the byte code or the index of the column value. */
2406     prev_offset[1]=element->a.nod.right->a.leaf.element_nr;
2407     return offset;
2408   }
2409   else
2410   {
2411     /*
2412       Recursively traverse the tree to the right. Mark it as an offset to
2413       another tree node (in contrast to a byte code or column value index).
2414     */
2415     uint temp=(uint) (offset-prev_offset-1);
2416     prev_offset[1]= IS_OFFSET+ temp;
2417     if (huff_tree->max_offset < temp)
2418       huff_tree->max_offset = temp;
2419     return make_offset_code_tree(huff_tree,element->a.nod.right,offset);
2420   }
2421 }
2422 
2423 	/* Get number of bits neaded to represent value */
2424 
max_bit(register uint value)2425 static uint max_bit(register uint value)
2426 {
2427   reg2 uint power=1;
2428 
2429   while ((value>>=1))
2430     power++;
2431   return (power);
2432 }
2433 
2434 
compress_isam_file(PACK_MRG_INFO * mrg,HUFF_COUNTS * huff_counts)2435 static int compress_isam_file(PACK_MRG_INFO *mrg, HUFF_COUNTS *huff_counts)
2436 {
2437   int error;
2438   uint i,max_calc_length,pack_ref_length,min_record_length,max_record_length,
2439     intervall,field_length,max_pack_length,pack_blob_length;
2440   my_off_t record_count;
2441   char llbuf[32];
2442   ulong length,pack_length;
2443   uchar *record,*pos,*end_pos,*record_pos,*start_pos;
2444   HUFF_COUNTS *count,*end_count;
2445   HUFF_TREE *tree;
2446   MI_INFO *isam_file=mrg->file[0];
2447   uint pack_version= (uint) isam_file->s->pack.version;
2448   DBUG_ENTER("compress_isam_file");
2449 
2450   /* Allocate a buffer for the records (excluding blobs). */
2451   if (!(record=(uchar*) my_alloca(isam_file->s->base.reclength)))
2452     return -1;
2453 
2454   end_count=huff_counts+isam_file->s->base.fields;
2455   min_record_length= (uint) ~0;
2456   max_record_length=0;
2457 
2458   /*
2459     Calculate the maximum number of bits required to pack the records.
2460     Remember to understand 'max_zero_fill' as 'min_zero_fill'.
2461     The tree height determines the maximum number of bits per value.
2462     Some fields skip leading or trailing spaces or zeroes. The skipped
2463     number of bytes is encoded by 'length_bits' bits.
2464     Empty blobs and varchar are encoded with a single 1 bit. Other blobs
2465     and varchar get a leading 0 bit.
2466   */
2467   for (i=max_calc_length=0 ; i < isam_file->s->base.fields ; i++)
2468   {
2469     if (!(huff_counts[i].pack_type & PACK_TYPE_ZERO_FILL))
2470       huff_counts[i].max_zero_fill=0;
2471     if (huff_counts[i].field_type == FIELD_CONSTANT ||
2472 	huff_counts[i].field_type == FIELD_ZERO ||
2473 	huff_counts[i].field_type == FIELD_CHECK)
2474       continue;
2475     if (huff_counts[i].field_type == FIELD_INTERVALL)
2476       max_calc_length+=huff_counts[i].tree->height;
2477     else if (huff_counts[i].field_type == FIELD_BLOB ||
2478 	     huff_counts[i].field_type == FIELD_VARCHAR)
2479       max_calc_length+=huff_counts[i].tree->height*huff_counts[i].max_length + huff_counts[i].length_bits +1;
2480     else
2481       max_calc_length+=
2482 	(huff_counts[i].field_length - huff_counts[i].max_zero_fill)*
2483 	  huff_counts[i].tree->height+huff_counts[i].length_bits;
2484   }
2485   max_calc_length= (max_calc_length + 7) / 8;
2486   pack_ref_length= calc_pack_length(pack_version, max_calc_length);
2487   record_count=0;
2488   /* 'max_blob_length' is the max length of all blobs of a record. */
2489   pack_blob_length= isam_file->s->base.blobs ?
2490                     calc_pack_length(pack_version, mrg->max_blob_length) : 0;
2491   max_pack_length=pack_ref_length+pack_blob_length;
2492 
2493   DBUG_PRINT("fields", ("==="));
2494   mrg_reset(mrg);
2495   while ((error=mrg_rrnd(mrg,record)) != HA_ERR_END_OF_FILE)
2496   {
2497     ulong tot_blob_length=0;
2498     if (! error)
2499     {
2500       if (flush_buffer((ulong) max_calc_length + (ulong) max_pack_length))
2501 	break;
2502       record_pos= (uchar*) file_buffer.pos;
2503       file_buffer.pos+=max_pack_length;
2504       for (start_pos=record, count= huff_counts; count < end_count ; count++)
2505       {
2506 	end_pos=start_pos+(field_length=count->field_length);
2507 	tree=count->tree;
2508 
2509         DBUG_PRINT("fields", ("column: %3lu  type: %2u  pack: %2u  zero: %4u  "
2510                               "lbits: %2u  tree: %2u  length: %4u",
2511                               (ulong) (count - huff_counts + 1),
2512                               count->field_type,
2513                               count->pack_type, count->max_zero_fill,
2514                               count->length_bits, count->tree->tree_number,
2515                               count->field_length));
2516 
2517         /* Check if the column contains spaces only. */
2518 	if (count->pack_type & PACK_TYPE_SPACE_FIELDS)
2519 	{
2520 	  for (pos=start_pos ; *pos == ' ' && pos < end_pos; pos++) ;
2521 	  if (pos == end_pos)
2522 	  {
2523             DBUG_PRINT("fields",
2524                        ("PACK_TYPE_SPACE_FIELDS spaces only, bits:  1"));
2525             DBUG_PRINT("fields", ("---"));
2526 	    write_bits(1,1);
2527 	    start_pos=end_pos;
2528 	    continue;
2529 	  }
2530           DBUG_PRINT("fields",
2531                      ("PACK_TYPE_SPACE_FIELDS not only spaces, bits:  1"));
2532 	  write_bits(0,1);
2533 	}
2534 	end_pos-=count->max_zero_fill;
2535 	field_length-=count->max_zero_fill;
2536 
2537 	switch (count->field_type) {
2538 	case FIELD_SKIP_ZERO:
2539 	  if (!memcmp((uchar*) start_pos,zero_string,field_length))
2540 	  {
2541             DBUG_PRINT("fields", ("FIELD_SKIP_ZERO zeroes only, bits:  1"));
2542 	    write_bits(1,1);
2543 	    start_pos=end_pos;
2544 	    break;
2545 	  }
2546           DBUG_PRINT("fields", ("FIELD_SKIP_ZERO not only zeroes, bits:  1"));
2547 	  write_bits(0,1);
2548 	  /* Fall through */
2549 	case FIELD_NORMAL:
2550           DBUG_PRINT("fields", ("FIELD_NORMAL %lu bytes",
2551                                 (ulong) (end_pos - start_pos)));
2552 	  for ( ; start_pos < end_pos ; start_pos++)
2553           {
2554             DBUG_PRINT("fields",
2555                        ("value: 0x%02x  code: 0x%s  bits: %2u  bin: %s",
2556                         (uchar) *start_pos,
2557                         hexdigits(tree->code[(uchar) *start_pos]),
2558                         (uint) tree->code_len[(uchar) *start_pos],
2559                         bindigits(tree->code[(uchar) *start_pos],
2560                                   (uint) tree->code_len[(uchar) *start_pos])));
2561 	    write_bits(tree->code[(uchar) *start_pos],
2562 		       (uint) tree->code_len[(uchar) *start_pos]);
2563           }
2564 	  break;
2565 	case FIELD_SKIP_ENDSPACE:
2566 	  for (pos=end_pos ; pos > start_pos && pos[-1] == ' ' ; pos--) ;
2567 	  length= (ulong) (end_pos - pos);
2568 	  if (count->pack_type & PACK_TYPE_SELECTED)
2569 	  {
2570 	    if (length > count->min_space)
2571 	    {
2572               DBUG_PRINT("fields",
2573                          ("FIELD_SKIP_ENDSPACE more than min_space, bits:  1"));
2574               DBUG_PRINT("fields",
2575                          ("FIELD_SKIP_ENDSPACE skip %lu/%u bytes, bits: %2u",
2576                           length, field_length, count->length_bits));
2577 	      write_bits(1,1);
2578 	      write_bits(length,count->length_bits);
2579 	    }
2580 	    else
2581 	    {
2582               DBUG_PRINT("fields",
2583                          ("FIELD_SKIP_ENDSPACE not more than min_space, "
2584                           "bits:  1"));
2585 	      write_bits(0,1);
2586 	      pos=end_pos;
2587 	    }
2588 	  }
2589 	  else
2590           {
2591             DBUG_PRINT("fields",
2592                        ("FIELD_SKIP_ENDSPACE skip %lu/%u bytes, bits: %2u",
2593                         length, field_length, count->length_bits));
2594 	    write_bits(length,count->length_bits);
2595           }
2596           /* Encode all significant bytes. */
2597           DBUG_PRINT("fields", ("FIELD_SKIP_ENDSPACE %lu bytes",
2598                                 (ulong) (pos - start_pos)));
2599 	  for ( ; start_pos < pos ; start_pos++)
2600           {
2601             DBUG_PRINT("fields",
2602                        ("value: 0x%02x  code: 0x%s  bits: %2u  bin: %s",
2603                         (uchar) *start_pos,
2604                         hexdigits(tree->code[(uchar) *start_pos]),
2605                         (uint) tree->code_len[(uchar) *start_pos],
2606                         bindigits(tree->code[(uchar) *start_pos],
2607                                   (uint) tree->code_len[(uchar) *start_pos])));
2608 	    write_bits(tree->code[(uchar) *start_pos],
2609 		       (uint) tree->code_len[(uchar) *start_pos]);
2610           }
2611 	  start_pos=end_pos;
2612 	  break;
2613 	case FIELD_SKIP_PRESPACE:
2614 	  for (pos=start_pos ; pos < end_pos && pos[0] == ' ' ; pos++) ;
2615           length= (ulong) (pos - start_pos);
2616 	  if (count->pack_type & PACK_TYPE_SELECTED)
2617 	  {
2618 	    if (length > count->min_space)
2619 	    {
2620               DBUG_PRINT("fields",
2621                          ("FIELD_SKIP_PRESPACE more than min_space, bits:  1"));
2622               DBUG_PRINT("fields",
2623                          ("FIELD_SKIP_PRESPACE skip %lu/%u bytes, bits: %2u",
2624                           length, field_length, count->length_bits));
2625 	      write_bits(1,1);
2626 	      write_bits(length,count->length_bits);
2627 	    }
2628 	    else
2629 	    {
2630               DBUG_PRINT("fields",
2631                          ("FIELD_SKIP_PRESPACE not more than min_space, "
2632                           "bits:  1"));
2633 	      pos=start_pos;
2634 	      write_bits(0,1);
2635 	    }
2636 	  }
2637 	  else
2638           {
2639             DBUG_PRINT("fields",
2640                        ("FIELD_SKIP_PRESPACE skip %lu/%u bytes, bits: %2u",
2641                         length, field_length, count->length_bits));
2642 	    write_bits(length,count->length_bits);
2643           }
2644           /* Encode all significant bytes. */
2645           DBUG_PRINT("fields", ("FIELD_SKIP_PRESPACE %lu bytes",
2646                                 (ulong) (end_pos - start_pos)));
2647 	  for (start_pos=pos ; start_pos < end_pos ; start_pos++)
2648           {
2649             DBUG_PRINT("fields",
2650                        ("value: 0x%02x  code: 0x%s  bits: %2u  bin: %s",
2651                         (uchar) *start_pos,
2652                         hexdigits(tree->code[(uchar) *start_pos]),
2653                         (uint) tree->code_len[(uchar) *start_pos],
2654                         bindigits(tree->code[(uchar) *start_pos],
2655                                   (uint) tree->code_len[(uchar) *start_pos])));
2656 	    write_bits(tree->code[(uchar) *start_pos],
2657 		       (uint) tree->code_len[(uchar) *start_pos]);
2658           }
2659 	  break;
2660 	case FIELD_CONSTANT:
2661 	case FIELD_ZERO:
2662 	case FIELD_CHECK:
2663           DBUG_PRINT("fields", ("FIELD_CONSTANT/ZERO/CHECK"));
2664 	  start_pos=end_pos;
2665 	  break;
2666 	case FIELD_INTERVALL:
2667 	  global_count=count;
2668 	  pos=(uchar*) tree_search(&count->int_tree, start_pos,
2669 				  count->int_tree.custom_arg);
2670 	  intervall=(uint) (pos - count->tree_buff)/field_length;
2671           DBUG_PRINT("fields", ("FIELD_INTERVALL"));
2672           DBUG_PRINT("fields", ("index: %4u code: 0x%s  bits: %2u",
2673                                 intervall, hexdigits(tree->code[intervall]),
2674                                 (uint) tree->code_len[intervall]));
2675 	  write_bits(tree->code[intervall],(uint) tree->code_len[intervall]);
2676 	  start_pos=end_pos;
2677 	  break;
2678 	case FIELD_BLOB:
2679 	{
2680 	  ulong blob_length=_mi_calc_blob_length(field_length-
2681 						 portable_sizeof_char_ptr,
2682 						 start_pos);
2683           /* Empty blobs are encoded with a single 1 bit. */
2684 	  if (!blob_length)
2685 	  {
2686             DBUG_PRINT("fields", ("FIELD_BLOB empty, bits:  1"));
2687             write_bits(1,1);
2688 	  }
2689 	  else
2690 	  {
2691 	    uchar *blob,*blob_end;
2692             DBUG_PRINT("fields", ("FIELD_BLOB not empty, bits:  1"));
2693 	    write_bits(0,1);
2694             /* Write the blob length. */
2695             DBUG_PRINT("fields", ("FIELD_BLOB %lu bytes, bits: %2u",
2696                                   blob_length, count->length_bits));
2697 	    write_bits(blob_length,count->length_bits);
2698 	    memcpy(&blob, end_pos-portable_sizeof_char_ptr, sizeof(char*));
2699 	    blob_end=blob+blob_length;
2700             /* Encode the blob bytes. */
2701 	    for ( ; blob < blob_end ; blob++)
2702             {
2703               DBUG_PRINT("fields",
2704                          ("value: 0x%02x  code: 0x%s  bits: %2u  bin: %s",
2705                           (uchar) *blob, hexdigits(tree->code[(uchar) *blob]),
2706                           (uint) tree->code_len[(uchar) *blob],
2707                           bindigits(tree->code[(uchar) *start_pos],
2708                                     (uint)tree->code_len[(uchar) *start_pos])));
2709 	      write_bits(tree->code[(uchar) *blob],
2710 			 (uint) tree->code_len[(uchar) *blob]);
2711             }
2712 	    tot_blob_length+=blob_length;
2713 	  }
2714 	  start_pos= end_pos;
2715 	  break;
2716 	}
2717 	case FIELD_VARCHAR:
2718 	{
2719           uint var_pack_length= HA_VARCHAR_PACKLENGTH(count->field_length-1);
2720           ulong col_length= (var_pack_length == 1 ?
2721                              (uint) *(uchar*) start_pos :
2722                              uint2korr(start_pos));
2723           /* Empty varchar are encoded with a single 1 bit. */
2724 	  if (!col_length)
2725 	  {
2726             DBUG_PRINT("fields", ("FIELD_VARCHAR empty, bits:  1"));
2727 	    write_bits(1,1);			/* Empty varchar */
2728 	  }
2729 	  else
2730 	  {
2731 	    uchar *end= start_pos + var_pack_length + col_length;
2732             DBUG_PRINT("fields", ("FIELD_VARCHAR not empty, bits:  1"));
2733 	    write_bits(0,1);
2734             /* Write the varchar length. */
2735             DBUG_PRINT("fields", ("FIELD_VARCHAR %lu bytes, bits: %2u",
2736                                   col_length, count->length_bits));
2737 	    write_bits(col_length,count->length_bits);
2738             /* Encode the varchar bytes. */
2739 	    for (start_pos+= var_pack_length ; start_pos < end ; start_pos++)
2740             {
2741               DBUG_PRINT("fields",
2742                          ("value: 0x%02x  code: 0x%s  bits: %2u  bin: %s",
2743                           (uchar) *start_pos,
2744                           hexdigits(tree->code[(uchar) *start_pos]),
2745                           (uint) tree->code_len[(uchar) *start_pos],
2746                           bindigits(tree->code[(uchar) *start_pos],
2747                                     (uint)tree->code_len[(uchar) *start_pos])));
2748 	      write_bits(tree->code[(uchar) *start_pos],
2749 			 (uint) tree->code_len[(uchar) *start_pos]);
2750             }
2751 	  }
2752 	  start_pos= end_pos;
2753 	  break;
2754 	}
2755 	case FIELD_LAST:
2756         case FIELD_enum_val_count:
2757 	  abort();				/* Impossible */
2758 	}
2759 	start_pos+=count->max_zero_fill;
2760         DBUG_PRINT("fields", ("---"));
2761       }
2762       flush_bits();
2763       length=(ulong) ((uchar*) file_buffer.pos - record_pos) - max_pack_length;
2764       pack_length= save_pack_length(pack_version, record_pos, length);
2765       if (pack_blob_length)
2766 	pack_length+= save_pack_length(pack_version, record_pos + pack_length,
2767 	                               tot_blob_length);
2768       DBUG_PRINT("fields", ("record: %lu  length: %lu  blob-length: %lu  "
2769                             "length-bytes: %lu", (ulong) record_count, length,
2770                             tot_blob_length, pack_length));
2771       DBUG_PRINT("fields", ("==="));
2772 
2773       /* Correct file buffer if the header was smaller */
2774       if (pack_length != max_pack_length)
2775       {
2776 	bmove(record_pos+pack_length,record_pos+max_pack_length,length);
2777 	file_buffer.pos-= (max_pack_length-pack_length);
2778       }
2779       if (length < (ulong) min_record_length)
2780 	min_record_length=(uint) length;
2781       if (length > (ulong) max_record_length)
2782 	max_record_length=(uint) length;
2783       record_count++;
2784       if (write_loop && record_count % WRITE_COUNT == 0)
2785       {
2786 	printf("%lu\r", (ulong) record_count);
2787         (void) fflush(stdout);
2788       }
2789     }
2790     else if (error != HA_ERR_RECORD_DELETED)
2791       break;
2792   }
2793   if (error == HA_ERR_END_OF_FILE)
2794     error=0;
2795   else
2796   {
2797     (void) fprintf(stderr, "%s: Got error %d reading records\n",
2798                  my_progname, error);
2799   }
2800   if (verbose >= 2)
2801     printf("wrote %s records.\n", llstr((longlong) record_count, llbuf));
2802 
2803   my_afree((uchar*) record);
2804   mrg->ref_length=max_pack_length;
2805   mrg->min_pack_length=max_record_length ? min_record_length : 0;
2806   mrg->max_pack_length=max_record_length;
2807   DBUG_RETURN(error || error_on_write || flush_buffer(~(ulong) 0));
2808 }
2809 
2810 
make_new_name(char * new_name,char * old_name)2811 static char *make_new_name(char *new_name, char *old_name)
2812 {
2813   return fn_format(new_name,old_name,"",DATA_TMP_EXT,2+4);
2814 }
2815 
make_old_name(char * new_name,char * old_name)2816 static char *make_old_name(char *new_name, char *old_name)
2817 {
2818   return fn_format(new_name,old_name,"",OLD_EXT,2+4);
2819 }
2820 
2821 	/* rutines for bit writing buffer */
2822 
init_file_buffer(File file,pbool read_buffer)2823 static void init_file_buffer(File file, pbool read_buffer)
2824 {
2825   file_buffer.file=file;
2826   file_buffer.buffer= (uchar*) my_malloc(ALIGN_SIZE(RECORD_CACHE_SIZE),
2827 					 MYF(MY_WME));
2828   file_buffer.end=file_buffer.buffer+ALIGN_SIZE(RECORD_CACHE_SIZE)-8;
2829   file_buffer.pos_in_file=0;
2830   error_on_write=0;
2831   if (read_buffer)
2832   {
2833 
2834     file_buffer.pos=file_buffer.end;
2835     file_buffer.bits=0;
2836   }
2837   else
2838   {
2839     file_buffer.pos=file_buffer.buffer;
2840     file_buffer.bits=BITS_SAVED;
2841   }
2842   file_buffer.bitbucket= 0;
2843 }
2844 
2845 
flush_buffer(ulong neaded_length)2846 static int flush_buffer(ulong neaded_length)
2847 {
2848   ulong length;
2849 
2850   /*
2851     file_buffer.end is 8 bytes lower than the real end of the buffer.
2852     This is done so that the end-of-buffer condition does not need to be
2853     checked for every byte (see write_bits()). Consequently,
2854     file_buffer.pos can become greater than file_buffer.end. The
2855     algorithms in the other functions ensure that there will never be
2856     more than 8 bytes written to the buffer without an end-of-buffer
2857     check. So the buffer cannot be overrun. But we need to check for the
2858     near-to-buffer-end condition to avoid a negative result, which is
2859     casted to unsigned and thus becomes giant.
2860   */
2861   if ((file_buffer.pos < file_buffer.end) &&
2862       ((ulong) (file_buffer.end - file_buffer.pos) > neaded_length))
2863     return 0;
2864   length=(ulong) (file_buffer.pos-file_buffer.buffer);
2865   file_buffer.pos=file_buffer.buffer;
2866   file_buffer.pos_in_file+=length;
2867   if (test_only)
2868     return 0;
2869   if (error_on_write|| my_write(file_buffer.file,
2870 				(const uchar*) file_buffer.buffer,
2871 				length,
2872 				MYF(MY_WME | MY_NABP | MY_WAIT_IF_FULL)))
2873   {
2874     error_on_write=1;
2875     return 1;
2876   }
2877 
2878   if (neaded_length != ~(ulong) 0 &&
2879       (ulong) (file_buffer.end-file_buffer.buffer) < neaded_length)
2880   {
2881     char *tmp;
2882     neaded_length+=256;				/* some margin */
2883     tmp= my_realloc((char*) file_buffer.buffer, neaded_length,MYF(MY_WME));
2884     if (!tmp)
2885       return 1;
2886     file_buffer.pos= ((uchar*) tmp +
2887                       (ulong) (file_buffer.pos - file_buffer.buffer));
2888     file_buffer.buffer= (uchar*) tmp;
2889     file_buffer.end= (uchar*) (tmp+neaded_length-8);
2890   }
2891   return 0;
2892 }
2893 
2894 
end_file_buffer(void)2895 static void end_file_buffer(void)
2896 {
2897   my_free(file_buffer.buffer);
2898 }
2899 
2900 	/* output `bits` low bits of `value' */
2901 
write_bits(register ulonglong value,register uint bits)2902 static void write_bits(register ulonglong value, register uint bits)
2903 {
2904   DBUG_ASSERT(((bits < 8 * sizeof(value)) && ! (value >> bits)) ||
2905               (bits == 8 * sizeof(value)));
2906 
2907   if ((file_buffer.bits-= (int) bits) >= 0)
2908   {
2909     file_buffer.bitbucket|= value << file_buffer.bits;
2910   }
2911   else
2912   {
2913     reg3 ulonglong bit_buffer;
2914     bits= (uint) -file_buffer.bits;
2915     bit_buffer= (file_buffer.bitbucket |
2916                  ((bits != 8 * sizeof(value)) ? (value >> bits) : 0));
2917 #if BITS_SAVED == 64
2918     *file_buffer.pos++= (uchar) (bit_buffer >> 56);
2919     *file_buffer.pos++= (uchar) (bit_buffer >> 48);
2920     *file_buffer.pos++= (uchar) (bit_buffer >> 40);
2921     *file_buffer.pos++= (uchar) (bit_buffer >> 32);
2922 #endif
2923     *file_buffer.pos++= (uchar) (bit_buffer >> 24);
2924     *file_buffer.pos++= (uchar) (bit_buffer >> 16);
2925     *file_buffer.pos++= (uchar) (bit_buffer >> 8);
2926     *file_buffer.pos++= (uchar) (bit_buffer);
2927 
2928     if (bits != 8 * sizeof(value))
2929       value&= (((ulonglong) 1) << bits) - 1;
2930     if (file_buffer.pos >= file_buffer.end)
2931       (void) flush_buffer(~ (ulong) 0);
2932     file_buffer.bits=(int) (BITS_SAVED - bits);
2933     file_buffer.bitbucket= value << (BITS_SAVED - bits);
2934   }
2935   return;
2936 }
2937 
2938 	/* Flush bits in bit_buffer to buffer */
2939 
flush_bits(void)2940 static void flush_bits(void)
2941 {
2942   int bits;
2943   ulonglong bit_buffer;
2944 
2945   bits= file_buffer.bits & ~7;
2946   bit_buffer= file_buffer.bitbucket >> bits;
2947   bits= BITS_SAVED - bits;
2948   while (bits > 0)
2949   {
2950     bits-= 8;
2951     *file_buffer.pos++= (uchar) (bit_buffer >> bits);
2952   }
2953   if (file_buffer.pos >= file_buffer.end)
2954     (void) flush_buffer(~ (ulong) 0);
2955   file_buffer.bits= BITS_SAVED;
2956   file_buffer.bitbucket= 0;
2957 }
2958 
2959 
2960 /****************************************************************************
2961 ** functions to handle the joined files
2962 ****************************************************************************/
2963 
save_state(MI_INFO * isam_file,PACK_MRG_INFO * mrg,my_off_t new_length,ha_checksum crc)2964 static int save_state(MI_INFO *isam_file,PACK_MRG_INFO *mrg,my_off_t new_length,
2965 		      ha_checksum crc)
2966 {
2967   MYISAM_SHARE *share=isam_file->s;
2968   uint options=mi_uint2korr(share->state.header.options);
2969   uint key;
2970   DBUG_ENTER("save_state");
2971 
2972   options|= (HA_OPTION_COMPRESS_RECORD | HA_OPTION_READ_ONLY_DATA |
2973              (share->options & HA_OPTION_NULL_FIELDS));
2974   mi_int2store(share->state.header.options,options);
2975 
2976   share->state.state.data_file_length=new_length;
2977   share->state.state.del=0;
2978   share->state.state.empty=0;
2979   share->state.dellink= HA_OFFSET_ERROR;
2980   share->state.split=(ha_rows) mrg->records;
2981   share->state.version=(ulong) time((time_t*) 0);
2982   if (! mi_is_all_keys_active(share->state.key_map, share->base.keys))
2983   {
2984     /*
2985       Some indexes are disabled, cannot use current key_file_length value
2986       as an estimate of upper bound of index file size. Use packed data file
2987       size instead.
2988     */
2989     share->state.state.key_file_length= new_length;
2990   }
2991   /*
2992     If there are no disabled indexes, keep key_file_length value from
2993     original file so "myisamchk -rq" can use this value (this is necessary
2994     because index size cannot be easily calculated for fulltext keys)
2995   */
2996   mi_clear_all_keys_active(share->state.key_map);
2997   for (key=0 ; key < share->base.keys ; key++)
2998     share->state.key_root[key]= HA_OFFSET_ERROR;
2999   for (key=0 ; key < share->state.header.max_block_size_index ; key++)
3000     share->state.key_del[key]= HA_OFFSET_ERROR;
3001   isam_file->state->checksum=crc;       /* Save crc here */
3002   share->changed=1;			/* Force write of header */
3003   share->state.open_count=0;
3004   share->global_changed=0;
3005   (void) my_chsize(share->kfile, share->base.keystart, 0, MYF(0));
3006   if (share->base.keys)
3007     isamchk_neaded=1;
3008   DBUG_RETURN(mi_state_info_write(share->kfile,&share->state,1+2));
3009 }
3010 
3011 
save_state_mrg(File file,PACK_MRG_INFO * mrg,my_off_t new_length,ha_checksum crc)3012 static int save_state_mrg(File file,PACK_MRG_INFO *mrg,my_off_t new_length,
3013 			  ha_checksum crc)
3014 {
3015   MI_STATE_INFO state;
3016   MI_INFO *isam_file=mrg->file[0];
3017   uint options;
3018   DBUG_ENTER("save_state_mrg");
3019 
3020   state= isam_file->s->state;
3021   options= (mi_uint2korr(state.header.options) | HA_OPTION_COMPRESS_RECORD |
3022 	    HA_OPTION_READ_ONLY_DATA |
3023             (isam_file->s->options & HA_OPTION_NULL_FIELDS));
3024   mi_int2store(state.header.options,options);
3025   state.state.data_file_length=new_length;
3026   state.state.del=0;
3027   state.state.empty=0;
3028   state.state.records=state.split=(ha_rows) mrg->records;
3029   /* See comment above in save_state about key_file_length handling. */
3030   if (mrg->src_file_has_indexes_disabled)
3031   {
3032     isam_file->s->state.state.key_file_length=
3033       MY_MAX(isam_file->s->state.state.key_file_length, new_length);
3034   }
3035   state.dellink= HA_OFFSET_ERROR;
3036   state.version=(ulong) time((time_t*) 0);
3037   mi_clear_all_keys_active(state.key_map);
3038   state.state.checksum=crc;
3039   if (isam_file->s->base.keys)
3040     isamchk_neaded=1;
3041   state.changed=STATE_CHANGED | STATE_NOT_ANALYZED; /* Force check of table */
3042   DBUG_RETURN (mi_state_info_write(file,&state,1+2));
3043 }
3044 
3045 
3046 /* reset for mrg_rrnd */
3047 
mrg_reset(PACK_MRG_INFO * mrg)3048 static void mrg_reset(PACK_MRG_INFO *mrg)
3049 {
3050   if (mrg->current)
3051   {
3052     mi_extra(*mrg->current, HA_EXTRA_NO_CACHE, 0);
3053     mrg->current=0;
3054   }
3055 }
3056 
mrg_rrnd(PACK_MRG_INFO * info,uchar * buf)3057 static int mrg_rrnd(PACK_MRG_INFO *info,uchar *buf)
3058 {
3059   int error;
3060   MI_INFO *isam_info;
3061   my_off_t filepos;
3062 
3063   if (!info->current)
3064   {
3065     isam_info= *(info->current=info->file);
3066     info->end=info->current+info->count;
3067     mi_reset(isam_info);
3068     mi_extra(isam_info, HA_EXTRA_CACHE, 0);
3069     filepos=isam_info->s->pack.header_length;
3070   }
3071   else
3072   {
3073     isam_info= *info->current;
3074     filepos= isam_info->nextpos;
3075   }
3076 
3077   for (;;)
3078   {
3079     isam_info->update&= HA_STATE_CHANGED;
3080     if (!(error=(*isam_info->s->read_rnd)(isam_info,(uchar*) buf,
3081 					  filepos, 1)) ||
3082 	error != HA_ERR_END_OF_FILE)
3083       return (error);
3084     mi_extra(isam_info,HA_EXTRA_NO_CACHE, 0);
3085     if (info->current+1 == info->end)
3086       return(HA_ERR_END_OF_FILE);
3087     info->current++;
3088     isam_info= *info->current;
3089     filepos=isam_info->s->pack.header_length;
3090     mi_reset(isam_info);
3091     mi_extra(isam_info,HA_EXTRA_CACHE, 0);
3092   }
3093 }
3094 
3095 
mrg_close(PACK_MRG_INFO * mrg)3096 static int mrg_close(PACK_MRG_INFO *mrg)
3097 {
3098   uint i;
3099   int error=0;
3100   for (i=0 ; i < mrg->count ; i++)
3101     error|=mi_close(mrg->file[i]);
3102   if (mrg->free_file)
3103     my_free(mrg->file);
3104   return error;
3105 }
3106 
3107 
3108 #if !defined(DBUG_OFF)
3109 /*
3110   Fake the counts to get big Huffman codes.
3111 
3112   SYNOPSIS
3113     fakebigcodes()
3114     huff_counts                 A pointer to the counts array.
3115     end_count                   A pointer past the counts array.
3116 
3117   DESCRIPTION
3118 
3119     Huffman coding works by removing the two least frequent values from
3120     the list of values and add a new value with the sum of their
3121     incidences in a loop until only one value is left. Every time a
3122     value is reused for a new value, it gets one more bit for its
3123     encoding. Hence, the least frequent values get the longest codes.
3124 
3125     To get a maximum code length for a value, two of the values must
3126     have an incidence of 1. As their sum is 2, the next infrequent value
3127     must have at least an incidence of 2, then 4, 8, 16 and so on. This
3128     means that one needs 2**n bytes (values) for a code length of n
3129     bits. However, using more distinct values forces the use of longer
3130     codes, or reaching the code length with less total bytes (values).
3131 
3132     To get 64(32)-bit codes, I sort the counts by decreasing incidence.
3133     I assign counts of 1 to the two most frequent values, a count of 2
3134     for the next one, then 4, 8, and so on until 2**64-1(2**30-1). All
3135     the remaining values get 1. That way every possible byte has an
3136     assigned code, though not all codes are used if not all byte values
3137     are present in the column.
3138 
3139     This strategy would work with distinct column values too, but
3140     requires that at least 64(32) values are present. To make things
3141     easier here, I cancel all distinct column values and force byte
3142     compression for all columns.
3143 
3144   RETURN
3145     void
3146 */
3147 
fakebigcodes(HUFF_COUNTS * huff_counts,HUFF_COUNTS * end_count)3148 static void fakebigcodes(HUFF_COUNTS *huff_counts, HUFF_COUNTS *end_count)
3149 {
3150   HUFF_COUNTS   *count;
3151   my_off_t      *cur_count_p;
3152   my_off_t      *end_count_p;
3153   my_off_t      **cur_sort_p;
3154   my_off_t      **end_sort_p;
3155   my_off_t      *sort_counts[256];
3156   my_off_t      total;
3157   DBUG_ENTER("fakebigcodes");
3158 
3159   for (count= huff_counts; count < end_count; count++)
3160   {
3161     /*
3162       Remove distinct column values.
3163     */
3164     if (huff_counts->tree_buff)
3165     {
3166       my_free(huff_counts->tree_buff);
3167       delete_tree(&huff_counts->int_tree, 0);
3168       huff_counts->tree_buff= NULL;
3169       DBUG_PRINT("fakebigcodes", ("freed distinct column values"));
3170     }
3171 
3172     /*
3173       Sort counts by decreasing incidence.
3174     */
3175     cur_count_p= count->counts;
3176     end_count_p= cur_count_p + 256;
3177     cur_sort_p= sort_counts;
3178     while (cur_count_p < end_count_p)
3179       *(cur_sort_p++)= cur_count_p++;
3180     (void) my_qsort(sort_counts, 256, sizeof(my_off_t*), (qsort_cmp) fakecmp);
3181 
3182     /*
3183       Assign faked counts.
3184     */
3185     cur_sort_p= sort_counts;
3186 #if SIZEOF_LONG_LONG > 4
3187     end_sort_p= sort_counts + 8 * sizeof(ulonglong) - 1;
3188 #else
3189     end_sort_p= sort_counts + 8 * sizeof(ulonglong) - 2;
3190 #endif
3191     /* Most frequent value gets a faked count of 1. */
3192     **(cur_sort_p++)= 1;
3193     total= 1;
3194     while (cur_sort_p < end_sort_p)
3195     {
3196       **(cur_sort_p++)= total;
3197       total<<= 1;
3198     }
3199     /* Set the last value. */
3200     **(cur_sort_p++)= --total;
3201     /*
3202       Set the remaining counts.
3203     */
3204     end_sort_p= sort_counts + 256;
3205     while (cur_sort_p < end_sort_p)
3206       **(cur_sort_p++)= 1;
3207   }
3208   DBUG_VOID_RETURN;
3209 }
3210 
3211 
3212 /*
3213   Compare two counts for reverse sorting.
3214 
3215   SYNOPSIS
3216     fakecmp()
3217     count1              One count.
3218     count2              Another count.
3219 
3220   RETURN
3221     1                   count1  < count2
3222     0                   count1 == count2
3223     -1                  count1 >  count2
3224 */
3225 
fakecmp(my_off_t ** count1,my_off_t ** count2)3226 static int fakecmp(my_off_t **count1, my_off_t **count2)
3227 {
3228   return ((**count1 < **count2) ? 1 :
3229           (**count1 > **count2) ? -1 : 0);
3230 }
3231 #endif
3232 
3233 #include "mi_extrafunc.h"
3234