1 /* Copyright (c) 2000, 2016, Oracle and/or its affiliates. All rights reserved.
2 
3    This program is free software; you can redistribute it and/or modify
4    it under the terms of the GNU General Public License, version 2.0,
5    as published by the Free Software Foundation.
6 
7    This program is also distributed with certain software (including
8    but not limited to OpenSSL) that is licensed under separate terms,
9    as designated in a particular file or component or in included license
10    documentation.  The authors of MySQL hereby grant you an additional
11    permission to link the program and your derivative works with the
12    separately licensed software that they have included with MySQL.
13 
14    This program is distributed in the hope that it will be useful,
15    but WITHOUT ANY WARRANTY; without even the implied warranty of
16    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
17    GNU General Public License, version 2.0, for more details.
18 
19    You should have received a copy of the GNU General Public License
20    along with this program; if not, write to the Free Software
21    Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301  USA */
22 
23 /* Pack MyISAM file */
24 
25 #ifndef USE_MY_FUNC
26 #define USE_MY_FUNC			/* We need at least my_malloc */
27 #endif
28 
29 #include "myisamdef.h"
30 #include "my_default.h"
31 #include <queues.h>
32 #include <my_tree.h>
33 #include "mysys_err.h"
34 #ifndef __GNU_LIBRARY__
35 #define __GNU_LIBRARY__			/* Skip warnings in getopt.h */
36 #endif
37 #include <my_getopt.h>
38 #include <assert.h>
39 #include <welcome_copyright_notice.h> // ORACLE_WELCOME_COPYRIGHT_NOTICE
40 
41 #if SIZEOF_LONG_LONG > 4
42 #define BITS_SAVED 64
43 #else
44 #define BITS_SAVED 32
45 #endif
46 
47 #define IS_OFFSET ((uint) 32768)	/* Bit if offset or char in tree */
48 #define HEAD_LENGTH	32
49 #define ALLOWED_JOIN_DIFF	256	/* Diff allowed to join trees */
50 
51 #define DATA_TMP_EXT		".TMD"
52 #define OLD_EXT			".OLD"
53 #define FRM_EXT                 ".frm"
54 #define WRITE_COUNT		MY_HOW_OFTEN_TO_WRITE
55 
56 struct st_file_buffer {
57   File file;
58   uchar *buffer,*pos,*end;
59   my_off_t pos_in_file;
60   int bits;
61   ulonglong bitbucket;
62 };
63 
64 struct st_huff_tree;
65 struct st_huff_element;
66 
67 typedef struct st_huff_counts {
68   uint	field_length,max_zero_fill;
69   uint	pack_type;
70   uint	max_end_space,max_pre_space,length_bits,min_space;
71   ulong max_length;
72   enum en_fieldtype field_type;
73   struct st_huff_tree *tree;		/* Tree for field */
74   my_off_t counts[256];
75   my_off_t end_space[8];
76   my_off_t pre_space[8];
77   my_off_t tot_end_space,tot_pre_space,zero_fields,empty_fields,bytes_packed;
78   TREE int_tree;        /* Tree for detecting distinct column values. */
79   uchar *tree_buff;      /* Column values, 'field_length' each. */
80   uchar *tree_pos;       /* Points to end of column values in 'tree_buff'. */
81 } HUFF_COUNTS;
82 
83 typedef struct st_huff_element HUFF_ELEMENT;
84 
85 /*
86   WARNING: It is crucial for the optimizations in calc_packed_length()
87   that 'count' is the first element of 'HUFF_ELEMENT'.
88 */
89 struct st_huff_element {
90   my_off_t count;
91   union un_element {
92     struct st_nod {
93       HUFF_ELEMENT *left,*right;
94     } nod;
95     struct st_leaf {
96       HUFF_ELEMENT *null;
97       uint	element_nr;		/* Number of element */
98     } leaf;
99   } a;
100 };
101 
102 
103 typedef struct st_huff_tree {
104   HUFF_ELEMENT *root,*element_buffer;
105   HUFF_COUNTS *counts;
106   uint tree_number;
107   uint elements;
108   my_off_t bytes_packed;
109   uint tree_pack_length;
110   uint min_chr,max_chr,char_bits,offset_bits,max_offset,height;
111   ulonglong *code;
112   uchar *code_len;
113 } HUFF_TREE;
114 
115 
116 typedef struct st_isam_mrg {
117   MI_INFO **file,**current,**end;
118   uint free_file;
119   uint count;
120   uint	min_pack_length;		/* Theese is used by packed data */
121   uint	max_pack_length;
122   uint	ref_length;
123   uint	max_blob_length;
124   my_off_t records;
125   /* true if at least one source file has at least one disabled index */
126   my_bool src_file_has_indexes_disabled;
127 } PACK_MRG_INFO;
128 
129 
130 extern int main(int argc,char * *argv);
131 static void get_options(int *argc,char ***argv);
132 static MI_INFO *open_isam_file(char *name,int mode);
133 static my_bool open_isam_files(PACK_MRG_INFO *mrg,char **names,uint count);
134 static int compress(PACK_MRG_INFO *file,char *join_name);
135 static int create_dest_frm(char *source_table, char *dest_table);
136 static HUFF_COUNTS *init_huff_count(MI_INFO *info,my_off_t records);
137 static void free_counts_and_tree_and_queue(HUFF_TREE *huff_trees,
138 					   uint trees,
139 					   HUFF_COUNTS *huff_counts,
140 					   uint fields);
141 static int compare_tree(void* cmp_arg MY_ATTRIBUTE((unused)),
142 			const uchar *s,const uchar *t);
143 static int get_statistic(PACK_MRG_INFO *mrg,HUFF_COUNTS *huff_counts);
144 static void check_counts(HUFF_COUNTS *huff_counts,uint trees,
145 			 my_off_t records);
146 static int test_space_compress(HUFF_COUNTS *huff_counts,my_off_t records,
147 			       uint max_space_length,my_off_t *space_counts,
148 			       my_off_t tot_space_count,
149 			       enum en_fieldtype field_type);
150 static HUFF_TREE* make_huff_trees(HUFF_COUNTS *huff_counts,uint trees);
151 static int make_huff_tree(HUFF_TREE *tree,HUFF_COUNTS *huff_counts);
152 static int compare_huff_elements(void *not_used, uchar *a,uchar *b);
153 static int save_counts_in_queue(uchar *key,element_count count,
154 				    HUFF_TREE *tree);
155 static my_off_t calc_packed_length(HUFF_COUNTS *huff_counts,uint flag);
156 static uint join_same_trees(HUFF_COUNTS *huff_counts,uint trees);
157 static int make_huff_decode_table(HUFF_TREE *huff_tree,uint trees);
158 static void make_traverse_code_tree(HUFF_TREE *huff_tree,
159 				    HUFF_ELEMENT *element,uint size,
160 				    ulonglong code);
161 static int write_header(PACK_MRG_INFO *isam_file, uint header_length,uint trees,
162 			my_off_t tot_elements,my_off_t filelength);
163 static void write_field_info(HUFF_COUNTS *counts, uint fields,uint trees);
164 static my_off_t write_huff_tree(HUFF_TREE *huff_tree,uint trees);
165 static uint *make_offset_code_tree(HUFF_TREE *huff_tree,
166 				       HUFF_ELEMENT *element,
167 				       uint *offset);
168 static uint max_bit(uint value);
169 static int compress_isam_file(PACK_MRG_INFO *file,HUFF_COUNTS *huff_counts);
170 static char *make_new_name(char *new_name,char *old_name);
171 static char *make_old_name(char *new_name,char *old_name);
172 static void init_file_buffer(File file,pbool read_buffer);
173 static int flush_buffer(ulong neaded_length);
174 static void end_file_buffer(void);
175 static void write_bits(ulonglong value, uint bits);
176 static void flush_bits(void);
177 static int save_state(MI_INFO *isam_file,PACK_MRG_INFO *mrg,my_off_t new_length,
178 		      ha_checksum crc);
179 static int save_state_mrg(File file,PACK_MRG_INFO *isam_file,my_off_t new_length,
180 			  ha_checksum crc);
181 static int mrg_close(PACK_MRG_INFO *mrg);
182 static int mrg_rrnd(PACK_MRG_INFO *info,uchar *buf);
183 static void mrg_reset(PACK_MRG_INFO *mrg);
184 #if !defined(DBUG_OFF)
185 static void fakebigcodes(HUFF_COUNTS *huff_counts, HUFF_COUNTS *end_count);
186 static int fakecmp(my_off_t **count1, my_off_t **count2);
187 #endif
188 
189 
190 static int error_on_write=0,test_only=0,verbose=0,silent=0,
191 	   write_loop=0,force_pack=0, isamchk_neaded=0;
192 static int tmpfile_createflag=O_RDWR | O_TRUNC | O_EXCL;
193 static my_bool backup, opt_wait;
194 /*
195   tree_buff_length is somewhat arbitrary. The bigger it is the better
196   the chance to win in terms of compression factor. On the other hand,
197   this table becomes part of the compressed file header. And its length
198   is coded with 16 bits in the header. Hence the limit is 2**16 - 1.
199 */
200 static uint tree_buff_length= 65536 - MALLOC_OVERHEAD;
201 static char tmp_dir[FN_REFLEN]={0},*join_table;
202 static my_off_t intervall_length;
203 static ha_checksum glob_crc;
204 static struct st_file_buffer file_buffer;
205 static QUEUE queue;
206 static HUFF_COUNTS *global_count;
207 static char zero_string[]={0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0};
208 static const char *load_default_groups[]= { "myisampack",0 };
209 
210 	/* The main program */
211 
main(int argc,char ** argv)212 int main(int argc, char **argv)
213 {
214   int error,ok;
215   PACK_MRG_INFO merge;
216   char **default_argv;
217   MY_INIT(argv[0]);
218 
219   if (load_defaults("my",load_default_groups,&argc,&argv))
220     exit(1);
221 
222   default_argv= argv;
223   get_options(&argc,&argv);
224 
225   error=ok=isamchk_neaded=0;
226   if (join_table)
227   {
228     /*
229       Join files into one and create FRM file for the compressed table only if
230       the compression succeeds
231     */
232     if (open_isam_files(&merge,argv,(uint) argc) ||
233 	compress(&merge, join_table) || create_dest_frm(argv[0], join_table))
234       error=1;
235   }
236   else while (argc--)
237   {
238     MI_INFO *isam_file;
239     if (!(isam_file=open_isam_file(*argv++,O_RDWR)))
240       error=1;
241     else
242     {
243       merge.file= &isam_file;
244       merge.current=0;
245       merge.free_file=0;
246       merge.count=1;
247       if (compress(&merge,0))
248 	error=1;
249       else
250 	ok=1;
251     }
252   }
253   if (ok && isamchk_neaded && !silent)
254     puts("Remember to run myisamchk -rq on compressed tables");
255   (void) fflush(stdout);
256   (void) fflush(stderr);
257   free_defaults(default_argv);
258   my_end(verbose ? MY_CHECK_ERROR | MY_GIVE_INFO : MY_CHECK_ERROR);
259   exit(error ? 2 : 0);
260 #ifndef _lint
261   return 0;					/* No compiler warning */
262 #endif
263 }
264 
265 enum options_mp {OPT_CHARSETS_DIR_MP=256};
266 
267 static struct my_option my_long_options[] =
268 {
269   {"backup", 'b', "Make a backup of the table as table_name.OLD.",
270    &backup, &backup, 0, GET_BOOL, NO_ARG, 0, 0, 0, 0, 0, 0},
271   {"character-sets-dir", OPT_CHARSETS_DIR_MP,
272    "Directory where character sets are.", &charsets_dir,
273    &charsets_dir, 0, GET_STR, REQUIRED_ARG, 0, 0, 0, 0, 0, 0},
274   {"debug", '#', "Output debug log. Often this is 'd:t:o,filename'.",
275    0, 0, 0, GET_STR, OPT_ARG, 0, 0, 0, 0, 0, 0},
276   {"force", 'f',
277    "Force packing of table even if it gets bigger or if tempfile exists.",
278    0, 0, 0, GET_NO_ARG, NO_ARG, 0, 0, 0, 0, 0, 0},
279   {"join", 'j',
280    "Join all given tables into 'new_table_name'. All tables MUST have identical layouts.",
281    &join_table, &join_table, 0, GET_STR, REQUIRED_ARG, 0, 0, 0,
282    0, 0, 0},
283   {"help", '?', "Display this help and exit.",
284    0, 0, 0, GET_NO_ARG, NO_ARG, 0, 0, 0, 0, 0, 0},
285   {"silent", 's', "Be more silent.",
286    0, 0, 0, GET_NO_ARG, NO_ARG, 0, 0, 0, 0, 0, 0},
287   {"tmpdir", 'T', "Use temporary directory to store temporary table.",
288    0, 0, 0, GET_STR, REQUIRED_ARG, 0, 0, 0, 0, 0, 0},
289   {"test", 't', "Don't pack table, only test packing it.",
290    0, 0, 0, GET_NO_ARG, NO_ARG, 0, 0, 0, 0, 0, 0},
291   {"verbose", 'v', "Write info about progress and packing result. Use many -v for more verbosity!",
292    0, 0, 0, GET_NO_ARG, NO_ARG, 0, 0, 0, 0, 0, 0},
293   {"version", 'V', "Output version information and exit.",
294    0, 0, 0, GET_NO_ARG, NO_ARG, 0, 0, 0, 0, 0, 0},
295   {"wait", 'w', "Wait and retry if table is in use.", &opt_wait,
296    &opt_wait, 0, GET_BOOL, NO_ARG, 0, 0, 0, 0, 0, 0},
297   { 0, 0, 0, 0, 0, 0, GET_NO_ARG, NO_ARG, 0, 0, 0, 0, 0, 0}
298 };
299 
300 
print_version(void)301 static void print_version(void)
302 {
303   printf("%s Ver 1.23 for %s on %s\n",
304               my_progname, SYSTEM_TYPE, MACHINE_TYPE);
305 }
306 
307 
usage(void)308 static void usage(void)
309 {
310   print_version();
311   puts(ORACLE_WELCOME_COPYRIGHT_NOTICE("2002"));
312 
313   puts("Pack a MyISAM-table to take much less space.");
314   puts("Keys are not updated, you must run myisamchk -rq on the datafile");
315   puts("afterwards to update the keys.");
316   puts("You should give the .MYI file as the filename argument.");
317 
318   printf("\nUsage: %s [OPTIONS] filename...\n", my_progname);
319   my_print_help(my_long_options);
320   print_defaults("my", load_default_groups);
321   my_print_variables(my_long_options);
322 }
323 
324 
325 static my_bool
get_one_option(int optid,const struct my_option * opt MY_ATTRIBUTE ((unused)),char * argument)326 get_one_option(int optid, const struct my_option *opt MY_ATTRIBUTE((unused)),
327 	       char *argument)
328 {
329   uint length;
330 
331   switch(optid) {
332   case 'f':
333     force_pack= 1;
334     tmpfile_createflag= O_RDWR | O_TRUNC;
335     break;
336   case 's':
337     write_loop= verbose= 0;
338     silent= 1;
339     break;
340   case 't':
341     test_only= 1;
342     /* Avoid to reset 'verbose' if it was already set > 1. */
343     if (! verbose)
344       verbose= 1;
345     break;
346   case 'T':
347     length= (uint) (strmov(tmp_dir, argument) - tmp_dir);
348     if (length != dirname_length(tmp_dir))
349     {
350       tmp_dir[length]=FN_LIBCHAR;
351       tmp_dir[length+1]=0;
352     }
353     break;
354   case 'v':
355     verbose++; /* Allow for selecting the level of verbosity. */
356     silent= 0;
357     break;
358   case '#':
359     DBUG_PUSH(argument ? argument : "d:t:o");
360     break;
361   case 'V':
362     print_version();
363     exit(0);
364   case 'I':
365   case '?':
366     usage();
367     exit(0);
368   }
369   return 0;
370 }
371 
372 	/* reads options */
373 	/* Initiates DEBUG - but no debugging here ! */
374 
get_options(int * argc,char *** argv)375 static void get_options(int *argc,char ***argv)
376 {
377   int ho_error;
378 
379   my_progname= argv[0][0];
380   if (isatty(fileno(stdout)))
381     write_loop=1;
382 
383   if ((ho_error=handle_options(argc, argv, my_long_options, get_one_option)))
384     exit(ho_error);
385 
386   if (!*argc)
387   {
388     usage();
389     exit(1);
390   }
391   if (join_table)
392   {
393     backup=0;					/* Not needed */
394     tmp_dir[0]=0;
395   }
396   return;
397 }
398 
399 
open_isam_file(char * name,int mode)400 static MI_INFO *open_isam_file(char *name,int mode)
401 {
402   MI_INFO *isam_file;
403   MYISAM_SHARE *share;
404   DBUG_ENTER("open_isam_file");
405 
406   if (!(isam_file=mi_open(name,mode,
407 			  (opt_wait ? HA_OPEN_WAIT_IF_LOCKED :
408 			   HA_OPEN_ABORT_IF_LOCKED))))
409   {
410     (void) fprintf(stderr, "%s gave error %d on open\n", name, my_errno);
411     DBUG_RETURN(0);
412   }
413   share=isam_file->s;
414   if (share->options & HA_OPTION_COMPRESS_RECORD && !join_table)
415   {
416     if (!force_pack)
417     {
418       (void) fprintf(stderr, "%s is already compressed\n", name);
419       (void) mi_close(isam_file);
420       DBUG_RETURN(0);
421     }
422     if (verbose)
423       puts("Recompressing already compressed table");
424     share->options&= ~HA_OPTION_READ_ONLY_DATA; /* We are modifing it */
425   }
426   if (! force_pack && share->state.state.records != 0 &&
427       (share->state.state.records <= 1 ||
428        share->state.state.data_file_length < 1024))
429   {
430     (void) fprintf(stderr, "%s is too small to compress\n", name);
431     (void) mi_close(isam_file);
432     DBUG_RETURN(0);
433   }
434   (void) mi_lock_database(isam_file,F_WRLCK);
435   DBUG_RETURN(isam_file);
436 }
437 
438 
open_isam_files(PACK_MRG_INFO * mrg,char ** names,uint count)439 static my_bool open_isam_files(PACK_MRG_INFO *mrg, char **names, uint count)
440 {
441   uint i,j;
442   mrg->count=0;
443   mrg->current=0;
444   mrg->file=(MI_INFO**) my_malloc(sizeof(MI_INFO*)*count,MYF(MY_FAE));
445   mrg->free_file=1;
446   mrg->src_file_has_indexes_disabled= 0;
447   for (i=0; i < count ; i++)
448   {
449     if (!(mrg->file[i]=open_isam_file(names[i],O_RDONLY)))
450       goto error;
451 
452     mrg->src_file_has_indexes_disabled|=
453       ! mi_is_all_keys_active(mrg->file[i]->s->state.key_map,
454                               mrg->file[i]->s->base.keys);
455   }
456   /* Check that files are identical */
457   for (j=0 ; j < count-1 ; j++)
458   {
459     MI_COLUMNDEF *m1,*m2,*end;
460     if (mrg->file[j]->s->base.reclength != mrg->file[j+1]->s->base.reclength ||
461 	mrg->file[j]->s->base.fields != mrg->file[j+1]->s->base.fields)
462       goto diff_file;
463     m1=mrg->file[j]->s->rec;
464     end=m1+mrg->file[j]->s->base.fields;
465     m2=mrg->file[j+1]->s->rec;
466     for ( ; m1 != end ; m1++,m2++)
467     {
468       if (m1->type != m2->type || m1->length != m2->length)
469 	goto diff_file;
470     }
471   }
472   mrg->count=count;
473   return 0;
474 
475  diff_file:
476   (void) fprintf(stderr, "%s: Tables '%s' and '%s' are not identical\n",
477                my_progname, names[j], names[j+1]);
478  error:
479   while (i--)
480     mi_close(mrg->file[i]);
481   my_free(mrg->file);
482   return 1;
483 }
484 
485 
compress(PACK_MRG_INFO * mrg,char * result_table)486 static int compress(PACK_MRG_INFO *mrg,char *result_table)
487 {
488   int error;
489   File new_file,join_isam_file;
490   MI_INFO *isam_file;
491   MYISAM_SHARE *share;
492   char org_name[FN_REFLEN],new_name[FN_REFLEN],temp_name[FN_REFLEN];
493   uint i,header_length,fields,trees,used_trees;
494   my_off_t old_length,new_length,tot_elements;
495   HUFF_COUNTS *huff_counts;
496   HUFF_TREE *huff_trees;
497   DBUG_ENTER("compress");
498 
499   isam_file=mrg->file[0];			/* Take this as an example */
500   share=isam_file->s;
501   new_file=join_isam_file= -1;
502   trees=fields=0;
503   huff_trees=0;
504   huff_counts=0;
505 
506   /* Create temporary or join file */
507 
508   if (backup)
509     (void) fn_format(org_name,isam_file->filename,"",MI_NAME_DEXT,2);
510   else
511     (void) fn_format(org_name,isam_file->filename,"",MI_NAME_DEXT,2+4+16);
512   if (!test_only && result_table)
513   {
514     /* Make a new indexfile based on first file in list */
515     uint length;
516     uchar *buff;
517     strmov(org_name,result_table);		/* Fix error messages */
518     (void) fn_format(new_name,result_table,"",MI_NAME_IEXT,2);
519     if ((join_isam_file=my_create(new_name,0,tmpfile_createflag,MYF(MY_WME)))
520 	< 0)
521       goto err;
522     length=(uint) share->base.keystart;
523     if (!(buff= (uchar*) my_malloc(length,MYF(MY_WME))))
524       goto err;
525     if (my_pread(share->kfile,buff,length,0L,MYF(MY_WME | MY_NABP)) ||
526 	my_write(join_isam_file,buff,length,
527 		 MYF(MY_WME | MY_NABP | MY_WAIT_IF_FULL)))
528     {
529       my_free(buff);
530       goto err;
531     }
532     my_free(buff);
533     (void) fn_format(new_name,result_table,"",MI_NAME_DEXT,2);
534   }
535   else if (!tmp_dir[0])
536     (void) make_new_name(new_name,org_name);
537   else
538     (void) fn_format(new_name,org_name,tmp_dir,DATA_TMP_EXT,1+2+4);
539   if (!test_only &&
540       (new_file=my_create(new_name,0,tmpfile_createflag,MYF(MY_WME))) < 0)
541     goto err;
542 
543   /* Start calculating statistics */
544 
545   mrg->records=0;
546   for (i=0 ; i < mrg->count ; i++)
547     mrg->records+=mrg->file[i]->s->state.state.records;
548 
549   DBUG_PRINT("info", ("Compressing %s: (%lu records)",
550                       result_table ? new_name : org_name,
551                       (ulong) mrg->records));
552   if (write_loop || verbose)
553   {
554     printf("Compressing %s: (%lu records)\n",
555                 result_table ? new_name : org_name, (ulong) mrg->records);
556   }
557   trees=fields=share->base.fields;
558   huff_counts=init_huff_count(isam_file,mrg->records);
559 
560   /*
561     Read the whole data file(s) for statistics.
562   */
563   DBUG_PRINT("info", ("- Calculating statistics"));
564   if (write_loop || verbose)
565     printf("- Calculating statistics\n");
566   if (get_statistic(mrg,huff_counts))
567     goto err;
568 
569   old_length=0;
570   for (i=0; i < mrg->count ; i++)
571     old_length+= (mrg->file[i]->s->state.state.data_file_length -
572 		  mrg->file[i]->s->state.state.empty);
573 
574   /*
575     Create a global priority queue in preparation for making
576     temporary Huffman trees.
577   */
578   if (init_queue(&queue,256,0,0,compare_huff_elements,0))
579     goto err;
580 
581   /*
582     Check each column if we should use pre-space-compress, end-space-
583     compress, empty-field-compress or zero-field-compress.
584   */
585   check_counts(huff_counts,fields,mrg->records);
586 
587   /*
588     Build a Huffman tree for each column.
589   */
590   huff_trees=make_huff_trees(huff_counts,trees);
591 
592   /*
593     If the packed lengths of combined columns is less then the sum of
594     the non-combined columns, then create common Huffman trees for them.
595     We do this only for byte compressed columns, not for distinct values
596     compressed columns.
597   */
598   if ((int) (used_trees=join_same_trees(huff_counts,trees)) < 0)
599     goto err;
600 
601   /*
602     Assign codes to all byte or column values.
603   */
604   if (make_huff_decode_table(huff_trees,fields))
605     goto err;
606 
607   /* Prepare a file buffer. */
608   init_file_buffer(new_file,0);
609 
610   /*
611     Reserve space in the target file for the fixed compressed file header.
612   */
613   file_buffer.pos_in_file=HEAD_LENGTH;
614   if (! test_only)
615     my_seek(new_file,file_buffer.pos_in_file,MY_SEEK_SET,MYF(0));
616 
617   /*
618     Write field infos: field type, pack type, length bits, tree number.
619   */
620   write_field_info(huff_counts,fields,used_trees);
621 
622   /*
623     Write decode trees.
624   */
625   if (!(tot_elements=write_huff_tree(huff_trees,trees)))
626     goto err;
627 
628   /*
629     Calculate the total length of the compression info header.
630     This includes the fixed compressed file header, the column compression
631     type descriptions, and the decode trees.
632   */
633   header_length=(uint) file_buffer.pos_in_file+
634     (uint) (file_buffer.pos-file_buffer.buffer);
635 
636   /*
637     Compress the source file into the target file.
638   */
639   DBUG_PRINT("info", ("- Compressing file"));
640   if (write_loop || verbose)
641     printf("- Compressing file\n");
642   error=compress_isam_file(mrg,huff_counts);
643   new_length=file_buffer.pos_in_file;
644   if (!error && !test_only)
645   {
646     uchar buff[MEMMAP_EXTRA_MARGIN];		/* End marginal for memmap */
647     memset(buff, 0, sizeof(buff));
648     error=my_write(file_buffer.file,buff,sizeof(buff),
649 		   MYF(MY_WME | MY_NABP | MY_WAIT_IF_FULL)) != 0;
650   }
651 
652   /*
653     Write the fixed compressed file header.
654   */
655   if (!error)
656     error=write_header(mrg,header_length,used_trees,tot_elements,
657 		       new_length);
658 
659   /* Flush the file buffer. */
660   end_file_buffer();
661 
662   /* Display statistics. */
663   DBUG_PRINT("info", ("Min record length: %6d  Max length: %6d  "
664                       "Mean total length: %6ld\n",
665                       mrg->min_pack_length, mrg->max_pack_length,
666                       (ulong) (mrg->records ? (new_length/mrg->records) : 0)));
667   if (verbose && mrg->records)
668     printf("Min record length: %6d   Max length: %6d   "
669                 "Mean total length: %6ld\n", mrg->min_pack_length,
670                 mrg->max_pack_length, (ulong) (new_length/mrg->records));
671 
672   /* Close source and target file. */
673   if (!test_only)
674   {
675     error|=my_close(new_file,MYF(MY_WME));
676     if (!result_table)
677     {
678       error|=my_close(isam_file->dfile,MYF(MY_WME));
679       isam_file->dfile= -1;		/* Tell mi_close file is closed */
680     }
681   }
682 
683   /* Cleanup. */
684   free_counts_and_tree_and_queue(huff_trees,trees,huff_counts,fields);
685   if (! test_only && ! error)
686   {
687     if (result_table)
688     {
689       error=save_state_mrg(join_isam_file,mrg,new_length,glob_crc);
690     }
691     else
692     {
693       if (backup)
694       {
695 	if (my_rename(org_name,make_old_name(temp_name,isam_file->filename),
696 		      MYF(MY_WME)))
697 	  error=1;
698 	else
699 	{
700 	  if (tmp_dir[0])
701 	    error=my_copy(new_name,org_name,MYF(MY_WME));
702 	  else
703 	    error=my_rename(new_name,org_name,MYF(MY_WME));
704 	  if (!error)
705           {
706 	    (void) my_copystat(temp_name,org_name,MYF(MY_COPYTIME));
707             if (tmp_dir[0])
708               (void) my_delete(new_name,MYF(MY_WME));
709           }
710 	}
711       }
712       else
713       {
714 	if (tmp_dir[0])
715         {
716 	  error=my_copy(new_name,org_name,
717 			MYF(MY_WME | MY_HOLD_ORIGINAL_MODES | MY_COPYTIME));
718           if (!error)
719             (void) my_delete(new_name,MYF(MY_WME));
720         }
721 	else
722 	  error=my_redel(org_name,new_name,MYF(MY_WME | MY_COPYTIME));
723       }
724       if (! error)
725 	error=save_state(isam_file,mrg,new_length,glob_crc);
726     }
727   }
728   error|=mrg_close(mrg);
729   if (join_isam_file >= 0)
730     error|=my_close(join_isam_file,MYF(MY_WME));
731   if (error)
732   {
733     (void) fprintf(stderr, "Aborting: %s is not compressed\n", org_name);
734     (void) my_delete(new_name,MYF(MY_WME));
735     DBUG_RETURN(-1);
736   }
737   if (write_loop || verbose)
738   {
739     if (old_length)
740       printf("%.4g%%     \n",
741                   (((longlong) (old_length - new_length)) * 100.0 /
742                    (longlong) old_length));
743     else
744       puts("Empty file saved in compressed format");
745   }
746   DBUG_RETURN(0);
747 
748  err:
749   free_counts_and_tree_and_queue(huff_trees,trees,huff_counts,fields);
750   if (new_file >= 0)
751     (void) my_close(new_file,MYF(0));
752   if (join_isam_file >= 0)
753     (void) my_close(join_isam_file,MYF(0));
754   mrg_close(mrg);
755   (void) fprintf(stderr, "Aborted: %s is not compressed\n", org_name);
756   DBUG_RETURN(-1);
757 }
758 
759 
760 /**
761   Create FRM for the destination table for --join operation
762   Copy the first table FRM as the destination table FRM file. Doing so
763   will help the mysql server to recognize the newly created table.
764   See Bug#36573.
765 
766   @param    source_table    Name of the source table
767   @param    dest_table      Name of the destination table
768   @retval   0               Successful copy operation
769 
770   @note  We always return 0 because we don't want myisampack to report error
771   even if the copy operation fails.
772 */
773 
create_dest_frm(char * source_table,char * dest_table)774 static int create_dest_frm(char *source_table, char *dest_table)
775 {
776   char source_name[FN_REFLEN], dest_name[FN_REFLEN];
777 
778   DBUG_ENTER("create_dest_frm");
779 
780   (void) fn_format(source_name, source_table,
781                    "", FRM_EXT, MY_UNPACK_FILENAME | MY_RESOLVE_SYMLINKS);
782   (void) fn_format(dest_name, dest_table,
783                    "", FRM_EXT, MY_UNPACK_FILENAME | MY_RESOLVE_SYMLINKS);
784   /*
785     Error messages produced by my_copy() are suppressed as this
786     is not vital for --join operation. User shouldn't see any error messages
787     like "source file frm not found" and "unable to create destination frm
788     file. So we don't pass the flag MY_WME -Write Message on Error to
789     my_copy()
790   */
791   (void) my_copy(source_name, dest_name, MYF(MY_DONT_OVERWRITE_FILE));
792 
793   DBUG_RETURN(0);
794 }
795 
796 
797 	/* Init a huff_count-struct for each field and init it */
798 
init_huff_count(MI_INFO * info,my_off_t records)799 static HUFF_COUNTS *init_huff_count(MI_INFO *info,my_off_t records)
800 {
801   reg2 uint i;
802   reg1 HUFF_COUNTS *count;
803   if ((count = (HUFF_COUNTS*) my_malloc(info->s->base.fields*
804 					sizeof(HUFF_COUNTS),
805 					MYF(MY_ZEROFILL | MY_WME))))
806   {
807     for (i=0 ; i < info->s->base.fields ; i++)
808     {
809       enum en_fieldtype type;
810       count[i].field_length=info->s->rec[i].length;
811       type= count[i].field_type= (enum en_fieldtype) info->s->rec[i].type;
812       if (type == FIELD_INTERVALL ||
813 	  type == FIELD_CONSTANT ||
814 	  type == FIELD_ZERO)
815 	type = FIELD_NORMAL;
816       if (count[i].field_length <= 8 &&
817 	  (type == FIELD_NORMAL ||
818 	   type == FIELD_SKIP_ZERO))
819 	count[i].max_zero_fill= count[i].field_length;
820       /*
821         For every column initialize a tree, which is used to detect distinct
822         column values. 'int_tree' works together with 'tree_buff' and
823         'tree_pos'. It's keys are implemented by pointers into 'tree_buff'.
824         This is accomplished by '-1' as the element size.
825       */
826       init_tree(&count[i].int_tree,0,0,-1,(qsort_cmp2) compare_tree,0, NULL,
827 		NULL);
828       if (records && type != FIELD_BLOB && type != FIELD_VARCHAR)
829 	count[i].tree_pos=count[i].tree_buff =
830 	  my_malloc(count[i].field_length > 1 ? tree_buff_length : 2,
831 		    MYF(MY_WME));
832     }
833   }
834   return count;
835 }
836 
837 
838 	/* Free memory used by counts and trees */
839 
free_counts_and_tree_and_queue(HUFF_TREE * huff_trees,uint trees,HUFF_COUNTS * huff_counts,uint fields)840 static void free_counts_and_tree_and_queue(HUFF_TREE *huff_trees, uint trees,
841 					   HUFF_COUNTS *huff_counts,
842 					   uint fields)
843 {
844   register uint i;
845 
846   if (huff_trees)
847   {
848     for (i=0 ; i < trees ; i++)
849     {
850       if (huff_trees[i].element_buffer)
851 	my_free(huff_trees[i].element_buffer);
852       if (huff_trees[i].code)
853 	my_free(huff_trees[i].code);
854     }
855     my_free(huff_trees);
856   }
857   if (huff_counts)
858   {
859     for (i=0 ; i < fields ; i++)
860     {
861       if (huff_counts[i].tree_buff)
862       {
863 	my_free(huff_counts[i].tree_buff);
864 	delete_tree(&huff_counts[i].int_tree);
865       }
866     }
867     my_free(huff_counts);
868   }
869   delete_queue(&queue);		/* This is safe to free */
870   return;
871 }
872 
873 	/* Read through old file and gather some statistics */
874 
get_statistic(PACK_MRG_INFO * mrg,HUFF_COUNTS * huff_counts)875 static int get_statistic(PACK_MRG_INFO *mrg,HUFF_COUNTS *huff_counts)
876 {
877   int error;
878   uint length;
879   ulong reclength,max_blob_length;
880   uchar *record,*pos,*next_pos,*end_pos,*start_pos;
881   ha_rows record_count;
882   my_bool static_row_size;
883   HUFF_COUNTS *count,*end_count;
884   TREE_ELEMENT *element;
885   DBUG_ENTER("get_statistic");
886 
887   reclength=mrg->file[0]->s->base.reclength;
888   record=(uchar*) my_alloca(reclength);
889   end_count=huff_counts+mrg->file[0]->s->base.fields;
890   record_count=0; glob_crc=0;
891   max_blob_length=0;
892 
893   /* Check how to calculate checksum */
894   static_row_size=1;
895   for (count=huff_counts ; count < end_count ; count++)
896   {
897     if (count->field_type == FIELD_BLOB ||
898         count->field_type == FIELD_VARCHAR)
899     {
900       static_row_size=0;
901       break;
902     }
903   }
904 
905   mrg_reset(mrg);
906   while ((error=mrg_rrnd(mrg,record)) != HA_ERR_END_OF_FILE)
907   {
908     ulong tot_blob_length=0;
909     if (! error)
910     {
911       /* glob_crc is a checksum over all bytes of all records. */
912       if (static_row_size)
913 	glob_crc+=mi_static_checksum(mrg->file[0],record);
914       else
915 	glob_crc+=mi_checksum(mrg->file[0],record);
916 
917       /* Count the incidence of values separately for every column. */
918       for (pos=record,count=huff_counts ;
919 	   count < end_count ;
920 	   count++,
921 	   pos=next_pos)
922       {
923 	next_pos=end_pos=(start_pos=pos)+count->field_length;
924 
925 	/*
926           Put the whole column value in a tree if there is room for it.
927           'int_tree' is used to quickly check for duplicate values.
928           'tree_buff' collects as many distinct column values as
929           possible. If the field length is > 1, it is tree_buff_length,
930           else 2 bytes. Each value is 'field_length' bytes big. If there
931           are more distinct column values than fit into the buffer, we
932           give up with this tree. BLOBs and VARCHARs do not have a
933           tree_buff as it can only be used with fixed length columns.
934           For the special case of field length == 1, we handle only the
935           case that there is only one distinct value in the table(s).
936           Otherwise, we can have a maximum of 256 distinct values. This
937           is then handled by the normal Huffman tree build.
938 
939           Another limit for collecting distinct column values is the
940           number of values itself. Since we would need to build a
941           Huffman tree for the values, we are limited by the 'IS_OFFSET'
942           constant. This constant expresses a bit which is used to
943           determine if a tree element holds a final value or an offset
944           to a child element. Hence, all values and offsets need to be
945           smaller than 'IS_OFFSET'. A tree element is implemented with
946           two integer values, one for the left branch and one for the
947           right branch. For the extreme case that the first element
948           points to the last element, the number of integers in the tree
949           must be less or equal to IS_OFFSET. So the number of elements
950           must be less or equal to IS_OFFSET / 2.
951 
952           WARNING: At first, we insert a pointer into the record buffer
953           as the key for the tree. If we got a new distinct value, which
954           is really inserted into the tree, instead of being counted
955           only, we will copy the column value from the record buffer to
956           'tree_buff' and adjust the key pointer of the tree accordingly.
957         */
958 	if (count->tree_buff)
959 	{
960 	  global_count=count;
961 	  if (!(element=tree_insert(&count->int_tree,pos, 0,
962 				    count->int_tree.custom_arg)) ||
963 	      (element->count == 1 &&
964 	       (count->tree_buff + tree_buff_length <
965                 count->tree_pos + count->field_length)) ||
966               (count->int_tree.elements_in_tree > IS_OFFSET / 2) ||
967 	      (count->field_length == 1 &&
968 	       count->int_tree.elements_in_tree > 1))
969 	  {
970 	    delete_tree(&count->int_tree);
971 	    my_free(count->tree_buff);
972 	    count->tree_buff=0;
973 	  }
974 	  else
975 	  {
976             /*
977               If tree_insert() succeeds, it either creates a new element
978               or increments the counter of an existing element.
979             */
980 	    if (element->count == 1)
981 	    {
982               /* Copy the new column value into 'tree_buff'. */
983 	      memcpy(count->tree_pos,pos,(size_t) count->field_length);
984               /* Adjust the key pointer in the tree. */
985 	      tree_set_pointer(element,count->tree_pos);
986               /* Point behind the last column value so far. */
987 	      count->tree_pos+=count->field_length;
988 	    }
989 	  }
990 	}
991 
992 	/* Save character counters and space-counts and zero-field-counts */
993 	if (count->field_type == FIELD_NORMAL ||
994 	    count->field_type == FIELD_SKIP_ENDSPACE)
995 	{
996           /* Ignore trailing space. */
997 	  for ( ; end_pos > pos ; end_pos--)
998 	    if (end_pos[-1] != ' ')
999 	      break;
1000           /* Empty fields are just counted. Go to the next record. */
1001 	  if (end_pos == pos)
1002 	  {
1003 	    count->empty_fields++;
1004 	    count->max_zero_fill=0;
1005 	    continue;
1006 	  }
1007           /*
1008             Count the total of all trailing spaces and the number of
1009             short trailing spaces. Remember the longest trailing space.
1010           */
1011 	  length= (uint) (next_pos-end_pos);
1012 	  count->tot_end_space+=length;
1013 	  if (length < 8)
1014 	    count->end_space[length]++;
1015 	  if (count->max_end_space < length)
1016 	    count->max_end_space = length;
1017 	}
1018 
1019 	if (count->field_type == FIELD_NORMAL ||
1020 	    count->field_type == FIELD_SKIP_PRESPACE)
1021 	{
1022           /* Ignore leading space. */
1023 	  for (pos=start_pos; pos < end_pos ; pos++)
1024 	    if (pos[0] != ' ')
1025 	      break;
1026           /* Empty fields are just counted. Go to the next record. */
1027 	  if (end_pos == pos)
1028 	  {
1029 	    count->empty_fields++;
1030 	    count->max_zero_fill=0;
1031 	    continue;
1032 	  }
1033           /*
1034             Count the total of all leading spaces and the number of
1035             short leading spaces. Remember the longest leading space.
1036           */
1037 	  length= (uint) (pos-start_pos);
1038 	  count->tot_pre_space+=length;
1039 	  if (length < 8)
1040 	    count->pre_space[length]++;
1041 	  if (count->max_pre_space < length)
1042 	    count->max_pre_space = length;
1043 	}
1044 
1045         /* Calculate pos, end_pos, and max_length for variable length fields. */
1046 	if (count->field_type == FIELD_BLOB)
1047 	{
1048 	  uint field_length=count->field_length -portable_sizeof_char_ptr;
1049 	  ulong blob_length= _mi_calc_blob_length(field_length, start_pos);
1050 	  memcpy(&pos, start_pos+field_length, sizeof(char*));
1051 	  end_pos=pos+blob_length;
1052 	  tot_blob_length+=blob_length;
1053 	  set_if_bigger(count->max_length,blob_length);
1054 	}
1055 	else if (count->field_type == FIELD_VARCHAR)
1056 	{
1057           uint pack_length= HA_VARCHAR_PACKLENGTH(count->field_length-1);
1058 	  length= (pack_length == 1 ? (uint) *(uchar*) start_pos :
1059                    uint2korr(start_pos));
1060 	  pos= start_pos+pack_length;
1061 	  end_pos= pos+length;
1062 	  set_if_bigger(count->max_length,length);
1063 	}
1064 
1065         /* Evaluate 'max_zero_fill' for short fields. */
1066 	if (count->field_length <= 8 &&
1067 	    (count->field_type == FIELD_NORMAL ||
1068 	     count->field_type == FIELD_SKIP_ZERO))
1069 	{
1070 	  uint i;
1071           /* Zero fields are just counted. Go to the next record. */
1072 	  if (!memcmp((uchar*) start_pos,zero_string,count->field_length))
1073 	  {
1074 	    count->zero_fields++;
1075 	    continue;
1076 	  }
1077           /*
1078             max_zero_fill starts with field_length. It is decreased every
1079             time a shorter "zero trailer" is found. It is set to zero when
1080             an empty field is found (see above). This suggests that the
1081             variable should be called 'min_zero_fill'.
1082           */
1083 	  for (i =0 ; i < count->max_zero_fill && ! end_pos[-1 - (int) i] ;
1084 	       i++) ;
1085 	  if (i < count->max_zero_fill)
1086 	    count->max_zero_fill=i;
1087 	}
1088 
1089         /* Ignore zero fields and check fields. */
1090 	if (count->field_type == FIELD_ZERO ||
1091 	    count->field_type == FIELD_CHECK)
1092 	  continue;
1093 
1094         /*
1095           Count the incidence of every byte value in the
1096           significant field value.
1097         */
1098 	for ( ; pos < end_pos ; pos++)
1099 	  count->counts[(uchar) *pos]++;
1100 
1101         /* Step to next field. */
1102       }
1103 
1104       if (tot_blob_length > max_blob_length)
1105 	max_blob_length=tot_blob_length;
1106       record_count++;
1107       if (write_loop && record_count % WRITE_COUNT == 0)
1108       {
1109 	printf("%lu\r", (ulong) record_count);
1110         (void) fflush(stdout);
1111       }
1112     }
1113     else if (error != HA_ERR_RECORD_DELETED)
1114     {
1115       (void) fprintf(stderr, "Got error %d while reading rows", error);
1116       break;
1117     }
1118 
1119     /* Step to next record. */
1120   }
1121   if (write_loop)
1122   {
1123     printf("            \r");
1124     (void) fflush(stdout);
1125   }
1126 
1127   /*
1128     If --debug=d,fakebigcodes is set, fake the counts to get big Huffman
1129     codes.
1130   */
1131   DBUG_EXECUTE_IF("fakebigcodes", fakebigcodes(huff_counts, end_count););
1132 
1133   DBUG_PRINT("info", ("Found the following number of incidents "
1134                       "of the byte codes:"));
1135   if (verbose >= 2)
1136     printf("Found the following number of incidents "
1137                 "of the byte codes:\n");
1138   for (count= huff_counts ; count < end_count; count++)
1139   {
1140     uint      idx;
1141     my_off_t  total_count;
1142     char      llbuf[32];
1143 
1144     DBUG_PRINT("info", ("column: %3u", (uint) (count - huff_counts + 1)));
1145     if (verbose >= 2)
1146       printf("column: %3u\n", (uint) (count - huff_counts + 1));
1147     if (count->tree_buff)
1148     {
1149       DBUG_PRINT("info", ("number of distinct values: %u",
1150                           (uint) ((count->tree_pos - count->tree_buff) /
1151                                   count->field_length)));
1152       if (verbose >= 2)
1153         printf("number of distinct values: %u\n",
1154                     (uint) ((count->tree_pos - count->tree_buff) /
1155                             count->field_length));
1156     }
1157     total_count= 0;
1158     for (idx= 0; idx < 256; idx++)
1159     {
1160       if (count->counts[idx])
1161       {
1162         total_count+= count->counts[idx];
1163         DBUG_PRINT("info", ("counts[0x%02x]: %12s", idx,
1164                             llstr((longlong) count->counts[idx], llbuf)));
1165         if (verbose >= 2)
1166           printf("counts[0x%02x]: %12s\n", idx,
1167                       llstr((longlong) count->counts[idx], llbuf));
1168       }
1169     }
1170     DBUG_PRINT("info", ("total:        %12s", llstr((longlong) total_count,
1171                                                     llbuf)));
1172     if ((verbose >= 2) && total_count)
1173     {
1174       printf("total:        %12s\n",
1175                   llstr((longlong) total_count, llbuf));
1176     }
1177   }
1178 
1179   mrg->records=record_count;
1180   mrg->max_blob_length=max_blob_length;
1181   my_afree((uchar*) record);
1182   DBUG_RETURN(error != HA_ERR_END_OF_FILE);
1183 }
1184 
compare_huff_elements(void * not_used MY_ATTRIBUTE ((unused)),uchar * a,uchar * b)1185 static int compare_huff_elements(void *not_used MY_ATTRIBUTE((unused)),
1186 				 uchar *a, uchar *b)
1187 {
1188   return *((my_off_t*) a) < *((my_off_t*) b) ? -1 :
1189     (*((my_off_t*) a) == *((my_off_t*) b)  ? 0 : 1);
1190 }
1191 
1192 	/* Check each tree if we should use pre-space-compress, end-space-
1193 	   compress, empty-field-compress or zero-field-compress */
1194 
check_counts(HUFF_COUNTS * huff_counts,uint trees,my_off_t records)1195 static void check_counts(HUFF_COUNTS *huff_counts, uint trees,
1196 			 my_off_t records)
1197 {
1198   uint space_fields,fill_zero_fields,field_count[(int) FIELD_enum_val_count];
1199   my_off_t old_length,new_length,length;
1200   DBUG_ENTER("check_counts");
1201 
1202   memset(field_count, 0, sizeof(field_count));
1203   space_fields=fill_zero_fields=0;
1204 
1205   for (; trees-- ; huff_counts++)
1206   {
1207     if (huff_counts->field_type == FIELD_BLOB)
1208     {
1209       huff_counts->length_bits=max_bit(huff_counts->max_length);
1210       goto found_pack;
1211     }
1212     else if (huff_counts->field_type == FIELD_VARCHAR)
1213     {
1214       huff_counts->length_bits=max_bit(huff_counts->max_length);
1215       goto found_pack;
1216     }
1217     else if (huff_counts->field_type == FIELD_CHECK)
1218     {
1219       huff_counts->bytes_packed=0;
1220       huff_counts->counts[0]=0;
1221       goto found_pack;
1222     }
1223 
1224     huff_counts->field_type=FIELD_NORMAL;
1225     huff_counts->pack_type=0;
1226 
1227     /* Check for zero-filled records (in this column), or zero records. */
1228     if (huff_counts->zero_fields || ! records)
1229     {
1230       my_off_t old_space_count;
1231       /*
1232         If there are only zero filled records (in this column),
1233         or no records at all, we are done.
1234       */
1235       if (huff_counts->zero_fields == records)
1236       {
1237 	huff_counts->field_type= FIELD_ZERO;
1238 	huff_counts->bytes_packed=0;
1239 	huff_counts->counts[0]=0;
1240 	goto found_pack;
1241       }
1242       /* Remeber the number of significant spaces. */
1243       old_space_count=huff_counts->counts[' '];
1244       /* Add all leading and trailing spaces. */
1245       huff_counts->counts[' ']+= (huff_counts->tot_end_space +
1246                                   huff_counts->tot_pre_space +
1247                                   huff_counts->empty_fields *
1248                                   huff_counts->field_length);
1249       /* Check, what the compressed length of this would be. */
1250       old_length=calc_packed_length(huff_counts,0)+records/8;
1251       /* Get the number of zero bytes. */
1252       length=huff_counts->zero_fields*huff_counts->field_length;
1253       /* Add it to the counts. */
1254       huff_counts->counts[0]+=length;
1255       /* Check, what the compressed length of this would be. */
1256       new_length=calc_packed_length(huff_counts,0);
1257       /* If the compression without the zeroes would be shorter, we are done. */
1258       if (old_length < new_length && huff_counts->field_length > 1)
1259       {
1260 	huff_counts->field_type=FIELD_SKIP_ZERO;
1261 	huff_counts->counts[0]-=length;
1262 	huff_counts->bytes_packed=old_length- records/8;
1263 	goto found_pack;
1264       }
1265       /* Remove the insignificant spaces, but keep the zeroes. */
1266       huff_counts->counts[' ']=old_space_count;
1267     }
1268     /* Check, what the compressed length of this column would be. */
1269     huff_counts->bytes_packed=calc_packed_length(huff_counts,0);
1270 
1271     /*
1272       If there are enough empty records (in this column),
1273       treating them specially may pay off.
1274     */
1275     if (huff_counts->empty_fields)
1276     {
1277       if (huff_counts->field_length > 2 &&
1278 	  huff_counts->empty_fields + (records - huff_counts->empty_fields)*
1279 	  (1+max_bit(MY_MAX(huff_counts->max_pre_space,
1280                             huff_counts->max_end_space))) <
1281 	  records * max_bit(huff_counts->field_length))
1282       {
1283 	huff_counts->pack_type |= PACK_TYPE_SPACE_FIELDS;
1284       }
1285       else
1286       {
1287 	length=huff_counts->empty_fields*huff_counts->field_length;
1288 	if (huff_counts->tot_end_space || ! huff_counts->tot_pre_space)
1289 	{
1290 	  huff_counts->tot_end_space+=length;
1291 	  huff_counts->max_end_space=huff_counts->field_length;
1292 	  if (huff_counts->field_length < 8)
1293 	    huff_counts->end_space[huff_counts->field_length]+=
1294 	      huff_counts->empty_fields;
1295 	}
1296 	if (huff_counts->tot_pre_space)
1297 	{
1298 	  huff_counts->tot_pre_space+=length;
1299 	  huff_counts->max_pre_space=huff_counts->field_length;
1300 	  if (huff_counts->field_length < 8)
1301 	    huff_counts->pre_space[huff_counts->field_length]+=
1302 	      huff_counts->empty_fields;
1303 	}
1304       }
1305     }
1306 
1307     /*
1308       If there are enough trailing spaces (in this column),
1309       treating them specially may pay off.
1310     */
1311     if (huff_counts->tot_end_space)
1312     {
1313       huff_counts->counts[' ']+=huff_counts->tot_pre_space;
1314       if (test_space_compress(huff_counts,records,huff_counts->max_end_space,
1315 			      huff_counts->end_space,
1316 			      huff_counts->tot_end_space,FIELD_SKIP_ENDSPACE))
1317 	goto found_pack;
1318       huff_counts->counts[' ']-=huff_counts->tot_pre_space;
1319     }
1320 
1321     /*
1322       If there are enough leading spaces (in this column),
1323       treating them specially may pay off.
1324     */
1325     if (huff_counts->tot_pre_space)
1326     {
1327       if (test_space_compress(huff_counts,records,huff_counts->max_pre_space,
1328 			      huff_counts->pre_space,
1329 			      huff_counts->tot_pre_space,FIELD_SKIP_PRESPACE))
1330 	goto found_pack;
1331     }
1332 
1333   found_pack:			/* Found field-packing */
1334 
1335     /* Test if we can use zero-fill */
1336 
1337     if (huff_counts->max_zero_fill &&
1338 	(huff_counts->field_type == FIELD_NORMAL ||
1339 	 huff_counts->field_type == FIELD_SKIP_ZERO))
1340     {
1341       huff_counts->counts[0]-=huff_counts->max_zero_fill*
1342 	(huff_counts->field_type == FIELD_SKIP_ZERO ?
1343 	 records - huff_counts->zero_fields : records);
1344       huff_counts->pack_type|=PACK_TYPE_ZERO_FILL;
1345       huff_counts->bytes_packed=calc_packed_length(huff_counts,0);
1346     }
1347 
1348     /* Test if intervall-field is better */
1349 
1350     if (huff_counts->tree_buff)
1351     {
1352       HUFF_TREE tree;
1353 
1354       DBUG_EXECUTE_IF("forceintervall",
1355                       huff_counts->bytes_packed= ~ (my_off_t) 0;);
1356       tree.element_buffer=0;
1357       if (!make_huff_tree(&tree,huff_counts) &&
1358 	  tree.bytes_packed+tree.tree_pack_length < huff_counts->bytes_packed)
1359       {
1360 	if (tree.elements == 1)
1361 	  huff_counts->field_type=FIELD_CONSTANT;
1362 	else
1363 	  huff_counts->field_type=FIELD_INTERVALL;
1364 	huff_counts->pack_type=0;
1365       }
1366       else
1367       {
1368 	my_free(huff_counts->tree_buff);
1369 	delete_tree(&huff_counts->int_tree);
1370 	huff_counts->tree_buff=0;
1371       }
1372       if (tree.element_buffer)
1373 	my_free(tree.element_buffer);
1374     }
1375     if (huff_counts->pack_type & PACK_TYPE_SPACE_FIELDS)
1376       space_fields++;
1377     if (huff_counts->pack_type & PACK_TYPE_ZERO_FILL)
1378       fill_zero_fields++;
1379     field_count[huff_counts->field_type]++;
1380   }
1381   DBUG_PRINT("info", ("normal:    %3d  empty-space:     %3d  "
1382                       "empty-zero:       %3d  empty-fill: %3d",
1383                       field_count[FIELD_NORMAL],space_fields,
1384                       field_count[FIELD_SKIP_ZERO],fill_zero_fields));
1385   DBUG_PRINT("info", ("pre-space: %3d  end-space:       %3d  "
1386                       "intervall-fields: %3d  zero:       %3d",
1387                       field_count[FIELD_SKIP_PRESPACE],
1388                       field_count[FIELD_SKIP_ENDSPACE],
1389                       field_count[FIELD_INTERVALL],
1390                       field_count[FIELD_ZERO]));
1391   if (verbose)
1392     printf("\nnormal:    %3d  empty-space:     %3d  "
1393                 "empty-zero:       %3d  empty-fill: %3d\n"
1394                 "pre-space: %3d  end-space:       %3d  "
1395                 "intervall-fields: %3d  zero:       %3d\n",
1396                 field_count[FIELD_NORMAL],space_fields,
1397                 field_count[FIELD_SKIP_ZERO],fill_zero_fields,
1398                 field_count[FIELD_SKIP_PRESPACE],
1399                 field_count[FIELD_SKIP_ENDSPACE],
1400                 field_count[FIELD_INTERVALL],
1401                 field_count[FIELD_ZERO]);
1402   DBUG_VOID_RETURN;
1403 }
1404 
1405 	/* Test if we can use space-compression and empty-field-compression */
1406 
1407 static int
test_space_compress(HUFF_COUNTS * huff_counts,my_off_t records,uint max_space_length,my_off_t * space_counts,my_off_t tot_space_count,enum en_fieldtype field_type)1408 test_space_compress(HUFF_COUNTS *huff_counts, my_off_t records,
1409 		    uint max_space_length, my_off_t *space_counts,
1410 		    my_off_t tot_space_count, enum en_fieldtype field_type)
1411 {
1412   int min_pos;
1413   uint length_bits,i;
1414   my_off_t space_count,min_space_count,min_pack,new_length,skip;
1415 
1416   length_bits=max_bit(max_space_length);
1417 
1418 		/* Default no end_space-packing */
1419   space_count=huff_counts->counts[(uint) ' '];
1420   min_space_count= (huff_counts->counts[(uint) ' ']+= tot_space_count);
1421   min_pack=calc_packed_length(huff_counts,0);
1422   min_pos= -2;
1423   huff_counts->counts[(uint) ' ']=space_count;
1424 
1425 	/* Test with allways space-count */
1426   new_length=huff_counts->bytes_packed+length_bits*records/8;
1427   if (new_length+1 < min_pack)
1428   {
1429     min_pos= -1;
1430     min_pack=new_length;
1431     min_space_count=space_count;
1432   }
1433 	/* Test with length-flag */
1434   for (skip=0L, i=0 ; i < 8 ; i++)
1435   {
1436     if (space_counts[i])
1437     {
1438       if (i)
1439 	huff_counts->counts[(uint) ' ']+=space_counts[i];
1440       skip+=huff_counts->pre_space[i];
1441       new_length=calc_packed_length(huff_counts,0)+
1442 	(records+(records-skip)*(1+length_bits))/8;
1443       if (new_length < min_pack)
1444       {
1445 	min_pos=(int) i;
1446 	min_pack=new_length;
1447 	min_space_count=huff_counts->counts[(uint) ' '];
1448       }
1449     }
1450   }
1451 
1452   huff_counts->counts[(uint) ' ']=min_space_count;
1453   huff_counts->bytes_packed=min_pack;
1454   switch (min_pos) {
1455   case -2:
1456     return(0);				/* No space-compress */
1457   case -1:				/* Always space-count */
1458     huff_counts->field_type=field_type;
1459     huff_counts->min_space=0;
1460     huff_counts->length_bits=max_bit(max_space_length);
1461     break;
1462   default:
1463     huff_counts->field_type=field_type;
1464     huff_counts->min_space=(uint) min_pos;
1465     huff_counts->pack_type|=PACK_TYPE_SELECTED;
1466     huff_counts->length_bits=max_bit(max_space_length);
1467     break;
1468   }
1469   return(1);				/* Using space-compress */
1470 }
1471 
1472 
1473 	/* Make a huff_tree of each huff_count */
1474 
make_huff_trees(HUFF_COUNTS * huff_counts,uint trees)1475 static HUFF_TREE* make_huff_trees(HUFF_COUNTS *huff_counts, uint trees)
1476 {
1477   uint tree;
1478   HUFF_TREE *huff_tree;
1479   DBUG_ENTER("make_huff_trees");
1480 
1481   if (!(huff_tree=(HUFF_TREE*) my_malloc(trees*sizeof(HUFF_TREE),
1482 					 MYF(MY_WME | MY_ZEROFILL))))
1483     DBUG_RETURN(0);
1484 
1485   for (tree=0 ; tree < trees ; tree++)
1486   {
1487     if (make_huff_tree(huff_tree+tree,huff_counts+tree))
1488     {
1489       while (tree--)
1490 	my_free(huff_tree[tree].element_buffer);
1491       my_free(huff_tree);
1492       DBUG_RETURN(0);
1493     }
1494   }
1495   DBUG_RETURN(huff_tree);
1496 }
1497 
1498 /*
1499   Build a Huffman tree.
1500 
1501   SYNOPSIS
1502     make_huff_tree()
1503     huff_tree                   The Huffman tree.
1504     huff_counts                 The counts.
1505 
1506   DESCRIPTION
1507     Build a Huffman tree according to huff_counts->counts or
1508     huff_counts->tree_buff. tree_buff, if non-NULL contains up to
1509     tree_buff_length of distinct column values. In that case, whole
1510     values can be Huffman encoded instead of single bytes.
1511 
1512   RETURN
1513     0           OK
1514     != 0        Error
1515 */
1516 
make_huff_tree(HUFF_TREE * huff_tree,HUFF_COUNTS * huff_counts)1517 static int make_huff_tree(HUFF_TREE *huff_tree, HUFF_COUNTS *huff_counts)
1518 {
1519   uint i,found,bits_packed,first,last;
1520   my_off_t bytes_packed;
1521   HUFF_ELEMENT *a,*b,*new_huff_el;
1522 
1523   first=last=0;
1524   if (huff_counts->tree_buff)
1525   {
1526     /* Calculate the number of distinct values in tree_buff. */
1527     found= (uint) (huff_counts->tree_pos - huff_counts->tree_buff) /
1528       huff_counts->field_length;
1529     first=0; last=found-1;
1530   }
1531   else
1532   {
1533     /* Count the number of byte codes found in the column. */
1534     for (i=found=0 ; i < 256 ; i++)
1535     {
1536       if (huff_counts->counts[i])
1537       {
1538 	if (! found++)
1539 	  first=i;
1540 	last=i;
1541       }
1542     }
1543     if (found < 2)
1544       found=2;
1545   }
1546 
1547   /* When using 'tree_buff' we can have more that 256 values. */
1548   if (queue.max_elements < found)
1549   {
1550     delete_queue(&queue);
1551     if (init_queue(&queue,found,0,0,compare_huff_elements,0))
1552       return -1;
1553   }
1554 
1555   /* Allocate or reallocate an element buffer for the Huffman tree. */
1556   if (!huff_tree->element_buffer)
1557   {
1558     if (!(huff_tree->element_buffer=
1559 	 (HUFF_ELEMENT*) my_malloc(found*2*sizeof(HUFF_ELEMENT),MYF(MY_WME))))
1560       return 1;
1561   }
1562   else
1563   {
1564     HUFF_ELEMENT *temp;
1565     if (!(temp=
1566 	  (HUFF_ELEMENT*) my_realloc((uchar*) huff_tree->element_buffer,
1567 				     found*2*sizeof(HUFF_ELEMENT),
1568 				     MYF(MY_WME))))
1569       return 1;
1570     huff_tree->element_buffer=temp;
1571   }
1572 
1573   huff_counts->tree=huff_tree;
1574   huff_tree->counts=huff_counts;
1575   huff_tree->min_chr=first;
1576   huff_tree->max_chr=last;
1577   huff_tree->char_bits=max_bit(last-first);
1578   huff_tree->offset_bits=max_bit(found-1)+1;
1579 
1580   if (huff_counts->tree_buff)
1581   {
1582     huff_tree->elements=0;
1583     huff_tree->tree_pack_length=(1+15+16+5+5+
1584 				 (huff_tree->char_bits+1)*found+
1585 				 (huff_tree->offset_bits+1)*
1586 				 (found-2)+7)/8 +
1587 				   (uint) (huff_tree->counts->tree_pos-
1588 					   huff_tree->counts->tree_buff);
1589     /*
1590       Put a HUFF_ELEMENT into the queue for every distinct column value.
1591 
1592       tree_walk() calls save_counts_in_queue() for every element in
1593       'int_tree'. This takes elements from the target trees element
1594       buffer and places references to them into the buffer of the
1595       priority queue. We insert in column value order, but the order is
1596       in fact irrelevant here. We will establish the correct order
1597       later.
1598     */
1599     tree_walk(&huff_counts->int_tree,
1600 	      (int (*)(void*, element_count,void*)) save_counts_in_queue,
1601 	      (uchar*) huff_tree, left_root_right);
1602   }
1603   else
1604   {
1605     huff_tree->elements=found;
1606     huff_tree->tree_pack_length=(9+9+5+5+
1607 				 (huff_tree->char_bits+1)*found+
1608 				 (huff_tree->offset_bits+1)*
1609 				 (found-2)+7)/8;
1610     /*
1611       Put a HUFF_ELEMENT into the queue for every byte code found in the column.
1612 
1613       The elements are taken from the target trees element buffer.
1614       Instead of using queue_insert(), we just place references to the
1615       elements into the buffer of the priority queue. We insert in byte
1616       value order, but the order is in fact irrelevant here. We will
1617       establish the correct order later.
1618     */
1619     for (i=first, found=0 ; i <= last ; i++)
1620     {
1621       if (huff_counts->counts[i])
1622       {
1623 	new_huff_el=huff_tree->element_buffer+(found++);
1624 	new_huff_el->count=huff_counts->counts[i];
1625 	new_huff_el->a.leaf.null=0;
1626 	new_huff_el->a.leaf.element_nr=i;
1627 	queue.root[found]=(uchar*) new_huff_el;
1628       }
1629     }
1630     /*
1631       If there is only a single byte value in this field in all records,
1632       add a second element with zero incidence. This is required to enter
1633       the loop, which builds the Huffman tree.
1634     */
1635     while (found < 2)
1636     {
1637       new_huff_el=huff_tree->element_buffer+(found++);
1638       new_huff_el->count=0;
1639       new_huff_el->a.leaf.null=0;
1640       if (last)
1641 	new_huff_el->a.leaf.element_nr=huff_tree->min_chr=last-1;
1642       else
1643 	new_huff_el->a.leaf.element_nr=huff_tree->max_chr=last+1;
1644       queue.root[found]=(uchar*) new_huff_el;
1645     }
1646   }
1647 
1648   /* Make a queue from the queue buffer. */
1649   queue.elements=found;
1650 
1651   /*
1652     Make a priority queue from the queue. Construct its index so that we
1653     have a partially ordered tree.
1654   */
1655   for (i=found/2 ; i > 0 ; i--)
1656     _downheap(&queue,i);
1657 
1658   /* The Huffman algorithm. */
1659   bytes_packed=0; bits_packed=0;
1660   for (i=1 ; i < found ; i++)
1661   {
1662     /*
1663       Pop the top element from the queue (the one with the least incidence).
1664       Popping from a priority queue includes a re-ordering of the queue,
1665       to get the next least incidence element to the top.
1666     */
1667     a=(HUFF_ELEMENT*) queue_remove(&queue,0);
1668     /*
1669       Copy the next least incidence element. The queue implementation
1670       reserves root[0] for temporary purposes. root[1] is the top.
1671     */
1672     b=(HUFF_ELEMENT*) queue.root[1];
1673     /* Get a new element from the element buffer. */
1674     new_huff_el=huff_tree->element_buffer+found+i;
1675     /* The new element gets the sum of the two least incidence elements. */
1676     new_huff_el->count=a->count+b->count;
1677     /*
1678       The Huffman algorithm assigns another bit to the code for a byte
1679       every time that bytes incidence is combined (directly or indirectly)
1680       to a new element as one of the two least incidence elements.
1681       This means that one more bit per incidence of that byte is required
1682       in the resulting file. So we add the new combined incidence as the
1683       number of bits by which the result grows.
1684     */
1685     bits_packed+=(uint) (new_huff_el->count & 7);
1686     bytes_packed+=new_huff_el->count/8;
1687     /* The new element points to its children, lesser in left.  */
1688     new_huff_el->a.nod.left=a;
1689     new_huff_el->a.nod.right=b;
1690     /*
1691       Replace the copied top element by the new element and re-order the
1692       queue.
1693     */
1694     queue.root[1]=(uchar*) new_huff_el;
1695     queue_replaced(&queue);
1696   }
1697   huff_tree->root=(HUFF_ELEMENT*) queue.root[1];
1698   huff_tree->bytes_packed=bytes_packed+(bits_packed+7)/8;
1699   return 0;
1700 }
1701 
compare_tree(void * cmp_arg MY_ATTRIBUTE ((unused)),register const uchar * s,register const uchar * t)1702 static int compare_tree(void* cmp_arg MY_ATTRIBUTE((unused)),
1703 			register const uchar *s, register const uchar *t)
1704 {
1705   uint length;
1706   for (length=global_count->field_length; length-- ;)
1707     if (*s++ != *t++)
1708       return (int) s[-1] - (int) t[-1];
1709   return 0;
1710 }
1711 
1712 /*
1713   Organize distinct column values and their incidences into a priority queue.
1714 
1715   SYNOPSIS
1716     save_counts_in_queue()
1717     key                         The column value.
1718     count                       The incidence of this value.
1719     tree                        The Huffman tree to be built later.
1720 
1721   DESCRIPTION
1722     We use the element buffer of the targeted tree. The distinct column
1723     values are organized in a priority queue first. The Huffman
1724     algorithm will later organize the elements into a Huffman tree. For
1725     the time being, we just place references to the elements into the
1726     queue buffer. The buffer will later be organized into a priority
1727     queue.
1728 
1729   RETURN
1730     0
1731  */
1732 
save_counts_in_queue(uchar * key,element_count count,HUFF_TREE * tree)1733 static int save_counts_in_queue(uchar *key, element_count count,
1734 				HUFF_TREE *tree)
1735 {
1736   HUFF_ELEMENT *new_huff_el;
1737 
1738   new_huff_el=tree->element_buffer+(tree->elements++);
1739   new_huff_el->count=count;
1740   new_huff_el->a.leaf.null=0;
1741   new_huff_el->a.leaf.element_nr= (uint) (key- tree->counts->tree_buff) /
1742     tree->counts->field_length;
1743   queue.root[tree->elements]=(uchar*) new_huff_el;
1744   return 0;
1745 }
1746 
1747 
1748 /*
1749   Calculate length of file if given counts should be used.
1750 
1751   SYNOPSIS
1752     calc_packed_length()
1753     huff_counts                 The counts for a column of the table(s).
1754     add_tree_lenght             If the decode tree length should be added.
1755 
1756   DESCRIPTION
1757     We need to follow the Huffman algorithm until we know, how many bits
1758     are required for each byte code. But we do not need the resulting
1759     Huffman tree. Hence, we can leave out some steps which are essential
1760     in make_huff_tree().
1761 
1762   RETURN
1763     Number of bytes required to compress this table column.
1764 */
1765 
calc_packed_length(HUFF_COUNTS * huff_counts,uint add_tree_lenght)1766 static my_off_t calc_packed_length(HUFF_COUNTS *huff_counts,
1767 				   uint add_tree_lenght)
1768 {
1769   uint i,found,bits_packed,first,last;
1770   my_off_t bytes_packed;
1771   HUFF_ELEMENT element_buffer[256];
1772   DBUG_ENTER("calc_packed_length");
1773 
1774   /*
1775     WARNING: We use a small hack for efficiency: Instead of placing
1776     references to HUFF_ELEMENTs into the queue, we just insert
1777     references to the counts of the byte codes which appeared in this
1778     table column. During the Huffman algorithm they are successively
1779     replaced by references to HUFF_ELEMENTs. This works, because
1780     HUFF_ELEMENTs have the incidence count at their beginning.
1781     Regardless, wether the queue array contains references to counts of
1782     type my_off_t or references to HUFF_ELEMENTs which have the count of
1783     type my_off_t at their beginning, it always points to a count of the
1784     same type.
1785 
1786     Instead of using queue_insert(), we just copy the references into
1787     the buffer of the priority queue. We insert in byte value order, but
1788     the order is in fact irrelevant here. We will establish the correct
1789     order later.
1790   */
1791   first=last=0;
1792   for (i=found=0 ; i < 256 ; i++)
1793   {
1794     if (huff_counts->counts[i])
1795     {
1796       if (! found++)
1797 	first=i;
1798       last=i;
1799       /* We start with root[1], which is the queues top element. */
1800       queue.root[found]=(uchar*) &huff_counts->counts[i];
1801     }
1802   }
1803   if (!found)
1804     DBUG_RETURN(0);			/* Empty tree */
1805   /*
1806     If there is only a single byte value in this field in all records,
1807     add a second element with zero incidence. This is required to enter
1808     the loop, which follows the Huffman algorithm.
1809   */
1810   if (found < 2)
1811     queue.root[++found]=(uchar*) &huff_counts->counts[last ? 0 : 1];
1812 
1813   /* Make a queue from the queue buffer. */
1814   queue.elements=found;
1815 
1816   bytes_packed=0; bits_packed=0;
1817   /* Add the length of the coding table, which would become part of the file. */
1818   if (add_tree_lenght)
1819     bytes_packed=(8+9+5+5+(max_bit(last-first)+1)*found+
1820 		  (max_bit(found-1)+1+1)*(found-2) +7)/8;
1821 
1822   /*
1823     Make a priority queue from the queue. Construct its index so that we
1824     have a partially ordered tree.
1825   */
1826   for (i=(found+1)/2 ; i > 0 ; i--)
1827     _downheap(&queue,i);
1828 
1829   /* The Huffman algorithm. */
1830   for (i=0 ; i < found-1 ; i++)
1831   {
1832     my_off_t        *a;
1833     my_off_t        *b;
1834     HUFF_ELEMENT    *new_huff_el;
1835 
1836     /*
1837       Pop the top element from the queue (the one with the least
1838       incidence). Popping from a priority queue includes a re-ordering
1839       of the queue, to get the next least incidence element to the top.
1840     */
1841     a= (my_off_t*) queue_remove(&queue, 0);
1842     /*
1843       Copy the next least incidence element. The queue implementation
1844       reserves root[0] for temporary purposes. root[1] is the top.
1845     */
1846     b= (my_off_t*) queue.root[1];
1847     /* Create a new element in a local (automatic) buffer. */
1848     new_huff_el= element_buffer + i;
1849     /* The new element gets the sum of the two least incidence elements. */
1850     new_huff_el->count= *a + *b;
1851     /*
1852       The Huffman algorithm assigns another bit to the code for a byte
1853       every time that bytes incidence is combined (directly or indirectly)
1854       to a new element as one of the two least incidence elements.
1855       This means that one more bit per incidence of that byte is required
1856       in the resulting file. So we add the new combined incidence as the
1857       number of bits by which the result grows.
1858     */
1859     bits_packed+=(uint) (new_huff_el->count & 7);
1860     bytes_packed+=new_huff_el->count/8;
1861     /*
1862       Replace the copied top element by the new element and re-order the
1863       queue. This successively replaces the references to counts by
1864       references to HUFF_ELEMENTs.
1865     */
1866     queue.root[1]=(uchar*) new_huff_el;
1867     queue_replaced(&queue);
1868   }
1869   DBUG_RETURN(bytes_packed+(bits_packed+7)/8);
1870 }
1871 
1872 
1873 	/* Remove trees that don't give any compression */
1874 
join_same_trees(HUFF_COUNTS * huff_counts,uint trees)1875 static uint join_same_trees(HUFF_COUNTS *huff_counts, uint trees)
1876 {
1877   uint k,tree_number;
1878   HUFF_COUNTS count,*i,*j,*last_count;
1879 
1880   last_count=huff_counts+trees;
1881   for (tree_number=0, i=huff_counts ; i < last_count ; i++)
1882   {
1883     if (!i->tree->tree_number)
1884     {
1885       i->tree->tree_number= ++tree_number;
1886       if (i->tree_buff)
1887 	continue;			/* Don't join intervall */
1888       for (j=i+1 ; j < last_count ; j++)
1889       {
1890 	if (! j->tree->tree_number && ! j->tree_buff)
1891 	{
1892 	  for (k=0 ; k < 256 ; k++)
1893 	    count.counts[k]=i->counts[k]+j->counts[k];
1894 	  if (calc_packed_length(&count,1) <=
1895 	      i->tree->bytes_packed + j->tree->bytes_packed+
1896 	      i->tree->tree_pack_length+j->tree->tree_pack_length+
1897 	      ALLOWED_JOIN_DIFF)
1898 	  {
1899 	    memcpy(i->counts, count.counts,
1900 			 sizeof(count.counts[0])*256);
1901 	    my_free(j->tree->element_buffer);
1902 	    j->tree->element_buffer=0;
1903 	    j->tree=i->tree;
1904 	    bmove((uchar*) i->counts,(uchar*) count.counts,
1905 		  sizeof(count.counts[0])*256);
1906 	    if (make_huff_tree(i->tree,i))
1907 	      return (uint) -1;
1908 	  }
1909 	}
1910       }
1911     }
1912   }
1913   DBUG_PRINT("info", ("Original trees:  %d  After join: %d",
1914                       trees, tree_number));
1915   if (verbose)
1916     printf("Original trees:  %d  After join: %d\n", trees, tree_number);
1917   return tree_number;			/* Return trees left */
1918 }
1919 
1920 
1921 /*
1922   Fill in huff_tree encode tables.
1923 
1924   SYNOPSIS
1925     make_huff_decode_table()
1926     huff_tree               An array of HUFF_TREE which are to be encoded.
1927     trees                   The number of HUFF_TREE in the array.
1928 
1929   RETURN
1930     0           success
1931     != 0        error
1932 */
1933 
make_huff_decode_table(HUFF_TREE * huff_tree,uint trees)1934 static int make_huff_decode_table(HUFF_TREE *huff_tree, uint trees)
1935 {
1936   uint elements;
1937   for ( ; trees-- ; huff_tree++)
1938   {
1939     if (huff_tree->tree_number > 0)
1940     {
1941       elements=huff_tree->counts->tree_buff ? huff_tree->elements : 256;
1942       if (!(huff_tree->code =
1943             (ulonglong*) my_malloc(elements*
1944                                    (sizeof(ulonglong) + sizeof(uchar)),
1945                                    MYF(MY_WME | MY_ZEROFILL))))
1946 	return 1;
1947       huff_tree->code_len=(uchar*) (huff_tree->code+elements);
1948       make_traverse_code_tree(huff_tree, huff_tree->root,
1949                               8 * sizeof(ulonglong), LL(0));
1950     }
1951   }
1952   return 0;
1953 }
1954 
1955 
make_traverse_code_tree(HUFF_TREE * huff_tree,HUFF_ELEMENT * element,uint size,ulonglong code)1956 static void make_traverse_code_tree(HUFF_TREE *huff_tree,
1957 				    HUFF_ELEMENT *element,
1958 				    uint size, ulonglong code)
1959 {
1960   uint chr;
1961   if (!element->a.leaf.null)
1962   {
1963     chr=element->a.leaf.element_nr;
1964     huff_tree->code_len[chr]= (uchar) (8 * sizeof(ulonglong) - size);
1965     huff_tree->code[chr]= (code >> size);
1966     if (huff_tree->height < 8 * sizeof(ulonglong) - size)
1967         huff_tree->height= 8 * sizeof(ulonglong) - size;
1968   }
1969   else
1970   {
1971     size--;
1972     make_traverse_code_tree(huff_tree,element->a.nod.left,size,code);
1973     make_traverse_code_tree(huff_tree, element->a.nod.right, size,
1974 			    code + (((ulonglong) 1) << size));
1975   }
1976   return;
1977 }
1978 
1979 
1980 /*
1981   Convert a value into binary digits.
1982 
1983   SYNOPSIS
1984     bindigits()
1985     value                       The value.
1986     length                      The number of low order bits to convert.
1987 
1988   NOTE
1989     The result string is in static storage. It is reused on every call.
1990     So you cannot use it twice in one expression.
1991 
1992   RETURN
1993     A pointer to a static NUL-terminated string.
1994  */
1995 
bindigits(ulonglong value,uint bits)1996 static char *bindigits(ulonglong value, uint bits)
1997 {
1998   static char digits[72];
1999   char *ptr= digits;
2000   uint idx= bits;
2001 
2002   DBUG_ASSERT(idx < sizeof(digits));
2003   while (idx)
2004     *(ptr++)= '0' + ((char) (value >> (--idx)) & (char) 1);
2005   *ptr= '\0';
2006   return digits;
2007 }
2008 
2009 
2010 /*
2011   Convert a value into hexadecimal digits.
2012 
2013   SYNOPSIS
2014     hexdigits()
2015     value                       The value.
2016 
2017   NOTE
2018     The result string is in static storage. It is reused on every call.
2019     So you cannot use it twice in one expression.
2020 
2021   RETURN
2022     A pointer to a static NUL-terminated string.
2023  */
2024 
hexdigits(ulonglong value)2025 static char *hexdigits(ulonglong value)
2026 {
2027   static char digits[20];
2028   char *ptr= digits;
2029   uint idx= 2 * sizeof(value); /* Two hex digits per byte. */
2030 
2031   DBUG_ASSERT(idx < sizeof(digits));
2032   while (idx)
2033   {
2034     if ((*(ptr++)= '0' + ((char) (value >> (4 * (--idx))) & (char) 0xf)) > '9')
2035       *(ptr - 1)+= 'a' - '9' - 1;
2036   }
2037   *ptr= '\0';
2038   return digits;
2039 }
2040 
2041 
2042 	/* Write header to new packed data file */
2043 
write_header(PACK_MRG_INFO * mrg,uint head_length,uint trees,my_off_t tot_elements,my_off_t filelength)2044 static int write_header(PACK_MRG_INFO *mrg,uint head_length,uint trees,
2045 			my_off_t tot_elements,my_off_t filelength)
2046 {
2047   uchar *buff= (uchar*) file_buffer.pos;
2048 
2049   memset(buff, 0, HEAD_LENGTH);
2050   memcpy(buff,myisam_pack_file_magic,4);
2051   int4store(buff+4,head_length);
2052   int4store(buff+8, mrg->min_pack_length);
2053   int4store(buff+12,mrg->max_pack_length);
2054   int4store(buff+16,tot_elements);
2055   int4store(buff+20,intervall_length);
2056   int2store(buff+24,trees);
2057   buff[26]=(char) mrg->ref_length;
2058 	/* Save record pointer length */
2059   buff[27]= (uchar) mi_get_pointer_length((ulonglong) filelength,2);
2060   if (test_only)
2061     return 0;
2062   my_seek(file_buffer.file,0L,MY_SEEK_SET,MYF(0));
2063   return my_write(file_buffer.file,(const uchar *) file_buffer.pos,HEAD_LENGTH,
2064 		  MYF(MY_WME | MY_NABP | MY_WAIT_IF_FULL)) != 0;
2065 }
2066 
2067 	/* Write fieldinfo to new packed file */
2068 
write_field_info(HUFF_COUNTS * counts,uint fields,uint trees)2069 static void write_field_info(HUFF_COUNTS *counts, uint fields, uint trees)
2070 {
2071   reg1 uint i;
2072   uint huff_tree_bits;
2073   huff_tree_bits=max_bit(trees ? trees-1 : 0);
2074 
2075   DBUG_PRINT("info", (" "));
2076   DBUG_PRINT("info", ("column types:"));
2077   DBUG_PRINT("info", ("FIELD_NORMAL          0"));
2078   DBUG_PRINT("info", ("FIELD_SKIP_ENDSPACE   1"));
2079   DBUG_PRINT("info", ("FIELD_SKIP_PRESPACE   2"));
2080   DBUG_PRINT("info", ("FIELD_SKIP_ZERO       3"));
2081   DBUG_PRINT("info", ("FIELD_BLOB            4"));
2082   DBUG_PRINT("info", ("FIELD_CONSTANT        5"));
2083   DBUG_PRINT("info", ("FIELD_INTERVALL       6"));
2084   DBUG_PRINT("info", ("FIELD_ZERO            7"));
2085   DBUG_PRINT("info", ("FIELD_VARCHAR         8"));
2086   DBUG_PRINT("info", ("FIELD_CHECK           9"));
2087   DBUG_PRINT("info", (" "));
2088   DBUG_PRINT("info", ("pack type as a set of flags:"));
2089   DBUG_PRINT("info", ("PACK_TYPE_SELECTED      1"));
2090   DBUG_PRINT("info", ("PACK_TYPE_SPACE_FIELDS  2"));
2091   DBUG_PRINT("info", ("PACK_TYPE_ZERO_FILL     4"));
2092   DBUG_PRINT("info", (" "));
2093   if (verbose >= 2)
2094   {
2095     printf("\n");
2096     printf("column types:\n");
2097     printf("FIELD_NORMAL          0\n");
2098     printf("FIELD_SKIP_ENDSPACE   1\n");
2099     printf("FIELD_SKIP_PRESPACE   2\n");
2100     printf("FIELD_SKIP_ZERO       3\n");
2101     printf("FIELD_BLOB            4\n");
2102     printf("FIELD_CONSTANT        5\n");
2103     printf("FIELD_INTERVALL       6\n");
2104     printf("FIELD_ZERO            7\n");
2105     printf("FIELD_VARCHAR         8\n");
2106     printf("FIELD_CHECK           9\n");
2107     printf("\n");
2108     printf("pack type as a set of flags:\n");
2109     printf("PACK_TYPE_SELECTED      1\n");
2110     printf("PACK_TYPE_SPACE_FIELDS  2\n");
2111     printf("PACK_TYPE_ZERO_FILL     4\n");
2112     printf("\n");
2113   }
2114   for (i=0 ; i++ < fields ; counts++)
2115   {
2116     write_bits((ulonglong) (int) counts->field_type, 5);
2117     write_bits(counts->pack_type,6);
2118     if (counts->pack_type & PACK_TYPE_ZERO_FILL)
2119       write_bits(counts->max_zero_fill,5);
2120     else
2121       write_bits(counts->length_bits,5);
2122     write_bits((ulonglong) counts->tree->tree_number - 1, huff_tree_bits);
2123     DBUG_PRINT("info", ("column: %3u  type: %2u  pack: %2u  zero: %4u  "
2124                         "lbits: %2u  tree: %2u  length: %4u",
2125                         i , counts->field_type, counts->pack_type,
2126                         counts->max_zero_fill, counts->length_bits,
2127                         counts->tree->tree_number, counts->field_length));
2128     if (verbose >= 2)
2129       printf("column: %3u  type: %2u  pack: %2u  zero: %4u  lbits: %2u  "
2130                   "tree: %2u  length: %4u\n", i , counts->field_type,
2131                   counts->pack_type, counts->max_zero_fill, counts->length_bits,
2132                   counts->tree->tree_number, counts->field_length);
2133   }
2134   flush_bits();
2135   return;
2136 }
2137 
2138 	/* Write all huff_trees to new datafile. Return tot count of
2139 	   elements in all trees
2140 	   Returns 0 on error */
2141 
write_huff_tree(HUFF_TREE * huff_tree,uint trees)2142 static my_off_t write_huff_tree(HUFF_TREE *huff_tree, uint trees)
2143 {
2144   uint i,int_length;
2145   uint tree_no;
2146   uint codes;
2147   uint errors= 0;
2148   uint *packed_tree,*offset,length;
2149   my_off_t elements;
2150 
2151   /* Find the highest number of elements in the trees. */
2152   for (i=length=0 ; i < trees ; i++)
2153     if (huff_tree[i].tree_number > 0 && huff_tree[i].elements > length)
2154       length=huff_tree[i].elements;
2155   /*
2156     Allocate a buffer for packing a decode tree. Two numbers per element
2157     (left child and right child).
2158   */
2159   if (!(packed_tree=(uint*) my_alloca(sizeof(uint)*length*2)))
2160   {
2161     my_error(EE_OUTOFMEMORY, MYF(ME_BELL+ME_FATALERROR),
2162              sizeof(uint)*length*2);
2163     return 0;
2164   }
2165 
2166   DBUG_PRINT("info", (" "));
2167   if (verbose >= 2)
2168     printf("\n");
2169   tree_no= 0;
2170   intervall_length=0;
2171   for (elements=0; trees-- ; huff_tree++)
2172   {
2173     /* Skip columns that have been joined with other columns. */
2174     if (huff_tree->tree_number == 0)
2175       continue;				/* Deleted tree */
2176     tree_no++;
2177     DBUG_PRINT("info", (" "));
2178     if (verbose >= 3)
2179       printf("\n");
2180     /* Count the total number of elements (byte codes or column values). */
2181     elements+=huff_tree->elements;
2182     huff_tree->max_offset=2;
2183     /* Build a tree of offsets and codes for decoding in 'packed_tree'. */
2184     if (huff_tree->elements <= 1)
2185       offset=packed_tree;
2186     else
2187       offset=make_offset_code_tree(huff_tree,huff_tree->root,packed_tree);
2188 
2189     /* This should be the same as 'length' above. */
2190     huff_tree->offset_bits=max_bit(huff_tree->max_offset);
2191 
2192     /*
2193       Since we check this during collecting the distinct column values,
2194       this should never happen.
2195     */
2196     if (huff_tree->max_offset >= IS_OFFSET)
2197     {				/* This should be impossible */
2198       (void) fprintf(stderr, "Tree offset got too big: %d, aborted\n",
2199                    huff_tree->max_offset);
2200       my_afree((uchar*) packed_tree);
2201       return 0;
2202     }
2203 
2204     DBUG_PRINT("info", ("pos: %lu  elements: %u  tree-elements: %lu  "
2205                         "char_bits: %u\n",
2206                         (ulong) (file_buffer.pos - file_buffer.buffer),
2207                         huff_tree->elements, (ulong) (offset - packed_tree),
2208                         huff_tree->char_bits));
2209     if (!huff_tree->counts->tree_buff)
2210     {
2211       /* We do a byte compression on this column. Mark with bit 0. */
2212       write_bits(0,1);
2213       write_bits(huff_tree->min_chr,8);
2214       write_bits(huff_tree->elements,9);
2215       write_bits(huff_tree->char_bits,5);
2216       write_bits(huff_tree->offset_bits,5);
2217       int_length=0;
2218     }
2219     else
2220     {
2221       int_length=(uint) (huff_tree->counts->tree_pos -
2222 			 huff_tree->counts->tree_buff);
2223       /* We have distinct column values for this column. Mark with bit 1. */
2224       write_bits(1,1);
2225       write_bits(huff_tree->elements,15);
2226       write_bits(int_length,16);
2227       write_bits(huff_tree->char_bits,5);
2228       write_bits(huff_tree->offset_bits,5);
2229       intervall_length+=int_length;
2230     }
2231     DBUG_PRINT("info", ("tree: %2u  elements: %4u  char_bits: %2u  "
2232                         "offset_bits: %2u  %s: %5u  codelen: %2u",
2233                         tree_no, huff_tree->elements, huff_tree->char_bits,
2234                         huff_tree->offset_bits, huff_tree->counts->tree_buff ?
2235                         "bufflen" : "min_chr", huff_tree->counts->tree_buff ?
2236                         int_length : huff_tree->min_chr, huff_tree->height));
2237     if (verbose >= 2)
2238       printf("tree: %2u  elements: %4u  char_bits: %2u  offset_bits: %2u  "
2239                   "%s: %5u  codelen: %2u\n", tree_no, huff_tree->elements,
2240                   huff_tree->char_bits, huff_tree->offset_bits,
2241                   huff_tree->counts->tree_buff ? "bufflen" : "min_chr",
2242                   huff_tree->counts->tree_buff ? int_length :
2243                   huff_tree->min_chr, huff_tree->height);
2244 
2245     /* Check that the code tree length matches the element count. */
2246     length=(uint) (offset-packed_tree);
2247     if (length != huff_tree->elements*2-2)
2248     {
2249       (void) fprintf(stderr, "error: Huff-tree-length: %d != calc_length: %d\n",
2250                    length, huff_tree->elements * 2 - 2);
2251       errors++;
2252       break;
2253     }
2254 
2255     for (i=0 ; i < length ; i++)
2256     {
2257       if (packed_tree[i] & IS_OFFSET)
2258 	write_bits(packed_tree[i] - IS_OFFSET+ (1 << huff_tree->offset_bits),
2259 		   huff_tree->offset_bits+1);
2260       else
2261 	write_bits(packed_tree[i]-huff_tree->min_chr,huff_tree->char_bits+1);
2262       DBUG_PRINT("info", ("tree[0x%04x]: %s0x%04x",
2263                           i, (packed_tree[i] & IS_OFFSET) ?
2264                           " -> " : "", (packed_tree[i] & IS_OFFSET) ?
2265                           packed_tree[i] - IS_OFFSET + i : packed_tree[i]));
2266       if (verbose >= 3)
2267         printf("tree[0x%04x]: %s0x%04x\n",
2268                     i, (packed_tree[i] & IS_OFFSET) ? " -> " : "",
2269                     (packed_tree[i] & IS_OFFSET) ?
2270                     packed_tree[i] - IS_OFFSET + i : packed_tree[i]);
2271     }
2272     flush_bits();
2273 
2274     /*
2275       Display coding tables and check their correctness.
2276     */
2277     codes= huff_tree->counts->tree_buff ? huff_tree->elements : 256;
2278     for (i= 0; i < codes; i++)
2279     {
2280       ulonglong code;
2281       uint bits;
2282       uint len;
2283       uint idx;
2284 
2285       if (! (len= huff_tree->code_len[i]))
2286         continue;
2287       DBUG_PRINT("info", ("code[0x%04x]:      0x%s  bits: %2u  bin: %s", i,
2288                           hexdigits(huff_tree->code[i]), huff_tree->code_len[i],
2289                           bindigits(huff_tree->code[i],
2290                                     huff_tree->code_len[i])));
2291       if (verbose >= 3)
2292         printf("code[0x%04x]:      0x%s  bits: %2u  bin: %s\n", i,
2293                     hexdigits(huff_tree->code[i]), huff_tree->code_len[i],
2294                     bindigits(huff_tree->code[i], huff_tree->code_len[i]));
2295 
2296       /* Check that the encode table decodes correctly. */
2297       code= 0;
2298       bits= 0;
2299       idx= 0;
2300       DBUG_EXECUTE_IF("forcechkerr1", len--;);
2301       DBUG_EXECUTE_IF("forcechkerr2", bits= 8 * sizeof(code););
2302       DBUG_EXECUTE_IF("forcechkerr3", idx= length;);
2303       for (;;)
2304       {
2305         if (! len)
2306         {
2307           (void) fflush(stdout);
2308           (void) fprintf(stderr, "error: code 0x%s with %u bits not found\n",
2309                        hexdigits(huff_tree->code[i]), huff_tree->code_len[i]);
2310           errors++;
2311           break;
2312         }
2313         code<<= 1;
2314         code|= (huff_tree->code[i] >> (--len)) & 1;
2315         bits++;
2316         if (bits > 8 * sizeof(code))
2317         {
2318           (void) fflush(stdout);
2319           (void) fprintf(stderr, "error: Huffman code too long: %u/%u\n",
2320                        bits, (uint) (8 * sizeof(code)));
2321           errors++;
2322           break;
2323         }
2324         idx+= (uint) code & 1;
2325         if (idx >= length)
2326         {
2327           (void) fflush(stdout);
2328           (void) fprintf(stderr, "error: illegal tree offset: %u/%u\n",
2329                        idx, length);
2330           errors++;
2331           break;
2332         }
2333         if (packed_tree[idx] & IS_OFFSET)
2334           idx+= packed_tree[idx] & ~IS_OFFSET;
2335         else
2336           break; /* Hit a leaf. This contains the result value. */
2337       }
2338       if (errors)
2339         break;
2340 
2341       DBUG_EXECUTE_IF("forcechkerr4", packed_tree[idx]++;);
2342       if (packed_tree[idx] != i)
2343       {
2344         (void) fflush(stdout);
2345         (void) fprintf(stderr, "error: decoded value 0x%04x  should be: 0x%04x\n",
2346                      packed_tree[idx], i);
2347         errors++;
2348         break;
2349       }
2350     } /*end for (codes)*/
2351     if (errors)
2352       break;
2353 
2354     /* Write column values in case of distinct column value compression. */
2355     if (huff_tree->counts->tree_buff)
2356     {
2357       for (i=0 ; i < int_length ; i++)
2358       {
2359  	write_bits((ulonglong) (uchar) huff_tree->counts->tree_buff[i], 8);
2360         DBUG_PRINT("info", ("column_values[0x%04x]: 0x%02x",
2361                             i, (uchar) huff_tree->counts->tree_buff[i]));
2362         if (verbose >= 3)
2363           printf("column_values[0x%04x]: 0x%02x\n",
2364                       i, (uchar) huff_tree->counts->tree_buff[i]);
2365       }
2366     }
2367     flush_bits();
2368   }
2369   DBUG_PRINT("info", (" "));
2370   if (verbose >= 2)
2371     printf("\n");
2372   my_afree((uchar*) packed_tree);
2373   if (errors)
2374   {
2375     (void) fprintf(stderr, "Error: Generated decode trees are corrupt. Stop.\n");
2376     return 0;
2377   }
2378   return elements;
2379 }
2380 
2381 
make_offset_code_tree(HUFF_TREE * huff_tree,HUFF_ELEMENT * element,uint * offset)2382 static uint *make_offset_code_tree(HUFF_TREE *huff_tree, HUFF_ELEMENT *element,
2383 				   uint *offset)
2384 {
2385   uint *prev_offset;
2386 
2387   prev_offset= offset;
2388   /*
2389     'a.leaf.null' takes the same place as 'a.nod.left'. If this is null,
2390     then there is no left child and, hence no right child either. This
2391     is a property of a binary tree. An element is either a node with two
2392     childs, or a leaf without childs.
2393 
2394     The current element is always a node with two childs. Go left first.
2395   */
2396   if (!element->a.nod.left->a.leaf.null)
2397   {
2398     /* Store the byte code or the index of the column value. */
2399     prev_offset[0] =(uint) element->a.nod.left->a.leaf.element_nr;
2400     offset+=2;
2401   }
2402   else
2403   {
2404     /*
2405       Recursively traverse the tree to the left. Mark it as an offset to
2406       another tree node (in contrast to a byte code or column value index).
2407     */
2408     prev_offset[0]= IS_OFFSET+2;
2409     offset=make_offset_code_tree(huff_tree,element->a.nod.left,offset+2);
2410   }
2411 
2412   /* Now, check the right child. */
2413   if (!element->a.nod.right->a.leaf.null)
2414   {
2415     /* Store the byte code or the index of the column value. */
2416     prev_offset[1]=element->a.nod.right->a.leaf.element_nr;
2417     return offset;
2418   }
2419   else
2420   {
2421     /*
2422       Recursively traverse the tree to the right. Mark it as an offset to
2423       another tree node (in contrast to a byte code or column value index).
2424     */
2425     uint temp=(uint) (offset-prev_offset-1);
2426     prev_offset[1]= IS_OFFSET+ temp;
2427     if (huff_tree->max_offset < temp)
2428       huff_tree->max_offset = temp;
2429     return make_offset_code_tree(huff_tree,element->a.nod.right,offset);
2430   }
2431 }
2432 
2433 	/* Get number of bits neaded to represent value */
2434 
max_bit(register uint value)2435 static uint max_bit(register uint value)
2436 {
2437   reg2 uint power=1;
2438 
2439   while ((value>>=1))
2440     power++;
2441   return (power);
2442 }
2443 
2444 
compress_isam_file(PACK_MRG_INFO * mrg,HUFF_COUNTS * huff_counts)2445 static int compress_isam_file(PACK_MRG_INFO *mrg, HUFF_COUNTS *huff_counts)
2446 {
2447   int error;
2448   uint i,max_calc_length,pack_ref_length,min_record_length,max_record_length,
2449     intervall,field_length,max_pack_length,pack_blob_length;
2450   my_off_t record_count;
2451   char llbuf[32];
2452   ulong length,pack_length;
2453   uchar *record,*pos,*end_pos,*record_pos,*start_pos;
2454   HUFF_COUNTS *count,*end_count;
2455   HUFF_TREE *tree;
2456   MI_INFO *isam_file=mrg->file[0];
2457   uint pack_version= (uint) isam_file->s->pack.version;
2458   DBUG_ENTER("compress_isam_file");
2459 
2460   /* Allocate a buffer for the records (excluding blobs). */
2461   if (!(record=(uchar*) my_alloca(isam_file->s->base.reclength)))
2462     return -1;
2463 
2464   end_count=huff_counts+isam_file->s->base.fields;
2465   min_record_length= (uint) ~0;
2466   max_record_length=0;
2467 
2468   /*
2469     Calculate the maximum number of bits required to pack the records.
2470     Remember to understand 'max_zero_fill' as 'min_zero_fill'.
2471     The tree height determines the maximum number of bits per value.
2472     Some fields skip leading or trailing spaces or zeroes. The skipped
2473     number of bytes is encoded by 'length_bits' bits.
2474     Empty blobs and varchar are encoded with a single 1 bit. Other blobs
2475     and varchar get a leading 0 bit.
2476   */
2477   for (i=max_calc_length=0 ; i < isam_file->s->base.fields ; i++)
2478   {
2479     if (!(huff_counts[i].pack_type & PACK_TYPE_ZERO_FILL))
2480       huff_counts[i].max_zero_fill=0;
2481     if (huff_counts[i].field_type == FIELD_CONSTANT ||
2482 	huff_counts[i].field_type == FIELD_ZERO ||
2483 	huff_counts[i].field_type == FIELD_CHECK)
2484       continue;
2485     if (huff_counts[i].field_type == FIELD_INTERVALL)
2486       max_calc_length+=huff_counts[i].tree->height;
2487     else if (huff_counts[i].field_type == FIELD_BLOB ||
2488 	     huff_counts[i].field_type == FIELD_VARCHAR)
2489       max_calc_length+=huff_counts[i].tree->height*huff_counts[i].max_length + huff_counts[i].length_bits +1;
2490     else
2491       max_calc_length+=
2492 	(huff_counts[i].field_length - huff_counts[i].max_zero_fill)*
2493 	  huff_counts[i].tree->height+huff_counts[i].length_bits;
2494   }
2495   max_calc_length= (max_calc_length + 7) / 8;
2496   pack_ref_length= calc_pack_length(pack_version, max_calc_length);
2497   record_count=0;
2498   /* 'max_blob_length' is the max length of all blobs of a record. */
2499   pack_blob_length= isam_file->s->base.blobs ?
2500                     calc_pack_length(pack_version, mrg->max_blob_length) : 0;
2501   max_pack_length=pack_ref_length+pack_blob_length;
2502 
2503   DBUG_PRINT("fields", ("==="));
2504   mrg_reset(mrg);
2505   while ((error=mrg_rrnd(mrg,record)) != HA_ERR_END_OF_FILE)
2506   {
2507     ulong tot_blob_length=0;
2508     if (! error)
2509     {
2510       if (flush_buffer((ulong) max_calc_length + (ulong) max_pack_length))
2511 	break;
2512       record_pos= (uchar*) file_buffer.pos;
2513       file_buffer.pos+=max_pack_length;
2514       for (start_pos=record, count= huff_counts; count < end_count ; count++)
2515       {
2516 	end_pos=start_pos+(field_length=count->field_length);
2517 	tree=count->tree;
2518 
2519         DBUG_PRINT("fields", ("column: %3lu  type: %2u  pack: %2u  zero: %4u  "
2520                               "lbits: %2u  tree: %2u  length: %4u",
2521                               (ulong) (count - huff_counts + 1),
2522                               count->field_type,
2523                               count->pack_type, count->max_zero_fill,
2524                               count->length_bits, count->tree->tree_number,
2525                               count->field_length));
2526 
2527         /* Check if the column contains spaces only. */
2528 	if (count->pack_type & PACK_TYPE_SPACE_FIELDS)
2529 	{
2530 	  for (pos=start_pos ; *pos == ' ' && pos < end_pos; pos++) ;
2531 	  if (pos == end_pos)
2532 	  {
2533             DBUG_PRINT("fields",
2534                        ("PACK_TYPE_SPACE_FIELDS spaces only, bits:  1"));
2535             DBUG_PRINT("fields", ("---"));
2536 	    write_bits(1,1);
2537 	    start_pos=end_pos;
2538 	    continue;
2539 	  }
2540           DBUG_PRINT("fields",
2541                      ("PACK_TYPE_SPACE_FIELDS not only spaces, bits:  1"));
2542 	  write_bits(0,1);
2543 	}
2544 	end_pos-=count->max_zero_fill;
2545 	field_length-=count->max_zero_fill;
2546 
2547 	switch (count->field_type) {
2548 	case FIELD_SKIP_ZERO:
2549 	  if (!memcmp((uchar*) start_pos,zero_string,field_length))
2550 	  {
2551             DBUG_PRINT("fields", ("FIELD_SKIP_ZERO zeroes only, bits:  1"));
2552 	    write_bits(1,1);
2553 	    start_pos=end_pos;
2554 	    break;
2555 	  }
2556           DBUG_PRINT("fields", ("FIELD_SKIP_ZERO not only zeroes, bits:  1"));
2557 	  write_bits(0,1);
2558 	  /* Fall through */
2559 	case FIELD_NORMAL:
2560           DBUG_PRINT("fields", ("FIELD_NORMAL %lu bytes",
2561                                 (ulong) (end_pos - start_pos)));
2562 	  for ( ; start_pos < end_pos ; start_pos++)
2563           {
2564             DBUG_PRINT("fields",
2565                        ("value: 0x%02x  code: 0x%s  bits: %2u  bin: %s",
2566                         (uchar) *start_pos,
2567                         hexdigits(tree->code[(uchar) *start_pos]),
2568                         (uint) tree->code_len[(uchar) *start_pos],
2569                         bindigits(tree->code[(uchar) *start_pos],
2570                                   (uint) tree->code_len[(uchar) *start_pos])));
2571 	    write_bits(tree->code[(uchar) *start_pos],
2572 		       (uint) tree->code_len[(uchar) *start_pos]);
2573           }
2574 	  break;
2575 	case FIELD_SKIP_ENDSPACE:
2576 	  for (pos=end_pos ; pos > start_pos && pos[-1] == ' ' ; pos--) ;
2577 	  length= (ulong) (end_pos - pos);
2578 	  if (count->pack_type & PACK_TYPE_SELECTED)
2579 	  {
2580 	    if (length > count->min_space)
2581 	    {
2582               DBUG_PRINT("fields",
2583                          ("FIELD_SKIP_ENDSPACE more than min_space, bits:  1"));
2584               DBUG_PRINT("fields",
2585                          ("FIELD_SKIP_ENDSPACE skip %lu/%u bytes, bits: %2u",
2586                           length, field_length, count->length_bits));
2587 	      write_bits(1,1);
2588 	      write_bits(length,count->length_bits);
2589 	    }
2590 	    else
2591 	    {
2592               DBUG_PRINT("fields",
2593                          ("FIELD_SKIP_ENDSPACE not more than min_space, "
2594                           "bits:  1"));
2595 	      write_bits(0,1);
2596 	      pos=end_pos;
2597 	    }
2598 	  }
2599 	  else
2600           {
2601             DBUG_PRINT("fields",
2602                        ("FIELD_SKIP_ENDSPACE skip %lu/%u bytes, bits: %2u",
2603                         length, field_length, count->length_bits));
2604 	    write_bits(length,count->length_bits);
2605           }
2606           /* Encode all significant bytes. */
2607           DBUG_PRINT("fields", ("FIELD_SKIP_ENDSPACE %lu bytes",
2608                                 (ulong) (pos - start_pos)));
2609 	  for ( ; start_pos < pos ; start_pos++)
2610           {
2611             DBUG_PRINT("fields",
2612                        ("value: 0x%02x  code: 0x%s  bits: %2u  bin: %s",
2613                         (uchar) *start_pos,
2614                         hexdigits(tree->code[(uchar) *start_pos]),
2615                         (uint) tree->code_len[(uchar) *start_pos],
2616                         bindigits(tree->code[(uchar) *start_pos],
2617                                   (uint) tree->code_len[(uchar) *start_pos])));
2618 	    write_bits(tree->code[(uchar) *start_pos],
2619 		       (uint) tree->code_len[(uchar) *start_pos]);
2620           }
2621 	  start_pos=end_pos;
2622 	  break;
2623 	case FIELD_SKIP_PRESPACE:
2624 	  for (pos=start_pos ; pos < end_pos && pos[0] == ' ' ; pos++) ;
2625           length= (ulong) (pos - start_pos);
2626 	  if (count->pack_type & PACK_TYPE_SELECTED)
2627 	  {
2628 	    if (length > count->min_space)
2629 	    {
2630               DBUG_PRINT("fields",
2631                          ("FIELD_SKIP_PRESPACE more than min_space, bits:  1"));
2632               DBUG_PRINT("fields",
2633                          ("FIELD_SKIP_PRESPACE skip %lu/%u bytes, bits: %2u",
2634                           length, field_length, count->length_bits));
2635 	      write_bits(1,1);
2636 	      write_bits(length,count->length_bits);
2637 	    }
2638 	    else
2639 	    {
2640               DBUG_PRINT("fields",
2641                          ("FIELD_SKIP_PRESPACE not more than min_space, "
2642                           "bits:  1"));
2643 	      pos=start_pos;
2644 	      write_bits(0,1);
2645 	    }
2646 	  }
2647 	  else
2648           {
2649             DBUG_PRINT("fields",
2650                        ("FIELD_SKIP_PRESPACE skip %lu/%u bytes, bits: %2u",
2651                         length, field_length, count->length_bits));
2652 	    write_bits(length,count->length_bits);
2653           }
2654           /* Encode all significant bytes. */
2655           DBUG_PRINT("fields", ("FIELD_SKIP_PRESPACE %lu bytes",
2656                                 (ulong) (end_pos - start_pos)));
2657 	  for (start_pos=pos ; start_pos < end_pos ; start_pos++)
2658           {
2659             DBUG_PRINT("fields",
2660                        ("value: 0x%02x  code: 0x%s  bits: %2u  bin: %s",
2661                         (uchar) *start_pos,
2662                         hexdigits(tree->code[(uchar) *start_pos]),
2663                         (uint) tree->code_len[(uchar) *start_pos],
2664                         bindigits(tree->code[(uchar) *start_pos],
2665                                   (uint) tree->code_len[(uchar) *start_pos])));
2666 	    write_bits(tree->code[(uchar) *start_pos],
2667 		       (uint) tree->code_len[(uchar) *start_pos]);
2668           }
2669 	  break;
2670 	case FIELD_CONSTANT:
2671 	case FIELD_ZERO:
2672 	case FIELD_CHECK:
2673           DBUG_PRINT("fields", ("FIELD_CONSTANT/ZERO/CHECK"));
2674 	  start_pos=end_pos;
2675 	  break;
2676 	case FIELD_INTERVALL:
2677 	  global_count=count;
2678 	  pos=(uchar*) tree_search(&count->int_tree, start_pos,
2679 				  count->int_tree.custom_arg);
2680 	  intervall=(uint) (pos - count->tree_buff)/field_length;
2681           DBUG_PRINT("fields", ("FIELD_INTERVALL"));
2682           DBUG_PRINT("fields", ("index: %4u code: 0x%s  bits: %2u",
2683                                 intervall, hexdigits(tree->code[intervall]),
2684                                 (uint) tree->code_len[intervall]));
2685 	  write_bits(tree->code[intervall],(uint) tree->code_len[intervall]);
2686 	  start_pos=end_pos;
2687 	  break;
2688 	case FIELD_BLOB:
2689 	{
2690 	  ulong blob_length=_mi_calc_blob_length(field_length-
2691 						 portable_sizeof_char_ptr,
2692 						 start_pos);
2693           /* Empty blobs are encoded with a single 1 bit. */
2694 	  if (!blob_length)
2695 	  {
2696             DBUG_PRINT("fields", ("FIELD_BLOB empty, bits:  1"));
2697             write_bits(1,1);
2698 	  }
2699 	  else
2700 	  {
2701 	    uchar *blob,*blob_end;
2702             DBUG_PRINT("fields", ("FIELD_BLOB not empty, bits:  1"));
2703 	    write_bits(0,1);
2704             /* Write the blob length. */
2705             DBUG_PRINT("fields", ("FIELD_BLOB %lu bytes, bits: %2u",
2706                                   blob_length, count->length_bits));
2707 	    write_bits(blob_length,count->length_bits);
2708 	    memcpy(&blob, end_pos-portable_sizeof_char_ptr, sizeof(char*));
2709 	    blob_end=blob+blob_length;
2710             /* Encode the blob bytes. */
2711 	    for ( ; blob < blob_end ; blob++)
2712             {
2713               DBUG_PRINT("fields",
2714                          ("value: 0x%02x  code: 0x%s  bits: %2u  bin: %s",
2715                           (uchar) *blob, hexdigits(tree->code[(uchar) *blob]),
2716                           (uint) tree->code_len[(uchar) *blob],
2717                           bindigits(tree->code[(uchar) *start_pos],
2718                                     (uint)tree->code_len[(uchar) *start_pos])));
2719 	      write_bits(tree->code[(uchar) *blob],
2720 			 (uint) tree->code_len[(uchar) *blob]);
2721             }
2722 	    tot_blob_length+=blob_length;
2723 	  }
2724 	  start_pos= end_pos;
2725 	  break;
2726 	}
2727 	case FIELD_VARCHAR:
2728 	{
2729           uint var_pack_length= HA_VARCHAR_PACKLENGTH(count->field_length-1);
2730           ulong col_length= (var_pack_length == 1 ?
2731                              (uint) *(uchar*) start_pos :
2732                              uint2korr(start_pos));
2733           /* Empty varchar are encoded with a single 1 bit. */
2734 	  if (!col_length)
2735 	  {
2736             DBUG_PRINT("fields", ("FIELD_VARCHAR empty, bits:  1"));
2737 	    write_bits(1,1);			/* Empty varchar */
2738 	  }
2739 	  else
2740 	  {
2741 	    uchar *end= start_pos + var_pack_length + col_length;
2742             DBUG_PRINT("fields", ("FIELD_VARCHAR not empty, bits:  1"));
2743 	    write_bits(0,1);
2744             /* Write the varchar length. */
2745             DBUG_PRINT("fields", ("FIELD_VARCHAR %lu bytes, bits: %2u",
2746                                   col_length, count->length_bits));
2747 	    write_bits(col_length,count->length_bits);
2748             /* Encode the varchar bytes. */
2749 	    for (start_pos+= var_pack_length ; start_pos < end ; start_pos++)
2750             {
2751               DBUG_PRINT("fields",
2752                          ("value: 0x%02x  code: 0x%s  bits: %2u  bin: %s",
2753                           (uchar) *start_pos,
2754                           hexdigits(tree->code[(uchar) *start_pos]),
2755                           (uint) tree->code_len[(uchar) *start_pos],
2756                           bindigits(tree->code[(uchar) *start_pos],
2757                                     (uint)tree->code_len[(uchar) *start_pos])));
2758 	      write_bits(tree->code[(uchar) *start_pos],
2759 			 (uint) tree->code_len[(uchar) *start_pos]);
2760             }
2761 	  }
2762 	  start_pos= end_pos;
2763 	  break;
2764 	}
2765 	case FIELD_LAST:
2766         case FIELD_enum_val_count:
2767 	  abort();				/* Impossible */
2768 	}
2769 	start_pos+=count->max_zero_fill;
2770         DBUG_PRINT("fields", ("---"));
2771       }
2772       flush_bits();
2773       length=(ulong) ((uchar*) file_buffer.pos - record_pos) - max_pack_length;
2774       pack_length= save_pack_length(pack_version, record_pos, length);
2775       if (pack_blob_length)
2776 	pack_length+= save_pack_length(pack_version, record_pos + pack_length,
2777 	                               tot_blob_length);
2778       DBUG_PRINT("fields", ("record: %lu  length: %lu  blob-length: %lu  "
2779                             "length-bytes: %lu", (ulong) record_count, length,
2780                             tot_blob_length, pack_length));
2781       DBUG_PRINT("fields", ("==="));
2782 
2783       /* Correct file buffer if the header was smaller */
2784       if (pack_length != max_pack_length)
2785       {
2786 	bmove(record_pos+pack_length,record_pos+max_pack_length,length);
2787 	file_buffer.pos-= (max_pack_length-pack_length);
2788       }
2789       if (length < (ulong) min_record_length)
2790 	min_record_length=(uint) length;
2791       if (length > (ulong) max_record_length)
2792 	max_record_length=(uint) length;
2793       record_count++;
2794       if (write_loop && record_count % WRITE_COUNT == 0)
2795       {
2796 	printf("%lu\r", (ulong) record_count);
2797         (void) fflush(stdout);
2798       }
2799     }
2800     else if (error != HA_ERR_RECORD_DELETED)
2801       break;
2802   }
2803   if (error == HA_ERR_END_OF_FILE)
2804     error=0;
2805   else
2806   {
2807     (void) fprintf(stderr, "%s: Got error %d reading records\n",
2808                  my_progname, error);
2809   }
2810   if (verbose >= 2)
2811     printf("wrote %s records.\n", llstr((longlong) record_count, llbuf));
2812 
2813   my_afree((uchar*) record);
2814   mrg->ref_length=max_pack_length;
2815   mrg->min_pack_length=max_record_length ? min_record_length : 0;
2816   mrg->max_pack_length=max_record_length;
2817   DBUG_RETURN(error || error_on_write || flush_buffer(~(ulong) 0));
2818 }
2819 
2820 
make_new_name(char * new_name,char * old_name)2821 static char *make_new_name(char *new_name, char *old_name)
2822 {
2823   return fn_format(new_name,old_name,"",DATA_TMP_EXT,2+4);
2824 }
2825 
make_old_name(char * new_name,char * old_name)2826 static char *make_old_name(char *new_name, char *old_name)
2827 {
2828   return fn_format(new_name,old_name,"",OLD_EXT,2+4);
2829 }
2830 
2831 	/* rutines for bit writing buffer */
2832 
init_file_buffer(File file,pbool read_buffer)2833 static void init_file_buffer(File file, pbool read_buffer)
2834 {
2835   file_buffer.file=file;
2836   file_buffer.buffer= (uchar*) my_malloc(ALIGN_SIZE(RECORD_CACHE_SIZE),
2837 					 MYF(MY_WME));
2838   file_buffer.end=file_buffer.buffer+ALIGN_SIZE(RECORD_CACHE_SIZE)-8;
2839   file_buffer.pos_in_file=0;
2840   error_on_write=0;
2841   if (read_buffer)
2842   {
2843 
2844     file_buffer.pos=file_buffer.end;
2845     file_buffer.bits=0;
2846   }
2847   else
2848   {
2849     file_buffer.pos=file_buffer.buffer;
2850     file_buffer.bits=BITS_SAVED;
2851   }
2852   file_buffer.bitbucket= 0;
2853 }
2854 
2855 
flush_buffer(ulong neaded_length)2856 static int flush_buffer(ulong neaded_length)
2857 {
2858   ulong length;
2859 
2860   /*
2861     file_buffer.end is 8 bytes lower than the real end of the buffer.
2862     This is done so that the end-of-buffer condition does not need to be
2863     checked for every byte (see write_bits()). Consequently,
2864     file_buffer.pos can become greater than file_buffer.end. The
2865     algorithms in the other functions ensure that there will never be
2866     more than 8 bytes written to the buffer without an end-of-buffer
2867     check. So the buffer cannot be overrun. But we need to check for the
2868     near-to-buffer-end condition to avoid a negative result, which is
2869     casted to unsigned and thus becomes giant.
2870   */
2871   if ((file_buffer.pos < file_buffer.end) &&
2872       ((ulong) (file_buffer.end - file_buffer.pos) > neaded_length))
2873     return 0;
2874   length=(ulong) (file_buffer.pos-file_buffer.buffer);
2875   file_buffer.pos=file_buffer.buffer;
2876   file_buffer.pos_in_file+=length;
2877   if (test_only)
2878     return 0;
2879   if (error_on_write|| my_write(file_buffer.file,
2880 				(const uchar*) file_buffer.buffer,
2881 				length,
2882 				MYF(MY_WME | MY_NABP | MY_WAIT_IF_FULL)))
2883   {
2884     error_on_write=1;
2885     return 1;
2886   }
2887 
2888   if (neaded_length != ~(ulong) 0 &&
2889       (ulong) (file_buffer.end-file_buffer.buffer) < neaded_length)
2890   {
2891     char *tmp;
2892     neaded_length+=256;				/* some margin */
2893     tmp= my_realloc((char*) file_buffer.buffer, neaded_length,MYF(MY_WME));
2894     if (!tmp)
2895       return 1;
2896     file_buffer.pos= ((uchar*) tmp +
2897                       (ulong) (file_buffer.pos - file_buffer.buffer));
2898     file_buffer.buffer= (uchar*) tmp;
2899     file_buffer.end= (uchar*) (tmp+neaded_length-8);
2900   }
2901   return 0;
2902 }
2903 
2904 
end_file_buffer(void)2905 static void end_file_buffer(void)
2906 {
2907   my_free(file_buffer.buffer);
2908 }
2909 
2910 	/* output `bits` low bits of `value' */
2911 
write_bits(register ulonglong value,register uint bits)2912 static void write_bits(register ulonglong value, register uint bits)
2913 {
2914   DBUG_ASSERT(((bits < 8 * sizeof(value)) && ! (value >> bits)) ||
2915               (bits == 8 * sizeof(value)));
2916 
2917   if ((file_buffer.bits-= (int) bits) >= 0)
2918   {
2919     file_buffer.bitbucket|= value << file_buffer.bits;
2920   }
2921   else
2922   {
2923     reg3 ulonglong bit_buffer;
2924     bits= (uint) -file_buffer.bits;
2925     bit_buffer= (file_buffer.bitbucket |
2926                  ((bits != 8 * sizeof(value)) ? (value >> bits) : 0));
2927 #if BITS_SAVED == 64
2928     *file_buffer.pos++= (uchar) (bit_buffer >> 56);
2929     *file_buffer.pos++= (uchar) (bit_buffer >> 48);
2930     *file_buffer.pos++= (uchar) (bit_buffer >> 40);
2931     *file_buffer.pos++= (uchar) (bit_buffer >> 32);
2932 #endif
2933     *file_buffer.pos++= (uchar) (bit_buffer >> 24);
2934     *file_buffer.pos++= (uchar) (bit_buffer >> 16);
2935     *file_buffer.pos++= (uchar) (bit_buffer >> 8);
2936     *file_buffer.pos++= (uchar) (bit_buffer);
2937 
2938     if (bits != 8 * sizeof(value))
2939       value&= (((ulonglong) 1) << bits) - 1;
2940     if (file_buffer.pos >= file_buffer.end)
2941       (void) flush_buffer(~ (ulong) 0);
2942     file_buffer.bits=(int) (BITS_SAVED - bits);
2943     file_buffer.bitbucket= value << (BITS_SAVED - bits);
2944   }
2945   return;
2946 }
2947 
2948 	/* Flush bits in bit_buffer to buffer */
2949 
flush_bits(void)2950 static void flush_bits(void)
2951 {
2952   int bits;
2953   ulonglong bit_buffer;
2954 
2955   bits= file_buffer.bits & ~7;
2956   bit_buffer= file_buffer.bitbucket >> bits;
2957   bits= BITS_SAVED - bits;
2958   while (bits > 0)
2959   {
2960     bits-= 8;
2961     *file_buffer.pos++= (uchar) (bit_buffer >> bits);
2962   }
2963   if (file_buffer.pos >= file_buffer.end)
2964     (void) flush_buffer(~ (ulong) 0);
2965   file_buffer.bits= BITS_SAVED;
2966   file_buffer.bitbucket= 0;
2967 }
2968 
2969 
2970 /****************************************************************************
2971 ** functions to handle the joined files
2972 ****************************************************************************/
2973 
save_state(MI_INFO * isam_file,PACK_MRG_INFO * mrg,my_off_t new_length,ha_checksum crc)2974 static int save_state(MI_INFO *isam_file,PACK_MRG_INFO *mrg,my_off_t new_length,
2975 		      ha_checksum crc)
2976 {
2977   MYISAM_SHARE *share=isam_file->s;
2978   uint options=mi_uint2korr(share->state.header.options);
2979   uint key;
2980   DBUG_ENTER("save_state");
2981 
2982   options|= HA_OPTION_COMPRESS_RECORD | HA_OPTION_READ_ONLY_DATA;
2983   mi_int2store(share->state.header.options,options);
2984 
2985   share->state.state.data_file_length=new_length;
2986   share->state.state.del=0;
2987   share->state.state.empty=0;
2988   share->state.dellink= HA_OFFSET_ERROR;
2989   share->state.split=(ha_rows) mrg->records;
2990   share->state.version=(ulong) time((time_t*) 0);
2991   if (! mi_is_all_keys_active(share->state.key_map, share->base.keys))
2992   {
2993     /*
2994       Some indexes are disabled, cannot use current key_file_length value
2995       as an estimate of upper bound of index file size. Use packed data file
2996       size instead.
2997     */
2998     share->state.state.key_file_length= new_length;
2999   }
3000   /*
3001     If there are no disabled indexes, keep key_file_length value from
3002     original file so "myisamchk -rq" can use this value (this is necessary
3003     because index size cannot be easily calculated for fulltext keys)
3004   */
3005   mi_clear_all_keys_active(share->state.key_map);
3006   for (key=0 ; key < share->base.keys ; key++)
3007     share->state.key_root[key]= HA_OFFSET_ERROR;
3008   for (key=0 ; key < share->state.header.max_block_size_index ; key++)
3009     share->state.key_del[key]= HA_OFFSET_ERROR;
3010   isam_file->state->checksum=crc;       /* Save crc here */
3011   share->changed=1;			/* Force write of header */
3012   share->state.open_count=0;
3013   share->global_changed=0;
3014   (void) my_chsize(share->kfile, share->base.keystart, 0, MYF(0));
3015   if (share->base.keys)
3016     isamchk_neaded=1;
3017   DBUG_RETURN(mi_state_info_write(share->kfile,&share->state,1+2));
3018 }
3019 
3020 
save_state_mrg(File file,PACK_MRG_INFO * mrg,my_off_t new_length,ha_checksum crc)3021 static int save_state_mrg(File file,PACK_MRG_INFO *mrg,my_off_t new_length,
3022 			  ha_checksum crc)
3023 {
3024   MI_STATE_INFO state;
3025   MI_INFO *isam_file=mrg->file[0];
3026   uint options;
3027   DBUG_ENTER("save_state_mrg");
3028 
3029   state= isam_file->s->state;
3030   options= (mi_uint2korr(state.header.options) | HA_OPTION_COMPRESS_RECORD |
3031 	    HA_OPTION_READ_ONLY_DATA);
3032   mi_int2store(state.header.options,options);
3033   state.state.data_file_length=new_length;
3034   state.state.del=0;
3035   state.state.empty=0;
3036   state.state.records=state.split=(ha_rows) mrg->records;
3037   /* See comment above in save_state about key_file_length handling. */
3038   if (mrg->src_file_has_indexes_disabled)
3039   {
3040     isam_file->s->state.state.key_file_length=
3041       MY_MAX(isam_file->s->state.state.key_file_length, new_length);
3042   }
3043   state.dellink= HA_OFFSET_ERROR;
3044   state.version=(ulong) time((time_t*) 0);
3045   mi_clear_all_keys_active(state.key_map);
3046   state.state.checksum=crc;
3047   if (isam_file->s->base.keys)
3048     isamchk_neaded=1;
3049   state.changed=STATE_CHANGED | STATE_NOT_ANALYZED; /* Force check of table */
3050   DBUG_RETURN (mi_state_info_write(file,&state,1+2));
3051 }
3052 
3053 
3054 /* reset for mrg_rrnd */
3055 
mrg_reset(PACK_MRG_INFO * mrg)3056 static void mrg_reset(PACK_MRG_INFO *mrg)
3057 {
3058   if (mrg->current)
3059   {
3060     mi_extra(*mrg->current, HA_EXTRA_NO_CACHE, 0);
3061     mrg->current=0;
3062   }
3063 }
3064 
mrg_rrnd(PACK_MRG_INFO * info,uchar * buf)3065 static int mrg_rrnd(PACK_MRG_INFO *info,uchar *buf)
3066 {
3067   int error;
3068   MI_INFO *isam_info;
3069   my_off_t filepos;
3070 
3071   if (!info->current)
3072   {
3073     isam_info= *(info->current=info->file);
3074     info->end=info->current+info->count;
3075     mi_reset(isam_info);
3076     mi_extra(isam_info, HA_EXTRA_CACHE, 0);
3077     filepos=isam_info->s->pack.header_length;
3078   }
3079   else
3080   {
3081     isam_info= *info->current;
3082     filepos= isam_info->nextpos;
3083   }
3084 
3085   for (;;)
3086   {
3087     isam_info->update&= HA_STATE_CHANGED;
3088     if (!(error=(*isam_info->s->read_rnd)(isam_info,(uchar*) buf,
3089 					  filepos, 1)) ||
3090 	error != HA_ERR_END_OF_FILE)
3091       return (error);
3092     mi_extra(isam_info,HA_EXTRA_NO_CACHE, 0);
3093     if (info->current+1 == info->end)
3094       return(HA_ERR_END_OF_FILE);
3095     info->current++;
3096     isam_info= *info->current;
3097     filepos=isam_info->s->pack.header_length;
3098     mi_reset(isam_info);
3099     mi_extra(isam_info,HA_EXTRA_CACHE, 0);
3100   }
3101 }
3102 
3103 
mrg_close(PACK_MRG_INFO * mrg)3104 static int mrg_close(PACK_MRG_INFO *mrg)
3105 {
3106   uint i;
3107   int error=0;
3108   for (i=0 ; i < mrg->count ; i++)
3109     error|=mi_close(mrg->file[i]);
3110   if (mrg->free_file)
3111     my_free(mrg->file);
3112   return error;
3113 }
3114 
3115 
3116 #if !defined(DBUG_OFF)
3117 /*
3118   Fake the counts to get big Huffman codes.
3119 
3120   SYNOPSIS
3121     fakebigcodes()
3122     huff_counts                 A pointer to the counts array.
3123     end_count                   A pointer past the counts array.
3124 
3125   DESCRIPTION
3126 
3127     Huffman coding works by removing the two least frequent values from
3128     the list of values and add a new value with the sum of their
3129     incidences in a loop until only one value is left. Every time a
3130     value is reused for a new value, it gets one more bit for its
3131     encoding. Hence, the least frequent values get the longest codes.
3132 
3133     To get a maximum code length for a value, two of the values must
3134     have an incidence of 1. As their sum is 2, the next infrequent value
3135     must have at least an incidence of 2, then 4, 8, 16 and so on. This
3136     means that one needs 2**n bytes (values) for a code length of n
3137     bits. However, using more distinct values forces the use of longer
3138     codes, or reaching the code length with less total bytes (values).
3139 
3140     To get 64(32)-bit codes, I sort the counts by decreasing incidence.
3141     I assign counts of 1 to the two most frequent values, a count of 2
3142     for the next one, then 4, 8, and so on until 2**64-1(2**30-1). All
3143     the remaining values get 1. That way every possible byte has an
3144     assigned code, though not all codes are used if not all byte values
3145     are present in the column.
3146 
3147     This strategy would work with distinct column values too, but
3148     requires that at least 64(32) values are present. To make things
3149     easier here, I cancel all distinct column values and force byte
3150     compression for all columns.
3151 
3152   RETURN
3153     void
3154 */
3155 
fakebigcodes(HUFF_COUNTS * huff_counts,HUFF_COUNTS * end_count)3156 static void fakebigcodes(HUFF_COUNTS *huff_counts, HUFF_COUNTS *end_count)
3157 {
3158   HUFF_COUNTS   *count;
3159   my_off_t      *cur_count_p;
3160   my_off_t      *end_count_p;
3161   my_off_t      **cur_sort_p;
3162   my_off_t      **end_sort_p;
3163   my_off_t      *sort_counts[256];
3164   my_off_t      total;
3165   DBUG_ENTER("fakebigcodes");
3166 
3167   for (count= huff_counts; count < end_count; count++)
3168   {
3169     /*
3170       Remove distinct column values.
3171     */
3172     if (huff_counts->tree_buff)
3173     {
3174       my_free(huff_counts->tree_buff);
3175       delete_tree(&huff_counts->int_tree);
3176       huff_counts->tree_buff= NULL;
3177       DBUG_PRINT("fakebigcodes", ("freed distinct column values"));
3178     }
3179 
3180     /*
3181       Sort counts by decreasing incidence.
3182     */
3183     cur_count_p= count->counts;
3184     end_count_p= cur_count_p + 256;
3185     cur_sort_p= sort_counts;
3186     while (cur_count_p < end_count_p)
3187       *(cur_sort_p++)= cur_count_p++;
3188     (void) my_qsort(sort_counts, 256, sizeof(my_off_t*), (qsort_cmp) fakecmp);
3189 
3190     /*
3191       Assign faked counts.
3192     */
3193     cur_sort_p= sort_counts;
3194 #if SIZEOF_LONG_LONG > 4
3195     end_sort_p= sort_counts + 8 * sizeof(ulonglong) - 1;
3196 #else
3197     end_sort_p= sort_counts + 8 * sizeof(ulonglong) - 2;
3198 #endif
3199     /* Most frequent value gets a faked count of 1. */
3200     **(cur_sort_p++)= 1;
3201     total= 1;
3202     while (cur_sort_p < end_sort_p)
3203     {
3204       **(cur_sort_p++)= total;
3205       total<<= 1;
3206     }
3207     /* Set the last value. */
3208     **(cur_sort_p++)= --total;
3209     /*
3210       Set the remaining counts.
3211     */
3212     end_sort_p= sort_counts + 256;
3213     while (cur_sort_p < end_sort_p)
3214       **(cur_sort_p++)= 1;
3215   }
3216   DBUG_VOID_RETURN;
3217 }
3218 
3219 
3220 /*
3221   Compare two counts for reverse sorting.
3222 
3223   SYNOPSIS
3224     fakecmp()
3225     count1              One count.
3226     count2              Another count.
3227 
3228   RETURN
3229     1                   count1  < count2
3230     0                   count1 == count2
3231     -1                  count1 >  count2
3232 */
3233 
fakecmp(my_off_t ** count1,my_off_t ** count2)3234 static int fakecmp(my_off_t **count1, my_off_t **count2)
3235 {
3236   return ((**count1 < **count2) ? 1 :
3237           (**count1 > **count2) ? -1 : 0);
3238 }
3239 #endif
3240 
3241 #include "mi_extrafunc.h"
3242