1 /*
2    Copyright (c) 2000, 2021, Oracle and/or its affiliates.
3 
4    This program is free software; you can redistribute it and/or modify
5    it under the terms of the GNU General Public License, version 2.0,
6    as published by the Free Software Foundation.
7 
8    This program is also distributed with certain software (including
9    but not limited to OpenSSL) that is licensed under separate terms,
10    as designated in a particular file or component or in included license
11    documentation.  The authors of MySQL hereby grant you an additional
12    permission to link the program and your derivative works with the
13    separately licensed software that they have included with MySQL.
14 
15    This program is distributed in the hope that it will be useful,
16    but WITHOUT ANY WARRANTY; without even the implied warranty of
17    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
18    GNU General Public License, version 2.0, for more details.
19 
20    You should have received a copy of the GNU General Public License
21    along with this program; if not, write to the Free Software Foundation,
22    51 Franklin Street, Suite 500, Boston, MA 02110-1335 USA */
23 
24 /**
25   @file
26 
27   @details
28 @verbatim
29 The idea of presented algorithm see in
30 "The Art of Computer Programming" by Donald E. Knuth
31 Volume 3 "Sorting and searching"
32 (chapter 6.3 "Digital searching" - name and number of chapter
33    is back translation from Russian edition :))
34 
35 as illustration of data structures, imagine next table:
36 
37 static SYMBOL symbols[] = {
38   { "ADD",              SYM(ADD),0,0},
39   { "AND",              SYM(AND),0,0},
40   { "DAY",              SYM(DAY_SYM),0,0},
41 };
42 
43 for this structure, presented program generate next searching-structure:
44 
45 +-----------+-+-+-+
46 |       len |1|2|3|
47 +-----------+-+-+-+
48 |first_char |0|0|a|
49 |last_char  |0|0|d|
50 |link       |0|0|+|
51                  |
52                  V
53        +----------+-+-+-+--+
54        |    1 char|a|b|c|d |
55        +----------+-+-+-+--+
56        |first_char|d|0|0|0 |
57        |last_char |n|0|0|-1|
58        |link      |+|0|0|+ |
59                    |     |
60                    |     V
61                    |  symbols[2] ( "DAY" )
62                    V
63 +----------+--+-+-+-+-+-+-+-+-+-+--+
64 |    2 char|d |e|f|j|h|i|j|k|l|m|n |
65 +----------+--+-+-+-+-+-+-+-+-+-+--+
66 |first_char|0 |0|0|0|0|0|0|0|0|0|0 |
67 |last_char |-1|0|0|0|0|0|0|0|0|0|-1|
68 |link      |+ |0|0|0|0|0|0|0|0|0|+ |
69             |                    |
70             V                    V
71          symbols[0] ( "ADD" )  symbols[1] ( "AND" )
72 
73 for optimization, link is the 16-bit index in 'symbols'
74 or search-array..
75 
76 So, we can read full search-structure as 32-bit word
77 @endverbatim
78 
79 @todo
80     use instead to_upper_lex, special array
81     (substitute chars) without skip codes..
82 @todo
83     try use reverse order of comparing..
84 
85 */
86 
87 #define NO_YACC_SYMBOLS
88 #include <my_global.h>
89 #include "mysql_version.h"
90 #include "lex.h"
91 #include <stdlib.h>
92 #include <stdio.h>
93 #include <string.h>
94 #include <set>
95 
96 #include <welcome_copyright_notice.h> /* ORACLE_WELCOME_COPYRIGHT_NOTICE */
97 
98 
99 static bool check_duplicates(uint group_mask);
100 
101 
102 struct hash_lex_struct
103 {
104   int first_char;
105   char last_char;
106   // union value is undefined if first_char == 0
107   union{
108     hash_lex_struct *char_tails; // if first_char > 0
109     int iresult;                 // if first_char == -1
110   };
111   int ithis;
112 
destroyhash_lex_struct113   void destroy()
114   {
115     if (first_char <= 0)
116       return;
117     for (int i= 0, size= static_cast<uchar>(last_char) - first_char + 1;
118          i < size; i++)
119       char_tails[i].destroy();
120     free(char_tails);
121   }
122 };
123 
124 
125 class hash_map_info
126 {
127 public:
128   hash_lex_struct *root_by_len;
129   int max_len;
130 
131   char *hash_map;
132   int size_hash_map;
133 
hash_map_info()134   hash_map_info()
135   : root_by_len(NULL), max_len(0), hash_map(NULL), size_hash_map(0)
136   {}
137 
~hash_map_info()138   ~hash_map_info()
139   {
140     for (int i= 0; i < max_len; i++)
141       root_by_len[i].destroy();
142     free(root_by_len);
143     free(hash_map);
144   }
145 
146   hash_lex_struct *get_hash_struct_by_len(int len);
147   void insert_symbols(int group_mask);
148   void add_structs_to_map(hash_lex_struct *st, int len);
149   void add_struct_to_map(hash_lex_struct *st);
150   void set_links(hash_lex_struct *st, int len);
151   bool print_hash_map(const char *name, uint group_mask);
152 };
153 
154 
get_hash_struct_by_len(int len)155 hash_lex_struct *hash_map_info::get_hash_struct_by_len(int len)
156 {
157   if (max_len < len)
158   {
159     root_by_len= (hash_lex_struct *) realloc((char*) root_by_len,
160                                              sizeof(hash_lex_struct) * len);
161     hash_lex_struct *cur, *end= root_by_len + len;
162     for (cur= root_by_len + max_len; cur < end; cur++)
163       cur->first_char= 0;
164     max_len= len;
165   }
166   return root_by_len + len - 1;
167 }
168 
insert_into_hash(hash_lex_struct * root,const char * name,int len_from_begin,int index)169 void insert_into_hash(hash_lex_struct *root, const char *name,
170 		      int len_from_begin, int index)
171 {
172   hash_lex_struct *end, *cur, *tails;
173 
174   if (!root->first_char)
175   {
176     root->first_char= -1;
177     root->iresult= index;
178     return;
179   }
180 
181   if (root->first_char == -1)
182   {
183     int index2= root->iresult;
184     const char *name2= symbols[index2].name + len_from_begin;
185     root->first_char= (int) (uchar) name2[0];
186     root->last_char= (char) root->first_char;
187     tails= (hash_lex_struct*)malloc(sizeof(hash_lex_struct));
188     root->char_tails= tails;
189     tails->first_char= -1;
190     tails->iresult= index2;
191   }
192 
193   size_t real_size= (root->last_char-root->first_char+1);
194 
195   if (root->first_char>(*name))
196   {
197     size_t new_size= root->last_char-(*name)+1;
198     if (new_size<real_size) printf("error!!!!\n");
199     tails= root->char_tails;
200     tails= (hash_lex_struct*)realloc((char*)tails,
201 				       sizeof(hash_lex_struct)*new_size);
202     root->char_tails= tails;
203     memmove(tails+(new_size-real_size),tails,real_size*sizeof(hash_lex_struct));
204     end= tails + new_size - real_size;
205     for (cur= tails; cur<end; cur++)
206       cur->first_char= 0;
207     root->first_char= (int) (uchar) *name;
208   }
209 
210   if (root->last_char<(*name))
211   {
212     size_t new_size= (*name)-root->first_char+1;
213     if (new_size<real_size) printf("error!!!!\n");
214     tails= root->char_tails;
215     tails= (hash_lex_struct*)realloc((char*)tails,
216 				    sizeof(hash_lex_struct)*new_size);
217     root->char_tails= tails;
218     end= tails + new_size;
219     for (cur= tails+real_size; cur<end; cur++)
220       cur->first_char= 0;
221     root->last_char= (*name);
222   }
223 
224   insert_into_hash(root->char_tails+(*name)-root->first_char,
225 		   name+1,len_from_begin+1,index);
226 }
227 
228 
insert_symbols(int group_mask)229 void hash_map_info::insert_symbols(int group_mask)
230 {
231   size_t i= 0;
232   const SYMBOL *cur;
233   for (cur= symbols; i < array_elements(symbols); cur++, i++)
234   {
235     if (!(cur->group & group_mask))
236       continue;
237     hash_lex_struct *root= get_hash_struct_by_len(cur->length);
238     insert_into_hash(root, cur->name, 0, i);
239   }
240 }
241 
242 
add_struct_to_map(hash_lex_struct * st)243 void hash_map_info::add_struct_to_map(hash_lex_struct *st)
244 {
245   st->ithis= size_hash_map/4;
246   size_hash_map+= 4;
247   hash_map= (char*)realloc(hash_map,size_hash_map);
248   hash_map[size_hash_map-4]= (char) (st->first_char == -1 ? 0 :
249 				     st->first_char);
250   hash_map[size_hash_map-3]= (char) (st->first_char == -1 ||
251 				     st->first_char == 0 ? 0 : st->last_char);
252   if (st->first_char == -1)
253   {
254     hash_map[size_hash_map-2]= ((unsigned int)(int16)st->iresult)&255;
255     hash_map[size_hash_map-1]= ((unsigned int)(int16)st->iresult)>>8;
256   }
257   else if (st->first_char == 0)
258   {
259     hash_map[size_hash_map-2]= ((unsigned int)(int16)array_elements(symbols))&255;
260     hash_map[size_hash_map-1]= ((unsigned int)(int16)array_elements(symbols))>>8;
261   }
262 }
263 
264 
add_structs_to_map(hash_lex_struct * st,int len)265 void hash_map_info::add_structs_to_map(hash_lex_struct *st, int len)
266 {
267   hash_lex_struct *cur, *end= st+len;
268   for (cur= st; cur<end; cur++)
269     add_struct_to_map(cur);
270   for (cur= st; cur<end; cur++)
271   {
272     if (cur->first_char && cur->first_char != -1)
273       add_structs_to_map(cur->char_tails,cur->last_char-cur->first_char+1);
274   }
275 }
276 
277 
set_links(hash_lex_struct * st,int len)278 void hash_map_info::set_links(hash_lex_struct *st, int len)
279 {
280   hash_lex_struct *cur, *end= st+len;
281   for (cur= st; cur<end; cur++)
282   {
283     if (cur->first_char != 0 && cur->first_char != -1)
284     {
285       int ilink= cur->char_tails->ithis;
286       hash_map[cur->ithis*4+2]= ilink%256;
287       hash_map[cur->ithis*4+3]= ilink/256;
288       set_links(cur->char_tails,cur->last_char-cur->first_char+1);
289     }
290   }
291 }
292 
293 
print_hash_map(const char * name,uint group_mask)294 bool hash_map_info::print_hash_map(const char *name, uint group_mask)
295 {
296   if (check_duplicates(group_mask))
297     return true;
298   insert_symbols(group_mask);
299   add_structs_to_map(root_by_len, max_len);
300   set_links(root_by_len, max_len);
301 
302   printf("static const unsigned char %s_map[%d]= {\n", name, size_hash_map);
303 
304   char *cur;
305   int i;
306   for (i=0, cur= hash_map; i < size_hash_map; i++, cur++)
307   {
308     switch(i%4){
309     case 0: case 1:
310       if (!*cur)
311 	printf("0,   ");
312       else
313 	printf("\'%c\', ",*cur);
314       break;
315     case 2: printf("%u, ",(uint)(uchar)*cur); break;
316     case 3: printf("%u,\n",(uint)(uchar)*cur); break;
317     }
318   }
319   printf("};\n\n");
320   printf("const unsigned int %s_max_len=%d;\n", name, max_len);
321   return false;
322 }
323 
324 
check_duplicates(uint group_mask)325 bool check_duplicates(uint group_mask)
326 {
327   std::set<const char *> names;
328 
329   size_t i= 0;
330   const SYMBOL *cur;
331   for (cur= symbols; i < array_elements(symbols); cur++, i++)
332   {
333     if (!(cur->group & group_mask))
334       continue;
335     if (!names.insert(cur->name).second)
336     {
337       const char *err_tmpl= "\ngen_lex_hash fatal error : "
338         "Unfortunately gen_lex_hash can not generate a hash,\n since "
339         "your lex.h has duplicate definition for a symbol \"%s\"\n\n";
340       printf (err_tmpl, cur->name);
341       fprintf (stderr, err_tmpl, cur->name);
342       return true;
343     }
344   }
345   return false;
346 }
347 
348 
main(int argc,char ** argv)349 int main(int argc,char **argv)
350 {
351 
352 
353   /* Broken up to indicate that it's not advice to you, gentle reader. */
354   printf("/*\n\n  Do " "not " "edit " "this " "file " "directly!\n\n*/\n");
355 
356   puts("/*");
357   puts(ORACLE_WELCOME_COPYRIGHT_NOTICE("2000"));
358   puts("*/");
359 
360   /* Broken up to indicate that it's not advice to you, gentle reader. */
361   printf("/* Do " "not " "edit " "this " "file!  This is generated by "
362          "gen_lex_hash.cc\nthat seeks for a perfect hash function */\n\n");
363 
364   /* Print sql_keywords_and_funcs_map[] and sql_keyword_and_funcs_max_len: */
365   if (hash_map_info().print_hash_map("sql_keywords_and_funcs", SG_MAIN_PARSER))
366     exit(1);
367 
368   printf("\n");
369 
370   /* Print sql_keywords_map[] and sql_keywords_max_len values: */
371   if (hash_map_info().print_hash_map("sql_keywords",
372                                      SG_KEYWORDS | SG_HINTABLE_KEYWORDS))
373     exit(1);
374 
375   printf("\n");
376 
377   /* Print hint_keywords_map[] and hint_keywords_max_len values: */
378   if (hash_map_info().print_hash_map("hint_keywords", SG_HINTS))
379     exit(1);
380 
381   exit(0);
382 }
383 
384