1 /* Copyright (c) 2016, 2018, Oracle and/or its affiliates. All rights reserved.
2
3 This program is free software; you can redistribute it and/or modify
4 it under the terms of the GNU General Public License, version 2.0,
5 as published by the Free Software Foundation.
6
7 This program is also distributed with certain software (including
8 but not limited to OpenSSL) that is licensed under separate terms,
9 as designated in a particular file or component or in included license
10 documentation. The authors of MySQL hereby grant you an additional
11 permission to link the program and your derivative works with the
12 separately licensed software that they have included with MySQL.
13
14 This program is distributed in the hope that it will be useful,
15 but WITHOUT ANY WARRANTY; without even the implied warranty of
16 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
17 GNU General Public License, version 2.0, for more details.
18
19 You should have received a copy of the GNU General Public License
20 along with this program; if not, write to the Free Software
21 Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA */
22
23 /* This header file contains type declarations used by UCA code. */
24
25 #ifndef STR_UCA_TYPE_H
26 #define STR_UCA_TYPE_H
27
28 #include <vector>
29
30 #include "my_inttypes.h"
31
32 /*
33 So far we have only Croatian collation needs to reorder Latin and
34 Cyrillic group of characters. May add more in future.
35 */
36 #define UCA_MAX_CHAR_GRP 4
37 enum enum_uca_ver { UCA_V400, UCA_V520, UCA_V900 };
38
39 enum enum_char_grp {
40 CHARGRP_NONE,
41 CHARGRP_CORE,
42 CHARGRP_LATIN,
43 CHARGRP_CYRILLIC,
44 CHARGRP_ARAB,
45 CHARGRP_KANA,
46 CHARGRP_OTHERS
47 };
48
49 struct Weight_boundary {
50 uint16 begin;
51 uint16 end;
52 };
53
54 struct Reorder_wt_rec {
55 struct Weight_boundary old_wt_bdy;
56 struct Weight_boundary new_wt_bdy;
57 };
58
59 struct Reorder_param {
60 enum enum_char_grp reorder_grp[UCA_MAX_CHAR_GRP];
61 struct Reorder_wt_rec wt_rec[2 * UCA_MAX_CHAR_GRP];
62 int wt_rec_num;
63 uint16 max_weight;
64 };
65
66 enum enum_case_first { CASE_FIRST_OFF, CASE_FIRST_UPPER, CASE_FIRST_LOWER };
67
68 struct Coll_param {
69 struct Reorder_param *reorder_param;
70 bool norm_enabled; // false = normalization off, default;
71 // true = on
72 enum enum_case_first case_first;
73 };
74
75 /*
76 NOTE: If you change MY_UCA_MAX_CONTRACTION, be sure to update the comment on
77 MY_UCA_CNT_MID1 in strings/uca_data.h, as it might cause us to run out of
78 bits in a byte flag.
79 */
80 #define MY_UCA_MAX_CONTRACTION 6
81 #define MY_UCA_MAX_WEIGHT_SIZE 25
82 #define MY_UCA_WEIGHT_LEVELS 1
83
84 /*
85 We store all the contractions in a trie, indexed on the codepoints they
86 consist of. The trie is organized as:
87 1. Each node stores one code point (ch) of contraction, and a list of nodes
88 (child_nodes) store all possible following code points.
89 2. The vector in MY_UCA_INFO stores a list of nodes which store the first
90 code points of all contractions.
91 3. Each node has a boolean value (is_contraction_tail) which shows
92 whether the code point stored in the node is the end of a contraction.
93 This is necessary because even if one code point is the end of a
94 contraction, there might be longer contraction contains all the
95 code points in the path (e.g., for Hungarian, both 'DZ' and 'DZS' are
96 contractions).
97 4. A contraction is formed by all the code points in the path until the
98 end of the contraction.
99 5. If it is the end of a contraction (is_contraction_tail == true), the
100 weight of this contraction is stored in array weight.
101 6. If it is the end of a contraction (is_contraction_tail == true),
102 with_context shows whether it is common contraction (with_context ==
103 false), or previous context contraction (with_context == true).
104 7. If it is the end of a contraction (is_contraction_tail == true),
105 contraction_len shows how many code points this contraction consists of.
106 */
107 struct MY_CONTRACTION {
108 my_wc_t ch;
109 // Lists of following nodes.
110 std::vector<MY_CONTRACTION> child_nodes;
111 std::vector<MY_CONTRACTION> child_nodes_context;
112
113 // weight and with_context are only useful when is_contraction_tail is true.
114 uint16 weight[MY_UCA_MAX_WEIGHT_SIZE]; /* Its weight string, 0-terminated */
115 bool is_contraction_tail;
116 size_t contraction_len;
117 };
118
119 struct MY_UCA_INFO {
120 enum enum_uca_ver version;
121
122 // Collation weights.
123 my_wc_t maxchar;
124 uchar *lengths;
125 uint16 **weights;
126 bool have_contractions;
127 std::vector<MY_CONTRACTION> *contraction_nodes;
128 /*
129 contraction_flags is only used when a collation has contraction rule.
130 UCA collation supports at least 65535 characters, but only a few of
131 them can be part of contraction, it is huge waste of time to find out
132 whether one character is in contraction list for every character.
133 contraction_flags points to memory which is allocated when a collation
134 has contraction rule. For a character in contraction, its corresponding
135 byte (contraction_flags[ch & 0x1000]) will be set to a certain value
136 according to the position (head, tail or middle) of this character in
137 contraction. This byte will be used to quick check whether one character
138 can be part of contraction.
139 */
140 char *contraction_flags;
141
142 /* Logical positions */
143 my_wc_t first_non_ignorable;
144 my_wc_t last_non_ignorable;
145 my_wc_t first_primary_ignorable;
146 my_wc_t last_primary_ignorable;
147 my_wc_t first_secondary_ignorable;
148 my_wc_t last_secondary_ignorable;
149 my_wc_t first_tertiary_ignorable;
150 my_wc_t last_tertiary_ignorable;
151 my_wc_t first_trailing;
152 my_wc_t last_trailing;
153 my_wc_t first_variable;
154 my_wc_t last_variable;
155 /*
156 extra_ce_pri_base, extra_ce_sec_base and extra_ce_ter_base are only used for
157 the UCA collations whose UCA version is not smaller than UCA_V900. For why
158 we need this extra CE, please see the comment in my_char_weight_put_900()
159 and apply_primary_shift_900().
160
161 The value of these three variables is set by the definition of my_uca_v900.
162 The value of extra_ce_pri_base is usually 0x54A4 (which is the maximum
163 regular weight value pluses one, 0x54A3 + 1 = 0x54A4). But for the Chinese
164 collation, the extra_ce_pri_base needs to change. This is because 0x54A4 has
165 been occupied to do reordering. There might be weight conflict if we still
166 use 0x54A4. Please also see the comment on modify_all_zh_pages().
167 */
168 uint16 extra_ce_pri_base; // Primary weight of extra CE
169 uint16 extra_ce_sec_base; // Secondary weight of extra CE
170 uint16 extra_ce_ter_base; // Tertiary weight of extra CE
171 };
172
173 #define MY_UCA_CNT_FLAG_SIZE 4096
174 #define MY_UCA_CNT_FLAG_MASK 4095
175
176 /** Whether the given character can be the first in any contraction. */
177 #define MY_UCA_CNT_HEAD 1
178
179 /** Whether the given character can be the last in any contraction. */
180 #define MY_UCA_CNT_TAIL 2
181
182 /**
183 Whether the given character can be the second in any contraction.
184
185 Also defined implicitly through shifting MY_UCA_CNT_MID1:
186
187 \#define MY_UCA_CNT_MID2 8
188 \#define MY_UCA_CNT_MID3 16
189 \#define MY_UCA_CNT_MID4 32
190
191 There's no need for MY_UCA_CNT_MID5 (which would cause us to run out of
192 bits) since MY_UCA_MAX_CONTRACTION is 6 (so head, four in the middle,
193 and then tail).
194 */
195 #define MY_UCA_CNT_MID1 4
196
197 /**
198 Whether the given character is the first part of a context-sensitive
199 contraction. Context-sensitive contractions are like normal contractions,
200 except that for performance reasons, they trigger on the _last_ character
201 instead of the first. The case given in Unicode TR35 is that in some
202 scripts (such as katakana in Japanese), "a-" should sort as "aa"
203 (except on the tertiary level), "e-" should sort as "ee" and so on.
204 However, adding regular contractions on "a" and "e" would cause undue
205 performance loss, so instead, we add a special "context-sensitive"
206 contraction on "-" that then looks at the _previous_ character.
207
208 We don't support context-sensitive contractions longer than two characters
209 at the moment, since none exist in CLDR. Thus, there is no
210 MY_UCA_PREVIOUS_CONTEXT_MID1 and so on.
211 */
212 #define MY_UCA_PREVIOUS_CONTEXT_HEAD 64
213
214 /** Similar to MY_UCA_PREVIOUS_CONTEXT_HEAD, just for the tail. */
215 #define MY_UCA_PREVIOUS_CONTEXT_TAIL 128
216
217 #define MY_UCA_PSHIFT 8
218
219 /**
220 Check if a code point can be contraction head
221
222 @param flags Pointer to UCA contraction flag data
223 @param wc Code point
224
225 @retval 0 - cannot be contraction head
226 @retval 1 - can be contraction head
227 */
228
my_uca_can_be_contraction_head(const char * flags,my_wc_t wc)229 inline bool my_uca_can_be_contraction_head(const char *flags, my_wc_t wc) {
230 return flags[wc & MY_UCA_CNT_FLAG_MASK] & MY_UCA_CNT_HEAD;
231 }
232
233 /**
234 Check if a code point can be contraction tail
235
236 @param flags Pointer to UCA contraction flag data
237 @param wc Code point
238
239 @retval 0 - cannot be contraction tail
240 @retval 1 - can be contraction tail
241 */
242
my_uca_can_be_contraction_tail(const char * flags,my_wc_t wc)243 inline bool my_uca_can_be_contraction_tail(const char *flags, my_wc_t wc) {
244 return flags[wc & MY_UCA_CNT_FLAG_MASK] & MY_UCA_CNT_TAIL;
245 }
246
247 const uint16 *my_uca_contraction2_weight(
248 const std::vector<MY_CONTRACTION> *cont_nodes, my_wc_t wc1, my_wc_t wc2);
249 #endif
250