1 /*
2 * Unicode sort key generation
3 *
4 * Copyright 2003 Dmitry Timoshkov
5 *
6 * This library is free software; you can redistribute it and/or
7 * modify it under the terms of the GNU Lesser General Public
8 * License as published by the Free Software Foundation; either
9 * version 2.1 of the License, or (at your option) any later version.
10 *
11 * This library is distributed in the hope that it will be useful,
12 * but WITHOUT ANY WARRANTY; without even the implied warranty of
13 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
14 * Lesser General Public License for more details.
15 *
16 * You should have received a copy of the GNU Lesser General Public
17 * License along with this library; if not, write to the Free Software
18 * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301, USA
19 */
20 #include "wine/unicode.h"
21
22 extern unsigned int wine_decompose( int flags, WCHAR ch, WCHAR *dst, unsigned int dstlen );
23 extern const unsigned int collation_table[];
24
25 /*
26 * flags - normalization NORM_* flags
27 *
28 * FIXME: 'variable' flag not handled
29 */
wine_get_sortkey(int flags,const WCHAR * src,int srclen,char * dst,int dstlen)30 int wine_get_sortkey(int flags, const WCHAR *src, int srclen, char *dst, int dstlen)
31 {
32 WCHAR dummy[4]; /* no decomposition is larger than 4 chars */
33 int key_len[4];
34 char *key_ptr[4];
35 const WCHAR *src_save = src;
36 int srclen_save = srclen;
37
38 key_len[0] = key_len[1] = key_len[2] = key_len[3] = 0;
39 for (; srclen; srclen--, src++)
40 {
41 unsigned int i, decomposed_len = 1;/*wine_decompose(*src, dummy, 4);*/
42 dummy[0] = *src;
43 if (decomposed_len)
44 {
45 for (i = 0; i < decomposed_len; i++)
46 {
47 WCHAR wch = dummy[i];
48 unsigned int ce;
49
50 /* tests show that win2k just ignores NORM_IGNORENONSPACE,
51 * and skips white space and punctuation characters for
52 * NORM_IGNORESYMBOLS.
53 */
54 if ((flags & NORM_IGNORESYMBOLS) && (get_char_typeW(wch) & (C1_PUNCT | C1_SPACE)))
55 continue;
56
57 if (flags & NORM_IGNORECASE) wch = tolowerW(wch);
58
59 ce = collation_table[collation_table[wch >> 8] + (wch & 0xff)];
60 if (ce != (unsigned int)-1)
61 {
62 if (ce >> 16) key_len[0] += 2;
63 if ((ce >> 8) & 0xff) key_len[1]++;
64 if ((ce >> 4) & 0x0f) key_len[2]++;
65 if (ce & 1)
66 {
67 if (wch >> 8) key_len[3]++;
68 key_len[3]++;
69 }
70 }
71 else
72 {
73 key_len[0] += 2;
74 if (wch >> 8) key_len[0]++;
75 if (wch & 0xff) key_len[0]++;
76 }
77 }
78 }
79 }
80
81 if (!dstlen) /* compute length */
82 /* 4 * '\1' + key length */
83 return key_len[0] + key_len[1] + key_len[2] + key_len[3] + 4;
84
85 if (dstlen < key_len[0] + key_len[1] + key_len[2] + key_len[3] + 4 + 1)
86 return 0; /* overflow */
87
88 src = src_save;
89 srclen = srclen_save;
90
91 key_ptr[0] = dst;
92 key_ptr[1] = key_ptr[0] + key_len[0] + 1;
93 key_ptr[2] = key_ptr[1] + key_len[1] + 1;
94 key_ptr[3] = key_ptr[2] + key_len[2] + 1;
95
96 for (; srclen; srclen--, src++)
97 {
98 unsigned int i, decomposed_len = 1;/*wine_decompose(*src, dummy, 4);*/
99 dummy[0] = *src;
100 if (decomposed_len)
101 {
102 for (i = 0; i < decomposed_len; i++)
103 {
104 WCHAR wch = dummy[i];
105 unsigned int ce;
106
107 /* tests show that win2k just ignores NORM_IGNORENONSPACE,
108 * and skips white space and punctuation characters for
109 * NORM_IGNORESYMBOLS.
110 */
111 if ((flags & NORM_IGNORESYMBOLS) && (get_char_typeW(wch) & (C1_PUNCT | C1_SPACE)))
112 continue;
113
114 if (flags & NORM_IGNORECASE) wch = tolowerW(wch);
115
116 ce = collation_table[collation_table[wch >> 8] + (wch & 0xff)];
117 if (ce != (unsigned int)-1)
118 {
119 WCHAR key;
120 if ((key = ce >> 16))
121 {
122 *key_ptr[0]++ = key >> 8;
123 *key_ptr[0]++ = key & 0xff;
124 }
125 /* make key 1 start from 2 */
126 if ((key = (ce >> 8) & 0xff)) *key_ptr[1]++ = key + 1;
127 /* make key 2 start from 2 */
128 if ((key = (ce >> 4) & 0x0f)) *key_ptr[2]++ = key + 1;
129 /* key 3 is always a character code */
130 if (ce & 1)
131 {
132 if (wch >> 8) *key_ptr[3]++ = wch >> 8;
133 if (wch & 0xff) *key_ptr[3]++ = wch & 0xff;
134 }
135 }
136 else
137 {
138 *key_ptr[0]++ = 0xff;
139 *key_ptr[0]++ = 0xfe;
140 if (wch >> 8) *key_ptr[0]++ = wch >> 8;
141 if (wch & 0xff) *key_ptr[0]++ = wch & 0xff;
142 }
143 }
144 }
145 }
146
147 *key_ptr[0] = '\1';
148 *key_ptr[1] = '\1';
149 *key_ptr[2] = '\1';
150 *key_ptr[3]++ = '\1';
151 *key_ptr[3] = 0;
152
153 return key_ptr[3] - dst;
154 }
155
156 enum weight
157 {
158 UNICODE_WEIGHT,
159 DIACRITIC_WEIGHT,
160 CASE_WEIGHT
161 };
162
get_weight(WCHAR ch,enum weight type)163 static unsigned int get_weight(WCHAR ch, enum weight type)
164 {
165 unsigned int ret;
166
167 ret = collation_table[collation_table[ch >> 8] + (ch & 0xff)];
168 if (ret == (unsigned int)-1)
169 return ch;
170
171 switch(type)
172 {
173 case UNICODE_WEIGHT:
174 return ret >> 16;
175 case DIACRITIC_WEIGHT:
176 return (ret >> 8) & 0xff;
177 case CASE_WEIGHT:
178 default:
179 return (ret >> 4) & 0x0f;
180 }
181 }
182
inc_str_pos(const WCHAR ** str,int * len,int * dpos,int * dlen)183 static void inc_str_pos(const WCHAR **str, int *len, int *dpos, int *dlen)
184 {
185 (*dpos)++;
186 if (*dpos == *dlen)
187 {
188 *dpos = *dlen = 0;
189 (*str)++;
190 (*len)--;
191 }
192 }
193
compare_weights(int flags,const WCHAR * str1,int len1,const WCHAR * str2,int len2,enum weight type)194 static inline int compare_weights(int flags, const WCHAR *str1, int len1,
195 const WCHAR *str2, int len2, enum weight type)
196 {
197 int dpos1 = 0, dpos2 = 0, dlen1 = 0, dlen2 = 0;
198 WCHAR dstr1[4], dstr2[4];
199 unsigned int ce1, ce2;
200
201 /* 32-bit collation element table format:
202 * unicode weight - high 16 bit, diacritic weight - high 8 bit of low 16 bit,
203 * case weight - high 4 bit of low 8 bit.
204 */
205 while (len1 > 0 && len2 > 0)
206 {
207 if (!dlen1) dlen1 = wine_decompose(0, *str1, dstr1, 4);
208 if (!dlen2) dlen2 = wine_decompose(0, *str2, dstr2, 4);
209
210 if (flags & NORM_IGNORESYMBOLS)
211 {
212 int skip = 0;
213 /* FIXME: not tested */
214 if (get_char_typeW(dstr1[dpos1]) & (C1_PUNCT | C1_SPACE))
215 {
216 inc_str_pos(&str1, &len1, &dpos1, &dlen1);
217 skip = 1;
218 }
219 if (get_char_typeW(dstr2[dpos2]) & (C1_PUNCT | C1_SPACE))
220 {
221 inc_str_pos(&str2, &len2, &dpos2, &dlen2);
222 skip = 1;
223 }
224 if (skip) continue;
225 }
226
227 /* hyphen and apostrophe are treated differently depending on
228 * whether SORT_STRINGSORT specified or not
229 */
230 if (type == UNICODE_WEIGHT && !(flags & SORT_STRINGSORT))
231 {
232 if (dstr1[dpos1] == '-' || dstr1[dpos1] == '\'')
233 {
234 if (dstr2[dpos2] != '-' && dstr2[dpos2] != '\'')
235 {
236 inc_str_pos(&str1, &len1, &dpos1, &dlen1);
237 continue;
238 }
239 }
240 else if (dstr2[dpos2] == '-' || dstr2[dpos2] == '\'')
241 {
242 inc_str_pos(&str2, &len2, &dpos2, &dlen2);
243 continue;
244 }
245 }
246
247 ce1 = get_weight(dstr1[dpos1], type);
248 if (!ce1)
249 {
250 inc_str_pos(&str1, &len1, &dpos1, &dlen1);
251 continue;
252 }
253 ce2 = get_weight(dstr2[dpos2], type);
254 if (!ce2)
255 {
256 inc_str_pos(&str2, &len2, &dpos2, &dlen2);
257 continue;
258 }
259
260 if (ce1 - ce2) return ce1 - ce2;
261
262 inc_str_pos(&str1, &len1, &dpos1, &dlen1);
263 inc_str_pos(&str2, &len2, &dpos2, &dlen2);
264 }
265 while (len1)
266 {
267 if (!dlen1) dlen1 = wine_decompose(0, *str1, dstr1, 4);
268
269 ce1 = get_weight(dstr1[dpos1], type);
270 if (ce1) break;
271 inc_str_pos(&str1, &len1, &dpos1, &dlen1);
272 }
273 while (len2)
274 {
275 if (!dlen2) dlen2 = wine_decompose(0, *str2, dstr2, 4);
276
277 ce2 = get_weight(dstr2[dpos2], type);
278 if (ce2) break;
279 inc_str_pos(&str2, &len2, &dpos2, &dlen2);
280 }
281 return len1 - len2;
282 }
283
wine_compare_string(int flags,const WCHAR * str1,int len1,const WCHAR * str2,int len2)284 int wine_compare_string(int flags, const WCHAR *str1, int len1,
285 const WCHAR *str2, int len2)
286 {
287 int ret;
288
289 ret = compare_weights(flags, str1, len1, str2, len2, UNICODE_WEIGHT);
290 if (!ret)
291 {
292 if (!(flags & NORM_IGNORENONSPACE))
293 ret = compare_weights(flags, str1, len1, str2, len2, DIACRITIC_WEIGHT);
294 if (!ret && !(flags & NORM_IGNORECASE))
295 ret = compare_weights(flags, str1, len1, str2, len2, CASE_WEIGHT);
296 }
297 return ret;
298 }
299