1c2c66affSColin Finck /*
2c2c66affSColin Finck * Unicode sort key generation
3c2c66affSColin Finck *
4c2c66affSColin Finck * Copyright 2003 Dmitry Timoshkov
5c2c66affSColin Finck *
6c2c66affSColin Finck * This library is free software; you can redistribute it and/or
7c2c66affSColin Finck * modify it under the terms of the GNU Lesser General Public
8c2c66affSColin Finck * License as published by the Free Software Foundation; either
9c2c66affSColin Finck * version 2.1 of the License, or (at your option) any later version.
10c2c66affSColin Finck *
11c2c66affSColin Finck * This library is distributed in the hope that it will be useful,
12c2c66affSColin Finck * but WITHOUT ANY WARRANTY; without even the implied warranty of
13c2c66affSColin Finck * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
14c2c66affSColin Finck * Lesser General Public License for more details.
15c2c66affSColin Finck *
16c2c66affSColin Finck * You should have received a copy of the GNU Lesser General Public
17c2c66affSColin Finck * License along with this library; if not, write to the Free Software
18c2c66affSColin Finck * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301, USA
19c2c66affSColin Finck */
20c2c66affSColin Finck #include "wine/unicode.h"
21c2c66affSColin Finck
22*1a472d94SAmine Khaldi extern unsigned int wine_decompose( int flags, WCHAR ch, WCHAR *dst, unsigned int dstlen );
23c2c66affSColin Finck extern const unsigned int collation_table[];
24c2c66affSColin Finck
25c2c66affSColin Finck /*
26c2c66affSColin Finck * flags - normalization NORM_* flags
27c2c66affSColin Finck *
28c2c66affSColin Finck * FIXME: 'variable' flag not handled
29c2c66affSColin Finck */
wine_get_sortkey(int flags,const WCHAR * src,int srclen,char * dst,int dstlen)30c2c66affSColin Finck int wine_get_sortkey(int flags, const WCHAR *src, int srclen, char *dst, int dstlen)
31c2c66affSColin Finck {
32c2c66affSColin Finck WCHAR dummy[4]; /* no decomposition is larger than 4 chars */
33c2c66affSColin Finck int key_len[4];
34c2c66affSColin Finck char *key_ptr[4];
35c2c66affSColin Finck const WCHAR *src_save = src;
36c2c66affSColin Finck int srclen_save = srclen;
37c2c66affSColin Finck
38c2c66affSColin Finck key_len[0] = key_len[1] = key_len[2] = key_len[3] = 0;
39c2c66affSColin Finck for (; srclen; srclen--, src++)
40c2c66affSColin Finck {
41c2c66affSColin Finck unsigned int i, decomposed_len = 1;/*wine_decompose(*src, dummy, 4);*/
42c2c66affSColin Finck dummy[0] = *src;
43c2c66affSColin Finck if (decomposed_len)
44c2c66affSColin Finck {
45c2c66affSColin Finck for (i = 0; i < decomposed_len; i++)
46c2c66affSColin Finck {
47c2c66affSColin Finck WCHAR wch = dummy[i];
48c2c66affSColin Finck unsigned int ce;
49c2c66affSColin Finck
50c2c66affSColin Finck /* tests show that win2k just ignores NORM_IGNORENONSPACE,
51c2c66affSColin Finck * and skips white space and punctuation characters for
52c2c66affSColin Finck * NORM_IGNORESYMBOLS.
53c2c66affSColin Finck */
54c2c66affSColin Finck if ((flags & NORM_IGNORESYMBOLS) && (get_char_typeW(wch) & (C1_PUNCT | C1_SPACE)))
55c2c66affSColin Finck continue;
56c2c66affSColin Finck
57c2c66affSColin Finck if (flags & NORM_IGNORECASE) wch = tolowerW(wch);
58c2c66affSColin Finck
59c2c66affSColin Finck ce = collation_table[collation_table[wch >> 8] + (wch & 0xff)];
60c2c66affSColin Finck if (ce != (unsigned int)-1)
61c2c66affSColin Finck {
62c2c66affSColin Finck if (ce >> 16) key_len[0] += 2;
63c2c66affSColin Finck if ((ce >> 8) & 0xff) key_len[1]++;
64c2c66affSColin Finck if ((ce >> 4) & 0x0f) key_len[2]++;
65c2c66affSColin Finck if (ce & 1)
66c2c66affSColin Finck {
67c2c66affSColin Finck if (wch >> 8) key_len[3]++;
68c2c66affSColin Finck key_len[3]++;
69c2c66affSColin Finck }
70c2c66affSColin Finck }
71c2c66affSColin Finck else
72c2c66affSColin Finck {
73c2c66affSColin Finck key_len[0] += 2;
74c2c66affSColin Finck if (wch >> 8) key_len[0]++;
75c2c66affSColin Finck if (wch & 0xff) key_len[0]++;
76c2c66affSColin Finck }
77c2c66affSColin Finck }
78c2c66affSColin Finck }
79c2c66affSColin Finck }
80c2c66affSColin Finck
81c2c66affSColin Finck if (!dstlen) /* compute length */
82*1a472d94SAmine Khaldi /* 4 * '\1' + key length */
83*1a472d94SAmine Khaldi return key_len[0] + key_len[1] + key_len[2] + key_len[3] + 4;
84c2c66affSColin Finck
85c2c66affSColin Finck if (dstlen < key_len[0] + key_len[1] + key_len[2] + key_len[3] + 4 + 1)
86c2c66affSColin Finck return 0; /* overflow */
87c2c66affSColin Finck
88c2c66affSColin Finck src = src_save;
89c2c66affSColin Finck srclen = srclen_save;
90c2c66affSColin Finck
91c2c66affSColin Finck key_ptr[0] = dst;
92c2c66affSColin Finck key_ptr[1] = key_ptr[0] + key_len[0] + 1;
93c2c66affSColin Finck key_ptr[2] = key_ptr[1] + key_len[1] + 1;
94c2c66affSColin Finck key_ptr[3] = key_ptr[2] + key_len[2] + 1;
95c2c66affSColin Finck
96c2c66affSColin Finck for (; srclen; srclen--, src++)
97c2c66affSColin Finck {
98c2c66affSColin Finck unsigned int i, decomposed_len = 1;/*wine_decompose(*src, dummy, 4);*/
99c2c66affSColin Finck dummy[0] = *src;
100c2c66affSColin Finck if (decomposed_len)
101c2c66affSColin Finck {
102c2c66affSColin Finck for (i = 0; i < decomposed_len; i++)
103c2c66affSColin Finck {
104c2c66affSColin Finck WCHAR wch = dummy[i];
105c2c66affSColin Finck unsigned int ce;
106c2c66affSColin Finck
107c2c66affSColin Finck /* tests show that win2k just ignores NORM_IGNORENONSPACE,
108c2c66affSColin Finck * and skips white space and punctuation characters for
109c2c66affSColin Finck * NORM_IGNORESYMBOLS.
110c2c66affSColin Finck */
111c2c66affSColin Finck if ((flags & NORM_IGNORESYMBOLS) && (get_char_typeW(wch) & (C1_PUNCT | C1_SPACE)))
112c2c66affSColin Finck continue;
113c2c66affSColin Finck
114c2c66affSColin Finck if (flags & NORM_IGNORECASE) wch = tolowerW(wch);
115c2c66affSColin Finck
116c2c66affSColin Finck ce = collation_table[collation_table[wch >> 8] + (wch & 0xff)];
117c2c66affSColin Finck if (ce != (unsigned int)-1)
118c2c66affSColin Finck {
119c2c66affSColin Finck WCHAR key;
120c2c66affSColin Finck if ((key = ce >> 16))
121c2c66affSColin Finck {
122c2c66affSColin Finck *key_ptr[0]++ = key >> 8;
123c2c66affSColin Finck *key_ptr[0]++ = key & 0xff;
124c2c66affSColin Finck }
125c2c66affSColin Finck /* make key 1 start from 2 */
126c2c66affSColin Finck if ((key = (ce >> 8) & 0xff)) *key_ptr[1]++ = key + 1;
127c2c66affSColin Finck /* make key 2 start from 2 */
128c2c66affSColin Finck if ((key = (ce >> 4) & 0x0f)) *key_ptr[2]++ = key + 1;
129c2c66affSColin Finck /* key 3 is always a character code */
130c2c66affSColin Finck if (ce & 1)
131c2c66affSColin Finck {
132c2c66affSColin Finck if (wch >> 8) *key_ptr[3]++ = wch >> 8;
133c2c66affSColin Finck if (wch & 0xff) *key_ptr[3]++ = wch & 0xff;
134c2c66affSColin Finck }
135c2c66affSColin Finck }
136c2c66affSColin Finck else
137c2c66affSColin Finck {
138c2c66affSColin Finck *key_ptr[0]++ = 0xff;
139c2c66affSColin Finck *key_ptr[0]++ = 0xfe;
140c2c66affSColin Finck if (wch >> 8) *key_ptr[0]++ = wch >> 8;
141c2c66affSColin Finck if (wch & 0xff) *key_ptr[0]++ = wch & 0xff;
142c2c66affSColin Finck }
143c2c66affSColin Finck }
144c2c66affSColin Finck }
145c2c66affSColin Finck }
146c2c66affSColin Finck
147c2c66affSColin Finck *key_ptr[0] = '\1';
148c2c66affSColin Finck *key_ptr[1] = '\1';
149c2c66affSColin Finck *key_ptr[2] = '\1';
150c2c66affSColin Finck *key_ptr[3]++ = '\1';
151c2c66affSColin Finck *key_ptr[3] = 0;
152c2c66affSColin Finck
153c2c66affSColin Finck return key_ptr[3] - dst;
154c2c66affSColin Finck }
155c2c66affSColin Finck
156*1a472d94SAmine Khaldi enum weight
157c2c66affSColin Finck {
158*1a472d94SAmine Khaldi UNICODE_WEIGHT,
159*1a472d94SAmine Khaldi DIACRITIC_WEIGHT,
160*1a472d94SAmine Khaldi CASE_WEIGHT
161*1a472d94SAmine Khaldi };
162*1a472d94SAmine Khaldi
get_weight(WCHAR ch,enum weight type)163*1a472d94SAmine Khaldi static unsigned int get_weight(WCHAR ch, enum weight type)
164*1a472d94SAmine Khaldi {
165*1a472d94SAmine Khaldi unsigned int ret;
166*1a472d94SAmine Khaldi
167*1a472d94SAmine Khaldi ret = collation_table[collation_table[ch >> 8] + (ch & 0xff)];
168*1a472d94SAmine Khaldi if (ret == (unsigned int)-1)
169*1a472d94SAmine Khaldi return ch;
170*1a472d94SAmine Khaldi
171*1a472d94SAmine Khaldi switch(type)
172*1a472d94SAmine Khaldi {
173*1a472d94SAmine Khaldi case UNICODE_WEIGHT:
174*1a472d94SAmine Khaldi return ret >> 16;
175*1a472d94SAmine Khaldi case DIACRITIC_WEIGHT:
176*1a472d94SAmine Khaldi return (ret >> 8) & 0xff;
177*1a472d94SAmine Khaldi case CASE_WEIGHT:
178*1a472d94SAmine Khaldi default:
179*1a472d94SAmine Khaldi return (ret >> 4) & 0x0f;
180*1a472d94SAmine Khaldi }
181*1a472d94SAmine Khaldi }
182*1a472d94SAmine Khaldi
inc_str_pos(const WCHAR ** str,int * len,int * dpos,int * dlen)183*1a472d94SAmine Khaldi static void inc_str_pos(const WCHAR **str, int *len, int *dpos, int *dlen)
184*1a472d94SAmine Khaldi {
185*1a472d94SAmine Khaldi (*dpos)++;
186*1a472d94SAmine Khaldi if (*dpos == *dlen)
187*1a472d94SAmine Khaldi {
188*1a472d94SAmine Khaldi *dpos = *dlen = 0;
189*1a472d94SAmine Khaldi (*str)++;
190*1a472d94SAmine Khaldi (*len)--;
191*1a472d94SAmine Khaldi }
192*1a472d94SAmine Khaldi }
193*1a472d94SAmine Khaldi
compare_weights(int flags,const WCHAR * str1,int len1,const WCHAR * str2,int len2,enum weight type)194*1a472d94SAmine Khaldi static inline int compare_weights(int flags, const WCHAR *str1, int len1,
195*1a472d94SAmine Khaldi const WCHAR *str2, int len2, enum weight type)
196*1a472d94SAmine Khaldi {
197*1a472d94SAmine Khaldi int dpos1 = 0, dpos2 = 0, dlen1 = 0, dlen2 = 0;
198*1a472d94SAmine Khaldi WCHAR dstr1[4], dstr2[4];
199c2c66affSColin Finck unsigned int ce1, ce2;
200c2c66affSColin Finck
201c2c66affSColin Finck /* 32-bit collation element table format:
202c2c66affSColin Finck * unicode weight - high 16 bit, diacritic weight - high 8 bit of low 16 bit,
203c2c66affSColin Finck * case weight - high 4 bit of low 8 bit.
204c2c66affSColin Finck */
205c2c66affSColin Finck while (len1 > 0 && len2 > 0)
206c2c66affSColin Finck {
207*1a472d94SAmine Khaldi if (!dlen1) dlen1 = wine_decompose(0, *str1, dstr1, 4);
208*1a472d94SAmine Khaldi if (!dlen2) dlen2 = wine_decompose(0, *str2, dstr2, 4);
209*1a472d94SAmine Khaldi
210c2c66affSColin Finck if (flags & NORM_IGNORESYMBOLS)
211c2c66affSColin Finck {
212c2c66affSColin Finck int skip = 0;
213c2c66affSColin Finck /* FIXME: not tested */
214*1a472d94SAmine Khaldi if (get_char_typeW(dstr1[dpos1]) & (C1_PUNCT | C1_SPACE))
215c2c66affSColin Finck {
216*1a472d94SAmine Khaldi inc_str_pos(&str1, &len1, &dpos1, &dlen1);
217c2c66affSColin Finck skip = 1;
218c2c66affSColin Finck }
219*1a472d94SAmine Khaldi if (get_char_typeW(dstr2[dpos2]) & (C1_PUNCT | C1_SPACE))
220c2c66affSColin Finck {
221*1a472d94SAmine Khaldi inc_str_pos(&str2, &len2, &dpos2, &dlen2);
222c2c66affSColin Finck skip = 1;
223c2c66affSColin Finck }
224c2c66affSColin Finck if (skip) continue;
225c2c66affSColin Finck }
226c2c66affSColin Finck
227c2c66affSColin Finck /* hyphen and apostrophe are treated differently depending on
228c2c66affSColin Finck * whether SORT_STRINGSORT specified or not
229c2c66affSColin Finck */
230*1a472d94SAmine Khaldi if (type == UNICODE_WEIGHT && !(flags & SORT_STRINGSORT))
231c2c66affSColin Finck {
232*1a472d94SAmine Khaldi if (dstr1[dpos1] == '-' || dstr1[dpos1] == '\'')
233c2c66affSColin Finck {
234*1a472d94SAmine Khaldi if (dstr2[dpos2] != '-' && dstr2[dpos2] != '\'')
235c2c66affSColin Finck {
236*1a472d94SAmine Khaldi inc_str_pos(&str1, &len1, &dpos1, &dlen1);
237c2c66affSColin Finck continue;
238c2c66affSColin Finck }
239c2c66affSColin Finck }
240*1a472d94SAmine Khaldi else if (dstr2[dpos2] == '-' || dstr2[dpos2] == '\'')
241c2c66affSColin Finck {
242*1a472d94SAmine Khaldi inc_str_pos(&str2, &len2, &dpos2, &dlen2);
243c2c66affSColin Finck continue;
244c2c66affSColin Finck }
245c2c66affSColin Finck }
246c2c66affSColin Finck
247*1a472d94SAmine Khaldi ce1 = get_weight(dstr1[dpos1], type);
248*1a472d94SAmine Khaldi if (!ce1)
249c2c66affSColin Finck {
250*1a472d94SAmine Khaldi inc_str_pos(&str1, &len1, &dpos1, &dlen1);
251*1a472d94SAmine Khaldi continue;
252c2c66affSColin Finck }
253*1a472d94SAmine Khaldi ce2 = get_weight(dstr2[dpos2], type);
254*1a472d94SAmine Khaldi if (!ce2)
255c2c66affSColin Finck {
256*1a472d94SAmine Khaldi inc_str_pos(&str2, &len2, &dpos2, &dlen2);
257*1a472d94SAmine Khaldi continue;
258c2c66affSColin Finck }
259c2c66affSColin Finck
260*1a472d94SAmine Khaldi if (ce1 - ce2) return ce1 - ce2;
261c2c66affSColin Finck
262*1a472d94SAmine Khaldi inc_str_pos(&str1, &len1, &dpos1, &dlen1);
263*1a472d94SAmine Khaldi inc_str_pos(&str2, &len2, &dpos2, &dlen2);
264c2c66affSColin Finck }
265*1a472d94SAmine Khaldi while (len1)
266c2c66affSColin Finck {
267*1a472d94SAmine Khaldi if (!dlen1) dlen1 = wine_decompose(0, *str1, dstr1, 4);
268c2c66affSColin Finck
269*1a472d94SAmine Khaldi ce1 = get_weight(dstr1[dpos1], type);
270*1a472d94SAmine Khaldi if (ce1) break;
271*1a472d94SAmine Khaldi inc_str_pos(&str1, &len1, &dpos1, &dlen1);
272c2c66affSColin Finck }
273*1a472d94SAmine Khaldi while (len2)
274c2c66affSColin Finck {
275*1a472d94SAmine Khaldi if (!dlen2) dlen2 = wine_decompose(0, *str2, dstr2, 4);
276c2c66affSColin Finck
277*1a472d94SAmine Khaldi ce2 = get_weight(dstr2[dpos2], type);
278*1a472d94SAmine Khaldi if (ce2) break;
279*1a472d94SAmine Khaldi inc_str_pos(&str2, &len2, &dpos2, &dlen2);
280c2c66affSColin Finck }
281c2c66affSColin Finck return len1 - len2;
282c2c66affSColin Finck }
283c2c66affSColin Finck
wine_compare_string(int flags,const WCHAR * str1,int len1,const WCHAR * str2,int len2)284c2c66affSColin Finck int wine_compare_string(int flags, const WCHAR *str1, int len1,
285c2c66affSColin Finck const WCHAR *str2, int len2)
286c2c66affSColin Finck {
287c2c66affSColin Finck int ret;
288c2c66affSColin Finck
289*1a472d94SAmine Khaldi ret = compare_weights(flags, str1, len1, str2, len2, UNICODE_WEIGHT);
290c2c66affSColin Finck if (!ret)
291c2c66affSColin Finck {
292c2c66affSColin Finck if (!(flags & NORM_IGNORENONSPACE))
293*1a472d94SAmine Khaldi ret = compare_weights(flags, str1, len1, str2, len2, DIACRITIC_WEIGHT);
294c2c66affSColin Finck if (!ret && !(flags & NORM_IGNORECASE))
295*1a472d94SAmine Khaldi ret = compare_weights(flags, str1, len1, str2, len2, CASE_WEIGHT);
296c2c66affSColin Finck }
297c2c66affSColin Finck return ret;
298c2c66affSColin Finck }
299