xref: /reactos/sdk/tools/unicode/sortkey.c (revision 1a472d94)
1c2c66affSColin Finck /*
2c2c66affSColin Finck  * Unicode sort key generation
3c2c66affSColin Finck  *
4c2c66affSColin Finck  * Copyright 2003 Dmitry Timoshkov
5c2c66affSColin Finck  *
6c2c66affSColin Finck  * This library is free software; you can redistribute it and/or
7c2c66affSColin Finck  * modify it under the terms of the GNU Lesser General Public
8c2c66affSColin Finck  * License as published by the Free Software Foundation; either
9c2c66affSColin Finck  * version 2.1 of the License, or (at your option) any later version.
10c2c66affSColin Finck  *
11c2c66affSColin Finck  * This library is distributed in the hope that it will be useful,
12c2c66affSColin Finck  * but WITHOUT ANY WARRANTY; without even the implied warranty of
13c2c66affSColin Finck  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
14c2c66affSColin Finck  * Lesser General Public License for more details.
15c2c66affSColin Finck  *
16c2c66affSColin Finck  * You should have received a copy of the GNU Lesser General Public
17c2c66affSColin Finck  * License along with this library; if not, write to the Free Software
18c2c66affSColin Finck  * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301, USA
19c2c66affSColin Finck  */
20c2c66affSColin Finck #include "wine/unicode.h"
21c2c66affSColin Finck 
22*1a472d94SAmine Khaldi extern unsigned int wine_decompose( int flags, WCHAR ch, WCHAR *dst, unsigned int dstlen );
23c2c66affSColin Finck extern const unsigned int collation_table[];
24c2c66affSColin Finck 
25c2c66affSColin Finck /*
26c2c66affSColin Finck  * flags - normalization NORM_* flags
27c2c66affSColin Finck  *
28c2c66affSColin Finck  * FIXME: 'variable' flag not handled
29c2c66affSColin Finck  */
wine_get_sortkey(int flags,const WCHAR * src,int srclen,char * dst,int dstlen)30c2c66affSColin Finck int wine_get_sortkey(int flags, const WCHAR *src, int srclen, char *dst, int dstlen)
31c2c66affSColin Finck {
32c2c66affSColin Finck     WCHAR dummy[4]; /* no decomposition is larger than 4 chars */
33c2c66affSColin Finck     int key_len[4];
34c2c66affSColin Finck     char *key_ptr[4];
35c2c66affSColin Finck     const WCHAR *src_save = src;
36c2c66affSColin Finck     int srclen_save = srclen;
37c2c66affSColin Finck 
38c2c66affSColin Finck     key_len[0] = key_len[1] = key_len[2] = key_len[3] = 0;
39c2c66affSColin Finck     for (; srclen; srclen--, src++)
40c2c66affSColin Finck     {
41c2c66affSColin Finck         unsigned int i, decomposed_len = 1;/*wine_decompose(*src, dummy, 4);*/
42c2c66affSColin Finck         dummy[0] = *src;
43c2c66affSColin Finck         if (decomposed_len)
44c2c66affSColin Finck         {
45c2c66affSColin Finck             for (i = 0; i < decomposed_len; i++)
46c2c66affSColin Finck             {
47c2c66affSColin Finck                 WCHAR wch = dummy[i];
48c2c66affSColin Finck                 unsigned int ce;
49c2c66affSColin Finck 
50c2c66affSColin Finck                 /* tests show that win2k just ignores NORM_IGNORENONSPACE,
51c2c66affSColin Finck                  * and skips white space and punctuation characters for
52c2c66affSColin Finck                  * NORM_IGNORESYMBOLS.
53c2c66affSColin Finck                  */
54c2c66affSColin Finck                 if ((flags & NORM_IGNORESYMBOLS) && (get_char_typeW(wch) & (C1_PUNCT | C1_SPACE)))
55c2c66affSColin Finck                     continue;
56c2c66affSColin Finck 
57c2c66affSColin Finck                 if (flags & NORM_IGNORECASE) wch = tolowerW(wch);
58c2c66affSColin Finck 
59c2c66affSColin Finck                 ce = collation_table[collation_table[wch >> 8] + (wch & 0xff)];
60c2c66affSColin Finck                 if (ce != (unsigned int)-1)
61c2c66affSColin Finck                 {
62c2c66affSColin Finck                     if (ce >> 16) key_len[0] += 2;
63c2c66affSColin Finck                     if ((ce >> 8) & 0xff) key_len[1]++;
64c2c66affSColin Finck                     if ((ce >> 4) & 0x0f) key_len[2]++;
65c2c66affSColin Finck                     if (ce & 1)
66c2c66affSColin Finck                     {
67c2c66affSColin Finck                         if (wch >> 8) key_len[3]++;
68c2c66affSColin Finck                         key_len[3]++;
69c2c66affSColin Finck                     }
70c2c66affSColin Finck                 }
71c2c66affSColin Finck                 else
72c2c66affSColin Finck                 {
73c2c66affSColin Finck                     key_len[0] += 2;
74c2c66affSColin Finck                     if (wch >> 8) key_len[0]++;
75c2c66affSColin Finck                     if (wch & 0xff) key_len[0]++;
76c2c66affSColin Finck 		}
77c2c66affSColin Finck             }
78c2c66affSColin Finck         }
79c2c66affSColin Finck     }
80c2c66affSColin Finck 
81c2c66affSColin Finck     if (!dstlen) /* compute length */
82*1a472d94SAmine Khaldi         /* 4 * '\1' + key length */
83*1a472d94SAmine Khaldi         return key_len[0] + key_len[1] + key_len[2] + key_len[3] + 4;
84c2c66affSColin Finck 
85c2c66affSColin Finck     if (dstlen < key_len[0] + key_len[1] + key_len[2] + key_len[3] + 4 + 1)
86c2c66affSColin Finck         return 0; /* overflow */
87c2c66affSColin Finck 
88c2c66affSColin Finck     src = src_save;
89c2c66affSColin Finck     srclen = srclen_save;
90c2c66affSColin Finck 
91c2c66affSColin Finck     key_ptr[0] = dst;
92c2c66affSColin Finck     key_ptr[1] = key_ptr[0] + key_len[0] + 1;
93c2c66affSColin Finck     key_ptr[2] = key_ptr[1] + key_len[1] + 1;
94c2c66affSColin Finck     key_ptr[3] = key_ptr[2] + key_len[2] + 1;
95c2c66affSColin Finck 
96c2c66affSColin Finck     for (; srclen; srclen--, src++)
97c2c66affSColin Finck     {
98c2c66affSColin Finck         unsigned int i, decomposed_len = 1;/*wine_decompose(*src, dummy, 4);*/
99c2c66affSColin Finck         dummy[0] = *src;
100c2c66affSColin Finck         if (decomposed_len)
101c2c66affSColin Finck         {
102c2c66affSColin Finck             for (i = 0; i < decomposed_len; i++)
103c2c66affSColin Finck             {
104c2c66affSColin Finck                 WCHAR wch = dummy[i];
105c2c66affSColin Finck                 unsigned int ce;
106c2c66affSColin Finck 
107c2c66affSColin Finck                 /* tests show that win2k just ignores NORM_IGNORENONSPACE,
108c2c66affSColin Finck                  * and skips white space and punctuation characters for
109c2c66affSColin Finck                  * NORM_IGNORESYMBOLS.
110c2c66affSColin Finck                  */
111c2c66affSColin Finck                 if ((flags & NORM_IGNORESYMBOLS) && (get_char_typeW(wch) & (C1_PUNCT | C1_SPACE)))
112c2c66affSColin Finck                     continue;
113c2c66affSColin Finck 
114c2c66affSColin Finck                 if (flags & NORM_IGNORECASE) wch = tolowerW(wch);
115c2c66affSColin Finck 
116c2c66affSColin Finck                 ce = collation_table[collation_table[wch >> 8] + (wch & 0xff)];
117c2c66affSColin Finck                 if (ce != (unsigned int)-1)
118c2c66affSColin Finck                 {
119c2c66affSColin Finck                     WCHAR key;
120c2c66affSColin Finck                     if ((key = ce >> 16))
121c2c66affSColin Finck                     {
122c2c66affSColin Finck                         *key_ptr[0]++ = key >> 8;
123c2c66affSColin Finck                         *key_ptr[0]++ = key & 0xff;
124c2c66affSColin Finck                     }
125c2c66affSColin Finck                     /* make key 1 start from 2 */
126c2c66affSColin Finck                     if ((key = (ce >> 8) & 0xff)) *key_ptr[1]++ = key + 1;
127c2c66affSColin Finck                     /* make key 2 start from 2 */
128c2c66affSColin Finck                     if ((key = (ce >> 4) & 0x0f)) *key_ptr[2]++ = key + 1;
129c2c66affSColin Finck                     /* key 3 is always a character code */
130c2c66affSColin Finck                     if (ce & 1)
131c2c66affSColin Finck                     {
132c2c66affSColin Finck                         if (wch >> 8) *key_ptr[3]++ = wch >> 8;
133c2c66affSColin Finck                         if (wch & 0xff) *key_ptr[3]++ = wch & 0xff;
134c2c66affSColin Finck                     }
135c2c66affSColin Finck                 }
136c2c66affSColin Finck                 else
137c2c66affSColin Finck                 {
138c2c66affSColin Finck                     *key_ptr[0]++ = 0xff;
139c2c66affSColin Finck                     *key_ptr[0]++ = 0xfe;
140c2c66affSColin Finck                     if (wch >> 8) *key_ptr[0]++ = wch >> 8;
141c2c66affSColin Finck                     if (wch & 0xff) *key_ptr[0]++ = wch & 0xff;
142c2c66affSColin Finck                 }
143c2c66affSColin Finck             }
144c2c66affSColin Finck         }
145c2c66affSColin Finck     }
146c2c66affSColin Finck 
147c2c66affSColin Finck     *key_ptr[0] = '\1';
148c2c66affSColin Finck     *key_ptr[1] = '\1';
149c2c66affSColin Finck     *key_ptr[2] = '\1';
150c2c66affSColin Finck     *key_ptr[3]++ = '\1';
151c2c66affSColin Finck     *key_ptr[3] = 0;
152c2c66affSColin Finck 
153c2c66affSColin Finck     return key_ptr[3] - dst;
154c2c66affSColin Finck }
155c2c66affSColin Finck 
156*1a472d94SAmine Khaldi enum weight
157c2c66affSColin Finck {
158*1a472d94SAmine Khaldi     UNICODE_WEIGHT,
159*1a472d94SAmine Khaldi     DIACRITIC_WEIGHT,
160*1a472d94SAmine Khaldi     CASE_WEIGHT
161*1a472d94SAmine Khaldi };
162*1a472d94SAmine Khaldi 
get_weight(WCHAR ch,enum weight type)163*1a472d94SAmine Khaldi static unsigned int get_weight(WCHAR ch, enum weight type)
164*1a472d94SAmine Khaldi {
165*1a472d94SAmine Khaldi     unsigned int ret;
166*1a472d94SAmine Khaldi 
167*1a472d94SAmine Khaldi     ret = collation_table[collation_table[ch >> 8] + (ch & 0xff)];
168*1a472d94SAmine Khaldi     if (ret == (unsigned int)-1)
169*1a472d94SAmine Khaldi         return ch;
170*1a472d94SAmine Khaldi 
171*1a472d94SAmine Khaldi     switch(type)
172*1a472d94SAmine Khaldi     {
173*1a472d94SAmine Khaldi     case UNICODE_WEIGHT:
174*1a472d94SAmine Khaldi         return ret >> 16;
175*1a472d94SAmine Khaldi     case DIACRITIC_WEIGHT:
176*1a472d94SAmine Khaldi         return (ret >> 8) & 0xff;
177*1a472d94SAmine Khaldi     case CASE_WEIGHT:
178*1a472d94SAmine Khaldi     default:
179*1a472d94SAmine Khaldi         return (ret >> 4) & 0x0f;
180*1a472d94SAmine Khaldi     }
181*1a472d94SAmine Khaldi }
182*1a472d94SAmine Khaldi 
inc_str_pos(const WCHAR ** str,int * len,int * dpos,int * dlen)183*1a472d94SAmine Khaldi static void inc_str_pos(const WCHAR **str, int *len, int *dpos, int *dlen)
184*1a472d94SAmine Khaldi {
185*1a472d94SAmine Khaldi     (*dpos)++;
186*1a472d94SAmine Khaldi     if (*dpos == *dlen)
187*1a472d94SAmine Khaldi     {
188*1a472d94SAmine Khaldi         *dpos = *dlen = 0;
189*1a472d94SAmine Khaldi         (*str)++;
190*1a472d94SAmine Khaldi         (*len)--;
191*1a472d94SAmine Khaldi     }
192*1a472d94SAmine Khaldi }
193*1a472d94SAmine Khaldi 
compare_weights(int flags,const WCHAR * str1,int len1,const WCHAR * str2,int len2,enum weight type)194*1a472d94SAmine Khaldi static inline int compare_weights(int flags, const WCHAR *str1, int len1,
195*1a472d94SAmine Khaldi                                   const WCHAR *str2, int len2, enum weight type)
196*1a472d94SAmine Khaldi {
197*1a472d94SAmine Khaldi     int dpos1 = 0, dpos2 = 0, dlen1 = 0, dlen2 = 0;
198*1a472d94SAmine Khaldi     WCHAR dstr1[4], dstr2[4];
199c2c66affSColin Finck     unsigned int ce1, ce2;
200c2c66affSColin Finck 
201c2c66affSColin Finck     /* 32-bit collation element table format:
202c2c66affSColin Finck      * unicode weight - high 16 bit, diacritic weight - high 8 bit of low 16 bit,
203c2c66affSColin Finck      * case weight - high 4 bit of low 8 bit.
204c2c66affSColin Finck      */
205c2c66affSColin Finck     while (len1 > 0 && len2 > 0)
206c2c66affSColin Finck     {
207*1a472d94SAmine Khaldi         if (!dlen1) dlen1 = wine_decompose(0, *str1, dstr1, 4);
208*1a472d94SAmine Khaldi         if (!dlen2) dlen2 = wine_decompose(0, *str2, dstr2, 4);
209*1a472d94SAmine Khaldi 
210c2c66affSColin Finck         if (flags & NORM_IGNORESYMBOLS)
211c2c66affSColin Finck         {
212c2c66affSColin Finck             int skip = 0;
213c2c66affSColin Finck             /* FIXME: not tested */
214*1a472d94SAmine Khaldi             if (get_char_typeW(dstr1[dpos1]) & (C1_PUNCT | C1_SPACE))
215c2c66affSColin Finck             {
216*1a472d94SAmine Khaldi                 inc_str_pos(&str1, &len1, &dpos1, &dlen1);
217c2c66affSColin Finck                 skip = 1;
218c2c66affSColin Finck             }
219*1a472d94SAmine Khaldi             if (get_char_typeW(dstr2[dpos2]) & (C1_PUNCT | C1_SPACE))
220c2c66affSColin Finck             {
221*1a472d94SAmine Khaldi                 inc_str_pos(&str2, &len2, &dpos2, &dlen2);
222c2c66affSColin Finck                 skip = 1;
223c2c66affSColin Finck             }
224c2c66affSColin Finck             if (skip) continue;
225c2c66affSColin Finck         }
226c2c66affSColin Finck 
227c2c66affSColin Finck        /* hyphen and apostrophe are treated differently depending on
228c2c66affSColin Finck         * whether SORT_STRINGSORT specified or not
229c2c66affSColin Finck         */
230*1a472d94SAmine Khaldi         if (type == UNICODE_WEIGHT && !(flags & SORT_STRINGSORT))
231c2c66affSColin Finck         {
232*1a472d94SAmine Khaldi             if (dstr1[dpos1] == '-' || dstr1[dpos1] == '\'')
233c2c66affSColin Finck             {
234*1a472d94SAmine Khaldi                 if (dstr2[dpos2] != '-' && dstr2[dpos2] != '\'')
235c2c66affSColin Finck                 {
236*1a472d94SAmine Khaldi                     inc_str_pos(&str1, &len1, &dpos1, &dlen1);
237c2c66affSColin Finck                     continue;
238c2c66affSColin Finck                 }
239c2c66affSColin Finck             }
240*1a472d94SAmine Khaldi             else if (dstr2[dpos2] == '-' || dstr2[dpos2] == '\'')
241c2c66affSColin Finck             {
242*1a472d94SAmine Khaldi                 inc_str_pos(&str2, &len2, &dpos2, &dlen2);
243c2c66affSColin Finck                 continue;
244c2c66affSColin Finck             }
245c2c66affSColin Finck         }
246c2c66affSColin Finck 
247*1a472d94SAmine Khaldi         ce1 = get_weight(dstr1[dpos1], type);
248*1a472d94SAmine Khaldi         if (!ce1)
249c2c66affSColin Finck         {
250*1a472d94SAmine Khaldi             inc_str_pos(&str1, &len1, &dpos1, &dlen1);
251*1a472d94SAmine Khaldi             continue;
252c2c66affSColin Finck         }
253*1a472d94SAmine Khaldi         ce2 = get_weight(dstr2[dpos2], type);
254*1a472d94SAmine Khaldi         if (!ce2)
255c2c66affSColin Finck         {
256*1a472d94SAmine Khaldi             inc_str_pos(&str2, &len2, &dpos2, &dlen2);
257*1a472d94SAmine Khaldi             continue;
258c2c66affSColin Finck         }
259c2c66affSColin Finck 
260*1a472d94SAmine Khaldi         if (ce1 - ce2) return ce1 - ce2;
261c2c66affSColin Finck 
262*1a472d94SAmine Khaldi         inc_str_pos(&str1, &len1, &dpos1, &dlen1);
263*1a472d94SAmine Khaldi         inc_str_pos(&str2, &len2, &dpos2, &dlen2);
264c2c66affSColin Finck     }
265*1a472d94SAmine Khaldi     while (len1)
266c2c66affSColin Finck     {
267*1a472d94SAmine Khaldi         if (!dlen1) dlen1 = wine_decompose(0, *str1, dstr1, 4);
268c2c66affSColin Finck 
269*1a472d94SAmine Khaldi         ce1 = get_weight(dstr1[dpos1], type);
270*1a472d94SAmine Khaldi         if (ce1) break;
271*1a472d94SAmine Khaldi         inc_str_pos(&str1, &len1, &dpos1, &dlen1);
272c2c66affSColin Finck     }
273*1a472d94SAmine Khaldi     while (len2)
274c2c66affSColin Finck     {
275*1a472d94SAmine Khaldi         if (!dlen2) dlen2 = wine_decompose(0, *str2, dstr2, 4);
276c2c66affSColin Finck 
277*1a472d94SAmine Khaldi         ce2 = get_weight(dstr2[dpos2], type);
278*1a472d94SAmine Khaldi         if (ce2) break;
279*1a472d94SAmine Khaldi         inc_str_pos(&str2, &len2, &dpos2, &dlen2);
280c2c66affSColin Finck     }
281c2c66affSColin Finck     return len1 - len2;
282c2c66affSColin Finck }
283c2c66affSColin Finck 
wine_compare_string(int flags,const WCHAR * str1,int len1,const WCHAR * str2,int len2)284c2c66affSColin Finck int wine_compare_string(int flags, const WCHAR *str1, int len1,
285c2c66affSColin Finck                         const WCHAR *str2, int len2)
286c2c66affSColin Finck {
287c2c66affSColin Finck     int ret;
288c2c66affSColin Finck 
289*1a472d94SAmine Khaldi     ret = compare_weights(flags, str1, len1, str2, len2, UNICODE_WEIGHT);
290c2c66affSColin Finck     if (!ret)
291c2c66affSColin Finck     {
292c2c66affSColin Finck         if (!(flags & NORM_IGNORENONSPACE))
293*1a472d94SAmine Khaldi             ret = compare_weights(flags, str1, len1, str2, len2, DIACRITIC_WEIGHT);
294c2c66affSColin Finck         if (!ret && !(flags & NORM_IGNORECASE))
295*1a472d94SAmine Khaldi             ret = compare_weights(flags, str1, len1, str2, len2, CASE_WEIGHT);
296c2c66affSColin Finck     }
297c2c66affSColin Finck     return ret;
298c2c66affSColin Finck }
299