xref: /reactos/sdk/tools/unicode/sortkey.c (revision 1a472d94)
1 /*
2  * Unicode sort key generation
3  *
4  * Copyright 2003 Dmitry Timoshkov
5  *
6  * This library is free software; you can redistribute it and/or
7  * modify it under the terms of the GNU Lesser General Public
8  * License as published by the Free Software Foundation; either
9  * version 2.1 of the License, or (at your option) any later version.
10  *
11  * This library is distributed in the hope that it will be useful,
12  * but WITHOUT ANY WARRANTY; without even the implied warranty of
13  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
14  * Lesser General Public License for more details.
15  *
16  * You should have received a copy of the GNU Lesser General Public
17  * License along with this library; if not, write to the Free Software
18  * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301, USA
19  */
20 #include "wine/unicode.h"
21 
22 extern unsigned int wine_decompose( int flags, WCHAR ch, WCHAR *dst, unsigned int dstlen );
23 extern const unsigned int collation_table[];
24 
25 /*
26  * flags - normalization NORM_* flags
27  *
28  * FIXME: 'variable' flag not handled
29  */
wine_get_sortkey(int flags,const WCHAR * src,int srclen,char * dst,int dstlen)30 int wine_get_sortkey(int flags, const WCHAR *src, int srclen, char *dst, int dstlen)
31 {
32     WCHAR dummy[4]; /* no decomposition is larger than 4 chars */
33     int key_len[4];
34     char *key_ptr[4];
35     const WCHAR *src_save = src;
36     int srclen_save = srclen;
37 
38     key_len[0] = key_len[1] = key_len[2] = key_len[3] = 0;
39     for (; srclen; srclen--, src++)
40     {
41         unsigned int i, decomposed_len = 1;/*wine_decompose(*src, dummy, 4);*/
42         dummy[0] = *src;
43         if (decomposed_len)
44         {
45             for (i = 0; i < decomposed_len; i++)
46             {
47                 WCHAR wch = dummy[i];
48                 unsigned int ce;
49 
50                 /* tests show that win2k just ignores NORM_IGNORENONSPACE,
51                  * and skips white space and punctuation characters for
52                  * NORM_IGNORESYMBOLS.
53                  */
54                 if ((flags & NORM_IGNORESYMBOLS) && (get_char_typeW(wch) & (C1_PUNCT | C1_SPACE)))
55                     continue;
56 
57                 if (flags & NORM_IGNORECASE) wch = tolowerW(wch);
58 
59                 ce = collation_table[collation_table[wch >> 8] + (wch & 0xff)];
60                 if (ce != (unsigned int)-1)
61                 {
62                     if (ce >> 16) key_len[0] += 2;
63                     if ((ce >> 8) & 0xff) key_len[1]++;
64                     if ((ce >> 4) & 0x0f) key_len[2]++;
65                     if (ce & 1)
66                     {
67                         if (wch >> 8) key_len[3]++;
68                         key_len[3]++;
69                     }
70                 }
71                 else
72                 {
73                     key_len[0] += 2;
74                     if (wch >> 8) key_len[0]++;
75                     if (wch & 0xff) key_len[0]++;
76 		}
77             }
78         }
79     }
80 
81     if (!dstlen) /* compute length */
82         /* 4 * '\1' + key length */
83         return key_len[0] + key_len[1] + key_len[2] + key_len[3] + 4;
84 
85     if (dstlen < key_len[0] + key_len[1] + key_len[2] + key_len[3] + 4 + 1)
86         return 0; /* overflow */
87 
88     src = src_save;
89     srclen = srclen_save;
90 
91     key_ptr[0] = dst;
92     key_ptr[1] = key_ptr[0] + key_len[0] + 1;
93     key_ptr[2] = key_ptr[1] + key_len[1] + 1;
94     key_ptr[3] = key_ptr[2] + key_len[2] + 1;
95 
96     for (; srclen; srclen--, src++)
97     {
98         unsigned int i, decomposed_len = 1;/*wine_decompose(*src, dummy, 4);*/
99         dummy[0] = *src;
100         if (decomposed_len)
101         {
102             for (i = 0; i < decomposed_len; i++)
103             {
104                 WCHAR wch = dummy[i];
105                 unsigned int ce;
106 
107                 /* tests show that win2k just ignores NORM_IGNORENONSPACE,
108                  * and skips white space and punctuation characters for
109                  * NORM_IGNORESYMBOLS.
110                  */
111                 if ((flags & NORM_IGNORESYMBOLS) && (get_char_typeW(wch) & (C1_PUNCT | C1_SPACE)))
112                     continue;
113 
114                 if (flags & NORM_IGNORECASE) wch = tolowerW(wch);
115 
116                 ce = collation_table[collation_table[wch >> 8] + (wch & 0xff)];
117                 if (ce != (unsigned int)-1)
118                 {
119                     WCHAR key;
120                     if ((key = ce >> 16))
121                     {
122                         *key_ptr[0]++ = key >> 8;
123                         *key_ptr[0]++ = key & 0xff;
124                     }
125                     /* make key 1 start from 2 */
126                     if ((key = (ce >> 8) & 0xff)) *key_ptr[1]++ = key + 1;
127                     /* make key 2 start from 2 */
128                     if ((key = (ce >> 4) & 0x0f)) *key_ptr[2]++ = key + 1;
129                     /* key 3 is always a character code */
130                     if (ce & 1)
131                     {
132                         if (wch >> 8) *key_ptr[3]++ = wch >> 8;
133                         if (wch & 0xff) *key_ptr[3]++ = wch & 0xff;
134                     }
135                 }
136                 else
137                 {
138                     *key_ptr[0]++ = 0xff;
139                     *key_ptr[0]++ = 0xfe;
140                     if (wch >> 8) *key_ptr[0]++ = wch >> 8;
141                     if (wch & 0xff) *key_ptr[0]++ = wch & 0xff;
142                 }
143             }
144         }
145     }
146 
147     *key_ptr[0] = '\1';
148     *key_ptr[1] = '\1';
149     *key_ptr[2] = '\1';
150     *key_ptr[3]++ = '\1';
151     *key_ptr[3] = 0;
152 
153     return key_ptr[3] - dst;
154 }
155 
156 enum weight
157 {
158     UNICODE_WEIGHT,
159     DIACRITIC_WEIGHT,
160     CASE_WEIGHT
161 };
162 
get_weight(WCHAR ch,enum weight type)163 static unsigned int get_weight(WCHAR ch, enum weight type)
164 {
165     unsigned int ret;
166 
167     ret = collation_table[collation_table[ch >> 8] + (ch & 0xff)];
168     if (ret == (unsigned int)-1)
169         return ch;
170 
171     switch(type)
172     {
173     case UNICODE_WEIGHT:
174         return ret >> 16;
175     case DIACRITIC_WEIGHT:
176         return (ret >> 8) & 0xff;
177     case CASE_WEIGHT:
178     default:
179         return (ret >> 4) & 0x0f;
180     }
181 }
182 
inc_str_pos(const WCHAR ** str,int * len,int * dpos,int * dlen)183 static void inc_str_pos(const WCHAR **str, int *len, int *dpos, int *dlen)
184 {
185     (*dpos)++;
186     if (*dpos == *dlen)
187     {
188         *dpos = *dlen = 0;
189         (*str)++;
190         (*len)--;
191     }
192 }
193 
compare_weights(int flags,const WCHAR * str1,int len1,const WCHAR * str2,int len2,enum weight type)194 static inline int compare_weights(int flags, const WCHAR *str1, int len1,
195                                   const WCHAR *str2, int len2, enum weight type)
196 {
197     int dpos1 = 0, dpos2 = 0, dlen1 = 0, dlen2 = 0;
198     WCHAR dstr1[4], dstr2[4];
199     unsigned int ce1, ce2;
200 
201     /* 32-bit collation element table format:
202      * unicode weight - high 16 bit, diacritic weight - high 8 bit of low 16 bit,
203      * case weight - high 4 bit of low 8 bit.
204      */
205     while (len1 > 0 && len2 > 0)
206     {
207         if (!dlen1) dlen1 = wine_decompose(0, *str1, dstr1, 4);
208         if (!dlen2) dlen2 = wine_decompose(0, *str2, dstr2, 4);
209 
210         if (flags & NORM_IGNORESYMBOLS)
211         {
212             int skip = 0;
213             /* FIXME: not tested */
214             if (get_char_typeW(dstr1[dpos1]) & (C1_PUNCT | C1_SPACE))
215             {
216                 inc_str_pos(&str1, &len1, &dpos1, &dlen1);
217                 skip = 1;
218             }
219             if (get_char_typeW(dstr2[dpos2]) & (C1_PUNCT | C1_SPACE))
220             {
221                 inc_str_pos(&str2, &len2, &dpos2, &dlen2);
222                 skip = 1;
223             }
224             if (skip) continue;
225         }
226 
227        /* hyphen and apostrophe are treated differently depending on
228         * whether SORT_STRINGSORT specified or not
229         */
230         if (type == UNICODE_WEIGHT && !(flags & SORT_STRINGSORT))
231         {
232             if (dstr1[dpos1] == '-' || dstr1[dpos1] == '\'')
233             {
234                 if (dstr2[dpos2] != '-' && dstr2[dpos2] != '\'')
235                 {
236                     inc_str_pos(&str1, &len1, &dpos1, &dlen1);
237                     continue;
238                 }
239             }
240             else if (dstr2[dpos2] == '-' || dstr2[dpos2] == '\'')
241             {
242                 inc_str_pos(&str2, &len2, &dpos2, &dlen2);
243                 continue;
244             }
245         }
246 
247         ce1 = get_weight(dstr1[dpos1], type);
248         if (!ce1)
249         {
250             inc_str_pos(&str1, &len1, &dpos1, &dlen1);
251             continue;
252         }
253         ce2 = get_weight(dstr2[dpos2], type);
254         if (!ce2)
255         {
256             inc_str_pos(&str2, &len2, &dpos2, &dlen2);
257             continue;
258         }
259 
260         if (ce1 - ce2) return ce1 - ce2;
261 
262         inc_str_pos(&str1, &len1, &dpos1, &dlen1);
263         inc_str_pos(&str2, &len2, &dpos2, &dlen2);
264     }
265     while (len1)
266     {
267         if (!dlen1) dlen1 = wine_decompose(0, *str1, dstr1, 4);
268 
269         ce1 = get_weight(dstr1[dpos1], type);
270         if (ce1) break;
271         inc_str_pos(&str1, &len1, &dpos1, &dlen1);
272     }
273     while (len2)
274     {
275         if (!dlen2) dlen2 = wine_decompose(0, *str2, dstr2, 4);
276 
277         ce2 = get_weight(dstr2[dpos2], type);
278         if (ce2) break;
279         inc_str_pos(&str2, &len2, &dpos2, &dlen2);
280     }
281     return len1 - len2;
282 }
283 
wine_compare_string(int flags,const WCHAR * str1,int len1,const WCHAR * str2,int len2)284 int wine_compare_string(int flags, const WCHAR *str1, int len1,
285                         const WCHAR *str2, int len2)
286 {
287     int ret;
288 
289     ret = compare_weights(flags, str1, len1, str2, len2, UNICODE_WEIGHT);
290     if (!ret)
291     {
292         if (!(flags & NORM_IGNORENONSPACE))
293             ret = compare_weights(flags, str1, len1, str2, len2, DIACRITIC_WEIGHT);
294         if (!ret && !(flags & NORM_IGNORECASE))
295             ret = compare_weights(flags, str1, len1, str2, len2, CASE_WEIGHT);
296     }
297     return ret;
298 }
299