1 /*
2  * Unicode sort key generation
3  *
4  * Copyright 2003 Dmitry Timoshkov
5  *
6  * This library is free software; you can redistribute it and/or
7  * modify it under the terms of the GNU Lesser General Public
8  * License as published by the Free Software Foundation; either
9  * version 2.1 of the License, or (at your option) any later version.
10  *
11  * This library is distributed in the hope that it will be useful,
12  * but WITHOUT ANY WARRANTY; without even the implied warranty of
13  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
14  * Lesser General Public License for more details.
15  *
16  * You should have received a copy of the GNU Lesser General Public
17  * License along with this library; if not, write to the Free Software
18  * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301, USA
19  */
20 #include "wine/unicode.h"
21 
22 #ifdef __REACTOS__
23 #define get_char_typeW(x) iswctype((x) >> 8, (x) & 0xFF)
24 #endif
25 extern unsigned int wine_decompose( WCHAR ch, WCHAR *dst, unsigned int dstlen );
26 extern const unsigned int collation_table[];
27 
28 /*
29  * flags - normalization NORM_* flags
30  *
31  * FIXME: 'variable' flag not handled
32  */
33 int wine_get_sortkey(int flags, const WCHAR *src, int srclen, char *dst, int dstlen)
34 {
35     WCHAR dummy[4]; /* no decomposition is larger than 4 chars */
36     int key_len[4];
37     char *key_ptr[4];
38     const WCHAR *src_save = src;
39     int srclen_save = srclen;
40 
41     key_len[0] = key_len[1] = key_len[2] = key_len[3] = 0;
42     for (; srclen; srclen--, src++)
43     {
44         unsigned int i, decomposed_len = 1;/*wine_decompose(*src, dummy, 4);*/
45         dummy[0] = *src;
46         if (decomposed_len)
47         {
48             for (i = 0; i < decomposed_len; i++)
49             {
50                 WCHAR wch = dummy[i];
51                 unsigned int ce;
52 
53                 /* tests show that win2k just ignores NORM_IGNORENONSPACE,
54                  * and skips white space and punctuation characters for
55                  * NORM_IGNORESYMBOLS.
56                  */
57                 if ((flags & NORM_IGNORESYMBOLS) && (get_char_typeW(wch) & (C1_PUNCT | C1_SPACE)))
58                     continue;
59 
60                 if (flags & NORM_IGNORECASE) wch = tolowerW(wch);
61 
62                 ce = collation_table[collation_table[wch >> 8] + (wch & 0xff)];
63                 if (ce != (unsigned int)-1)
64                 {
65                     if (ce >> 16) key_len[0] += 2;
66                     if ((ce >> 8) & 0xff) key_len[1]++;
67                     if ((ce >> 4) & 0x0f) key_len[2]++;
68                     if (ce & 1)
69                     {
70                         if (wch >> 8) key_len[3]++;
71                         key_len[3]++;
72                     }
73                 }
74                 else
75                 {
76                     key_len[0] += 2;
77                     if (wch >> 8) key_len[0]++;
78                     if (wch & 0xff) key_len[0]++;
79 		}
80             }
81         }
82     }
83 
84     if (!dstlen) /* compute length */
85         /* 4 * '\1' + 1 * '\0' + key length */
86         return key_len[0] + key_len[1] + key_len[2] + key_len[3] + 4 + 1;
87 
88     if (dstlen < key_len[0] + key_len[1] + key_len[2] + key_len[3] + 4 + 1)
89         return 0; /* overflow */
90 
91     src = src_save;
92     srclen = srclen_save;
93 
94     key_ptr[0] = dst;
95     key_ptr[1] = key_ptr[0] + key_len[0] + 1;
96     key_ptr[2] = key_ptr[1] + key_len[1] + 1;
97     key_ptr[3] = key_ptr[2] + key_len[2] + 1;
98 
99     for (; srclen; srclen--, src++)
100     {
101         unsigned int i, decomposed_len = 1;/*wine_decompose(*src, dummy, 4);*/
102         dummy[0] = *src;
103         if (decomposed_len)
104         {
105             for (i = 0; i < decomposed_len; i++)
106             {
107                 WCHAR wch = dummy[i];
108                 unsigned int ce;
109 
110                 /* tests show that win2k just ignores NORM_IGNORENONSPACE,
111                  * and skips white space and punctuation characters for
112                  * NORM_IGNORESYMBOLS.
113                  */
114                 if ((flags & NORM_IGNORESYMBOLS) && (get_char_typeW(wch) & (C1_PUNCT | C1_SPACE)))
115                     continue;
116 
117                 if (flags & NORM_IGNORECASE) wch = tolowerW(wch);
118 
119                 ce = collation_table[collation_table[wch >> 8] + (wch & 0xff)];
120                 if (ce != (unsigned int)-1)
121                 {
122                     WCHAR key;
123                     if ((key = ce >> 16))
124                     {
125                         *key_ptr[0]++ = key >> 8;
126                         *key_ptr[0]++ = key & 0xff;
127                     }
128                     /* make key 1 start from 2 */
129                     if ((key = (ce >> 8) & 0xff)) *key_ptr[1]++ = key + 1;
130                     /* make key 2 start from 2 */
131                     if ((key = (ce >> 4) & 0x0f)) *key_ptr[2]++ = key + 1;
132                     /* key 3 is always a character code */
133                     if (ce & 1)
134                     {
135                         if (wch >> 8) *key_ptr[3]++ = wch >> 8;
136                         if (wch & 0xff) *key_ptr[3]++ = wch & 0xff;
137                     }
138                 }
139                 else
140                 {
141                     *key_ptr[0]++ = 0xff;
142                     *key_ptr[0]++ = 0xfe;
143                     if (wch >> 8) *key_ptr[0]++ = wch >> 8;
144                     if (wch & 0xff) *key_ptr[0]++ = wch & 0xff;
145                 }
146             }
147         }
148     }
149 
150     *key_ptr[0] = '\1';
151     *key_ptr[1] = '\1';
152     *key_ptr[2] = '\1';
153     *key_ptr[3]++ = '\1';
154     *key_ptr[3] = 0;
155 
156     return key_ptr[3] - dst;
157 }
158 
159 static inline int compare_unicode_weights(int flags, const WCHAR *str1, int len1,
160                                           const WCHAR *str2, int len2)
161 {
162     unsigned int ce1, ce2;
163     int ret;
164 
165     /* 32-bit collation element table format:
166      * unicode weight - high 16 bit, diacritic weight - high 8 bit of low 16 bit,
167      * case weight - high 4 bit of low 8 bit.
168      */
169     while (len1 > 0 && len2 > 0)
170     {
171         if (flags & NORM_IGNORESYMBOLS)
172         {
173             int skip = 0;
174             /* FIXME: not tested */
175             if (get_char_typeW(*str1) & (C1_PUNCT | C1_SPACE))
176             {
177                 str1++;
178                 len1--;
179                 skip = 1;
180             }
181             if (get_char_typeW(*str2) & (C1_PUNCT | C1_SPACE))
182             {
183                 str2++;
184                 len2--;
185                 skip = 1;
186             }
187             if (skip) continue;
188         }
189 
190        /* hyphen and apostrophe are treated differently depending on
191         * whether SORT_STRINGSORT specified or not
192         */
193         if (!(flags & SORT_STRINGSORT))
194         {
195             if (*str1 == '-' || *str1 == '\'')
196             {
197                 if (*str2 != '-' && *str2 != '\'')
198                 {
199                     str1++;
200                     len1--;
201                     continue;
202                 }
203             }
204             else if (*str2 == '-' || *str2 == '\'')
205             {
206                 str2++;
207                 len2--;
208                 continue;
209             }
210         }
211 
212         ce1 = collation_table[collation_table[*str1 >> 8] + (*str1 & 0xff)];
213         ce2 = collation_table[collation_table[*str2 >> 8] + (*str2 & 0xff)];
214 
215         if (ce1 != (unsigned int)-1 && ce2 != (unsigned int)-1)
216             ret = (ce1 >> 16) - (ce2 >> 16);
217         else
218             ret = *str1 - *str2;
219 
220         if (ret) return ret;
221 
222         str1++;
223         str2++;
224         len1--;
225         len2--;
226     }
227     while (len1 && !*str1)
228     {
229         str1++;
230         len1--;
231     }
232     while (len2 && !*str2)
233     {
234         str2++;
235         len2--;
236     }
237     return len1 - len2;
238 }
239 
240 static inline int compare_diacritic_weights(int flags, const WCHAR *str1, int len1,
241                                             const WCHAR *str2, int len2)
242 {
243     unsigned int ce1, ce2;
244     int ret;
245 
246     /* 32-bit collation element table format:
247      * unicode weight - high 16 bit, diacritic weight - high 8 bit of low 16 bit,
248      * case weight - high 4 bit of low 8 bit.
249      */
250     while (len1 > 0 && len2 > 0)
251     {
252         if (flags & NORM_IGNORESYMBOLS)
253         {
254             int skip = 0;
255             /* FIXME: not tested */
256             if (get_char_typeW(*str1) & (C1_PUNCT | C1_SPACE))
257             {
258                 str1++;
259                 len1--;
260                 skip = 1;
261             }
262             if (get_char_typeW(*str2) & (C1_PUNCT | C1_SPACE))
263             {
264                 str2++;
265                 len2--;
266                 skip = 1;
267             }
268             if (skip) continue;
269         }
270 
271         ce1 = collation_table[collation_table[*str1 >> 8] + (*str1 & 0xff)];
272         ce2 = collation_table[collation_table[*str2 >> 8] + (*str2 & 0xff)];
273 
274         if (ce1 != (unsigned int)-1 && ce2 != (unsigned int)-1)
275             ret = ((ce1 >> 8) & 0xff) - ((ce2 >> 8) & 0xff);
276         else
277             ret = *str1 - *str2;
278 
279         if (ret) return ret;
280 
281         str1++;
282         str2++;
283         len1--;
284         len2--;
285     }
286     while (len1 && !*str1)
287     {
288         str1++;
289         len1--;
290     }
291     while (len2 && !*str2)
292     {
293         str2++;
294         len2--;
295     }
296     return len1 - len2;
297 }
298 
299 static inline int compare_case_weights(int flags, const WCHAR *str1, int len1,
300                                        const WCHAR *str2, int len2)
301 {
302     unsigned int ce1, ce2;
303     int ret;
304 
305     /* 32-bit collation element table format:
306      * unicode weight - high 16 bit, diacritic weight - high 8 bit of low 16 bit,
307      * case weight - high 4 bit of low 8 bit.
308      */
309     while (len1 > 0 && len2 > 0)
310     {
311         if (flags & NORM_IGNORESYMBOLS)
312         {
313             int skip = 0;
314             /* FIXME: not tested */
315             if (get_char_typeW(*str1) & (C1_PUNCT | C1_SPACE))
316             {
317                 str1++;
318                 len1--;
319                 skip = 1;
320             }
321             if (get_char_typeW(*str2) & (C1_PUNCT | C1_SPACE))
322             {
323                 str2++;
324                 len2--;
325                 skip = 1;
326             }
327             if (skip) continue;
328         }
329 
330         ce1 = collation_table[collation_table[*str1 >> 8] + (*str1 & 0xff)];
331         ce2 = collation_table[collation_table[*str2 >> 8] + (*str2 & 0xff)];
332 
333         if (ce1 != (unsigned int)-1 && ce2 != (unsigned int)-1)
334             ret = ((ce1 >> 4) & 0x0f) - ((ce2 >> 4) & 0x0f);
335         else
336             ret = *str1 - *str2;
337 
338         if (ret) return ret;
339 
340         str1++;
341         str2++;
342         len1--;
343         len2--;
344     }
345     while (len1 && !*str1)
346     {
347         str1++;
348         len1--;
349     }
350     while (len2 && !*str2)
351     {
352         str2++;
353         len2--;
354     }
355     return len1 - len2;
356 }
357 
358 int wine_compare_string(int flags, const WCHAR *str1, int len1,
359                         const WCHAR *str2, int len2)
360 {
361     int ret;
362 
363     ret = compare_unicode_weights(flags, str1, len1, str2, len2);
364     if (!ret)
365     {
366         if (!(flags & NORM_IGNORENONSPACE))
367             ret = compare_diacritic_weights(flags, str1, len1, str2, len2);
368         if (!ret && !(flags & NORM_IGNORECASE))
369             ret = compare_case_weights(flags, str1, len1, str2, len2);
370     }
371     return ret;
372 }
373