1 /*
2  * Unicode sort key generation
3  *
4  * Copyright 2003 Dmitry Timoshkov
5  *
6  * This library is free software; you can redistribute it and/or
7  * modify it under the terms of the GNU Lesser General Public
8  * License as published by the Free Software Foundation; either
9  * version 2.1 of the License, or (at your option) any later version.
10  *
11  * This library is distributed in the hope that it will be useful,
12  * but WITHOUT ANY WARRANTY; without even the implied warranty of
13  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
14  * Lesser General Public License for more details.
15  *
16  * You should have received a copy of the GNU Lesser General Public
17  * License along with this library; if not, write to the Free Software
18  * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301, USA
19  */
20 
21 #include <wine/unicode.h>
22 
23 #ifdef __REACTOS__
24 #define get_char_typeW(x) iswctype((x) >> 8, (x) & 0xFF)
25 #endif
26 extern unsigned int wine_decompose( WCHAR ch, WCHAR *dst, unsigned int dstlen );
27 extern const unsigned int collation_table[];
28 
29 /*
30  * flags - normalization NORM_* flags
31  *
32  * FIXME: 'variable' flag not handled
33  */
34 int wine_get_sortkey(int flags, const WCHAR *src, int srclen, char *dst, int dstlen)
35 {
36     WCHAR dummy[4]; /* no decomposition is larger than 4 chars */
37     int key_len[4];
38     char *key_ptr[4];
39     const WCHAR *src_save = src;
40     int srclen_save = srclen;
41 
42     key_len[0] = key_len[1] = key_len[2] = key_len[3] = 0;
43     for (; srclen; srclen--, src++)
44     {
45         unsigned int i, decomposed_len = 1;/*wine_decompose(*src, dummy, 4);*/
46         dummy[0] = *src;
47         if (decomposed_len)
48         {
49             for (i = 0; i < decomposed_len; i++)
50             {
51                 WCHAR wch = dummy[i];
52                 unsigned int ce;
53 
54                 /* tests show that win2k just ignores NORM_IGNORENONSPACE,
55                  * and skips white space and punctuation characters for
56                  * NORM_IGNORESYMBOLS.
57                  */
58                 if ((flags & NORM_IGNORESYMBOLS) && (get_char_typeW(wch) & (C1_PUNCT | C1_SPACE)))
59                     continue;
60 
61                 if (flags & NORM_IGNORECASE) wch = tolowerW(wch);
62 
63                 ce = collation_table[collation_table[wch >> 8] + (wch & 0xff)];
64                 if (ce != (unsigned int)-1)
65                 {
66                     if (ce >> 16) key_len[0] += 2;
67                     if ((ce >> 8) & 0xff) key_len[1]++;
68                     if ((ce >> 4) & 0x0f) key_len[2]++;
69                     if (ce & 1)
70                     {
71                         if (wch >> 8) key_len[3]++;
72                         key_len[3]++;
73                     }
74                 }
75                 else
76                 {
77                     key_len[0] += 2;
78                     if (wch >> 8) key_len[0]++;
79                     if (wch & 0xff) key_len[0]++;
80 		}
81             }
82         }
83     }
84 
85     if (!dstlen) /* compute length */
86         /* 4 * '\1' + 1 * '\0' + key length */
87         return key_len[0] + key_len[1] + key_len[2] + key_len[3] + 4 + 1;
88 
89     if (dstlen < key_len[0] + key_len[1] + key_len[2] + key_len[3] + 4 + 1)
90         return 0; /* overflow */
91 
92     src = src_save;
93     srclen = srclen_save;
94 
95     key_ptr[0] = dst;
96     key_ptr[1] = key_ptr[0] + key_len[0] + 1;
97     key_ptr[2] = key_ptr[1] + key_len[1] + 1;
98     key_ptr[3] = key_ptr[2] + key_len[2] + 1;
99 
100     for (; srclen; srclen--, src++)
101     {
102         unsigned int i, decomposed_len = 1;/*wine_decompose(*src, dummy, 4);*/
103         dummy[0] = *src;
104         if (decomposed_len)
105         {
106             for (i = 0; i < decomposed_len; i++)
107             {
108                 WCHAR wch = dummy[i];
109                 unsigned int ce;
110 
111                 /* tests show that win2k just ignores NORM_IGNORENONSPACE,
112                  * and skips white space and punctuation characters for
113                  * NORM_IGNORESYMBOLS.
114                  */
115                 if ((flags & NORM_IGNORESYMBOLS) && (get_char_typeW(wch) & (C1_PUNCT | C1_SPACE)))
116                     continue;
117 
118                 if (flags & NORM_IGNORECASE) wch = tolowerW(wch);
119 
120                 ce = collation_table[collation_table[wch >> 8] + (wch & 0xff)];
121                 if (ce != (unsigned int)-1)
122                 {
123                     WCHAR key;
124                     if ((key = ce >> 16))
125                     {
126                         *key_ptr[0]++ = key >> 8;
127                         *key_ptr[0]++ = key & 0xff;
128                     }
129                     /* make key 1 start from 2 */
130                     if ((key = (ce >> 8) & 0xff)) *key_ptr[1]++ = key + 1;
131                     /* make key 2 start from 2 */
132                     if ((key = (ce >> 4) & 0x0f)) *key_ptr[2]++ = key + 1;
133                     /* key 3 is always a character code */
134                     if (ce & 1)
135                     {
136                         if (wch >> 8) *key_ptr[3]++ = wch >> 8;
137                         if (wch & 0xff) *key_ptr[3]++ = wch & 0xff;
138                     }
139                 }
140                 else
141                 {
142                     *key_ptr[0]++ = 0xff;
143                     *key_ptr[0]++ = 0xfe;
144                     if (wch >> 8) *key_ptr[0]++ = wch >> 8;
145                     if (wch & 0xff) *key_ptr[0]++ = wch & 0xff;
146                 }
147             }
148         }
149     }
150 
151     *key_ptr[0] = '\1';
152     *key_ptr[1] = '\1';
153     *key_ptr[2] = '\1';
154     *key_ptr[3]++ = '\1';
155     *key_ptr[3] = 0;
156 
157     return key_ptr[3] - dst;
158 }
159 
160 static inline int compare_unicode_weights(int flags, const WCHAR *str1, int len1,
161                                           const WCHAR *str2, int len2)
162 {
163     unsigned int ce1, ce2;
164     int ret;
165 
166     /* 32-bit collation element table format:
167      * unicode weight - high 16 bit, diacritic weight - high 8 bit of low 16 bit,
168      * case weight - high 4 bit of low 8 bit.
169      */
170     while (len1 > 0 && len2 > 0)
171     {
172         if (flags & NORM_IGNORESYMBOLS)
173         {
174             int skip = 0;
175             /* FIXME: not tested */
176             if (get_char_typeW(*str1) & (C1_PUNCT | C1_SPACE))
177             {
178                 str1++;
179                 len1--;
180                 skip = 1;
181             }
182             if (get_char_typeW(*str2) & (C1_PUNCT | C1_SPACE))
183             {
184                 str2++;
185                 len2--;
186                 skip = 1;
187             }
188             if (skip) continue;
189         }
190 
191        /* hyphen and apostrophe are treated differently depending on
192         * whether SORT_STRINGSORT specified or not
193         */
194         if (!(flags & SORT_STRINGSORT))
195         {
196             if (*str1 == '-' || *str1 == '\'')
197             {
198                 if (*str2 != '-' && *str2 != '\'')
199                 {
200                     str1++;
201                     len1--;
202                     continue;
203                 }
204             }
205             else if (*str2 == '-' || *str2 == '\'')
206             {
207                 str2++;
208                 len2--;
209                 continue;
210             }
211         }
212 
213         ce1 = collation_table[collation_table[*str1 >> 8] + (*str1 & 0xff)];
214         ce2 = collation_table[collation_table[*str2 >> 8] + (*str2 & 0xff)];
215 
216         if (ce1 != (unsigned int)-1 && ce2 != (unsigned int)-1)
217             ret = (ce1 >> 16) - (ce2 >> 16);
218         else
219             ret = *str1 - *str2;
220 
221         if (ret) return ret;
222 
223         str1++;
224         str2++;
225         len1--;
226         len2--;
227     }
228     while (len1 && !*str1)
229     {
230         str1++;
231         len1--;
232     }
233     while (len2 && !*str2)
234     {
235         str2++;
236         len2--;
237     }
238     return len1 - len2;
239 }
240 
241 static inline int compare_diacritic_weights(int flags, const WCHAR *str1, int len1,
242                                             const WCHAR *str2, int len2)
243 {
244     unsigned int ce1, ce2;
245     int ret;
246 
247     /* 32-bit collation element table format:
248      * unicode weight - high 16 bit, diacritic weight - high 8 bit of low 16 bit,
249      * case weight - high 4 bit of low 8 bit.
250      */
251     while (len1 > 0 && len2 > 0)
252     {
253         if (flags & NORM_IGNORESYMBOLS)
254         {
255             int skip = 0;
256             /* FIXME: not tested */
257             if (get_char_typeW(*str1) & (C1_PUNCT | C1_SPACE))
258             {
259                 str1++;
260                 len1--;
261                 skip = 1;
262             }
263             if (get_char_typeW(*str2) & (C1_PUNCT | C1_SPACE))
264             {
265                 str2++;
266                 len2--;
267                 skip = 1;
268             }
269             if (skip) continue;
270         }
271 
272         ce1 = collation_table[collation_table[*str1 >> 8] + (*str1 & 0xff)];
273         ce2 = collation_table[collation_table[*str2 >> 8] + (*str2 & 0xff)];
274 
275         if (ce1 != (unsigned int)-1 && ce2 != (unsigned int)-1)
276             ret = ((ce1 >> 8) & 0xff) - ((ce2 >> 8) & 0xff);
277         else
278             ret = *str1 - *str2;
279 
280         if (ret) return ret;
281 
282         str1++;
283         str2++;
284         len1--;
285         len2--;
286     }
287     while (len1 && !*str1)
288     {
289         str1++;
290         len1--;
291     }
292     while (len2 && !*str2)
293     {
294         str2++;
295         len2--;
296     }
297     return len1 - len2;
298 }
299 
300 static inline int compare_case_weights(int flags, const WCHAR *str1, int len1,
301                                        const WCHAR *str2, int len2)
302 {
303     unsigned int ce1, ce2;
304     int ret;
305 
306     /* 32-bit collation element table format:
307      * unicode weight - high 16 bit, diacritic weight - high 8 bit of low 16 bit,
308      * case weight - high 4 bit of low 8 bit.
309      */
310     while (len1 > 0 && len2 > 0)
311     {
312         if (flags & NORM_IGNORESYMBOLS)
313         {
314             int skip = 0;
315             /* FIXME: not tested */
316             if (get_char_typeW(*str1) & (C1_PUNCT | C1_SPACE))
317             {
318                 str1++;
319                 len1--;
320                 skip = 1;
321             }
322             if (get_char_typeW(*str2) & (C1_PUNCT | C1_SPACE))
323             {
324                 str2++;
325                 len2--;
326                 skip = 1;
327             }
328             if (skip) continue;
329         }
330 
331         ce1 = collation_table[collation_table[*str1 >> 8] + (*str1 & 0xff)];
332         ce2 = collation_table[collation_table[*str2 >> 8] + (*str2 & 0xff)];
333 
334         if (ce1 != (unsigned int)-1 && ce2 != (unsigned int)-1)
335             ret = ((ce1 >> 4) & 0x0f) - ((ce2 >> 4) & 0x0f);
336         else
337             ret = *str1 - *str2;
338 
339         if (ret) return ret;
340 
341         str1++;
342         str2++;
343         len1--;
344         len2--;
345     }
346     while (len1 && !*str1)
347     {
348         str1++;
349         len1--;
350     }
351     while (len2 && !*str2)
352     {
353         str2++;
354         len2--;
355     }
356     return len1 - len2;
357 }
358 
359 int wine_compare_string(int flags, const WCHAR *str1, int len1,
360                         const WCHAR *str2, int len2)
361 {
362     int ret;
363 
364     ret = compare_unicode_weights(flags, str1, len1, str2, len2);
365     if (!ret)
366     {
367         if (!(flags & NORM_IGNORENONSPACE))
368             ret = compare_diacritic_weights(flags, str1, len1, str2, len2);
369         if (!ret && !(flags & NORM_IGNORECASE))
370             ret = compare_case_weights(flags, str1, len1, str2, len2);
371     }
372     return ret;
373 }
374