1 /* 2 * Unicode sort key generation 3 * 4 * Copyright 2003 Dmitry Timoshkov 5 * 6 * This library is free software; you can redistribute it and/or 7 * modify it under the terms of the GNU Lesser General Public 8 * License as published by the Free Software Foundation; either 9 * version 2.1 of the License, or (at your option) any later version. 10 * 11 * This library is distributed in the hope that it will be useful, 12 * but WITHOUT ANY WARRANTY; without even the implied warranty of 13 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 14 * Lesser General Public License for more details. 15 * 16 * You should have received a copy of the GNU Lesser General Public 17 * License along with this library; if not, write to the Free Software 18 * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301, USA 19 */ 20 #include "wine/unicode.h" 21 22 #ifdef __REACTOS__ 23 #define get_char_typeW(x) iswctype((x) >> 8, (x) & 0xFF) 24 #endif 25 extern unsigned int wine_decompose( WCHAR ch, WCHAR *dst, unsigned int dstlen ); 26 extern const unsigned int collation_table[]; 27 28 /* 29 * flags - normalization NORM_* flags 30 * 31 * FIXME: 'variable' flag not handled 32 */ 33 int wine_get_sortkey(int flags, const WCHAR *src, int srclen, char *dst, int dstlen) 34 { 35 WCHAR dummy[4]; /* no decomposition is larger than 4 chars */ 36 int key_len[4]; 37 char *key_ptr[4]; 38 const WCHAR *src_save = src; 39 int srclen_save = srclen; 40 41 key_len[0] = key_len[1] = key_len[2] = key_len[3] = 0; 42 for (; srclen; srclen--, src++) 43 { 44 unsigned int i, decomposed_len = 1;/*wine_decompose(*src, dummy, 4);*/ 45 dummy[0] = *src; 46 if (decomposed_len) 47 { 48 for (i = 0; i < decomposed_len; i++) 49 { 50 WCHAR wch = dummy[i]; 51 unsigned int ce; 52 53 /* tests show that win2k just ignores NORM_IGNORENONSPACE, 54 * and skips white space and punctuation characters for 55 * NORM_IGNORESYMBOLS. 56 */ 57 if ((flags & NORM_IGNORESYMBOLS) && (get_char_typeW(wch) & (C1_PUNCT | C1_SPACE))) 58 continue; 59 60 if (flags & NORM_IGNORECASE) wch = tolowerW(wch); 61 62 ce = collation_table[collation_table[wch >> 8] + (wch & 0xff)]; 63 if (ce != (unsigned int)-1) 64 { 65 if (ce >> 16) key_len[0] += 2; 66 if ((ce >> 8) & 0xff) key_len[1]++; 67 if ((ce >> 4) & 0x0f) key_len[2]++; 68 if (ce & 1) 69 { 70 if (wch >> 8) key_len[3]++; 71 key_len[3]++; 72 } 73 } 74 else 75 { 76 key_len[0] += 2; 77 if (wch >> 8) key_len[0]++; 78 if (wch & 0xff) key_len[0]++; 79 } 80 } 81 } 82 } 83 84 if (!dstlen) /* compute length */ 85 /* 4 * '\1' + 1 * '\0' + key length */ 86 return key_len[0] + key_len[1] + key_len[2] + key_len[3] + 4 + 1; 87 88 if (dstlen < key_len[0] + key_len[1] + key_len[2] + key_len[3] + 4 + 1) 89 return 0; /* overflow */ 90 91 src = src_save; 92 srclen = srclen_save; 93 94 key_ptr[0] = dst; 95 key_ptr[1] = key_ptr[0] + key_len[0] + 1; 96 key_ptr[2] = key_ptr[1] + key_len[1] + 1; 97 key_ptr[3] = key_ptr[2] + key_len[2] + 1; 98 99 for (; srclen; srclen--, src++) 100 { 101 unsigned int i, decomposed_len = 1;/*wine_decompose(*src, dummy, 4);*/ 102 dummy[0] = *src; 103 if (decomposed_len) 104 { 105 for (i = 0; i < decomposed_len; i++) 106 { 107 WCHAR wch = dummy[i]; 108 unsigned int ce; 109 110 /* tests show that win2k just ignores NORM_IGNORENONSPACE, 111 * and skips white space and punctuation characters for 112 * NORM_IGNORESYMBOLS. 113 */ 114 if ((flags & NORM_IGNORESYMBOLS) && (get_char_typeW(wch) & (C1_PUNCT | C1_SPACE))) 115 continue; 116 117 if (flags & NORM_IGNORECASE) wch = tolowerW(wch); 118 119 ce = collation_table[collation_table[wch >> 8] + (wch & 0xff)]; 120 if (ce != (unsigned int)-1) 121 { 122 WCHAR key; 123 if ((key = ce >> 16)) 124 { 125 *key_ptr[0]++ = key >> 8; 126 *key_ptr[0]++ = key & 0xff; 127 } 128 /* make key 1 start from 2 */ 129 if ((key = (ce >> 8) & 0xff)) *key_ptr[1]++ = key + 1; 130 /* make key 2 start from 2 */ 131 if ((key = (ce >> 4) & 0x0f)) *key_ptr[2]++ = key + 1; 132 /* key 3 is always a character code */ 133 if (ce & 1) 134 { 135 if (wch >> 8) *key_ptr[3]++ = wch >> 8; 136 if (wch & 0xff) *key_ptr[3]++ = wch & 0xff; 137 } 138 } 139 else 140 { 141 *key_ptr[0]++ = 0xff; 142 *key_ptr[0]++ = 0xfe; 143 if (wch >> 8) *key_ptr[0]++ = wch >> 8; 144 if (wch & 0xff) *key_ptr[0]++ = wch & 0xff; 145 } 146 } 147 } 148 } 149 150 *key_ptr[0] = '\1'; 151 *key_ptr[1] = '\1'; 152 *key_ptr[2] = '\1'; 153 *key_ptr[3]++ = '\1'; 154 *key_ptr[3] = 0; 155 156 return key_ptr[3] - dst; 157 } 158 159 static inline int compare_unicode_weights(int flags, const WCHAR *str1, int len1, 160 const WCHAR *str2, int len2) 161 { 162 unsigned int ce1, ce2; 163 int ret; 164 165 /* 32-bit collation element table format: 166 * unicode weight - high 16 bit, diacritic weight - high 8 bit of low 16 bit, 167 * case weight - high 4 bit of low 8 bit. 168 */ 169 while (len1 > 0 && len2 > 0) 170 { 171 if (flags & NORM_IGNORESYMBOLS) 172 { 173 int skip = 0; 174 /* FIXME: not tested */ 175 if (get_char_typeW(*str1) & (C1_PUNCT | C1_SPACE)) 176 { 177 str1++; 178 len1--; 179 skip = 1; 180 } 181 if (get_char_typeW(*str2) & (C1_PUNCT | C1_SPACE)) 182 { 183 str2++; 184 len2--; 185 skip = 1; 186 } 187 if (skip) continue; 188 } 189 190 /* hyphen and apostrophe are treated differently depending on 191 * whether SORT_STRINGSORT specified or not 192 */ 193 if (!(flags & SORT_STRINGSORT)) 194 { 195 if (*str1 == '-' || *str1 == '\'') 196 { 197 if (*str2 != '-' && *str2 != '\'') 198 { 199 str1++; 200 len1--; 201 continue; 202 } 203 } 204 else if (*str2 == '-' || *str2 == '\'') 205 { 206 str2++; 207 len2--; 208 continue; 209 } 210 } 211 212 ce1 = collation_table[collation_table[*str1 >> 8] + (*str1 & 0xff)]; 213 ce2 = collation_table[collation_table[*str2 >> 8] + (*str2 & 0xff)]; 214 215 if (ce1 != (unsigned int)-1 && ce2 != (unsigned int)-1) 216 ret = (ce1 >> 16) - (ce2 >> 16); 217 else 218 ret = *str1 - *str2; 219 220 if (ret) return ret; 221 222 str1++; 223 str2++; 224 len1--; 225 len2--; 226 } 227 while (len1 && !*str1) 228 { 229 str1++; 230 len1--; 231 } 232 while (len2 && !*str2) 233 { 234 str2++; 235 len2--; 236 } 237 return len1 - len2; 238 } 239 240 static inline int compare_diacritic_weights(int flags, const WCHAR *str1, int len1, 241 const WCHAR *str2, int len2) 242 { 243 unsigned int ce1, ce2; 244 int ret; 245 246 /* 32-bit collation element table format: 247 * unicode weight - high 16 bit, diacritic weight - high 8 bit of low 16 bit, 248 * case weight - high 4 bit of low 8 bit. 249 */ 250 while (len1 > 0 && len2 > 0) 251 { 252 if (flags & NORM_IGNORESYMBOLS) 253 { 254 int skip = 0; 255 /* FIXME: not tested */ 256 if (get_char_typeW(*str1) & (C1_PUNCT | C1_SPACE)) 257 { 258 str1++; 259 len1--; 260 skip = 1; 261 } 262 if (get_char_typeW(*str2) & (C1_PUNCT | C1_SPACE)) 263 { 264 str2++; 265 len2--; 266 skip = 1; 267 } 268 if (skip) continue; 269 } 270 271 ce1 = collation_table[collation_table[*str1 >> 8] + (*str1 & 0xff)]; 272 ce2 = collation_table[collation_table[*str2 >> 8] + (*str2 & 0xff)]; 273 274 if (ce1 != (unsigned int)-1 && ce2 != (unsigned int)-1) 275 ret = ((ce1 >> 8) & 0xff) - ((ce2 >> 8) & 0xff); 276 else 277 ret = *str1 - *str2; 278 279 if (ret) return ret; 280 281 str1++; 282 str2++; 283 len1--; 284 len2--; 285 } 286 while (len1 && !*str1) 287 { 288 str1++; 289 len1--; 290 } 291 while (len2 && !*str2) 292 { 293 str2++; 294 len2--; 295 } 296 return len1 - len2; 297 } 298 299 static inline int compare_case_weights(int flags, const WCHAR *str1, int len1, 300 const WCHAR *str2, int len2) 301 { 302 unsigned int ce1, ce2; 303 int ret; 304 305 /* 32-bit collation element table format: 306 * unicode weight - high 16 bit, diacritic weight - high 8 bit of low 16 bit, 307 * case weight - high 4 bit of low 8 bit. 308 */ 309 while (len1 > 0 && len2 > 0) 310 { 311 if (flags & NORM_IGNORESYMBOLS) 312 { 313 int skip = 0; 314 /* FIXME: not tested */ 315 if (get_char_typeW(*str1) & (C1_PUNCT | C1_SPACE)) 316 { 317 str1++; 318 len1--; 319 skip = 1; 320 } 321 if (get_char_typeW(*str2) & (C1_PUNCT | C1_SPACE)) 322 { 323 str2++; 324 len2--; 325 skip = 1; 326 } 327 if (skip) continue; 328 } 329 330 ce1 = collation_table[collation_table[*str1 >> 8] + (*str1 & 0xff)]; 331 ce2 = collation_table[collation_table[*str2 >> 8] + (*str2 & 0xff)]; 332 333 if (ce1 != (unsigned int)-1 && ce2 != (unsigned int)-1) 334 ret = ((ce1 >> 4) & 0x0f) - ((ce2 >> 4) & 0x0f); 335 else 336 ret = *str1 - *str2; 337 338 if (ret) return ret; 339 340 str1++; 341 str2++; 342 len1--; 343 len2--; 344 } 345 while (len1 && !*str1) 346 { 347 str1++; 348 len1--; 349 } 350 while (len2 && !*str2) 351 { 352 str2++; 353 len2--; 354 } 355 return len1 - len2; 356 } 357 358 int wine_compare_string(int flags, const WCHAR *str1, int len1, 359 const WCHAR *str2, int len2) 360 { 361 int ret; 362 363 ret = compare_unicode_weights(flags, str1, len1, str2, len2); 364 if (!ret) 365 { 366 if (!(flags & NORM_IGNORENONSPACE)) 367 ret = compare_diacritic_weights(flags, str1, len1, str2, len2); 368 if (!ret && !(flags & NORM_IGNORECASE)) 369 ret = compare_case_weights(flags, str1, len1, str2, len2); 370 } 371 return ret; 372 } 373