1 /* 2 * Unicode sort key generation 3 * 4 * Copyright 2003 Dmitry Timoshkov 5 * 6 * This library is free software; you can redistribute it and/or 7 * modify it under the terms of the GNU Lesser General Public 8 * License as published by the Free Software Foundation; either 9 * version 2.1 of the License, or (at your option) any later version. 10 * 11 * This library is distributed in the hope that it will be useful, 12 * but WITHOUT ANY WARRANTY; without even the implied warranty of 13 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 14 * Lesser General Public License for more details. 15 * 16 * You should have received a copy of the GNU Lesser General Public 17 * License along with this library; if not, write to the Free Software 18 * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301, USA 19 */ 20 21 #include <wine/unicode.h> 22 23 #ifdef __REACTOS__ 24 #define get_char_typeW(x) iswctype((x) >> 8, (x) & 0xFF) 25 #endif 26 extern unsigned int wine_decompose( WCHAR ch, WCHAR *dst, unsigned int dstlen ); 27 extern const unsigned int collation_table[]; 28 29 /* 30 * flags - normalization NORM_* flags 31 * 32 * FIXME: 'variable' flag not handled 33 */ 34 int wine_get_sortkey(int flags, const WCHAR *src, int srclen, char *dst, int dstlen) 35 { 36 WCHAR dummy[4]; /* no decomposition is larger than 4 chars */ 37 int key_len[4]; 38 char *key_ptr[4]; 39 const WCHAR *src_save = src; 40 int srclen_save = srclen; 41 42 key_len[0] = key_len[1] = key_len[2] = key_len[3] = 0; 43 for (; srclen; srclen--, src++) 44 { 45 unsigned int i, decomposed_len = 1;/*wine_decompose(*src, dummy, 4);*/ 46 dummy[0] = *src; 47 if (decomposed_len) 48 { 49 for (i = 0; i < decomposed_len; i++) 50 { 51 WCHAR wch = dummy[i]; 52 unsigned int ce; 53 54 /* tests show that win2k just ignores NORM_IGNORENONSPACE, 55 * and skips white space and punctuation characters for 56 * NORM_IGNORESYMBOLS. 57 */ 58 if ((flags & NORM_IGNORESYMBOLS) && (get_char_typeW(wch) & (C1_PUNCT | C1_SPACE))) 59 continue; 60 61 if (flags & NORM_IGNORECASE) wch = tolowerW(wch); 62 63 ce = collation_table[collation_table[wch >> 8] + (wch & 0xff)]; 64 if (ce != (unsigned int)-1) 65 { 66 if (ce >> 16) key_len[0] += 2; 67 if ((ce >> 8) & 0xff) key_len[1]++; 68 if ((ce >> 4) & 0x0f) key_len[2]++; 69 if (ce & 1) 70 { 71 if (wch >> 8) key_len[3]++; 72 key_len[3]++; 73 } 74 } 75 else 76 { 77 key_len[0] += 2; 78 if (wch >> 8) key_len[0]++; 79 if (wch & 0xff) key_len[0]++; 80 } 81 } 82 } 83 } 84 85 if (!dstlen) /* compute length */ 86 /* 4 * '\1' + 1 * '\0' + key length */ 87 return key_len[0] + key_len[1] + key_len[2] + key_len[3] + 4 + 1; 88 89 if (dstlen < key_len[0] + key_len[1] + key_len[2] + key_len[3] + 4 + 1) 90 return 0; /* overflow */ 91 92 src = src_save; 93 srclen = srclen_save; 94 95 key_ptr[0] = dst; 96 key_ptr[1] = key_ptr[0] + key_len[0] + 1; 97 key_ptr[2] = key_ptr[1] + key_len[1] + 1; 98 key_ptr[3] = key_ptr[2] + key_len[2] + 1; 99 100 for (; srclen; srclen--, src++) 101 { 102 unsigned int i, decomposed_len = 1;/*wine_decompose(*src, dummy, 4);*/ 103 dummy[0] = *src; 104 if (decomposed_len) 105 { 106 for (i = 0; i < decomposed_len; i++) 107 { 108 WCHAR wch = dummy[i]; 109 unsigned int ce; 110 111 /* tests show that win2k just ignores NORM_IGNORENONSPACE, 112 * and skips white space and punctuation characters for 113 * NORM_IGNORESYMBOLS. 114 */ 115 if ((flags & NORM_IGNORESYMBOLS) && (get_char_typeW(wch) & (C1_PUNCT | C1_SPACE))) 116 continue; 117 118 if (flags & NORM_IGNORECASE) wch = tolowerW(wch); 119 120 ce = collation_table[collation_table[wch >> 8] + (wch & 0xff)]; 121 if (ce != (unsigned int)-1) 122 { 123 WCHAR key; 124 if ((key = ce >> 16)) 125 { 126 *key_ptr[0]++ = key >> 8; 127 *key_ptr[0]++ = key & 0xff; 128 } 129 /* make key 1 start from 2 */ 130 if ((key = (ce >> 8) & 0xff)) *key_ptr[1]++ = key + 1; 131 /* make key 2 start from 2 */ 132 if ((key = (ce >> 4) & 0x0f)) *key_ptr[2]++ = key + 1; 133 /* key 3 is always a character code */ 134 if (ce & 1) 135 { 136 if (wch >> 8) *key_ptr[3]++ = wch >> 8; 137 if (wch & 0xff) *key_ptr[3]++ = wch & 0xff; 138 } 139 } 140 else 141 { 142 *key_ptr[0]++ = 0xff; 143 *key_ptr[0]++ = 0xfe; 144 if (wch >> 8) *key_ptr[0]++ = wch >> 8; 145 if (wch & 0xff) *key_ptr[0]++ = wch & 0xff; 146 } 147 } 148 } 149 } 150 151 *key_ptr[0] = '\1'; 152 *key_ptr[1] = '\1'; 153 *key_ptr[2] = '\1'; 154 *key_ptr[3]++ = '\1'; 155 *key_ptr[3] = 0; 156 157 return key_ptr[3] - dst; 158 } 159 160 static inline int compare_unicode_weights(int flags, const WCHAR *str1, int len1, 161 const WCHAR *str2, int len2) 162 { 163 unsigned int ce1, ce2; 164 int ret; 165 166 /* 32-bit collation element table format: 167 * unicode weight - high 16 bit, diacritic weight - high 8 bit of low 16 bit, 168 * case weight - high 4 bit of low 8 bit. 169 */ 170 while (len1 > 0 && len2 > 0) 171 { 172 if (flags & NORM_IGNORESYMBOLS) 173 { 174 int skip = 0; 175 /* FIXME: not tested */ 176 if (get_char_typeW(*str1) & (C1_PUNCT | C1_SPACE)) 177 { 178 str1++; 179 len1--; 180 skip = 1; 181 } 182 if (get_char_typeW(*str2) & (C1_PUNCT | C1_SPACE)) 183 { 184 str2++; 185 len2--; 186 skip = 1; 187 } 188 if (skip) continue; 189 } 190 191 /* hyphen and apostrophe are treated differently depending on 192 * whether SORT_STRINGSORT specified or not 193 */ 194 if (!(flags & SORT_STRINGSORT)) 195 { 196 if (*str1 == '-' || *str1 == '\'') 197 { 198 if (*str2 != '-' && *str2 != '\'') 199 { 200 str1++; 201 len1--; 202 continue; 203 } 204 } 205 else if (*str2 == '-' || *str2 == '\'') 206 { 207 str2++; 208 len2--; 209 continue; 210 } 211 } 212 213 ce1 = collation_table[collation_table[*str1 >> 8] + (*str1 & 0xff)]; 214 ce2 = collation_table[collation_table[*str2 >> 8] + (*str2 & 0xff)]; 215 216 if (ce1 != (unsigned int)-1 && ce2 != (unsigned int)-1) 217 ret = (ce1 >> 16) - (ce2 >> 16); 218 else 219 ret = *str1 - *str2; 220 221 if (ret) return ret; 222 223 str1++; 224 str2++; 225 len1--; 226 len2--; 227 } 228 while (len1 && !*str1) 229 { 230 str1++; 231 len1--; 232 } 233 while (len2 && !*str2) 234 { 235 str2++; 236 len2--; 237 } 238 return len1 - len2; 239 } 240 241 static inline int compare_diacritic_weights(int flags, const WCHAR *str1, int len1, 242 const WCHAR *str2, int len2) 243 { 244 unsigned int ce1, ce2; 245 int ret; 246 247 /* 32-bit collation element table format: 248 * unicode weight - high 16 bit, diacritic weight - high 8 bit of low 16 bit, 249 * case weight - high 4 bit of low 8 bit. 250 */ 251 while (len1 > 0 && len2 > 0) 252 { 253 if (flags & NORM_IGNORESYMBOLS) 254 { 255 int skip = 0; 256 /* FIXME: not tested */ 257 if (get_char_typeW(*str1) & (C1_PUNCT | C1_SPACE)) 258 { 259 str1++; 260 len1--; 261 skip = 1; 262 } 263 if (get_char_typeW(*str2) & (C1_PUNCT | C1_SPACE)) 264 { 265 str2++; 266 len2--; 267 skip = 1; 268 } 269 if (skip) continue; 270 } 271 272 ce1 = collation_table[collation_table[*str1 >> 8] + (*str1 & 0xff)]; 273 ce2 = collation_table[collation_table[*str2 >> 8] + (*str2 & 0xff)]; 274 275 if (ce1 != (unsigned int)-1 && ce2 != (unsigned int)-1) 276 ret = ((ce1 >> 8) & 0xff) - ((ce2 >> 8) & 0xff); 277 else 278 ret = *str1 - *str2; 279 280 if (ret) return ret; 281 282 str1++; 283 str2++; 284 len1--; 285 len2--; 286 } 287 while (len1 && !*str1) 288 { 289 str1++; 290 len1--; 291 } 292 while (len2 && !*str2) 293 { 294 str2++; 295 len2--; 296 } 297 return len1 - len2; 298 } 299 300 static inline int compare_case_weights(int flags, const WCHAR *str1, int len1, 301 const WCHAR *str2, int len2) 302 { 303 unsigned int ce1, ce2; 304 int ret; 305 306 /* 32-bit collation element table format: 307 * unicode weight - high 16 bit, diacritic weight - high 8 bit of low 16 bit, 308 * case weight - high 4 bit of low 8 bit. 309 */ 310 while (len1 > 0 && len2 > 0) 311 { 312 if (flags & NORM_IGNORESYMBOLS) 313 { 314 int skip = 0; 315 /* FIXME: not tested */ 316 if (get_char_typeW(*str1) & (C1_PUNCT | C1_SPACE)) 317 { 318 str1++; 319 len1--; 320 skip = 1; 321 } 322 if (get_char_typeW(*str2) & (C1_PUNCT | C1_SPACE)) 323 { 324 str2++; 325 len2--; 326 skip = 1; 327 } 328 if (skip) continue; 329 } 330 331 ce1 = collation_table[collation_table[*str1 >> 8] + (*str1 & 0xff)]; 332 ce2 = collation_table[collation_table[*str2 >> 8] + (*str2 & 0xff)]; 333 334 if (ce1 != (unsigned int)-1 && ce2 != (unsigned int)-1) 335 ret = ((ce1 >> 4) & 0x0f) - ((ce2 >> 4) & 0x0f); 336 else 337 ret = *str1 - *str2; 338 339 if (ret) return ret; 340 341 str1++; 342 str2++; 343 len1--; 344 len2--; 345 } 346 while (len1 && !*str1) 347 { 348 str1++; 349 len1--; 350 } 351 while (len2 && !*str2) 352 { 353 str2++; 354 len2--; 355 } 356 return len1 - len2; 357 } 358 359 int wine_compare_string(int flags, const WCHAR *str1, int len1, 360 const WCHAR *str2, int len2) 361 { 362 int ret; 363 364 ret = compare_unicode_weights(flags, str1, len1, str2, len2); 365 if (!ret) 366 { 367 if (!(flags & NORM_IGNORENONSPACE)) 368 ret = compare_diacritic_weights(flags, str1, len1, str2, len2); 369 if (!ret && !(flags & NORM_IGNORECASE)) 370 ret = compare_case_weights(flags, str1, len1, str2, len2); 371 } 372 return ret; 373 } 374