1 /* 2 * Unicode sort key generation 3 * 4 * Copyright 2003 Dmitry Timoshkov 5 * 6 * This library is free software; you can redistribute it and/or 7 * modify it under the terms of the GNU Lesser General Public 8 * License as published by the Free Software Foundation; either 9 * version 2.1 of the License, or (at your option) any later version. 10 * 11 * This library is distributed in the hope that it will be useful, 12 * but WITHOUT ANY WARRANTY; without even the implied warranty of 13 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 14 * Lesser General Public License for more details. 15 * 16 * You should have received a copy of the GNU Lesser General Public 17 * License along with this library; if not, write to the Free Software 18 * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301, USA 19 */ 20 #include "wine/unicode.h" 21 22 extern unsigned int wine_decompose( int flags, WCHAR ch, WCHAR *dst, unsigned int dstlen ); 23 extern const unsigned int collation_table[]; 24 25 /* 26 * flags - normalization NORM_* flags 27 * 28 * FIXME: 'variable' flag not handled 29 */ 30 int wine_get_sortkey(int flags, const WCHAR *src, int srclen, char *dst, int dstlen) 31 { 32 WCHAR dummy[4]; /* no decomposition is larger than 4 chars */ 33 int key_len[4]; 34 char *key_ptr[4]; 35 const WCHAR *src_save = src; 36 int srclen_save = srclen; 37 38 key_len[0] = key_len[1] = key_len[2] = key_len[3] = 0; 39 for (; srclen; srclen--, src++) 40 { 41 unsigned int i, decomposed_len = 1;/*wine_decompose(*src, dummy, 4);*/ 42 dummy[0] = *src; 43 if (decomposed_len) 44 { 45 for (i = 0; i < decomposed_len; i++) 46 { 47 WCHAR wch = dummy[i]; 48 unsigned int ce; 49 50 /* tests show that win2k just ignores NORM_IGNORENONSPACE, 51 * and skips white space and punctuation characters for 52 * NORM_IGNORESYMBOLS. 53 */ 54 if ((flags & NORM_IGNORESYMBOLS) && (get_char_typeW(wch) & (C1_PUNCT | C1_SPACE))) 55 continue; 56 57 if (flags & NORM_IGNORECASE) wch = tolowerW(wch); 58 59 ce = collation_table[collation_table[wch >> 8] + (wch & 0xff)]; 60 if (ce != (unsigned int)-1) 61 { 62 if (ce >> 16) key_len[0] += 2; 63 if ((ce >> 8) & 0xff) key_len[1]++; 64 if ((ce >> 4) & 0x0f) key_len[2]++; 65 if (ce & 1) 66 { 67 if (wch >> 8) key_len[3]++; 68 key_len[3]++; 69 } 70 } 71 else 72 { 73 key_len[0] += 2; 74 if (wch >> 8) key_len[0]++; 75 if (wch & 0xff) key_len[0]++; 76 } 77 } 78 } 79 } 80 81 if (!dstlen) /* compute length */ 82 /* 4 * '\1' + key length */ 83 return key_len[0] + key_len[1] + key_len[2] + key_len[3] + 4; 84 85 if (dstlen < key_len[0] + key_len[1] + key_len[2] + key_len[3] + 4 + 1) 86 return 0; /* overflow */ 87 88 src = src_save; 89 srclen = srclen_save; 90 91 key_ptr[0] = dst; 92 key_ptr[1] = key_ptr[0] + key_len[0] + 1; 93 key_ptr[2] = key_ptr[1] + key_len[1] + 1; 94 key_ptr[3] = key_ptr[2] + key_len[2] + 1; 95 96 for (; srclen; srclen--, src++) 97 { 98 unsigned int i, decomposed_len = 1;/*wine_decompose(*src, dummy, 4);*/ 99 dummy[0] = *src; 100 if (decomposed_len) 101 { 102 for (i = 0; i < decomposed_len; i++) 103 { 104 WCHAR wch = dummy[i]; 105 unsigned int ce; 106 107 /* tests show that win2k just ignores NORM_IGNORENONSPACE, 108 * and skips white space and punctuation characters for 109 * NORM_IGNORESYMBOLS. 110 */ 111 if ((flags & NORM_IGNORESYMBOLS) && (get_char_typeW(wch) & (C1_PUNCT | C1_SPACE))) 112 continue; 113 114 if (flags & NORM_IGNORECASE) wch = tolowerW(wch); 115 116 ce = collation_table[collation_table[wch >> 8] + (wch & 0xff)]; 117 if (ce != (unsigned int)-1) 118 { 119 WCHAR key; 120 if ((key = ce >> 16)) 121 { 122 *key_ptr[0]++ = key >> 8; 123 *key_ptr[0]++ = key & 0xff; 124 } 125 /* make key 1 start from 2 */ 126 if ((key = (ce >> 8) & 0xff)) *key_ptr[1]++ = key + 1; 127 /* make key 2 start from 2 */ 128 if ((key = (ce >> 4) & 0x0f)) *key_ptr[2]++ = key + 1; 129 /* key 3 is always a character code */ 130 if (ce & 1) 131 { 132 if (wch >> 8) *key_ptr[3]++ = wch >> 8; 133 if (wch & 0xff) *key_ptr[3]++ = wch & 0xff; 134 } 135 } 136 else 137 { 138 *key_ptr[0]++ = 0xff; 139 *key_ptr[0]++ = 0xfe; 140 if (wch >> 8) *key_ptr[0]++ = wch >> 8; 141 if (wch & 0xff) *key_ptr[0]++ = wch & 0xff; 142 } 143 } 144 } 145 } 146 147 *key_ptr[0] = '\1'; 148 *key_ptr[1] = '\1'; 149 *key_ptr[2] = '\1'; 150 *key_ptr[3]++ = '\1'; 151 *key_ptr[3] = 0; 152 153 return key_ptr[3] - dst; 154 } 155 156 enum weight 157 { 158 UNICODE_WEIGHT, 159 DIACRITIC_WEIGHT, 160 CASE_WEIGHT 161 }; 162 163 static unsigned int get_weight(WCHAR ch, enum weight type) 164 { 165 unsigned int ret; 166 167 ret = collation_table[collation_table[ch >> 8] + (ch & 0xff)]; 168 if (ret == (unsigned int)-1) 169 return ch; 170 171 switch(type) 172 { 173 case UNICODE_WEIGHT: 174 return ret >> 16; 175 case DIACRITIC_WEIGHT: 176 return (ret >> 8) & 0xff; 177 case CASE_WEIGHT: 178 default: 179 return (ret >> 4) & 0x0f; 180 } 181 } 182 183 static void inc_str_pos(const WCHAR **str, int *len, int *dpos, int *dlen) 184 { 185 (*dpos)++; 186 if (*dpos == *dlen) 187 { 188 *dpos = *dlen = 0; 189 (*str)++; 190 (*len)--; 191 } 192 } 193 194 static inline int compare_weights(int flags, const WCHAR *str1, int len1, 195 const WCHAR *str2, int len2, enum weight type) 196 { 197 int dpos1 = 0, dpos2 = 0, dlen1 = 0, dlen2 = 0; 198 WCHAR dstr1[4], dstr2[4]; 199 unsigned int ce1, ce2; 200 201 /* 32-bit collation element table format: 202 * unicode weight - high 16 bit, diacritic weight - high 8 bit of low 16 bit, 203 * case weight - high 4 bit of low 8 bit. 204 */ 205 while (len1 > 0 && len2 > 0) 206 { 207 if (!dlen1) dlen1 = wine_decompose(0, *str1, dstr1, 4); 208 if (!dlen2) dlen2 = wine_decompose(0, *str2, dstr2, 4); 209 210 if (flags & NORM_IGNORESYMBOLS) 211 { 212 int skip = 0; 213 /* FIXME: not tested */ 214 if (get_char_typeW(dstr1[dpos1]) & (C1_PUNCT | C1_SPACE)) 215 { 216 inc_str_pos(&str1, &len1, &dpos1, &dlen1); 217 skip = 1; 218 } 219 if (get_char_typeW(dstr2[dpos2]) & (C1_PUNCT | C1_SPACE)) 220 { 221 inc_str_pos(&str2, &len2, &dpos2, &dlen2); 222 skip = 1; 223 } 224 if (skip) continue; 225 } 226 227 /* hyphen and apostrophe are treated differently depending on 228 * whether SORT_STRINGSORT specified or not 229 */ 230 if (type == UNICODE_WEIGHT && !(flags & SORT_STRINGSORT)) 231 { 232 if (dstr1[dpos1] == '-' || dstr1[dpos1] == '\'') 233 { 234 if (dstr2[dpos2] != '-' && dstr2[dpos2] != '\'') 235 { 236 inc_str_pos(&str1, &len1, &dpos1, &dlen1); 237 continue; 238 } 239 } 240 else if (dstr2[dpos2] == '-' || dstr2[dpos2] == '\'') 241 { 242 inc_str_pos(&str2, &len2, &dpos2, &dlen2); 243 continue; 244 } 245 } 246 247 ce1 = get_weight(dstr1[dpos1], type); 248 if (!ce1) 249 { 250 inc_str_pos(&str1, &len1, &dpos1, &dlen1); 251 continue; 252 } 253 ce2 = get_weight(dstr2[dpos2], type); 254 if (!ce2) 255 { 256 inc_str_pos(&str2, &len2, &dpos2, &dlen2); 257 continue; 258 } 259 260 if (ce1 - ce2) return ce1 - ce2; 261 262 inc_str_pos(&str1, &len1, &dpos1, &dlen1); 263 inc_str_pos(&str2, &len2, &dpos2, &dlen2); 264 } 265 while (len1) 266 { 267 if (!dlen1) dlen1 = wine_decompose(0, *str1, dstr1, 4); 268 269 ce1 = get_weight(dstr1[dpos1], type); 270 if (ce1) break; 271 inc_str_pos(&str1, &len1, &dpos1, &dlen1); 272 } 273 while (len2) 274 { 275 if (!dlen2) dlen2 = wine_decompose(0, *str2, dstr2, 4); 276 277 ce2 = get_weight(dstr2[dpos2], type); 278 if (ce2) break; 279 inc_str_pos(&str2, &len2, &dpos2, &dlen2); 280 } 281 return len1 - len2; 282 } 283 284 int wine_compare_string(int flags, const WCHAR *str1, int len1, 285 const WCHAR *str2, int len2) 286 { 287 int ret; 288 289 ret = compare_weights(flags, str1, len1, str2, len2, UNICODE_WEIGHT); 290 if (!ret) 291 { 292 if (!(flags & NORM_IGNORENONSPACE)) 293 ret = compare_weights(flags, str1, len1, str2, len2, DIACRITIC_WEIGHT); 294 if (!ret && !(flags & NORM_IGNORECASE)) 295 ret = compare_weights(flags, str1, len1, str2, len2, CASE_WEIGHT); 296 } 297 return ret; 298 } 299