1 2 #define PERL_NO_GET_CONTEXT /* we want efficiency */ 3 4 /* I guese no private function needs pTHX_ and aTHX_ */ 5 6 #include "EXTERN.h" 7 #include "perl.h" 8 #include "XSUB.h" 9 10 /* This file is prepared by mkheader */ 11 #include "ucatbl.h" 12 13 /* At present, char > 0x10ffff are unaffected without complaint, right? */ 14 #define VALID_UTF_MAX (0x10ffff) 15 #define OVER_UTF_MAX(uv) (VALID_UTF_MAX < (uv)) 16 17 #define MAX_DIV_16 (UV_MAX / 16) 18 19 /* Supported Levels */ 20 #define MinLevel (1) 21 #define MaxLevel (4) 22 23 /* Shifted weight at 4th level */ 24 #define Shift4Wt (0xFFFF) 25 26 #define VCE_Length (9) 27 28 #define Hangul_SBase (0xAC00) 29 #define Hangul_SIni (0xAC00) 30 #define Hangul_SFin (0xD7A3) 31 #define Hangul_NCount (588) 32 #define Hangul_TCount (28) 33 #define Hangul_LBase (0x1100) 34 #define Hangul_LIni (0x1100) 35 #define Hangul_LFin (0x1159) 36 #define Hangul_LFill (0x115F) 37 #define Hangul_LEnd (0x115F) /* Unicode 5.2 */ 38 #define Hangul_VBase (0x1161) 39 #define Hangul_VIni (0x1160) /* from Vowel Filler */ 40 #define Hangul_VFin (0x11A2) 41 #define Hangul_VEnd (0x11A7) /* Unicode 5.2 */ 42 #define Hangul_TBase (0x11A7) /* from "no-final" codepoint */ 43 #define Hangul_TIni (0x11A8) 44 #define Hangul_TFin (0x11F9) 45 #define Hangul_TEnd (0x11FF) /* Unicode 5.2 */ 46 #define HangulL2Ini (0xA960) /* Unicode 5.2 */ 47 #define HangulL2Fin (0xA97C) /* Unicode 5.2 */ 48 #define HangulV2Ini (0xD7B0) /* Unicode 5.2 */ 49 #define HangulV2Fin (0xD7C6) /* Unicode 5.2 */ 50 #define HangulT2Ini (0xD7CB) /* Unicode 5.2 */ 51 #define HangulT2Fin (0xD7FB) /* Unicode 5.2 */ 52 53 #define CJK_UidIni (0x4E00) 54 #define CJK_UidFin (0x9FA5) 55 #define CJK_UidF41 (0x9FBB) /* Unicode 4.1 */ 56 #define CJK_UidF51 (0x9FC3) /* Unicode 5.1 */ 57 #define CJK_UidF52 (0x9FCB) /* Unicode 5.2 */ 58 #define CJK_UidF61 (0x9FCC) /* Unicode 6.1 */ 59 #define CJK_UidF80 (0x9FD5) /* Unicode 8.0 */ 60 #define CJK_UidF100 (0x9FEA) /* Unicode 10.0 */ 61 #define CJK_UidF110 (0x9FEF) /* Unicode 11.0 */ 62 #define CJK_UidF130 (0x9FFC) /* Unicode 13.0 */ 63 64 #define CJK_ExtAIni (0x3400) /* Unicode 3.0 */ 65 #define CJK_ExtAFin (0x4DB5) /* Unicode 3.0 */ 66 #define CJK_ExtA130 (0x4DBF) /* Unicode 13.0 */ 67 #define CJK_ExtBIni (0x20000) /* Unicode 3.1 */ 68 #define CJK_ExtBFin (0x2A6D6) /* Unicode 3.1 */ 69 #define CJK_ExtB130 (0x2A6DD) /* Unicode 13.0 */ 70 #define CJK_ExtCIni (0x2A700) /* Unicode 5.2 */ 71 #define CJK_ExtCFin (0x2B734) /* Unicode 5.2 */ 72 #define CJK_ExtDIni (0x2B740) /* Unicode 6.0 */ 73 #define CJK_ExtDFin (0x2B81D) /* Unicode 6.0 */ 74 #define CJK_ExtEIni (0x2B820) /* Unicode 8.0 */ 75 #define CJK_ExtEFin (0x2CEA1) /* Unicode 8.0 */ 76 #define CJK_ExtFIni (0x2CEB0) /* Unicode 10.0 */ 77 #define CJK_ExtFFin (0x2EBE0) /* Unicode 10.0 */ 78 #define CJK_ExtGIni (0x30000) /* Unicode 13.0 */ 79 #define CJK_ExtGFin (0x3134A) /* Unicode 13.0 */ 80 81 #define CJK_CompIni (0xFA0E) 82 #define CJK_CompFin (0xFA29) 83 static const STDCHAR UnifiedCompat[] = { 84 1,1,0,1,0,1,1,0,0,0,0,0,0,0,0,0,0,1,0,1,0,1,1,0,0,1,1,1 85 }; /* E F 0 1 2 3 4 5 6 7 8 9 A B C D E F 0 1 2 3 4 5 6 7 8 9 */ 86 87 #define TangIdeoIni (0x17000) /* Unicode 9.0 */ 88 #define TangIdeoFin (0x187EC) /* Unicode 9.0 */ 89 #define TangIdeo110 (0x187F1) /* Unicode 11.0 */ 90 #define TangIdeo120 (0x187F7) /* Unicode 12.0 */ 91 #define TangCompIni (0x18800) /* Unicode 9.0 */ 92 #define TangCompFin (0x18AF2) /* Unicode 9.0 */ 93 #define TangComp130 (0x18AFF) /* Unicode 13.0 */ 94 #define TangSuppIni (0x18D00) /* Unicode 13.0 */ 95 #define TangSuppFin (0x18D08) /* Unicode 13.0 */ 96 #define NushuIni (0x1B170) /* Unicode 10.0 */ 97 #define NushuFin (0x1B2FB) /* Unicode 10.0 */ 98 #define KhitanIni (0x18B00) /* Unicode 13.0 */ 99 #define KhitanFin (0x18CD5) /* Unicode 13.0 */ 100 101 #define codeRange(bcode, ecode) ((bcode) <= code && code <= (ecode)) 102 103 MODULE = Unicode::Collate PACKAGE = Unicode::Collate 104 105 PROTOTYPES: DISABLE 106 107 void 108 _fetch_rest () 109 PREINIT: 110 char ** rest; 111 PPCODE: 112 for (rest = (char **)UCA_rest; *rest; ++rest) { 113 XPUSHs(sv_2mortal(newSVpv((char *) *rest, 0))); 114 } 115 116 117 void 118 _fetch_simple (uv) 119 UV uv 120 PREINIT: 121 U8 ***plane, **row; 122 U8* result = NULL; 123 PPCODE: 124 if (!OVER_UTF_MAX(uv)){ 125 plane = (U8***)UCA_simple[uv >> 16]; 126 if (plane) { 127 row = plane[(uv >> 8) & 0xff]; 128 result = row ? row[uv & 0xff] : NULL; 129 } 130 } 131 if (result) { 132 int i; 133 int num = (int)*result; 134 ++result; 135 EXTEND(SP, num); 136 for (i = 0; i < num; ++i) { 137 PUSHs(sv_2mortal(newSVpvn((char *) result, VCE_Length))); 138 result += VCE_Length; 139 } 140 } else { 141 PUSHs(sv_2mortal(newSViv(0))); 142 } 143 144 SV* 145 _ignorable_simple (uv) 146 UV uv 147 ALIAS: 148 _exists_simple = 1 149 PREINIT: 150 U8 ***plane, **row; 151 int num = -1; 152 U8* result = NULL; 153 CODE: 154 if (!OVER_UTF_MAX(uv)){ 155 plane = (U8***)UCA_simple[uv >> 16]; 156 if (plane) { 157 row = plane[(uv >> 8) & 0xff]; 158 result = row ? row[uv & 0xff] : NULL; 159 } 160 if (result) 161 num = (int)*result; /* assuming 0 <= num < 128 */ 162 } 163 164 if (ix) 165 RETVAL = boolSV(num >0); 166 else 167 RETVAL = boolSV(num==0); 168 OUTPUT: 169 RETVAL 170 171 172 void 173 _getHexArray (src) 174 SV* src 175 PREINIT: 176 char *s, *e; 177 STRLEN byte; 178 UV value; 179 bool overflowed = FALSE; 180 const char *hexdigit; 181 PPCODE: 182 s = SvPV(src,byte); 183 for (e = s + byte; s < e;) { 184 hexdigit = strchr((char *) PL_hexdigit, *s++); 185 if (! hexdigit) 186 continue; 187 value = (hexdigit - PL_hexdigit) & 0xF; 188 while (*s) { 189 hexdigit = strchr((char *) PL_hexdigit, *s++); 190 if (! hexdigit) 191 break; 192 if (overflowed) 193 continue; 194 if (value > MAX_DIV_16) { 195 overflowed = TRUE; 196 continue; 197 } 198 value = (value << 4) | ((hexdigit - PL_hexdigit) & 0xF); 199 } 200 XPUSHs(sv_2mortal(newSVuv(overflowed ? UV_MAX : value))); 201 } 202 203 204 SV* 205 _isIllegal (sv) 206 SV* sv 207 PREINIT: 208 UV uv; 209 CODE: 210 if (!sv || !SvIOK(sv)) 211 XSRETURN_YES; 212 uv = SvUVX(sv); 213 RETVAL = boolSV( 214 0x10FFFF < uv /* out of range */ 215 || ((uv & 0xFFFE) == 0xFFFE) /* ??FFF[EF] */ 216 || (0xD800 <= uv && uv <= 0xDFFF) /* unpaired surrogates */ 217 || (0xFDD0 <= uv && uv <= 0xFDEF) /* other non-characters */ 218 ); 219 OUTPUT: 220 RETVAL 221 222 223 void 224 _decompHangul (code) 225 UV code 226 PREINIT: 227 UV sindex, lindex, vindex, tindex; 228 PPCODE: 229 /* code *must* be in Hangul syllable. 230 * Check it before you enter here. */ 231 sindex = code - Hangul_SBase; 232 lindex = sindex / Hangul_NCount; 233 vindex = (sindex % Hangul_NCount) / Hangul_TCount; 234 tindex = sindex % Hangul_TCount; 235 236 EXTEND(SP, tindex ? 3 : 2); 237 PUSHs(sv_2mortal(newSVuv(lindex + Hangul_LBase))); 238 PUSHs(sv_2mortal(newSVuv(vindex + Hangul_VBase))); 239 if (tindex) 240 PUSHs(sv_2mortal(newSVuv(tindex + Hangul_TBase))); 241 242 243 SV* 244 getHST (code, uca_vers = 0) 245 UV code; 246 IV uca_vers; 247 PREINIT: 248 const char * hangtype; 249 STRLEN typelen; 250 CODE: 251 if (codeRange(Hangul_SIni, Hangul_SFin)) { 252 if ((code - Hangul_SBase) % Hangul_TCount) { 253 hangtype = "LVT"; typelen = 3; 254 } else { 255 hangtype = "LV"; typelen = 2; 256 } 257 } else if (uca_vers < 20) { 258 if (codeRange(Hangul_LIni, Hangul_LFin) || code == Hangul_LFill) { 259 hangtype = "L"; typelen = 1; 260 } else if (codeRange(Hangul_VIni, Hangul_VFin)) { 261 hangtype = "V"; typelen = 1; 262 } else if (codeRange(Hangul_TIni, Hangul_TFin)) { 263 hangtype = "T"; typelen = 1; 264 } else { 265 hangtype = ""; typelen = 0; 266 } 267 } else { 268 if (codeRange(Hangul_LIni, Hangul_LEnd) || 269 codeRange(HangulL2Ini, HangulL2Fin)) { 270 hangtype = "L"; typelen = 1; 271 } else if (codeRange(Hangul_VIni, Hangul_VEnd) || 272 codeRange(HangulV2Ini, HangulV2Fin)) { 273 hangtype = "V"; typelen = 1; 274 } else if (codeRange(Hangul_TIni, Hangul_TEnd) || 275 codeRange(HangulT2Ini, HangulT2Fin)) { 276 hangtype = "T"; typelen = 1; 277 } else { 278 hangtype = ""; typelen = 0; 279 } 280 } 281 282 RETVAL = newSVpvn(hangtype, typelen); 283 OUTPUT: 284 RETVAL 285 286 287 void 288 _derivCE_9 (code) 289 UV code 290 ALIAS: 291 _derivCE_14 = 1 292 _derivCE_18 = 2 293 _derivCE_20 = 3 294 _derivCE_22 = 4 295 _derivCE_24 = 5 296 _derivCE_32 = 6 297 _derivCE_34 = 7 298 _derivCE_36 = 8 299 _derivCE_38 = 9 300 _derivCE_40 = 10 301 _derivCE_43 = 11 302 PREINIT: 303 UV base, aaaa, bbbb; 304 U8 a[VCE_Length + 1] = "\x00\x00\x00\x00\x00\x00\x00\x00\x00"; 305 U8 b[VCE_Length + 1] = "\x00\x00\x00\x00\x00\x00\x00\x00\x00"; 306 bool basic_unified = 0, tangut = 0, nushu = 0, khitan = 0; 307 PPCODE: 308 if (codeRange(CJK_UidIni, CJK_CompFin)) { 309 if (codeRange(CJK_CompIni, CJK_CompFin)) 310 basic_unified = (bool)UnifiedCompat[code - CJK_CompIni]; 311 else 312 basic_unified = (ix >= 11 ? (code <= CJK_UidF130) : 313 ix >= 9 ? (code <= CJK_UidF110) : 314 ix == 8 ? (code <= CJK_UidF100) : 315 ix >= 6 ? (code <= CJK_UidF80) : 316 ix == 5 ? (code <= CJK_UidF61) : 317 ix >= 3 ? (code <= CJK_UidF52) : 318 ix == 2 ? (code <= CJK_UidF51) : 319 ix == 1 ? (code <= CJK_UidF41) : 320 (code <= CJK_UidFin)); 321 } else { 322 if (ix >= 7) { 323 tangut = (ix >= 11) ? (codeRange(TangIdeoIni, TangIdeo120) || 324 codeRange(TangCompIni, TangComp130) || 325 codeRange(TangSuppIni, TangSuppFin)) : 326 (ix == 10) ? (codeRange(TangIdeoIni, TangIdeo120) || 327 codeRange(TangCompIni, TangCompFin)) : 328 (ix == 9) ? (codeRange(TangIdeoIni, TangIdeo110) || 329 codeRange(TangCompIni, TangCompFin)) : 330 (codeRange(TangIdeoIni, TangIdeoFin) || 331 codeRange(TangCompIni, TangCompFin)); 332 } 333 if (ix >= 8) 334 nushu = (codeRange(NushuIni, NushuFin)); 335 if (ix >= 11) 336 khitan = (codeRange(KhitanIni, KhitanFin)); 337 } 338 base = tangut 339 ? 0xFB00 : 340 nushu 341 ? 0xFB01 : 342 khitan 343 ? 0xFB02 : 344 basic_unified 345 ? 0xFB40 : /* CJK */ 346 ((ix >= 11 ? codeRange(CJK_ExtAIni, CJK_ExtA130) 347 : codeRange(CJK_ExtAIni, CJK_ExtAFin)) 348 || 349 (ix >= 11 ? codeRange(CJK_ExtBIni, CJK_ExtB130) 350 : codeRange(CJK_ExtBIni, CJK_ExtBFin)) 351 || 352 (ix >= 3 && codeRange(CJK_ExtCIni, CJK_ExtCFin)) 353 || 354 (ix >= 4 && codeRange(CJK_ExtDIni, CJK_ExtDFin)) 355 || 356 (ix >= 6 && codeRange(CJK_ExtEIni, CJK_ExtEFin)) 357 || 358 (ix >= 8 && codeRange(CJK_ExtFIni, CJK_ExtFFin)) 359 || 360 (ix >= 11 && codeRange(CJK_ExtGIni, CJK_ExtGFin))) 361 ? 0xFB80 /* CJK ext. */ 362 : 0xFBC0; /* others */ 363 aaaa = tangut || nushu || khitan ? base : base + (code >> 15); 364 bbbb = (tangut ? (code - TangIdeoIni) : 365 nushu ? (code - NushuIni) : 366 khitan ? (code - KhitanIni) : (code & 0x7FFF)) | 0x8000; 367 a[1] = (U8)(aaaa >> 8); 368 a[2] = (U8)(aaaa & 0xFF); 369 b[1] = (U8)(bbbb >> 8); 370 b[2] = (U8)(bbbb & 0xFF); 371 a[4] = (U8)(0x20); /* second octet of level 2 */ 372 a[6] = (U8)(0x02); /* second octet of level 3 */ 373 a[7] = b[7] = (U8)(code >> 8); 374 a[8] = b[8] = (U8)(code & 0xFF); 375 EXTEND(SP, 2); 376 PUSHs(sv_2mortal(newSVpvn((char *) a, VCE_Length))); 377 PUSHs(sv_2mortal(newSVpvn((char *) b, VCE_Length))); 378 379 380 void 381 _derivCE_8 (code) 382 UV code 383 PREINIT: 384 UV aaaa, bbbb; 385 U8 a[VCE_Length + 1] = "\x00\x00\x00\x00\x00\x00\x00\x00\x00"; 386 U8 b[VCE_Length + 1] = "\x00\x00\x00\x00\x00\x00\x00\x00\x00"; 387 PPCODE: 388 aaaa = 0xFF80 + (code >> 15); 389 bbbb = (code & 0x7FFF) | 0x8000; 390 a[1] = (U8)(aaaa >> 8); 391 a[2] = (U8)(aaaa & 0xFF); 392 b[1] = (U8)(bbbb >> 8); 393 b[2] = (U8)(bbbb & 0xFF); 394 a[4] = (U8)(0x02); /* second octet of level 2 */ 395 a[6] = (U8)(0x01); /* second octet of level 3 */ 396 a[7] = b[7] = (U8)(code >> 8); 397 a[8] = b[8] = (U8)(code & 0xFF); 398 EXTEND(SP, 2); 399 PUSHs(sv_2mortal(newSVpvn((char *) a, VCE_Length))); 400 PUSHs(sv_2mortal(newSVpvn((char *) b, VCE_Length))); 401 402 403 void 404 _uideoCE_8 (code) 405 UV code 406 PREINIT: 407 U8 uice[VCE_Length + 1] = "\x00\x00\x00\x00\x00\x00\x00\x00\x00"; 408 PPCODE: 409 uice[1] = uice[7] = (U8)(code >> 8); 410 uice[2] = uice[8] = (U8)(code & 0xFF); 411 uice[4] = (U8)(0x20); /* second octet of level 2 */ 412 uice[6] = (U8)(0x02); /* second octet of level 3 */ 413 PUSHs(sv_2mortal(newSVpvn((char *) uice, VCE_Length))); 414 415 416 SV* 417 _isUIdeo (code, uca_vers) 418 UV code; 419 IV uca_vers; 420 bool basic_unified = 0; 421 CODE: 422 /* uca_vers = 0 for _uideoCE_8() */ 423 if (CJK_UidIni <= code) { 424 if (codeRange(CJK_CompIni, CJK_CompFin)) 425 basic_unified = (bool)UnifiedCompat[code - CJK_CompIni]; 426 else 427 basic_unified = (uca_vers >= 43 ? (code <= CJK_UidF130) : 428 uca_vers >= 38 ? (code <= CJK_UidF110) : 429 uca_vers >= 36 ? (code <= CJK_UidF100) : 430 uca_vers >= 32 ? (code <= CJK_UidF80) : 431 uca_vers >= 24 ? (code <= CJK_UidF61) : 432 uca_vers >= 20 ? (code <= CJK_UidF52) : 433 uca_vers >= 18 ? (code <= CJK_UidF51) : 434 uca_vers >= 14 ? (code <= CJK_UidF41) : 435 (code <= CJK_UidFin)); 436 } 437 RETVAL = boolSV( 438 (basic_unified) 439 || 440 (codeRange(CJK_ExtAIni, CJK_ExtAFin)) 441 || 442 (uca_vers >= 43 && codeRange(CJK_ExtAIni, CJK_ExtA130)) 443 || 444 (uca_vers >= 8 && codeRange(CJK_ExtBIni, CJK_ExtBFin)) 445 || 446 (uca_vers >= 43 && codeRange(CJK_ExtBIni, CJK_ExtB130)) 447 || 448 (uca_vers >= 20 && codeRange(CJK_ExtCIni, CJK_ExtCFin)) 449 || 450 (uca_vers >= 22 && codeRange(CJK_ExtDIni, CJK_ExtDFin)) 451 || 452 (uca_vers >= 32 && codeRange(CJK_ExtEIni, CJK_ExtEFin)) 453 || 454 (uca_vers >= 36 && codeRange(CJK_ExtFIni, CJK_ExtFFin)) 455 || 456 (uca_vers >= 43 && codeRange(CJK_ExtGIni, CJK_ExtGFin)) 457 ); 458 OUTPUT: 459 RETVAL 460 461 462 SV* 463 mk_SortKey (self, buf) 464 SV* self; 465 SV* buf; 466 PREINIT: 467 SV *dst, **svp; 468 STRLEN dlen, vlen; 469 U8 *d, *p, *e, *v, *s[MaxLevel], *eachlevel[MaxLevel]; 470 AV *bufAV; 471 HV *selfHV; 472 UV back_flag; 473 I32 i, buf_len; 474 IV lv, level, uca_vers; 475 bool upper_lower, kata_hira, v2i, last_is_var; 476 CODE: 477 if (SvROK(self) && SvTYPE(SvRV(self)) == SVt_PVHV) 478 selfHV = (HV*)SvRV(self); 479 else 480 croak("$self is not a HASHREF."); 481 482 if (SvROK(buf) && SvTYPE(SvRV(buf)) == SVt_PVAV) 483 bufAV = (AV*)SvRV(buf); 484 else 485 croak("XSUB, not an ARRAYREF."); 486 487 buf_len = av_len(bufAV); 488 489 if (buf_len < 0) { /* empty: -1 */ 490 dlen = 2 * (MaxLevel - 1); 491 dst = newSV(dlen); 492 (void)SvPOK_only(dst); 493 d = (U8*)SvPVX(dst); 494 while (dlen--) 495 *d++ = '\0'; 496 } else { 497 svp = hv_fetch(selfHV, "level", 5, FALSE); 498 level = svp ? SvIV(*svp) : MaxLevel; 499 500 for (lv = 0; lv < level; lv++) { 501 New(0, eachlevel[lv], 2 * (1 + buf_len) + 1, U8); 502 s[lv] = eachlevel[lv]; 503 } 504 505 svp = hv_fetch(selfHV, "upper_before_lower", 18, FALSE); 506 upper_lower = svp ? SvTRUE(*svp) : FALSE; 507 svp = hv_fetch(selfHV, "katakana_before_hiragana", 24, FALSE); 508 kata_hira = svp ? SvTRUE(*svp) : FALSE; 509 svp = hv_fetch(selfHV, "UCA_Version", 11, FALSE); 510 uca_vers = SvIV(*svp); 511 svp = hv_fetch(selfHV, "variable", 8, FALSE); 512 v2i = uca_vers >= 9 && svp /* (vers >= 9) and not (non-ignorable) */ 513 ? !(SvCUR(*svp) == 13 && memEQ(SvPVX(*svp), "non-ignorable", 13)) 514 : FALSE; 515 516 last_is_var = FALSE; 517 for (i = 0; i <= buf_len; i++) { 518 svp = av_fetch(bufAV, i, FALSE); 519 520 if (svp && SvPOK(*svp)) 521 v = (U8*)SvPV(*svp, vlen); 522 else 523 croak("not a vwt."); 524 525 if (vlen < VCE_Length) /* ignore short VCE (unexpected) */ 526 continue; 527 528 /* "Ignorable (L1, L2) after Variable" since track. v. 9 */ 529 if (v2i) { 530 if (*v) 531 last_is_var = TRUE; 532 else if (v[1] || v[2]) /* non zero primary weight */ 533 last_is_var = FALSE; 534 else if (last_is_var) /* zero primary weight; skipped */ 535 continue; 536 } 537 538 if (v[5] == 0) { /* tert wt < 256 */ 539 if (upper_lower) { 540 if (0x8 <= v[6] && v[6] <= 0xC) /* lower */ 541 v[6] -= 6; 542 else if (0x2 <= v[6] && v[6] <= 0x6) /* upper */ 543 v[6] += 6; 544 else if (v[6] == 0x1C) /* square upper */ 545 v[6]++; 546 else if (v[6] == 0x1D) /* square lower */ 547 v[6]--; 548 } 549 if (kata_hira) { 550 if (0x0F <= v[6] && v[6] <= 0x13) /* katakana */ 551 v[6] -= 2; 552 else if (0xD <= v[6] && v[6] <= 0xE) /* hiragana */ 553 v[6] += 5; 554 } 555 } 556 557 for (lv = 0; lv < level; lv++) { 558 if (v[2 * lv + 1] || v[2 * lv + 2]) { 559 *s[lv]++ = v[2 * lv + 1]; 560 *s[lv]++ = v[2 * lv + 2]; 561 } 562 } 563 } 564 565 dlen = 2 * (MaxLevel - 1); 566 for (lv = 0; lv < level; lv++) 567 dlen += s[lv] - eachlevel[lv]; 568 569 dst = newSV(dlen); 570 (void)SvPOK_only(dst); 571 d = (U8*)SvPVX(dst); 572 573 svp = hv_fetch(selfHV, "backwardsFlag", 13, FALSE); 574 back_flag = svp ? SvUV(*svp) : (UV)0; 575 576 for (lv = 0; lv < level; lv++) { 577 if (back_flag & (1 << (lv + 1))) { 578 p = s[lv]; 579 e = eachlevel[lv]; 580 for ( ; e < p; p -= 2) { 581 *d++ = p[-2]; 582 *d++ = p[-1]; 583 } 584 } 585 else { 586 p = eachlevel[lv]; 587 e = s[lv]; 588 while (p < e) 589 *d++ = *p++; 590 } 591 if (lv + 1 < MaxLevel) { /* lv + 1 == real level */ 592 *d++ = '\0'; 593 *d++ = '\0'; 594 } 595 } 596 597 for (lv = level; lv < MaxLevel; lv++) { 598 if (lv + 1 < MaxLevel) { /* lv + 1 == real level */ 599 *d++ = '\0'; 600 *d++ = '\0'; 601 } 602 } 603 604 for (lv = 0; lv < level; lv++) { 605 Safefree(eachlevel[lv]); 606 } 607 } 608 *d = '\0'; 609 SvCUR_set(dst, d - (U8*)SvPVX(dst)); 610 RETVAL = dst; 611 OUTPUT: 612 RETVAL 613 614 615 SV* 616 varCE (self, vce) 617 SV* self; 618 SV* vce; 619 PREINIT: 620 SV *dst, *vbl, **svp; 621 HV *selfHV; 622 U8 *a, *v, *d; 623 STRLEN alen, vlen; 624 bool ig_l2; 625 IV uca_vers; 626 UV totwt; 627 CODE: 628 if (SvROK(self) && SvTYPE(SvRV(self)) == SVt_PVHV) 629 selfHV = (HV*)SvRV(self); 630 else 631 croak("$self is not a HASHREF."); 632 633 svp = hv_fetch(selfHV, "ignore_level2", 13, FALSE); 634 ig_l2 = svp ? SvTRUE(*svp) : FALSE; 635 636 svp = hv_fetch(selfHV, "variable", 8, FALSE); 637 vbl = svp ? *svp : &PL_sv_no; 638 a = (U8*)SvPV(vbl, alen); 639 v = (U8*)SvPV(vce, vlen); 640 641 dst = newSV(vlen); 642 d = (U8*)SvPVX(dst); 643 (void)SvPOK_only(dst); 644 Copy(v, d, vlen, U8); 645 SvCUR_set(dst, vlen); 646 d[vlen] = '\0'; 647 648 /* primary weight == 0 && secondary weight != 0 */ 649 if (ig_l2 && !d[1] && !d[2] && (d[3] || d[4])) { 650 d[3] = d[4] = d[5] = d[6] = '\0'; 651 } 652 653 /* variable: checked only the first char and the length, 654 trusting checkCollator() and %VariableOK in Perl ... */ 655 656 if (vlen >= VCE_Length && *a != 'n') { 657 if (*v) { 658 if (*a == 's') { /* shifted or shift-trimmed */ 659 d[7] = d[1]; /* wt level 1 to 4 */ 660 d[8] = d[2]; 661 } /* else blanked */ 662 d[1] = d[2] = d[3] = d[4] = d[5] = d[6] = '\0'; 663 } else if (*a == 's') { /* shifted or shift-trimmed */ 664 totwt = d[1] + d[2] + d[3] + d[4] + d[5] + d[6]; 665 if (alen == 7 && totwt != 0) { /* shifted */ 666 if (d[1] == 0 && d[2] == 1) { /* XXX: CollationAuxiliary-6.2.0 */ 667 d[7] = d[1]; /* wt level 1 to 4 */ 668 d[8] = d[2]; 669 } else { 670 svp = hv_fetch(selfHV, "UCA_Version", 11, FALSE); 671 if (!svp) 672 croak("Panic: no $self->{UCA_Version} in varCE"); 673 uca_vers = SvIV(*svp); 674 675 /* completely ignorable or the second derived CE */ 676 if (uca_vers >= 36 && d[3] + d[4] + d[5] + d[6] == 0) { 677 d[7] = d[8] = '\0'; 678 } else { 679 d[7] = (U8)(Shift4Wt >> 8); 680 d[8] = (U8)(Shift4Wt & 0xFF); 681 } 682 } 683 } else { /* shift-trimmed or completely ignorable */ 684 d[7] = d[8] = '\0'; 685 } 686 } /* else blanked */ 687 } /* else non-ignorable */ 688 RETVAL = dst; 689 OUTPUT: 690 RETVAL 691 692 693 694 SV* 695 visualizeSortKey (self, key) 696 SV * self 697 SV * key 698 PREINIT: 699 HV *selfHV; 700 SV **svp, *dst; 701 U8 *s, *e, *d; 702 STRLEN klen, dlen; 703 UV uv; 704 IV uca_vers, sep = 0; 705 const char *upperhex = "0123456789ABCDEF"; 706 CODE: 707 if (SvROK(self) && SvTYPE(SvRV(self)) == SVt_PVHV) 708 selfHV = (HV*)SvRV(self); 709 else 710 croak("$self is not a HASHREF."); 711 712 svp = hv_fetch(selfHV, "UCA_Version", 11, FALSE); 713 if (!svp) 714 croak("Panic: no $self->{UCA_Version} in visualizeSortKey"); 715 uca_vers = SvIV(*svp); 716 717 s = (U8*)SvPV(key, klen); 718 719 /* slightly *longer* than the need, but I'm afraid of miscounting; 720 = (klen / 2) * 5 - 1 721 # FFFF and ' ' for each 16bit units but ' ' is less by 1; 722 # ' ' and '|' for level boundaries including the identical level 723 + 2 # '[' and ']' 724 + 1 # '\0' 725 (a) if klen is odd (not expected), maybe more 5 bytes. 726 (b) there is not always the identical level. 727 */ 728 dlen = (klen / 2) * 5 + MaxLevel * 2 + 2; 729 dst = newSV(dlen); 730 (void)SvPOK_only(dst); 731 d = (U8*)SvPVX(dst); 732 733 *d++ = '['; 734 for (e = s + klen; s < e; s += 2) { 735 uv = (U16)(*s << 8 | s[1]); 736 if (uv || sep >= MaxLevel) { 737 if ((d[-1] != '[') && ((9 <= uca_vers) || (d[-1] != '|'))) 738 *d++ = ' '; 739 *d++ = upperhex[ (s[0] >> 4) & 0xF ]; 740 *d++ = upperhex[ s[0] & 0xF ]; 741 *d++ = upperhex[ (s[1] >> 4) & 0xF ]; 742 *d++ = upperhex[ s[1] & 0xF ]; 743 } else { 744 if ((9 <= uca_vers) && (d[-1] != '[')) 745 *d++ = ' '; 746 *d++ = '|'; 747 ++sep; 748 } 749 } 750 *d++ = ']'; 751 *d = '\0'; 752 SvCUR_set(dst, d - (U8*)SvPVX(dst)); 753 RETVAL = dst; 754 OUTPUT: 755 RETVAL 756