1 2 #define PERL_NO_GET_CONTEXT /* we want efficiency */ 3 4 /* private functions which need pTHX_ and aTHX_ 5 pv_cat_decompHangul 6 sv_2pvunicode 7 pv_utf8_decompose 8 pv_utf8_reorder 9 pv_utf8_compose 10 */ 11 12 #include "EXTERN.h" 13 #include "perl.h" 14 #include "XSUB.h" 15 16 /* These 5 files are prepared by mkheader */ 17 #include "unfcmb.h" 18 #include "unfcan.h" 19 #include "unfcpt.h" 20 #include "unfcmp.h" 21 #include "unfexc.h" 22 23 /* The generated normalization tables since v5.20 are in native character set 24 * terms. Prior to that, they were in Unicode terms. So we use 'uvchr' for 25 * later perls, and redefine that to be 'uvuni' for earlier ones */ 26 #if PERL_VERSION < 20 27 # undef uvchr_to_utf8 28 # ifdef uvuni_to_utf8 29 # define uvchr_to_utf8 uvuni_to_utf8 30 # else /* Perl 5.6.1 */ 31 # define uvchr_to_utf8 uv_to_utf8 32 # endif 33 34 # undef utf8n_to_uvchr 35 # ifdef utf8n_to_uvuni 36 # define utf8n_to_uvchr utf8n_to_uvuni 37 # else /* Perl 5.6.1 */ 38 # define utf8n_to_uvchr utf8_to_uv 39 # endif 40 #endif 41 42 /* UTF8_ALLOW_BOM is used before Perl 5.8.0 */ 43 #ifndef UTF8_ALLOW_BOM 44 #define UTF8_ALLOW_BOM (0) 45 #endif /* UTF8_ALLOW_BOM */ 46 47 #ifndef UTF8_ALLOW_SURROGATE 48 #define UTF8_ALLOW_SURROGATE (0) 49 #endif /* UTF8_ALLOW_SURROGATE */ 50 51 #ifndef UTF8_ALLOW_FE_FF 52 #define UTF8_ALLOW_FE_FF (0) 53 #endif /* UTF8_ALLOW_FE_FF */ 54 55 #ifndef UTF8_ALLOW_FFFF 56 #define UTF8_ALLOW_FFFF (0) 57 #endif /* UTF8_ALLOW_FFFF */ 58 59 #ifndef PERL_UNUSED_VAR 60 # define PERL_UNUSED_VAR(x) ((void)sizeof(x)) 61 #endif 62 63 #define AllowAnyUTF (UTF8_ALLOW_SURROGATE|UTF8_ALLOW_BOM|UTF8_ALLOW_FE_FF|UTF8_ALLOW_FFFF) 64 65 /* check if the string buffer is enough before uvchr_to_utf8(). */ 66 /* dstart, d, and dlen should be defined outside before. */ 67 #define Renew_d_if_not_enough_to(need) STRLEN curlen = d - dstart; \ 68 if (dlen < curlen + (need)) { \ 69 dlen += (need); \ 70 Renew(dstart, dlen+1, U8); \ 71 d = dstart + curlen; \ 72 } 73 74 /* if utf8n_to_uvchr() sets retlen to 0 (if broken?) */ 75 #define ErrRetlenIsZero "panic (Unicode::Normalize %s): zero-length character" 76 77 /* utf8_hop() hops back before start. Maybe broken UTF-8 */ 78 #define ErrHopBeforeStart "panic (Unicode::Normalize): hopping before start" 79 80 /* At present, char > 0x10ffff are unaffected without complaint, right? */ 81 #define VALID_UTF_MAX (0x10ffff) 82 #define OVER_UTF_MAX(uv) (VALID_UTF_MAX < (uv)) 83 84 /* size of array for combining characters */ 85 /* enough as an initial value? */ 86 #define CC_SEQ_SIZE (10) 87 #define CC_SEQ_STEP (5) 88 89 /* HANGUL begin */ 90 #define Hangul_SBase 0xAC00 91 #define Hangul_SFinal 0xD7A3 92 #define Hangul_SCount 11172 93 94 #define Hangul_NCount 588 95 96 #define Hangul_LBase 0x1100 97 #define Hangul_LFinal 0x1112 98 #define Hangul_LCount 19 99 100 #define Hangul_VBase 0x1161 101 #define Hangul_VFinal 0x1175 102 #define Hangul_VCount 21 103 104 #define Hangul_TBase 0x11A7 105 #define Hangul_TFinal 0x11C2 106 #define Hangul_TCount 28 107 108 #define Hangul_IsS(u) ((Hangul_SBase <= (u)) && ((u) <= Hangul_SFinal)) 109 #define Hangul_IsN(u) (((u) - Hangul_SBase) % Hangul_TCount == 0) 110 #define Hangul_IsLV(u) (Hangul_IsS(u) && Hangul_IsN(u)) 111 #define Hangul_IsL(u) ((Hangul_LBase <= (u)) && ((u) <= Hangul_LFinal)) 112 #define Hangul_IsV(u) ((Hangul_VBase <= (u)) && ((u) <= Hangul_VFinal)) 113 #define Hangul_IsT(u) ((Hangul_TBase < (u)) && ((u) <= Hangul_TFinal)) 114 /* HANGUL end */ 115 116 /* this is used for canonical ordering of combining characters (c.c.). */ 117 typedef struct { 118 U8 cc; /* combining class */ 119 UV uv; /* codepoint */ 120 STRLEN pos; /* position */ 121 } UNF_cc; 122 123 static int compare_cc(const void *a, const void *b) 124 { 125 int ret_cc; 126 ret_cc = ((UNF_cc*) a)->cc - ((UNF_cc*) b)->cc; 127 if (ret_cc) 128 return ret_cc; 129 130 return ( ((UNF_cc*) a)->pos > ((UNF_cc*) b)->pos ) 131 - ( ((UNF_cc*) a)->pos < ((UNF_cc*) b)->pos ); 132 } 133 134 static U8* dec_canonical(UV uv) 135 { 136 U8 ***plane, **row; 137 if (OVER_UTF_MAX(uv)) 138 return NULL; 139 plane = (U8***)UNF_canon[uv >> 16]; 140 if (! plane) 141 return NULL; 142 row = plane[(uv >> 8) & 0xff]; 143 return row ? row[uv & 0xff] : NULL; 144 } 145 146 static U8* dec_compat(UV uv) 147 { 148 U8 ***plane, **row; 149 if (OVER_UTF_MAX(uv)) 150 return NULL; 151 plane = (U8***)UNF_compat[uv >> 16]; 152 if (! plane) 153 return NULL; 154 row = plane[(uv >> 8) & 0xff]; 155 return row ? row[uv & 0xff] : NULL; 156 } 157 158 static UV composite_uv(UV uv, UV uv2) 159 { 160 UNF_complist ***plane, **row, *cell, *i; 161 162 if (!uv2 || OVER_UTF_MAX(uv) || OVER_UTF_MAX(uv2)) 163 return 0; 164 165 if (Hangul_IsL(uv) && Hangul_IsV(uv2)) { 166 UV lindex = uv - Hangul_LBase; 167 UV vindex = uv2 - Hangul_VBase; 168 return(Hangul_SBase + (lindex * Hangul_VCount + vindex) * 169 Hangul_TCount); 170 } 171 if (Hangul_IsLV(uv) && Hangul_IsT(uv2)) { 172 UV tindex = uv2 - Hangul_TBase; 173 return(uv + tindex); 174 } 175 plane = UNF_compos[uv >> 16]; 176 if (! plane) 177 return 0; 178 row = plane[(uv >> 8) & 0xff]; 179 if (! row) 180 return 0; 181 cell = row[uv & 0xff]; 182 if (! cell) 183 return 0; 184 for (i = cell; i->nextchar; i++) { 185 if (uv2 == i->nextchar) 186 return i->composite; 187 } 188 return 0; 189 } 190 191 static U8 getCombinClass(UV uv) 192 { 193 U8 **plane, *row; 194 if (OVER_UTF_MAX(uv)) 195 return 0; 196 plane = (U8**)UNF_combin[uv >> 16]; 197 if (! plane) 198 return 0; 199 row = plane[(uv >> 8) & 0xff]; 200 return row ? row[uv & 0xff] : 0; 201 } 202 203 static U8* pv_cat_decompHangul(pTHX_ U8* d, UV uv) 204 { 205 UV sindex = uv - Hangul_SBase; 206 UV lindex = sindex / Hangul_NCount; 207 UV vindex = (sindex % Hangul_NCount) / Hangul_TCount; 208 UV tindex = sindex % Hangul_TCount; 209 210 if (! Hangul_IsS(uv)) 211 return d; 212 213 d = uvchr_to_utf8(d, (lindex + Hangul_LBase)); 214 d = uvchr_to_utf8(d, (vindex + Hangul_VBase)); 215 if (tindex) 216 d = uvchr_to_utf8(d, (tindex + Hangul_TBase)); 217 return d; 218 } 219 220 static char* sv_2pvunicode(pTHX_ SV *sv, STRLEN *lp) 221 { 222 char *s; 223 STRLEN len; 224 s = SvPV(sv,len); 225 if (!SvUTF8(sv)) { 226 SV* tmpsv = sv_2mortal(newSVpvn(s, len)); 227 if (!SvPOK(tmpsv)) 228 s = SvPV_force(tmpsv,len); 229 sv_utf8_upgrade(tmpsv); 230 s = SvPV(tmpsv,len); 231 } 232 if (lp) 233 *lp = len; 234 return s; 235 } 236 237 static 238 U8* pv_utf8_decompose(pTHX_ U8* s, STRLEN slen, U8** dp, STRLEN dlen, bool iscompat) 239 { 240 U8* p = s; 241 U8* e = s + slen; 242 U8* dstart = *dp; 243 U8* d = dstart; 244 245 while (p < e) { 246 STRLEN retlen; 247 UV uv = utf8n_to_uvchr(p, e - p, &retlen, AllowAnyUTF); 248 if (!retlen) 249 croak(ErrRetlenIsZero, "decompose"); 250 p += retlen; 251 252 if (Hangul_IsS(uv)) { 253 Renew_d_if_not_enough_to(UTF8_MAXLEN * 3) 254 d = pv_cat_decompHangul(aTHX_ d, uv); 255 } 256 else { 257 U8* r = iscompat ? dec_compat(uv) : dec_canonical(uv); 258 259 if (r) { 260 STRLEN len = (STRLEN)strlen((char *)r); 261 Renew_d_if_not_enough_to(len) 262 while (len--) 263 *d++ = *r++; 264 } 265 else { 266 Renew_d_if_not_enough_to(UTF8_MAXLEN) 267 d = uvchr_to_utf8(d, uv); 268 } 269 } 270 } 271 *dp = dstart; 272 return d; 273 } 274 275 static 276 U8* pv_utf8_reorder(pTHX_ U8* s, STRLEN slen, U8** dp, STRLEN dlen) 277 { 278 U8* p = s; 279 U8* e = s + slen; 280 U8* dstart = *dp; 281 U8* d = dstart; 282 283 UNF_cc seq_ary[CC_SEQ_SIZE]; 284 UNF_cc* seq_ptr = seq_ary; /* use array at the beginning */ 285 UNF_cc* seq_ext = NULL; /* extend if need */ 286 STRLEN seq_max = CC_SEQ_SIZE; 287 STRLEN cc_pos = 0; 288 289 while (p < e) { 290 U8 curCC; 291 STRLEN retlen; 292 UV uv = utf8n_to_uvchr(p, e - p, &retlen, AllowAnyUTF); 293 if (!retlen) 294 croak(ErrRetlenIsZero, "reorder"); 295 p += retlen; 296 297 curCC = getCombinClass(uv); 298 299 if (curCC != 0) { 300 if (seq_max < cc_pos + 1) { /* extend if need */ 301 seq_max = cc_pos + CC_SEQ_STEP; /* new size */ 302 if (CC_SEQ_SIZE == cc_pos) { /* seq_ary full */ 303 STRLEN i; 304 New(0, seq_ext, seq_max, UNF_cc); 305 for (i = 0; i < cc_pos; i++) 306 seq_ext[i] = seq_ary[i]; 307 } 308 else { 309 Renew(seq_ext, seq_max, UNF_cc); 310 } 311 seq_ptr = seq_ext; /* use seq_ext from now */ 312 } 313 314 seq_ptr[cc_pos].cc = curCC; 315 seq_ptr[cc_pos].uv = uv; 316 seq_ptr[cc_pos].pos = cc_pos; 317 ++cc_pos; 318 319 if (p < e) 320 continue; 321 } 322 323 /* output */ 324 if (cc_pos) { 325 STRLEN i; 326 327 if (cc_pos > 1) /* reordered if there are two c.c.'s */ 328 qsort((void*)seq_ptr, cc_pos, sizeof(UNF_cc), compare_cc); 329 330 for (i = 0; i < cc_pos; i++) { 331 Renew_d_if_not_enough_to(UTF8_MAXLEN) 332 d = uvchr_to_utf8(d, seq_ptr[i].uv); 333 } 334 cc_pos = 0; 335 } 336 337 if (curCC == 0) { 338 Renew_d_if_not_enough_to(UTF8_MAXLEN) 339 d = uvchr_to_utf8(d, uv); 340 } 341 } 342 if (seq_ext) 343 Safefree(seq_ext); 344 *dp = dstart; 345 return d; 346 } 347 348 static 349 U8* pv_utf8_compose(pTHX_ U8* s, STRLEN slen, U8** dp, STRLEN dlen, bool iscontig) 350 { 351 U8* p = s; 352 U8* e = s + slen; 353 U8* dstart = *dp; 354 U8* d = dstart; 355 356 UV uvS = 0; /* code point of the starter */ 357 bool valid_uvS = FALSE; /* if FALSE, uvS isn't initialized yet */ 358 U8 preCC = 0; 359 360 UV seq_ary[CC_SEQ_SIZE]; 361 UV* seq_ptr = seq_ary; /* use array at the beginning */ 362 UV* seq_ext = NULL; /* extend if need */ 363 STRLEN seq_max = CC_SEQ_SIZE; 364 STRLEN cc_pos = 0; 365 366 while (p < e) { 367 U8 curCC; 368 STRLEN retlen; 369 UV uv = utf8n_to_uvchr(p, e - p, &retlen, AllowAnyUTF); 370 if (!retlen) 371 croak(ErrRetlenIsZero, "compose"); 372 p += retlen; 373 374 curCC = getCombinClass(uv); 375 376 if (!valid_uvS) { 377 if (curCC == 0) { 378 uvS = uv; /* the first Starter is found */ 379 valid_uvS = TRUE; 380 if (p < e) 381 continue; 382 } 383 else { 384 Renew_d_if_not_enough_to(UTF8_MAXLEN) 385 d = uvchr_to_utf8(d, uv); 386 continue; 387 } 388 } 389 else { 390 bool composed; 391 392 /* blocked */ 393 if ((iscontig && cc_pos) || /* discontiguous combination */ 394 (curCC != 0 && preCC == curCC) || /* blocked by same CC */ 395 (preCC > curCC)) /* blocked by higher CC: revised D2 */ 396 composed = FALSE; 397 398 /* not blocked: 399 iscontig && cc_pos == 0 -- contiguous combination 400 curCC == 0 && preCC == 0 -- starter + starter 401 curCC != 0 && preCC < curCC -- lower CC */ 402 else { 403 /* try composition */ 404 UV uvComp = composite_uv(uvS, uv); 405 406 if (uvComp && !isExclusion(uvComp)) { 407 uvS = uvComp; 408 composed = TRUE; 409 410 /* preCC should not be changed to curCC */ 411 /* e.g. 1E14 = 0045 0304 0300 where CC(0304) == CC(0300) */ 412 if (p < e) 413 continue; 414 } 415 else 416 composed = FALSE; 417 } 418 419 if (!composed) { 420 preCC = curCC; 421 if (curCC != 0 || !(p < e)) { 422 if (seq_max < cc_pos + 1) { /* extend if need */ 423 seq_max = cc_pos + CC_SEQ_STEP; /* new size */ 424 if (CC_SEQ_SIZE == cc_pos) { /* seq_ary full */ 425 New(0, seq_ext, seq_max, UV); 426 Copy(seq_ary, seq_ext, cc_pos, UV); 427 } 428 else { 429 Renew(seq_ext, seq_max, UV); 430 } 431 seq_ptr = seq_ext; /* use seq_ext from now */ 432 } 433 seq_ptr[cc_pos] = uv; 434 ++cc_pos; 435 } 436 if (curCC != 0 && p < e) 437 continue; 438 } 439 } 440 441 /* output */ 442 { 443 Renew_d_if_not_enough_to(UTF8_MAXLEN) 444 d = uvchr_to_utf8(d, uvS); /* starter (composed or not) */ 445 } 446 447 if (cc_pos) { 448 STRLEN i; 449 450 for (i = 0; i < cc_pos; i++) { 451 Renew_d_if_not_enough_to(UTF8_MAXLEN) 452 d = uvchr_to_utf8(d, seq_ptr[i]); 453 } 454 cc_pos = 0; 455 } 456 457 uvS = uv; 458 } 459 if (seq_ext) 460 Safefree(seq_ext); 461 *dp = dstart; 462 return d; 463 } 464 465 MODULE = Unicode::Normalize PACKAGE = Unicode::Normalize 466 467 SV* 468 decompose(src, compat = &PL_sv_no) 469 SV * src 470 SV * compat 471 PROTOTYPE: $;$ 472 PREINIT: 473 SV* dst; 474 U8 *s, *d, *dend; 475 STRLEN slen, dlen; 476 CODE: 477 s = (U8*)sv_2pvunicode(aTHX_ src,&slen); 478 dst = newSVpvn("", 0); 479 dlen = slen; 480 New(0, d, dlen+1, U8); 481 dend = pv_utf8_decompose(aTHX_ s, slen, &d, dlen, (bool)SvTRUE(compat)); 482 sv_setpvn(dst, (char *)d, dend - d); 483 SvUTF8_on(dst); 484 Safefree(d); 485 RETVAL = dst; 486 OUTPUT: 487 RETVAL 488 489 490 SV* 491 reorder(src) 492 SV * src 493 PROTOTYPE: $ 494 PREINIT: 495 SV* dst; 496 U8 *s, *d, *dend; 497 STRLEN slen, dlen; 498 CODE: 499 s = (U8*)sv_2pvunicode(aTHX_ src,&slen); 500 dst = newSVpvn("", 0); 501 dlen = slen; 502 New(0, d, dlen+1, U8); 503 dend = pv_utf8_reorder(aTHX_ s, slen, &d, dlen); 504 sv_setpvn(dst, (char *)d, dend - d); 505 SvUTF8_on(dst); 506 Safefree(d); 507 RETVAL = dst; 508 OUTPUT: 509 RETVAL 510 511 512 SV* 513 compose(src) 514 SV * src 515 PROTOTYPE: $ 516 ALIAS: 517 composeContiguous = 1 518 PREINIT: 519 SV* dst; 520 U8 *s, *d, *dend; 521 STRLEN slen, dlen; 522 CODE: 523 s = (U8*)sv_2pvunicode(aTHX_ src,&slen); 524 dst = newSVpvn("", 0); 525 dlen = slen; 526 New(0, d, dlen+1, U8); 527 dend = pv_utf8_compose(aTHX_ s, slen, &d, dlen, (bool)ix); 528 sv_setpvn(dst, (char *)d, dend - d); 529 SvUTF8_on(dst); 530 Safefree(d); 531 RETVAL = dst; 532 OUTPUT: 533 RETVAL 534 535 536 SV* 537 NFD(src) 538 SV * src 539 PROTOTYPE: $ 540 ALIAS: 541 NFKD = 1 542 PREINIT: 543 SV *dst; 544 U8 *s, *t, *tend, *d, *dend; 545 STRLEN slen, tlen, dlen; 546 CODE: 547 s = (U8*)sv_2pvunicode(aTHX_ src,&slen); 548 549 /* decompose */ 550 tlen = slen; 551 New(0, t, tlen+1, U8); 552 tend = pv_utf8_decompose(aTHX_ s, slen, &t, tlen, (bool)(ix==1)); 553 *tend = '\0'; 554 tlen = tend - t; /* no longer know real size of t */ 555 556 /* reorder */ 557 dlen = tlen; 558 New(0, d, dlen+1, U8); 559 dend = pv_utf8_reorder(aTHX_ t, tlen, &d, dlen); 560 *dend = '\0'; 561 dlen = dend - d; /* no longer know real size of d */ 562 563 /* return */ 564 dst = newSVpvn("", 0); 565 sv_setpvn(dst, (char *)d, dlen); 566 SvUTF8_on(dst); 567 568 Safefree(t); 569 Safefree(d); 570 RETVAL = dst; 571 OUTPUT: 572 RETVAL 573 574 575 SV* 576 NFC(src) 577 SV * src 578 PROTOTYPE: $ 579 ALIAS: 580 NFKC = 1 581 FCC = 2 582 PREINIT: 583 SV *dst; 584 U8 *s, *t, *tend, *u, *uend, *d, *dend; 585 STRLEN slen, tlen, ulen, dlen; 586 CODE: 587 s = (U8*)sv_2pvunicode(aTHX_ src,&slen); 588 589 /* decompose */ 590 tlen = slen; 591 New(0, t, tlen+1, U8); 592 tend = pv_utf8_decompose(aTHX_ s, slen, &t, tlen, (bool)(ix==1)); 593 *tend = '\0'; 594 tlen = tend - t; /* no longer know real size of t */ 595 596 /* reorder */ 597 ulen = tlen; 598 New(0, u, ulen+1, U8); 599 uend = pv_utf8_reorder(aTHX_ t, tlen, &u, ulen); 600 *uend = '\0'; 601 ulen = uend - u; /* no longer know real size of u */ 602 603 /* compose */ 604 dlen = ulen; 605 New(0, d, dlen+1, U8); 606 dend = pv_utf8_compose(aTHX_ u, ulen, &d, dlen, (bool)(ix==2)); 607 *dend = '\0'; 608 dlen = dend - d; /* no longer know real size of d */ 609 610 /* return */ 611 dst = newSVpvn("", 0); 612 sv_setpvn(dst, (char *)d, dlen); 613 SvUTF8_on(dst); 614 615 Safefree(t); 616 Safefree(u); 617 Safefree(d); 618 RETVAL = dst; 619 OUTPUT: 620 RETVAL 621 622 623 SV* 624 checkNFD(src) 625 SV * src 626 PROTOTYPE: $ 627 ALIAS: 628 checkNFKD = 1 629 PREINIT: 630 STRLEN srclen, retlen; 631 U8 *s, *e, *p, curCC, preCC; 632 bool result = TRUE; 633 CODE: 634 s = (U8*)sv_2pvunicode(aTHX_ src,&srclen); 635 e = s + srclen; 636 637 preCC = 0; 638 for (p = s; p < e; p += retlen) { 639 UV uv = utf8n_to_uvchr(p, e - p, &retlen, AllowAnyUTF); 640 if (!retlen) 641 croak(ErrRetlenIsZero, "checkNFD or -NFKD"); 642 643 curCC = getCombinClass(uv); 644 if (preCC > curCC && curCC != 0) { /* canonical ordering violated */ 645 result = FALSE; 646 break; 647 } 648 if (Hangul_IsS(uv) || (ix ? dec_compat(uv) : dec_canonical(uv))) { 649 result = FALSE; 650 break; 651 } 652 preCC = curCC; 653 } 654 RETVAL = boolSV(result); 655 OUTPUT: 656 RETVAL 657 658 659 SV* 660 checkNFC(src) 661 SV * src 662 PROTOTYPE: $ 663 ALIAS: 664 checkNFKC = 1 665 PREINIT: 666 STRLEN srclen, retlen; 667 U8 *s, *e, *p, curCC, preCC; 668 bool result = TRUE; 669 bool isMAYBE = FALSE; 670 CODE: 671 s = (U8*)sv_2pvunicode(aTHX_ src,&srclen); 672 e = s + srclen; 673 674 preCC = 0; 675 for (p = s; p < e; p += retlen) { 676 UV uv = utf8n_to_uvchr(p, e - p, &retlen, AllowAnyUTF); 677 if (!retlen) 678 croak(ErrRetlenIsZero, "checkNFC or -NFKC"); 679 680 curCC = getCombinClass(uv); 681 if (preCC > curCC && curCC != 0) { /* canonical ordering violated */ 682 result = FALSE; 683 break; 684 } 685 686 /* get NFC/NFKC property */ 687 if (Hangul_IsS(uv)) /* Hangul syllables are canonical composites */ 688 ; /* YES */ 689 else if (isExclusion(uv) || isSingleton(uv) || isNonStDecomp(uv)) { 690 result = FALSE; 691 break; 692 } 693 else if (isComp2nd(uv)) 694 isMAYBE = TRUE; 695 else if (ix) { 696 char *canon, *compat; 697 /* NFKC_NO when having compatibility mapping. */ 698 canon = (char *) dec_canonical(uv); 699 compat = (char *) dec_compat(uv); 700 if (compat && !(canon && strEQ(canon, compat))) { 701 result = FALSE; 702 break; 703 } 704 } /* end of get NFC/NFKC property */ 705 706 preCC = curCC; 707 } 708 if (isMAYBE && result) /* NO precedes MAYBE */ 709 XSRETURN_UNDEF; 710 RETVAL = boolSV(result); 711 OUTPUT: 712 RETVAL 713 714 715 SV* 716 checkFCD(src) 717 SV * src 718 PROTOTYPE: $ 719 ALIAS: 720 checkFCC = 1 721 PREINIT: 722 STRLEN srclen, retlen; 723 U8 *s, *e, *p, curCC, preCC; 724 bool result = TRUE; 725 bool isMAYBE = FALSE; 726 CODE: 727 s = (U8*)sv_2pvunicode(aTHX_ src,&srclen); 728 e = s + srclen; 729 preCC = 0; 730 for (p = s; p < e; p += retlen) { 731 U8 *sCan; 732 UV uvLead; 733 STRLEN canlen = 0; 734 UV uv = utf8n_to_uvchr(p, e - p, &retlen, AllowAnyUTF); 735 if (!retlen) 736 croak(ErrRetlenIsZero, "checkFCD or -FCC"); 737 738 sCan = (U8*) dec_canonical(uv); 739 740 if (sCan) { 741 STRLEN canret; 742 canlen = (STRLEN)strlen((char *) sCan); 743 uvLead = utf8n_to_uvchr(sCan, canlen, &canret, AllowAnyUTF); 744 if (!canret) 745 croak(ErrRetlenIsZero, "checkFCD or -FCC"); 746 } 747 else { 748 uvLead = uv; 749 } 750 751 curCC = getCombinClass(uvLead); 752 753 if (curCC != 0 && curCC < preCC) { /* canonical ordering violated */ 754 result = FALSE; 755 break; 756 } 757 758 if (ix) { 759 if (isExclusion(uv) || isSingleton(uv) || isNonStDecomp(uv)) { 760 result = FALSE; 761 break; 762 } 763 else if (isComp2nd(uv)) 764 isMAYBE = TRUE; 765 } 766 767 if (sCan) { 768 STRLEN canret; 769 UV uvTrail; 770 U8* eCan = sCan + canlen; 771 U8* pCan = utf8_hop(eCan, -1); 772 if (pCan < sCan) 773 croak(ErrHopBeforeStart); 774 uvTrail = utf8n_to_uvchr(pCan, eCan - pCan, &canret, AllowAnyUTF); 775 if (!canret) 776 croak(ErrRetlenIsZero, "checkFCD or -FCC"); 777 preCC = getCombinClass(uvTrail); 778 } 779 else { 780 preCC = curCC; 781 } 782 } 783 if (isMAYBE && result) /* NO precedes MAYBE */ 784 XSRETURN_UNDEF; 785 RETVAL = boolSV(result); 786 OUTPUT: 787 RETVAL 788 789 790 U8 791 getCombinClass(uv) 792 UV uv 793 PROTOTYPE: $ 794 795 bool 796 isExclusion(uv) 797 UV uv 798 PROTOTYPE: $ 799 800 bool 801 isSingleton(uv) 802 UV uv 803 PROTOTYPE: $ 804 805 bool 806 isNonStDecomp(uv) 807 UV uv 808 PROTOTYPE: $ 809 810 bool 811 isComp2nd(uv) 812 UV uv 813 PROTOTYPE: $ 814 ALIAS: 815 isNFC_MAYBE = 1 816 isNFKC_MAYBE = 2 817 INIT: 818 PERL_UNUSED_VAR(ix); 819 820 SV* 821 isNFD_NO(uv) 822 UV uv 823 PROTOTYPE: $ 824 ALIAS: 825 isNFKD_NO = 1 826 PREINIT: 827 bool result = FALSE; 828 CODE: 829 if (Hangul_IsS(uv) || (ix ? dec_compat(uv) : dec_canonical(uv))) 830 result = TRUE; /* NFD_NO or NFKD_NO */ 831 RETVAL = boolSV(result); 832 OUTPUT: 833 RETVAL 834 835 836 SV* 837 isComp_Ex(uv) 838 UV uv 839 PROTOTYPE: $ 840 ALIAS: 841 isNFC_NO = 0 842 isNFKC_NO = 1 843 PREINIT: 844 bool result = FALSE; 845 CODE: 846 if (isExclusion(uv) || isSingleton(uv) || isNonStDecomp(uv)) 847 result = TRUE; /* NFC_NO or NFKC_NO */ 848 else if (ix) { 849 char *canon, *compat; 850 canon = (char *) dec_canonical(uv); 851 compat = (char *) dec_compat(uv); 852 if (compat && (!canon || strNE(canon, compat))) 853 result = TRUE; /* NFC_NO or NFKC_NO */ 854 } 855 RETVAL = boolSV(result); 856 OUTPUT: 857 RETVAL 858 859 SV* 860 getComposite(uv, uv2) 861 UV uv 862 UV uv2 863 PROTOTYPE: $$ 864 PREINIT: 865 UV composite; 866 CODE: 867 composite = composite_uv(uv, uv2); 868 RETVAL = composite ? newSVuv(composite) : &PL_sv_undef; 869 OUTPUT: 870 RETVAL 871 872 873 874 SV* 875 getCanon(uv) 876 UV uv 877 PROTOTYPE: $ 878 ALIAS: 879 getCompat = 1 880 CODE: 881 if (Hangul_IsS(uv)) { 882 U8 tmp[3 * UTF8_MAXLEN + 1]; 883 U8 *t = tmp; 884 U8 *e = pv_cat_decompHangul(aTHX_ t, uv); 885 RETVAL = newSVpvn((char *)t, e - t); 886 } else { 887 U8* rstr = ix ? dec_compat(uv) : dec_canonical(uv); 888 if (!rstr) 889 XSRETURN_UNDEF; 890 RETVAL = newSVpvn((char *)rstr, strlen((char *)rstr)); 891 } 892 SvUTF8_on(RETVAL); 893 OUTPUT: 894 RETVAL 895 896 897 void 898 splitOnLastStarter(src) 899 SV * src 900 PREINIT: 901 SV *svp; 902 STRLEN srclen; 903 U8 *s, *e, *p; 904 PPCODE: 905 s = (U8*)sv_2pvunicode(aTHX_ src,&srclen); 906 e = s + srclen; 907 p = e; 908 while (s < p) { 909 UV uv; 910 p = utf8_hop(p, -1); 911 if (p < s) 912 croak(ErrHopBeforeStart); 913 uv = utf8n_to_uvchr(p, e - p, NULL, AllowAnyUTF); 914 if (getCombinClass(uv) == 0) /* Last Starter found */ 915 break; 916 } 917 918 svp = sv_2mortal(newSVpvn((char*)s, p - s)); 919 SvUTF8_on(svp); 920 XPUSHs(svp); 921 922 svp = sv_2mortal(newSVpvn((char*)p, e - p)); 923 SvUTF8_on(svp); 924 XPUSHs(svp); 925 926