1
2 #define PERL_NO_GET_CONTEXT /* we want efficiency */
3
4 /* private functions which need pTHX_ and aTHX_
5 pv_cat_decompHangul
6 sv_2pvunicode
7 pv_utf8_decompose
8 pv_utf8_reorder
9 pv_utf8_compose
10 */
11
12 #include "EXTERN.h"
13 #include "perl.h"
14 #include "XSUB.h"
15
16 #define NEED_utf8_to_uvchr_buf
17 #include "ppport.h"
18
19 /* These 5 files are prepared by mkheader */
20 #include "unfcmb.h"
21 #include "unfcan.h"
22 #include "unfcpt.h"
23 #include "unfcmp.h"
24 #include "unfexc.h"
25
26 /* The generated normalization tables since v5.20 are in native character set
27 * terms. Prior to that, they were in Unicode terms. So we use 'uvchr' for
28 * later perls, and redefine that to be 'uvuni' for earlier ones */
29 #if PERL_VERSION_LT(5,20,0)
30 # undef uvchr_to_utf8
31 # ifdef uvuni_to_utf8
32 # define uvchr_to_utf8 uvuni_to_utf8
33 # else /* Perl 5.6.1 */
34 # define uvchr_to_utf8 uv_to_utf8
35 # endif
36 #endif
37
38 /* check if the string buffer is enough before uvchr_to_utf8(). */
39 /* dstart, d, and dlen should be defined outside before. */
40 #define Renew_d_if_not_enough_to(need) STRLEN curlen = d - dstart; \
41 if (dlen < curlen + (need)) { \
42 dlen += (need); \
43 Renew(dstart, dlen+1, U8); \
44 d = dstart + curlen; \
45 }
46
47 /* if utf8_to_uvchr_buf() sets retlen to 0 (if broken?) */
48 #define ErrRetlenIsZero "panic (Unicode::Normalize %s): zero-length character"
49
50 /* utf8_hop() hops back before start. Maybe broken UTF-8 */
51 #define ErrHopBeforeStart "panic (Unicode::Normalize): hopping before start"
52
53 /* At present, char > 0x10ffff are unaffected without complaint, right? */
54 #define VALID_UTF_MAX (0x10ffff)
55 #define OVER_UTF_MAX(uv) (VALID_UTF_MAX < (uv))
56
57 /* size of array for combining characters */
58 /* enough as an initial value? */
59 #define CC_SEQ_SIZE (10)
60 #define CC_SEQ_STEP (5)
61
62 /* HANGUL begin */
63 #define Hangul_SBase 0xAC00
64 #define Hangul_SFinal 0xD7A3
65 #define Hangul_SCount 11172
66
67 #define Hangul_NCount 588
68
69 #define Hangul_LBase 0x1100
70 #define Hangul_LFinal 0x1112
71 #define Hangul_LCount 19
72
73 #define Hangul_VBase 0x1161
74 #define Hangul_VFinal 0x1175
75 #define Hangul_VCount 21
76
77 #define Hangul_TBase 0x11A7
78 #define Hangul_TFinal 0x11C2
79 #define Hangul_TCount 28
80
81 #define Hangul_IsS(u) ((Hangul_SBase <= (u)) && ((u) <= Hangul_SFinal))
82 #define Hangul_IsN(u) (((u) - Hangul_SBase) % Hangul_TCount == 0)
83 #define Hangul_IsLV(u) (Hangul_IsS(u) && Hangul_IsN(u))
84 #define Hangul_IsL(u) ((Hangul_LBase <= (u)) && ((u) <= Hangul_LFinal))
85 #define Hangul_IsV(u) ((Hangul_VBase <= (u)) && ((u) <= Hangul_VFinal))
86 #define Hangul_IsT(u) ((Hangul_TBase < (u)) && ((u) <= Hangul_TFinal))
87 /* HANGUL end */
88
89 /* this is used for canonical ordering of combining characters (c.c.). */
90 typedef struct {
91 U8 cc; /* combining class */
92 UV uv; /* codepoint */
93 STRLEN pos; /* position */
94 } UNF_cc;
95
compare_cc(const void * a,const void * b)96 static int compare_cc(const void *a, const void *b)
97 {
98 int ret_cc;
99 ret_cc = ((UNF_cc*) a)->cc - ((UNF_cc*) b)->cc;
100 if (ret_cc)
101 return ret_cc;
102
103 return ( ((UNF_cc*) a)->pos > ((UNF_cc*) b)->pos )
104 - ( ((UNF_cc*) a)->pos < ((UNF_cc*) b)->pos );
105 }
106
dec_canonical(UV uv)107 static U8* dec_canonical(UV uv)
108 {
109 U8 ***plane, **row;
110 if (OVER_UTF_MAX(uv))
111 return NULL;
112 plane = (U8***)UNF_canon[uv >> 16];
113 if (! plane)
114 return NULL;
115 row = plane[(U8) (uv >> 8)];
116 return row ? row[(U8) uv] : NULL;
117 }
118
dec_compat(UV uv)119 static U8* dec_compat(UV uv)
120 {
121 U8 ***plane, **row;
122 if (OVER_UTF_MAX(uv))
123 return NULL;
124 plane = (U8***)UNF_compat[uv >> 16];
125 if (! plane)
126 return NULL;
127 row = plane[(U8) (uv >> 8)];
128 return row ? row[(U8) uv] : NULL;
129 }
130
composite_uv(UV uv,UV uv2)131 static UV composite_uv(UV uv, UV uv2)
132 {
133 UNF_complist ***plane, **row, *cell, *i;
134
135 if (!uv2 || OVER_UTF_MAX(uv) || OVER_UTF_MAX(uv2))
136 return 0;
137
138 if (Hangul_IsL(uv) && Hangul_IsV(uv2)) {
139 UV lindex = uv - Hangul_LBase;
140 UV vindex = uv2 - Hangul_VBase;
141 return(Hangul_SBase + (lindex * Hangul_VCount + vindex) *
142 Hangul_TCount);
143 }
144 if (Hangul_IsLV(uv) && Hangul_IsT(uv2)) {
145 UV tindex = uv2 - Hangul_TBase;
146 return(uv + tindex);
147 }
148 plane = UNF_compos[uv >> 16];
149 if (! plane)
150 return 0;
151 row = plane[(U8) (uv >> 8)];
152 if (! row)
153 return 0;
154 cell = row[(U8) uv];
155 if (! cell)
156 return 0;
157 for (i = cell; i->nextchar; i++) {
158 if (uv2 == i->nextchar)
159 return i->composite;
160 }
161 return 0;
162 }
163
getCombinClass(UV uv)164 static U8 getCombinClass(UV uv)
165 {
166 U8 **plane, *row;
167 if (OVER_UTF_MAX(uv))
168 return 0;
169 plane = (U8**)UNF_combin[uv >> 16];
170 if (! plane)
171 return 0;
172 row = plane[(U8) (uv >> 8)];
173 return row ? row[(U8) uv] : 0;
174 }
175
pv_cat_decompHangul(pTHX_ U8 * d,UV uv)176 static U8* pv_cat_decompHangul(pTHX_ U8* d, UV uv)
177 {
178 UV sindex = uv - Hangul_SBase;
179 UV lindex = sindex / Hangul_NCount;
180 UV vindex = (sindex % Hangul_NCount) / Hangul_TCount;
181 UV tindex = sindex % Hangul_TCount;
182
183 if (! Hangul_IsS(uv))
184 return d;
185
186 d = uvchr_to_utf8(d, (lindex + Hangul_LBase));
187 d = uvchr_to_utf8(d, (vindex + Hangul_VBase));
188 if (tindex)
189 d = uvchr_to_utf8(d, (tindex + Hangul_TBase));
190 return d;
191 }
192
sv_2pvunicode(pTHX_ SV * sv,STRLEN * lp)193 static char* sv_2pvunicode(pTHX_ SV *sv, STRLEN *lp)
194 {
195 char *s;
196 STRLEN len;
197 s = SvPV(sv,len);
198 if (!SvUTF8(sv)) {
199 SV* tmpsv = sv_2mortal(newSVpvn(s, len));
200 if (!SvPOK(tmpsv))
201 s = SvPV_force(tmpsv,len);
202 sv_utf8_upgrade(tmpsv);
203 s = SvPV(tmpsv,len);
204 }
205 if (lp)
206 *lp = len;
207 return s;
208 }
209
210 static
pv_utf8_decompose(pTHX_ U8 * s,STRLEN slen,U8 ** dp,STRLEN dlen,bool iscompat)211 U8* pv_utf8_decompose(pTHX_ U8* s, STRLEN slen, U8** dp, STRLEN dlen, bool iscompat)
212 {
213 U8* p = s;
214 U8* e = s + slen;
215 U8* dstart = *dp;
216 U8* d = dstart;
217
218 while (p < e) {
219 STRLEN retlen;
220 UV uv = utf8_to_uvchr_buf(p, e, &retlen);
221 if (!retlen)
222 croak(ErrRetlenIsZero, "decompose");
223 p += retlen;
224
225 if (Hangul_IsS(uv)) {
226 Renew_d_if_not_enough_to(UTF8_MAXLEN * 3)
227 d = pv_cat_decompHangul(aTHX_ d, uv);
228 }
229 else {
230 U8* r = iscompat ? dec_compat(uv) : dec_canonical(uv);
231
232 if (r) {
233 STRLEN len = (STRLEN)strlen((char *)r);
234 Renew_d_if_not_enough_to(len)
235 while (len--)
236 *d++ = *r++;
237 }
238 else {
239 Renew_d_if_not_enough_to(UTF8_MAXLEN)
240 d = uvchr_to_utf8(d, uv);
241 }
242 }
243 }
244 *dp = dstart;
245 return d;
246 }
247
248 static
pv_utf8_reorder(pTHX_ U8 * s,STRLEN slen,U8 ** dp,STRLEN dlen)249 U8* pv_utf8_reorder(pTHX_ U8* s, STRLEN slen, U8** dp, STRLEN dlen)
250 {
251 U8* p = s;
252 U8* e = s + slen;
253 U8* dstart = *dp;
254 U8* d = dstart;
255
256 UNF_cc seq_ary[CC_SEQ_SIZE];
257 UNF_cc* seq_ptr = seq_ary; /* use array at the beginning */
258 UNF_cc* seq_ext = NULL; /* extend if need */
259 STRLEN seq_max = CC_SEQ_SIZE;
260 STRLEN cc_pos = 0;
261
262 while (p < e) {
263 U8 curCC;
264 STRLEN retlen;
265 UV uv = utf8_to_uvchr_buf(p, e, &retlen);
266 if (!retlen)
267 croak(ErrRetlenIsZero, "reorder");
268 p += retlen;
269
270 curCC = getCombinClass(uv);
271
272 if (curCC != 0) {
273 if (seq_max < cc_pos + 1) { /* extend if need */
274 seq_max = cc_pos + CC_SEQ_STEP; /* new size */
275 if (CC_SEQ_SIZE == cc_pos) { /* seq_ary full */
276 STRLEN i;
277 New(0, seq_ext, seq_max, UNF_cc);
278 for (i = 0; i < cc_pos; i++)
279 seq_ext[i] = seq_ary[i];
280 }
281 else {
282 Renew(seq_ext, seq_max, UNF_cc);
283 }
284 seq_ptr = seq_ext; /* use seq_ext from now */
285 }
286
287 seq_ptr[cc_pos].cc = curCC;
288 seq_ptr[cc_pos].uv = uv;
289 seq_ptr[cc_pos].pos = cc_pos;
290 ++cc_pos;
291
292 if (p < e)
293 continue;
294 }
295
296 /* output */
297 if (cc_pos) {
298 STRLEN i;
299
300 if (cc_pos > 1) /* reordered if there are two c.c.'s */
301 qsort((void*)seq_ptr, cc_pos, sizeof(UNF_cc), compare_cc);
302
303 for (i = 0; i < cc_pos; i++) {
304 Renew_d_if_not_enough_to(UTF8_MAXLEN)
305 d = uvchr_to_utf8(d, seq_ptr[i].uv);
306 }
307 cc_pos = 0;
308 }
309
310 if (curCC == 0) {
311 Renew_d_if_not_enough_to(UTF8_MAXLEN)
312 d = uvchr_to_utf8(d, uv);
313 }
314 }
315 if (seq_ext)
316 Safefree(seq_ext);
317 *dp = dstart;
318 return d;
319 }
320
321 static
pv_utf8_compose(pTHX_ U8 * s,STRLEN slen,U8 ** dp,STRLEN dlen,bool iscontig)322 U8* pv_utf8_compose(pTHX_ U8* s, STRLEN slen, U8** dp, STRLEN dlen, bool iscontig)
323 {
324 U8* p = s;
325 U8* e = s + slen;
326 U8* dstart = *dp;
327 U8* d = dstart;
328
329 UV uvS = 0; /* code point of the starter */
330 bool valid_uvS = FALSE; /* if FALSE, uvS isn't initialized yet */
331 U8 preCC = 0;
332
333 UV seq_ary[CC_SEQ_SIZE];
334 UV* seq_ptr = seq_ary; /* use array at the beginning */
335 UV* seq_ext = NULL; /* extend if need */
336 STRLEN seq_max = CC_SEQ_SIZE;
337 STRLEN cc_pos = 0;
338
339 while (p < e) {
340 U8 curCC;
341 STRLEN retlen;
342 UV uv = utf8_to_uvchr_buf(p, e, &retlen);
343 if (!retlen)
344 croak(ErrRetlenIsZero, "compose");
345 p += retlen;
346
347 curCC = getCombinClass(uv);
348
349 if (!valid_uvS) {
350 if (curCC == 0) {
351 uvS = uv; /* the first Starter is found */
352 valid_uvS = TRUE;
353 if (p < e)
354 continue;
355 }
356 else {
357 Renew_d_if_not_enough_to(UTF8_MAXLEN)
358 d = uvchr_to_utf8(d, uv);
359 continue;
360 }
361 }
362 else {
363 bool composed;
364
365 /* blocked */
366 if ((iscontig && cc_pos) || /* discontiguous combination */
367 (curCC != 0 && preCC == curCC) || /* blocked by same CC */
368 (preCC > curCC)) /* blocked by higher CC: revised D2 */
369 composed = FALSE;
370
371 /* not blocked:
372 iscontig && cc_pos == 0 -- contiguous combination
373 curCC == 0 && preCC == 0 -- starter + starter
374 curCC != 0 && preCC < curCC -- lower CC */
375 else {
376 /* try composition */
377 UV uvComp = composite_uv(uvS, uv);
378
379 if (uvComp && !isExclusion(uvComp)) {
380 uvS = uvComp;
381 composed = TRUE;
382
383 /* preCC should not be changed to curCC */
384 /* e.g. 1E14 = 0045 0304 0300 where CC(0304) == CC(0300) */
385 if (p < e)
386 continue;
387 }
388 else
389 composed = FALSE;
390 }
391
392 if (!composed) {
393 preCC = curCC;
394 if (curCC != 0 || !(p < e)) {
395 if (seq_max < cc_pos + 1) { /* extend if need */
396 seq_max = cc_pos + CC_SEQ_STEP; /* new size */
397 if (CC_SEQ_SIZE == cc_pos) { /* seq_ary full */
398 New(0, seq_ext, seq_max, UV);
399 Copy(seq_ary, seq_ext, cc_pos, UV);
400 }
401 else {
402 Renew(seq_ext, seq_max, UV);
403 }
404 seq_ptr = seq_ext; /* use seq_ext from now */
405 }
406 seq_ptr[cc_pos] = uv;
407 ++cc_pos;
408 }
409 if (curCC != 0 && p < e)
410 continue;
411 }
412 }
413
414 /* output */
415 {
416 Renew_d_if_not_enough_to(UTF8_MAXLEN)
417 d = uvchr_to_utf8(d, uvS); /* starter (composed or not) */
418 }
419
420 if (cc_pos) {
421 STRLEN i;
422
423 for (i = 0; i < cc_pos; i++) {
424 Renew_d_if_not_enough_to(UTF8_MAXLEN)
425 d = uvchr_to_utf8(d, seq_ptr[i]);
426 }
427 cc_pos = 0;
428 }
429
430 uvS = uv;
431 }
432 if (seq_ext)
433 Safefree(seq_ext);
434 *dp = dstart;
435 return d;
436 }
437
438 MODULE = Unicode::Normalize PACKAGE = Unicode::Normalize
439
440 SV*
441 decompose(src, compat = &PL_sv_no)
442 SV * src
443 SV * compat
444 PROTOTYPE: $;$
445 PREINIT:
446 SV* dst;
447 U8 *s, *d, *dend;
448 STRLEN slen, dlen;
449 CODE:
450 s = (U8*)sv_2pvunicode(aTHX_ src,&slen);
451 dst = newSVpvn("", 0);
452 dlen = slen;
453 New(0, d, dlen+1, U8);
454 dend = pv_utf8_decompose(aTHX_ s, slen, &d, dlen, (bool)SvTRUE(compat));
455 sv_setpvn(dst, (char *)d, dend - d);
456 SvUTF8_on(dst);
457 Safefree(d);
458 RETVAL = dst;
459 OUTPUT:
460 RETVAL
461
462
463 SV*
464 reorder(src)
465 SV * src
466 PROTOTYPE: $
467 PREINIT:
468 SV* dst;
469 U8 *s, *d, *dend;
470 STRLEN slen, dlen;
471 CODE:
472 s = (U8*)sv_2pvunicode(aTHX_ src,&slen);
473 dst = newSVpvn("", 0);
474 dlen = slen;
475 New(0, d, dlen+1, U8);
476 dend = pv_utf8_reorder(aTHX_ s, slen, &d, dlen);
477 sv_setpvn(dst, (char *)d, dend - d);
478 SvUTF8_on(dst);
479 Safefree(d);
480 RETVAL = dst;
481 OUTPUT:
482 RETVAL
483
484
485 SV*
486 compose(src)
487 SV * src
488 PROTOTYPE: $
489 ALIAS:
490 composeContiguous = 1
491 PREINIT:
492 SV* dst;
493 U8 *s, *d, *dend;
494 STRLEN slen, dlen;
495 CODE:
496 s = (U8*)sv_2pvunicode(aTHX_ src,&slen);
497 dst = newSVpvn("", 0);
498 dlen = slen;
499 New(0, d, dlen+1, U8);
500 dend = pv_utf8_compose(aTHX_ s, slen, &d, dlen, (bool)ix);
501 sv_setpvn(dst, (char *)d, dend - d);
502 SvUTF8_on(dst);
503 Safefree(d);
504 RETVAL = dst;
505 OUTPUT:
506 RETVAL
507
508
509 SV*
510 NFD(src)
511 SV * src
512 PROTOTYPE: $
513 ALIAS:
514 NFKD = 1
515 PREINIT:
516 SV *dst;
517 U8 *s, *t, *tend, *d, *dend;
518 STRLEN slen, tlen, dlen;
519 CODE:
520 s = (U8*)sv_2pvunicode(aTHX_ src,&slen);
521
522 /* decompose */
523 tlen = slen;
524 New(0, t, tlen+1, U8);
525 tend = pv_utf8_decompose(aTHX_ s, slen, &t, tlen, (bool)(ix==1));
526 *tend = '\0';
527 tlen = tend - t; /* no longer know real size of t */
528
529 /* reorder */
530 dlen = tlen;
531 New(0, d, dlen+1, U8);
532 dend = pv_utf8_reorder(aTHX_ t, tlen, &d, dlen);
533 *dend = '\0';
534 dlen = dend - d; /* no longer know real size of d */
535
536 /* return */
537 dst = newSVpvn("", 0);
538 sv_setpvn(dst, (char *)d, dlen);
539 SvUTF8_on(dst);
540
541 Safefree(t);
542 Safefree(d);
543 RETVAL = dst;
544 OUTPUT:
545 RETVAL
546
547
548 SV*
549 NFC(src)
550 SV * src
551 PROTOTYPE: $
552 ALIAS:
553 NFKC = 1
554 FCC = 2
555 PREINIT:
556 SV *dst;
557 U8 *s, *t, *tend, *u, *uend, *d, *dend;
558 STRLEN slen, tlen, ulen, dlen;
559 CODE:
560 s = (U8*)sv_2pvunicode(aTHX_ src,&slen);
561
562 /* decompose */
563 tlen = slen;
564 New(0, t, tlen+1, U8);
565 tend = pv_utf8_decompose(aTHX_ s, slen, &t, tlen, (bool)(ix==1));
566 *tend = '\0';
567 tlen = tend - t; /* no longer know real size of t */
568
569 /* reorder */
570 ulen = tlen;
571 New(0, u, ulen+1, U8);
572 uend = pv_utf8_reorder(aTHX_ t, tlen, &u, ulen);
573 *uend = '\0';
574 ulen = uend - u; /* no longer know real size of u */
575
576 /* compose */
577 dlen = ulen;
578 New(0, d, dlen+1, U8);
579 dend = pv_utf8_compose(aTHX_ u, ulen, &d, dlen, (bool)(ix==2));
580 *dend = '\0';
581 dlen = dend - d; /* no longer know real size of d */
582
583 /* return */
584 dst = newSVpvn("", 0);
585 sv_setpvn(dst, (char *)d, dlen);
586 SvUTF8_on(dst);
587
588 Safefree(t);
589 Safefree(u);
590 Safefree(d);
591 RETVAL = dst;
592 OUTPUT:
593 RETVAL
594
595
596 SV*
597 checkNFD(src)
598 SV * src
599 PROTOTYPE: $
600 ALIAS:
601 checkNFKD = 1
602 PREINIT:
603 STRLEN srclen, retlen;
604 U8 *s, *e, *p, curCC, preCC;
605 bool result = TRUE;
606 CODE:
607 s = (U8*)sv_2pvunicode(aTHX_ src,&srclen);
608 e = s + srclen;
609
610 preCC = 0;
611 for (p = s; p < e; p += retlen) {
612 UV uv = utf8_to_uvchr_buf(p, e, &retlen);
613 if (!retlen)
614 croak(ErrRetlenIsZero, "checkNFD or -NFKD");
615
616 curCC = getCombinClass(uv);
617 if (preCC > curCC && curCC != 0) { /* canonical ordering violated */
618 result = FALSE;
619 break;
620 }
621 if (Hangul_IsS(uv) || (ix ? dec_compat(uv) : dec_canonical(uv))) {
622 result = FALSE;
623 break;
624 }
625 preCC = curCC;
626 }
627 RETVAL = boolSV(result);
628 OUTPUT:
629 RETVAL
630
631
632 SV*
633 checkNFC(src)
634 SV * src
635 PROTOTYPE: $
636 ALIAS:
637 checkNFKC = 1
638 PREINIT:
639 STRLEN srclen, retlen;
640 U8 *s, *e, *p, curCC, preCC;
641 bool result = TRUE;
642 bool isMAYBE = FALSE;
643 CODE:
644 s = (U8*)sv_2pvunicode(aTHX_ src,&srclen);
645 e = s + srclen;
646
647 preCC = 0;
648 for (p = s; p < e; p += retlen) {
649 UV uv = utf8_to_uvchr_buf(p, e, &retlen);
650 if (!retlen)
651 croak(ErrRetlenIsZero, "checkNFC or -NFKC");
652
653 curCC = getCombinClass(uv);
654 if (preCC > curCC && curCC != 0) { /* canonical ordering violated */
655 result = FALSE;
656 break;
657 }
658
659 /* get NFC/NFKC property */
660 if (Hangul_IsS(uv)) /* Hangul syllables are canonical composites */
661 ; /* YES */
662 else if (isExclusion(uv) || isSingleton(uv) || isNonStDecomp(uv)) {
663 result = FALSE;
664 break;
665 }
666 else if (isComp2nd(uv))
667 isMAYBE = TRUE;
668 else if (ix) {
669 char *canon, *compat;
670 /* NFKC_NO when having compatibility mapping. */
671 canon = (char *) dec_canonical(uv);
672 compat = (char *) dec_compat(uv);
673 if (compat && !(canon && strEQ(canon, compat))) {
674 result = FALSE;
675 break;
676 }
677 } /* end of get NFC/NFKC property */
678
679 preCC = curCC;
680 }
681 if (isMAYBE && result) /* NO precedes MAYBE */
682 XSRETURN_UNDEF;
683 RETVAL = boolSV(result);
684 OUTPUT:
685 RETVAL
686
687
688 SV*
689 checkFCD(src)
690 SV * src
691 PROTOTYPE: $
692 ALIAS:
693 checkFCC = 1
694 PREINIT:
695 STRLEN srclen, retlen;
696 U8 *s, *e, *p, curCC, preCC;
697 bool result = TRUE;
698 bool isMAYBE = FALSE;
699 CODE:
700 s = (U8*)sv_2pvunicode(aTHX_ src,&srclen);
701 e = s + srclen;
702 preCC = 0;
703 for (p = s; p < e; p += retlen) {
704 U8 *sCan;
705 UV uvLead;
706 STRLEN canlen = 0;
707 UV uv = utf8_to_uvchr_buf(p, e, &retlen);
708 if (!retlen)
709 croak(ErrRetlenIsZero, "checkFCD or -FCC");
710
711 sCan = (U8*) dec_canonical(uv);
712
713 if (sCan) {
714 STRLEN canret;
715 canlen = (STRLEN)strlen((char *) sCan);
716 uvLead = utf8_to_uvchr_buf(sCan, sCan + canlen, &canret);
717 if (!canret)
718 croak(ErrRetlenIsZero, "checkFCD or -FCC");
719 }
720 else {
721 uvLead = uv;
722 }
723
724 curCC = getCombinClass(uvLead);
725
726 if (curCC != 0 && curCC < preCC) { /* canonical ordering violated */
727 result = FALSE;
728 break;
729 }
730
731 if (ix) {
732 if (isExclusion(uv) || isSingleton(uv) || isNonStDecomp(uv)) {
733 result = FALSE;
734 break;
735 }
736 else if (isComp2nd(uv))
737 isMAYBE = TRUE;
738 }
739
740 if (sCan) {
741 STRLEN canret;
742 UV uvTrail;
743 U8* eCan = sCan + canlen;
744 U8* pCan = utf8_hop(eCan, -1);
745 if (pCan < sCan)
746 croak(ErrHopBeforeStart);
747 uvTrail = utf8_to_uvchr_buf(pCan, eCan, &canret);
748 if (!canret)
749 croak(ErrRetlenIsZero, "checkFCD or -FCC");
750 preCC = getCombinClass(uvTrail);
751 }
752 else {
753 preCC = curCC;
754 }
755 }
756 if (isMAYBE && result) /* NO precedes MAYBE */
757 XSRETURN_UNDEF;
758 RETVAL = boolSV(result);
759 OUTPUT:
760 RETVAL
761
762
763 U8
764 getCombinClass(uv)
765 UV uv
766 PROTOTYPE: $
767
768 bool
769 isExclusion(uv)
770 UV uv
771 PROTOTYPE: $
772
773 bool
774 isSingleton(uv)
775 UV uv
776 PROTOTYPE: $
777
778 bool
779 isNonStDecomp(uv)
780 UV uv
781 PROTOTYPE: $
782
783 bool
784 isComp2nd(uv)
785 UV uv
786 PROTOTYPE: $
787 ALIAS:
788 isNFC_MAYBE = 1
789 isNFKC_MAYBE = 2
790 INIT:
791 PERL_UNUSED_VAR(ix);
792
793 SV*
794 isNFD_NO(uv)
795 UV uv
796 PROTOTYPE: $
797 ALIAS:
798 isNFKD_NO = 1
799 PREINIT:
800 bool result = FALSE;
801 CODE:
802 if (Hangul_IsS(uv) || (ix ? dec_compat(uv) : dec_canonical(uv)))
803 result = TRUE; /* NFD_NO or NFKD_NO */
804 RETVAL = boolSV(result);
805 OUTPUT:
806 RETVAL
807
808
809 SV*
810 isComp_Ex(uv)
811 UV uv
812 PROTOTYPE: $
813 ALIAS:
814 isNFC_NO = 0
815 isNFKC_NO = 1
816 PREINIT:
817 bool result = FALSE;
818 CODE:
819 if (isExclusion(uv) || isSingleton(uv) || isNonStDecomp(uv))
820 result = TRUE; /* NFC_NO or NFKC_NO */
821 else if (ix) {
822 char *canon, *compat;
823 canon = (char *) dec_canonical(uv);
824 compat = (char *) dec_compat(uv);
825 if (compat && (!canon || strNE(canon, compat)))
826 result = TRUE; /* NFC_NO or NFKC_NO */
827 }
828 RETVAL = boolSV(result);
829 OUTPUT:
830 RETVAL
831
832 SV*
833 getComposite(uv, uv2)
834 UV uv
835 UV uv2
836 PROTOTYPE: $$
837 PREINIT:
838 UV composite;
839 CODE:
840 composite = composite_uv(uv, uv2);
841 RETVAL = composite ? newSVuv(composite) : &PL_sv_undef;
842 OUTPUT:
843 RETVAL
844
845
846
847 SV*
848 getCanon(uv)
849 UV uv
850 PROTOTYPE: $
851 ALIAS:
852 getCompat = 1
853 CODE:
854 if (Hangul_IsS(uv)) {
855 U8 tmp[3 * UTF8_MAXLEN + 1];
856 U8 *t = tmp;
857 U8 *e = pv_cat_decompHangul(aTHX_ t, uv);
858 RETVAL = newSVpvn((char *)t, e - t);
859 } else {
860 U8* rstr = ix ? dec_compat(uv) : dec_canonical(uv);
861 if (!rstr)
862 XSRETURN_UNDEF;
863 RETVAL = newSVpvn((char *)rstr, strlen((char *)rstr));
864 }
865 SvUTF8_on(RETVAL);
866 OUTPUT:
867 RETVAL
868
869
870 void
871 splitOnLastStarter(src)
872 SV * src
873 PREINIT:
874 SV *svp;
875 STRLEN srclen;
876 U8 *s, *e, *p;
877 PPCODE:
878 s = (U8*)sv_2pvunicode(aTHX_ src,&srclen);
879 e = s + srclen;
880 p = e;
881 while (s < p) {
882 UV uv;
883 p = utf8_hop(p, -1);
884 if (p < s)
885 croak(ErrHopBeforeStart);
886 uv = utf8_to_uvchr_buf(p, e, NULL);
887 if (getCombinClass(uv) == 0) /* Last Starter found */
888 break;
889 }
890
891 svp = sv_2mortal(newSVpvn((char*)s, p - s));
892 SvUTF8_on(svp);
893 XPUSHs(svp);
894
895 svp = sv_2mortal(newSVpvn((char*)p, e - p));
896 SvUTF8_on(svp);
897 XPUSHs(svp);
898
899