1 
2 #define PERL_NO_GET_CONTEXT /* we want efficiency */
3 
4 /* private functions which need pTHX_ and aTHX_
5     pv_cat_decompHangul
6     sv_2pvunicode
7     pv_utf8_decompose
8     pv_utf8_reorder
9     pv_utf8_compose
10 */
11 
12 #include "EXTERN.h"
13 #include "perl.h"
14 #include "XSUB.h"
15 
16 #define NEED_utf8_to_uvchr_buf
17 #include "ppport.h"
18 
19 /* These 5 files are prepared by mkheader */
20 #include "unfcmb.h"
21 #include "unfcan.h"
22 #include "unfcpt.h"
23 #include "unfcmp.h"
24 #include "unfexc.h"
25 
26 /* The generated normalization tables since v5.20 are in native character set
27  * terms.  Prior to that, they were in Unicode terms.  So we use 'uvchr' for
28  * later perls, and redefine that to be 'uvuni' for earlier ones */
29 #if PERL_VERSION_LT(5,20,0)
30 #   undef uvchr_to_utf8
31 #   ifdef uvuni_to_utf8
32 #       define uvchr_to_utf8   uvuni_to_utf8
33 #   else /* Perl 5.6.1 */
34 #       define uvchr_to_utf8   uv_to_utf8
35 #   endif
36 #endif
37 
38 /* check if the string buffer is enough before uvchr_to_utf8(). */
39 /* dstart, d, and dlen should be defined outside before. */
40 #define Renew_d_if_not_enough_to(need)	STRLEN curlen = d - dstart;	\
41 		if (dlen < curlen + (need)) {	\
42 		    dlen += (need);		\
43 		    Renew(dstart, dlen+1, U8);	\
44 		    d = dstart + curlen;	\
45 		}
46 
47 /* if utf8_to_uvchr_buf() sets retlen to 0 (if broken?) */
48 #define ErrRetlenIsZero "panic (Unicode::Normalize %s): zero-length character"
49 
50 /* utf8_hop() hops back before start. Maybe broken UTF-8 */
51 #define ErrHopBeforeStart "panic (Unicode::Normalize): hopping before start"
52 
53 /* At present, char > 0x10ffff are unaffected without complaint, right? */
54 #define VALID_UTF_MAX    (0x10ffff)
55 #define OVER_UTF_MAX(uv) (VALID_UTF_MAX < (uv))
56 
57 /* size of array for combining characters */
58 /* enough as an initial value? */
59 #define CC_SEQ_SIZE (10)
60 #define CC_SEQ_STEP  (5)
61 
62 /* HANGUL begin */
63 #define Hangul_SBase  0xAC00
64 #define Hangul_SFinal 0xD7A3
65 #define Hangul_SCount  11172
66 
67 #define Hangul_NCount    588
68 
69 #define Hangul_LBase  0x1100
70 #define Hangul_LFinal 0x1112
71 #define Hangul_LCount     19
72 
73 #define Hangul_VBase  0x1161
74 #define Hangul_VFinal 0x1175
75 #define Hangul_VCount     21
76 
77 #define Hangul_TBase  0x11A7
78 #define Hangul_TFinal 0x11C2
79 #define Hangul_TCount     28
80 
81 #define Hangul_IsS(u)  ((Hangul_SBase <= (u)) && ((u) <= Hangul_SFinal))
82 #define Hangul_IsN(u)  (((u) - Hangul_SBase) % Hangul_TCount == 0)
83 #define Hangul_IsLV(u) (Hangul_IsS(u) && Hangul_IsN(u))
84 #define Hangul_IsL(u)  ((Hangul_LBase <= (u)) && ((u) <= Hangul_LFinal))
85 #define Hangul_IsV(u)  ((Hangul_VBase <= (u)) && ((u) <= Hangul_VFinal))
86 #define Hangul_IsT(u)  ((Hangul_TBase  < (u)) && ((u) <= Hangul_TFinal))
87 /* HANGUL end */
88 
89 /* this is used for canonical ordering of combining characters (c.c.). */
90 typedef struct {
91     U8 cc;	/* combining class */
92     UV uv;	/* codepoint */
93     STRLEN pos; /* position */
94 } UNF_cc;
95 
compare_cc(const void * a,const void * b)96 static int compare_cc(const void *a, const void *b)
97 {
98     int ret_cc;
99     ret_cc = ((UNF_cc*) a)->cc - ((UNF_cc*) b)->cc;
100     if (ret_cc)
101 	return ret_cc;
102 
103     return ( ((UNF_cc*) a)->pos > ((UNF_cc*) b)->pos )
104 	 - ( ((UNF_cc*) a)->pos < ((UNF_cc*) b)->pos );
105 }
106 
dec_canonical(UV uv)107 static U8* dec_canonical(UV uv)
108 {
109     U8 ***plane, **row;
110     if (OVER_UTF_MAX(uv))
111 	return NULL;
112     plane = (U8***)UNF_canon[uv >> 16];
113     if (! plane)
114 	return NULL;
115     row = plane[(U8) (uv >> 8)];
116     return row ? row[(U8) uv] : NULL;
117 }
118 
dec_compat(UV uv)119 static U8* dec_compat(UV uv)
120 {
121     U8 ***plane, **row;
122     if (OVER_UTF_MAX(uv))
123 	return NULL;
124     plane = (U8***)UNF_compat[uv >> 16];
125     if (! plane)
126 	return NULL;
127     row = plane[(U8) (uv >> 8)];
128     return row ? row[(U8) uv] : NULL;
129 }
130 
composite_uv(UV uv,UV uv2)131 static UV composite_uv(UV uv, UV uv2)
132 {
133     UNF_complist ***plane, **row, *cell, *i;
134 
135     if (!uv2 || OVER_UTF_MAX(uv) || OVER_UTF_MAX(uv2))
136 	return 0;
137 
138     if (Hangul_IsL(uv) && Hangul_IsV(uv2)) {
139 	UV lindex = uv  - Hangul_LBase;
140 	UV vindex = uv2 - Hangul_VBase;
141 	return(Hangul_SBase + (lindex * Hangul_VCount + vindex) *
142 	       Hangul_TCount);
143     }
144     if (Hangul_IsLV(uv) && Hangul_IsT(uv2)) {
145 	UV tindex = uv2 - Hangul_TBase;
146 	return(uv + tindex);
147     }
148     plane = UNF_compos[uv >> 16];
149     if (! plane)
150 	return 0;
151     row = plane[(U8) (uv >> 8)];
152     if (! row)
153 	return 0;
154     cell = row[(U8) uv];
155     if (! cell)
156 	return 0;
157     for (i = cell; i->nextchar; i++) {
158 	if (uv2 == i->nextchar)
159 	    return i->composite;
160     }
161     return 0;
162 }
163 
getCombinClass(UV uv)164 static U8 getCombinClass(UV uv)
165 {
166     U8 **plane, *row;
167     if (OVER_UTF_MAX(uv))
168 	return 0;
169     plane = (U8**)UNF_combin[uv >> 16];
170     if (! plane)
171 	return 0;
172     row = plane[(U8) (uv >> 8)];
173     return row ? row[(U8) uv] : 0;
174 }
175 
pv_cat_decompHangul(pTHX_ U8 * d,UV uv)176 static U8* pv_cat_decompHangul(pTHX_ U8* d, UV uv)
177 {
178     UV sindex =  uv - Hangul_SBase;
179     UV lindex =  sindex / Hangul_NCount;
180     UV vindex = (sindex % Hangul_NCount) / Hangul_TCount;
181     UV tindex =  sindex % Hangul_TCount;
182 
183     if (! Hangul_IsS(uv))
184 	return d;
185 
186     d = uvchr_to_utf8(d, (lindex + Hangul_LBase));
187     d = uvchr_to_utf8(d, (vindex + Hangul_VBase));
188     if (tindex)
189 	d = uvchr_to_utf8(d, (tindex + Hangul_TBase));
190     return d;
191 }
192 
sv_2pvunicode(pTHX_ SV * sv,STRLEN * lp)193 static char* sv_2pvunicode(pTHX_ SV *sv, STRLEN *lp)
194 {
195     char *s;
196     STRLEN len;
197     s = SvPV(sv,len);
198     if (!SvUTF8(sv)) {
199 	SV* tmpsv = sv_2mortal(newSVpvn(s, len));
200 	if (!SvPOK(tmpsv))
201 	    s = SvPV_force(tmpsv,len);
202 	sv_utf8_upgrade(tmpsv);
203 	s = SvPV(tmpsv,len);
204     }
205     if (lp)
206 	*lp = len;
207     return s;
208 }
209 
210 static
pv_utf8_decompose(pTHX_ U8 * s,STRLEN slen,U8 ** dp,STRLEN dlen,bool iscompat)211 U8* pv_utf8_decompose(pTHX_ U8* s, STRLEN slen, U8** dp, STRLEN dlen, bool iscompat)
212 {
213     U8* p = s;
214     U8* e = s + slen;
215     U8* dstart = *dp;
216     U8* d = dstart;
217 
218     while (p < e) {
219 	STRLEN retlen;
220 	UV uv = utf8_to_uvchr_buf(p, e, &retlen);
221 	if (!retlen)
222 	    croak(ErrRetlenIsZero, "decompose");
223 	p += retlen;
224 
225 	if (Hangul_IsS(uv)) {
226 	    Renew_d_if_not_enough_to(UTF8_MAXLEN * 3)
227 	    d = pv_cat_decompHangul(aTHX_ d, uv);
228 	}
229 	else {
230 	    U8* r = iscompat ? dec_compat(uv) : dec_canonical(uv);
231 
232 	    if (r) {
233 		STRLEN len = (STRLEN)strlen((char *)r);
234 		Renew_d_if_not_enough_to(len)
235 		while (len--)
236 		    *d++ = *r++;
237 	    }
238 	    else {
239 		Renew_d_if_not_enough_to(UTF8_MAXLEN)
240 		d = uvchr_to_utf8(d, uv);
241 	    }
242 	}
243     }
244     *dp = dstart;
245     return d;
246 }
247 
248 static
pv_utf8_reorder(pTHX_ U8 * s,STRLEN slen,U8 ** dp,STRLEN dlen)249 U8* pv_utf8_reorder(pTHX_ U8* s, STRLEN slen, U8** dp, STRLEN dlen)
250 {
251     U8* p = s;
252     U8* e = s + slen;
253     U8* dstart = *dp;
254     U8* d = dstart;
255 
256     UNF_cc  seq_ary[CC_SEQ_SIZE];
257     UNF_cc* seq_ptr = seq_ary; /* use array at the beginning */
258     UNF_cc* seq_ext = NULL; /* extend if need */
259     STRLEN seq_max = CC_SEQ_SIZE;
260     STRLEN cc_pos = 0;
261 
262     while (p < e) {
263 	U8 curCC;
264 	STRLEN retlen;
265 	UV uv = utf8_to_uvchr_buf(p, e, &retlen);
266 	if (!retlen)
267 	    croak(ErrRetlenIsZero, "reorder");
268 	p += retlen;
269 
270 	curCC = getCombinClass(uv);
271 
272 	if (curCC != 0) {
273 	    if (seq_max < cc_pos + 1) { /* extend if need */
274 		seq_max = cc_pos + CC_SEQ_STEP; /* new size */
275 		if (CC_SEQ_SIZE == cc_pos) { /* seq_ary full */
276 		    STRLEN i;
277 		    New(0, seq_ext, seq_max, UNF_cc);
278 		    for (i = 0; i < cc_pos; i++)
279 			seq_ext[i] = seq_ary[i];
280 		}
281 		else {
282 		    Renew(seq_ext, seq_max, UNF_cc);
283 		}
284 		seq_ptr = seq_ext; /* use seq_ext from now */
285 	    }
286 
287 	    seq_ptr[cc_pos].cc  = curCC;
288 	    seq_ptr[cc_pos].uv  = uv;
289 	    seq_ptr[cc_pos].pos = cc_pos;
290 	    ++cc_pos;
291 
292 	    if (p < e)
293 		continue;
294 	}
295 
296 	/* output */
297 	if (cc_pos) {
298 	    STRLEN i;
299 
300 	    if (cc_pos > 1) /* reordered if there are two c.c.'s */
301 		qsort((void*)seq_ptr, cc_pos, sizeof(UNF_cc), compare_cc);
302 
303 	    for (i = 0; i < cc_pos; i++) {
304 		Renew_d_if_not_enough_to(UTF8_MAXLEN)
305 		d = uvchr_to_utf8(d, seq_ptr[i].uv);
306 	    }
307 	    cc_pos = 0;
308 	}
309 
310 	if (curCC == 0) {
311 	    Renew_d_if_not_enough_to(UTF8_MAXLEN)
312 	    d = uvchr_to_utf8(d, uv);
313 	}
314     }
315     if (seq_ext)
316 	Safefree(seq_ext);
317     *dp = dstart;
318     return d;
319 }
320 
321 static
pv_utf8_compose(pTHX_ U8 * s,STRLEN slen,U8 ** dp,STRLEN dlen,bool iscontig)322 U8* pv_utf8_compose(pTHX_ U8* s, STRLEN slen, U8** dp, STRLEN dlen, bool iscontig)
323 {
324     U8* p = s;
325     U8* e = s + slen;
326     U8* dstart = *dp;
327     U8* d = dstart;
328 
329     UV uvS = 0; /* code point of the starter */
330     bool valid_uvS = FALSE; /* if FALSE, uvS isn't initialized yet */
331     U8 preCC = 0;
332 
333     UV  seq_ary[CC_SEQ_SIZE];
334     UV* seq_ptr = seq_ary; /* use array at the beginning */
335     UV* seq_ext = NULL; /* extend if need */
336     STRLEN seq_max = CC_SEQ_SIZE;
337     STRLEN cc_pos = 0;
338 
339     while (p < e) {
340 	U8 curCC;
341 	STRLEN retlen;
342 	UV uv = utf8_to_uvchr_buf(p, e, &retlen);
343 	if (!retlen)
344 	    croak(ErrRetlenIsZero, "compose");
345 	p += retlen;
346 
347 	curCC = getCombinClass(uv);
348 
349 	if (!valid_uvS) {
350 	    if (curCC == 0) {
351 		uvS = uv; /* the first Starter is found */
352 		valid_uvS = TRUE;
353 		if (p < e)
354 		    continue;
355 	    }
356 	    else {
357 		Renew_d_if_not_enough_to(UTF8_MAXLEN)
358 		d = uvchr_to_utf8(d, uv);
359 		continue;
360 	    }
361 	}
362 	else {
363 	    bool composed;
364 
365 	    /* blocked */
366 	    if ((iscontig && cc_pos) || /* discontiguous combination */
367 		 (curCC != 0 && preCC == curCC) || /* blocked by same CC */
368 		 (preCC > curCC)) /* blocked by higher CC: revised D2 */
369 		composed = FALSE;
370 
371 	    /* not blocked:
372 		 iscontig && cc_pos == 0      -- contiguous combination
373 		 curCC == 0 && preCC == 0     -- starter + starter
374 		 curCC != 0 && preCC < curCC  -- lower CC */
375 	    else {
376 		/* try composition */
377 		UV uvComp = composite_uv(uvS, uv);
378 
379 		if (uvComp && !isExclusion(uvComp))  {
380 		    uvS = uvComp;
381 		    composed = TRUE;
382 
383 		    /* preCC should not be changed to curCC */
384 		    /* e.g. 1E14 = 0045 0304 0300 where CC(0304) == CC(0300) */
385 		    if (p < e)
386 			continue;
387 		}
388 		else
389 		    composed = FALSE;
390 	    }
391 
392 	    if (!composed) {
393 		preCC = curCC;
394 		if (curCC != 0 || !(p < e)) {
395 		    if (seq_max < cc_pos + 1) { /* extend if need */
396 			seq_max = cc_pos + CC_SEQ_STEP; /* new size */
397 			if (CC_SEQ_SIZE == cc_pos) { /* seq_ary full */
398 			    New(0, seq_ext, seq_max, UV);
399 			    Copy(seq_ary, seq_ext, cc_pos, UV);
400 			}
401 			else {
402 			    Renew(seq_ext, seq_max, UV);
403 			}
404 			seq_ptr = seq_ext; /* use seq_ext from now */
405 		    }
406 		    seq_ptr[cc_pos] = uv;
407 		    ++cc_pos;
408 		}
409 		if (curCC != 0 && p < e)
410 		    continue;
411 	    }
412 	}
413 
414 	/* output */
415 	{
416 	    Renew_d_if_not_enough_to(UTF8_MAXLEN)
417 	    d = uvchr_to_utf8(d, uvS); /* starter (composed or not) */
418 	}
419 
420 	if (cc_pos) {
421 	    STRLEN i;
422 
423 	    for (i = 0; i < cc_pos; i++) {
424 		Renew_d_if_not_enough_to(UTF8_MAXLEN)
425 		d = uvchr_to_utf8(d, seq_ptr[i]);
426 	    }
427 	    cc_pos = 0;
428 	}
429 
430 	uvS = uv;
431     }
432     if (seq_ext)
433 	Safefree(seq_ext);
434     *dp = dstart;
435     return d;
436 }
437 
438 MODULE = Unicode::Normalize	PACKAGE = Unicode::Normalize
439 
440 SV*
441 decompose(src, compat = &PL_sv_no)
442     SV * src
443     SV * compat
444   PROTOTYPE: $;$
445   PREINIT:
446     SV* dst;
447     U8 *s, *d, *dend;
448     STRLEN slen, dlen;
449   CODE:
450     s = (U8*)sv_2pvunicode(aTHX_ src,&slen);
451     dst = newSVpvn("", 0);
452     dlen = slen;
453     New(0, d, dlen+1, U8);
454     dend = pv_utf8_decompose(aTHX_ s, slen, &d, dlen, (bool)SvTRUE(compat));
455     sv_setpvn(dst, (char *)d, dend - d);
456     SvUTF8_on(dst);
457     Safefree(d);
458     RETVAL = dst;
459   OUTPUT:
460     RETVAL
461 
462 
463 SV*
464 reorder(src)
465     SV * src
466   PROTOTYPE: $
467   PREINIT:
468     SV* dst;
469     U8 *s, *d, *dend;
470     STRLEN slen, dlen;
471   CODE:
472     s = (U8*)sv_2pvunicode(aTHX_ src,&slen);
473     dst = newSVpvn("", 0);
474     dlen = slen;
475     New(0, d, dlen+1, U8);
476     dend = pv_utf8_reorder(aTHX_ s, slen, &d, dlen);
477     sv_setpvn(dst, (char *)d, dend - d);
478     SvUTF8_on(dst);
479     Safefree(d);
480     RETVAL = dst;
481   OUTPUT:
482     RETVAL
483 
484 
485 SV*
486 compose(src)
487     SV * src
488   PROTOTYPE: $
489   ALIAS:
490     composeContiguous = 1
491   PREINIT:
492     SV* dst;
493     U8 *s, *d, *dend;
494     STRLEN slen, dlen;
495   CODE:
496     s = (U8*)sv_2pvunicode(aTHX_ src,&slen);
497     dst = newSVpvn("", 0);
498     dlen = slen;
499     New(0, d, dlen+1, U8);
500     dend = pv_utf8_compose(aTHX_ s, slen, &d, dlen, (bool)ix);
501     sv_setpvn(dst, (char *)d, dend - d);
502     SvUTF8_on(dst);
503     Safefree(d);
504     RETVAL = dst;
505   OUTPUT:
506     RETVAL
507 
508 
509 SV*
510 NFD(src)
511     SV * src
512   PROTOTYPE: $
513   ALIAS:
514     NFKD = 1
515   PREINIT:
516     SV *dst;
517     U8 *s, *t, *tend, *d, *dend;
518     STRLEN slen, tlen, dlen;
519   CODE:
520     s = (U8*)sv_2pvunicode(aTHX_ src,&slen);
521 
522     /* decompose */
523     tlen = slen;
524     New(0, t, tlen+1, U8);
525     tend = pv_utf8_decompose(aTHX_ s, slen, &t, tlen, (bool)(ix==1));
526     *tend = '\0';
527     tlen = tend - t; /* no longer know real size of t */
528 
529     /* reorder */
530     dlen = tlen;
531     New(0, d, dlen+1, U8);
532     dend = pv_utf8_reorder(aTHX_ t, tlen, &d, dlen);
533     *dend = '\0';
534     dlen = dend - d; /* no longer know real size of d */
535 
536     /* return */
537     dst = newSVpvn("", 0);
538     sv_setpvn(dst, (char *)d, dlen);
539     SvUTF8_on(dst);
540 
541     Safefree(t);
542     Safefree(d);
543     RETVAL = dst;
544   OUTPUT:
545     RETVAL
546 
547 
548 SV*
549 NFC(src)
550     SV * src
551   PROTOTYPE: $
552   ALIAS:
553     NFKC = 1
554     FCC  = 2
555   PREINIT:
556     SV *dst;
557     U8 *s, *t, *tend, *u, *uend, *d, *dend;
558     STRLEN slen, tlen, ulen, dlen;
559   CODE:
560     s = (U8*)sv_2pvunicode(aTHX_ src,&slen);
561 
562     /* decompose */
563     tlen = slen;
564     New(0, t, tlen+1, U8);
565     tend = pv_utf8_decompose(aTHX_ s, slen, &t, tlen, (bool)(ix==1));
566     *tend = '\0';
567     tlen = tend - t; /* no longer know real size of t */
568 
569     /* reorder */
570     ulen = tlen;
571     New(0, u, ulen+1, U8);
572     uend = pv_utf8_reorder(aTHX_ t, tlen, &u, ulen);
573     *uend = '\0';
574     ulen = uend - u; /* no longer know real size of u */
575 
576     /* compose */
577     dlen = ulen;
578     New(0, d, dlen+1, U8);
579     dend = pv_utf8_compose(aTHX_ u, ulen, &d, dlen, (bool)(ix==2));
580     *dend = '\0';
581     dlen = dend - d; /* no longer know real size of d */
582 
583     /* return */
584     dst = newSVpvn("", 0);
585     sv_setpvn(dst, (char *)d, dlen);
586     SvUTF8_on(dst);
587 
588     Safefree(t);
589     Safefree(u);
590     Safefree(d);
591     RETVAL = dst;
592   OUTPUT:
593     RETVAL
594 
595 
596 SV*
597 checkNFD(src)
598     SV * src
599   PROTOTYPE: $
600   ALIAS:
601     checkNFKD = 1
602   PREINIT:
603     STRLEN srclen, retlen;
604     U8 *s, *e, *p, curCC, preCC;
605     bool result = TRUE;
606   CODE:
607     s = (U8*)sv_2pvunicode(aTHX_ src,&srclen);
608     e = s + srclen;
609 
610     preCC = 0;
611     for (p = s; p < e; p += retlen) {
612 	UV uv = utf8_to_uvchr_buf(p, e, &retlen);
613 	if (!retlen)
614 	    croak(ErrRetlenIsZero, "checkNFD or -NFKD");
615 
616 	curCC = getCombinClass(uv);
617 	if (preCC > curCC && curCC != 0) { /* canonical ordering violated */
618 	    result = FALSE;
619 	    break;
620 	}
621 	if (Hangul_IsS(uv) || (ix ? dec_compat(uv) : dec_canonical(uv))) {
622 	    result = FALSE;
623 	    break;
624 	}
625 	preCC = curCC;
626     }
627     RETVAL = boolSV(result);
628   OUTPUT:
629     RETVAL
630 
631 
632 SV*
633 checkNFC(src)
634     SV * src
635   PROTOTYPE: $
636   ALIAS:
637     checkNFKC = 1
638   PREINIT:
639     STRLEN srclen, retlen;
640     U8 *s, *e, *p, curCC, preCC;
641     bool result = TRUE;
642     bool isMAYBE = FALSE;
643   CODE:
644     s = (U8*)sv_2pvunicode(aTHX_ src,&srclen);
645     e = s + srclen;
646 
647     preCC = 0;
648     for (p = s; p < e; p += retlen) {
649 	UV uv = utf8_to_uvchr_buf(p, e, &retlen);
650 	if (!retlen)
651 	    croak(ErrRetlenIsZero, "checkNFC or -NFKC");
652 
653 	curCC = getCombinClass(uv);
654 	if (preCC > curCC && curCC != 0) { /* canonical ordering violated */
655 	    result = FALSE;
656 	    break;
657 	}
658 
659 	/* get NFC/NFKC property */
660 	if (Hangul_IsS(uv)) /* Hangul syllables are canonical composites */
661 	    ; /* YES */
662 	else if (isExclusion(uv) || isSingleton(uv) || isNonStDecomp(uv)) {
663 	    result = FALSE;
664 	    break;
665 	}
666 	else if (isComp2nd(uv))
667 	    isMAYBE = TRUE;
668 	else if (ix) {
669 	    char *canon, *compat;
670 	  /* NFKC_NO when having compatibility mapping. */
671 	    canon  = (char *) dec_canonical(uv);
672 	    compat = (char *) dec_compat(uv);
673 	    if (compat && !(canon && strEQ(canon, compat))) {
674 		result = FALSE;
675 		break;
676 	    }
677 	} /* end of get NFC/NFKC property */
678 
679 	preCC = curCC;
680     }
681     if (isMAYBE && result) /* NO precedes MAYBE */
682 	XSRETURN_UNDEF;
683     RETVAL = boolSV(result);
684   OUTPUT:
685     RETVAL
686 
687 
688 SV*
689 checkFCD(src)
690     SV * src
691   PROTOTYPE: $
692   ALIAS:
693     checkFCC = 1
694   PREINIT:
695     STRLEN srclen, retlen;
696     U8 *s, *e, *p, curCC, preCC;
697     bool result = TRUE;
698     bool isMAYBE = FALSE;
699   CODE:
700     s = (U8*)sv_2pvunicode(aTHX_ src,&srclen);
701     e = s + srclen;
702     preCC = 0;
703     for (p = s; p < e; p += retlen) {
704 	U8 *sCan;
705 	UV uvLead;
706 	STRLEN canlen = 0;
707 	UV uv = utf8_to_uvchr_buf(p, e, &retlen);
708 	if (!retlen)
709 	    croak(ErrRetlenIsZero, "checkFCD or -FCC");
710 
711 	sCan = (U8*) dec_canonical(uv);
712 
713 	if (sCan) {
714 	    STRLEN canret;
715 	    canlen = (STRLEN)strlen((char *) sCan);
716 	    uvLead = utf8_to_uvchr_buf(sCan, sCan + canlen, &canret);
717 	    if (!canret)
718 		croak(ErrRetlenIsZero, "checkFCD or -FCC");
719 	}
720 	else {
721 	    uvLead = uv;
722 	}
723 
724 	curCC = getCombinClass(uvLead);
725 
726 	if (curCC != 0 && curCC < preCC) { /* canonical ordering violated */
727 	    result = FALSE;
728 	    break;
729 	}
730 
731 	if (ix) {
732 	    if (isExclusion(uv) || isSingleton(uv) || isNonStDecomp(uv)) {
733 		result = FALSE;
734 		break;
735 	    }
736 	    else if (isComp2nd(uv))
737 		isMAYBE = TRUE;
738 	}
739 
740 	if (sCan) {
741 	    STRLEN canret;
742 	    UV uvTrail;
743 	    U8* eCan = sCan + canlen;
744 	    U8* pCan = utf8_hop(eCan, -1);
745 	    if (pCan < sCan)
746 		croak(ErrHopBeforeStart);
747 	    uvTrail = utf8_to_uvchr_buf(pCan, eCan, &canret);
748 	    if (!canret)
749 		croak(ErrRetlenIsZero, "checkFCD or -FCC");
750 	    preCC = getCombinClass(uvTrail);
751 	}
752 	else {
753 	    preCC = curCC;
754 	}
755     }
756     if (isMAYBE && result) /* NO precedes MAYBE */
757 	XSRETURN_UNDEF;
758     RETVAL = boolSV(result);
759   OUTPUT:
760     RETVAL
761 
762 
763 U8
764 getCombinClass(uv)
765     UV uv
766   PROTOTYPE: $
767 
768 bool
769 isExclusion(uv)
770     UV uv
771   PROTOTYPE: $
772 
773 bool
774 isSingleton(uv)
775     UV uv
776   PROTOTYPE: $
777 
778 bool
779 isNonStDecomp(uv)
780     UV uv
781   PROTOTYPE: $
782 
783 bool
784 isComp2nd(uv)
785     UV uv
786   PROTOTYPE: $
787   ALIAS:
788     isNFC_MAYBE  = 1
789     isNFKC_MAYBE = 2
790   INIT:
791     PERL_UNUSED_VAR(ix);
792 
793 SV*
794 isNFD_NO(uv)
795     UV uv
796   PROTOTYPE: $
797   ALIAS:
798     isNFKD_NO = 1
799   PREINIT:
800     bool result = FALSE;
801   CODE:
802     if (Hangul_IsS(uv) || (ix ? dec_compat(uv) : dec_canonical(uv)))
803 	result = TRUE; /* NFD_NO or NFKD_NO */
804     RETVAL = boolSV(result);
805   OUTPUT:
806     RETVAL
807 
808 
809 SV*
810 isComp_Ex(uv)
811     UV uv
812   PROTOTYPE: $
813   ALIAS:
814     isNFC_NO  = 0
815     isNFKC_NO = 1
816   PREINIT:
817     bool result = FALSE;
818   CODE:
819     if (isExclusion(uv) || isSingleton(uv) || isNonStDecomp(uv))
820 	result = TRUE; /* NFC_NO or NFKC_NO */
821     else if (ix) {
822 	char *canon, *compat;
823 	canon  = (char *) dec_canonical(uv);
824 	compat = (char *) dec_compat(uv);
825 	if (compat && (!canon || strNE(canon, compat)))
826 	    result = TRUE; /* NFC_NO or NFKC_NO */
827     }
828     RETVAL = boolSV(result);
829   OUTPUT:
830     RETVAL
831 
832 SV*
833 getComposite(uv, uv2)
834     UV uv
835     UV uv2
836   PROTOTYPE: $$
837   PREINIT:
838     UV composite;
839   CODE:
840     composite = composite_uv(uv, uv2);
841     RETVAL = composite ? newSVuv(composite) : &PL_sv_undef;
842   OUTPUT:
843     RETVAL
844 
845 
846 
847 SV*
848 getCanon(uv)
849     UV uv
850   PROTOTYPE: $
851   ALIAS:
852     getCompat = 1
853   CODE:
854     if (Hangul_IsS(uv)) {
855 	U8 tmp[3 * UTF8_MAXLEN + 1];
856 	U8 *t = tmp;
857 	U8 *e = pv_cat_decompHangul(aTHX_ t, uv);
858 	RETVAL = newSVpvn((char *)t, e - t);
859     } else {
860 	U8* rstr = ix ? dec_compat(uv) : dec_canonical(uv);
861 	if (!rstr)
862 	    XSRETURN_UNDEF;
863 	RETVAL = newSVpvn((char *)rstr, strlen((char *)rstr));
864     }
865     SvUTF8_on(RETVAL);
866   OUTPUT:
867     RETVAL
868 
869 
870 void
871 splitOnLastStarter(src)
872     SV * src
873   PREINIT:
874     SV *svp;
875     STRLEN srclen;
876     U8 *s, *e, *p;
877   PPCODE:
878     s = (U8*)sv_2pvunicode(aTHX_ src,&srclen);
879     e = s + srclen;
880     p = e;
881     while (s < p) {
882 	UV uv;
883 	p = utf8_hop(p, -1);
884 	if (p < s)
885 	    croak(ErrHopBeforeStart);
886 	uv = utf8_to_uvchr_buf(p, e, NULL);
887 	if (getCombinClass(uv) == 0) /* Last Starter found */
888 	    break;
889     }
890 
891     svp = sv_2mortal(newSVpvn((char*)s, p - s));
892     SvUTF8_on(svp);
893     XPUSHs(svp);
894 
895     svp = sv_2mortal(newSVpvn((char*)p, e - p));
896     SvUTF8_on(svp);
897     XPUSHs(svp);
898 
899