1 
2 #define PERL_NO_GET_CONTEXT /* we want efficiency */
3 
4 /* private functions which need pTHX_ and aTHX_
5     pv_cat_decompHangul
6     sv_2pvunicode
7     pv_utf8_decompose
8     pv_utf8_reorder
9     pv_utf8_compose
10 */
11 
12 #include "EXTERN.h"
13 #include "perl.h"
14 #include "XSUB.h"
15 
16 /* These 5 files are prepared by mkheader */
17 #include "unfcmb.h"
18 #include "unfcan.h"
19 #include "unfcpt.h"
20 #include "unfcmp.h"
21 #include "unfexc.h"
22 
23 /* The generated normalization tables since v5.20 are in native character set
24  * terms.  Prior to that, they were in Unicode terms.  So we use 'uvchr' for
25  * later perls, and redefine that to be 'uvuni' for earlier ones */
26 #if PERL_VERSION < 20
27 #   undef uvchr_to_utf8
28 #   ifdef uvuni_to_utf8
29 #       define uvchr_to_utf8   uvuni_to_utf8
30 #   else /* Perl 5.6.1 */
31 #       define uvchr_to_utf8   uv_to_utf8
32 #   endif
33 
34 #   undef utf8n_to_uvchr
35 #   ifdef utf8n_to_uvuni
36 #       define utf8n_to_uvchr   utf8n_to_uvuni
37 #   else /* Perl 5.6.1 */
38 #       define utf8n_to_uvchr   utf8_to_uv
39 #   endif
40 #endif
41 
42 /* UTF8_ALLOW_BOM is used before Perl 5.8.0 */
43 #ifndef UTF8_ALLOW_BOM
44 #define UTF8_ALLOW_BOM  (0)
45 #endif /* UTF8_ALLOW_BOM */
46 
47 #ifndef UTF8_ALLOW_SURROGATE
48 #define UTF8_ALLOW_SURROGATE  (0)
49 #endif /* UTF8_ALLOW_SURROGATE */
50 
51 #ifndef UTF8_ALLOW_FE_FF
52 #define UTF8_ALLOW_FE_FF  (0)
53 #endif /* UTF8_ALLOW_FE_FF */
54 
55 #ifndef UTF8_ALLOW_FFFF
56 #define UTF8_ALLOW_FFFF  (0)
57 #endif /* UTF8_ALLOW_FFFF */
58 
59 #ifndef PERL_UNUSED_VAR
60 #  define PERL_UNUSED_VAR(x) ((void)sizeof(x))
61 #endif
62 
63 #define AllowAnyUTF (UTF8_ALLOW_SURROGATE|UTF8_ALLOW_BOM|UTF8_ALLOW_FE_FF|UTF8_ALLOW_FFFF)
64 
65 /* check if the string buffer is enough before uvchr_to_utf8(). */
66 /* dstart, d, and dlen should be defined outside before. */
67 #define Renew_d_if_not_enough_to(need)	STRLEN curlen = d - dstart;	\
68 		if (dlen < curlen + (need)) {	\
69 		    dlen += (need);		\
70 		    Renew(dstart, dlen+1, U8);	\
71 		    d = dstart + curlen;	\
72 		}
73 
74 /* if utf8n_to_uvchr() sets retlen to 0 (if broken?) */
75 #define ErrRetlenIsZero "panic (Unicode::Normalize %s): zero-length character"
76 
77 /* utf8_hop() hops back before start. Maybe broken UTF-8 */
78 #define ErrHopBeforeStart "panic (Unicode::Normalize): hopping before start"
79 
80 /* At present, char > 0x10ffff are unaffected without complaint, right? */
81 #define VALID_UTF_MAX    (0x10ffff)
82 #define OVER_UTF_MAX(uv) (VALID_UTF_MAX < (uv))
83 
84 /* size of array for combining characters */
85 /* enough as an initial value? */
86 #define CC_SEQ_SIZE (10)
87 #define CC_SEQ_STEP  (5)
88 
89 /* HANGUL begin */
90 #define Hangul_SBase  0xAC00
91 #define Hangul_SFinal 0xD7A3
92 #define Hangul_SCount  11172
93 
94 #define Hangul_NCount    588
95 
96 #define Hangul_LBase  0x1100
97 #define Hangul_LFinal 0x1112
98 #define Hangul_LCount     19
99 
100 #define Hangul_VBase  0x1161
101 #define Hangul_VFinal 0x1175
102 #define Hangul_VCount     21
103 
104 #define Hangul_TBase  0x11A7
105 #define Hangul_TFinal 0x11C2
106 #define Hangul_TCount     28
107 
108 #define Hangul_IsS(u)  ((Hangul_SBase <= (u)) && ((u) <= Hangul_SFinal))
109 #define Hangul_IsN(u)  (((u) - Hangul_SBase) % Hangul_TCount == 0)
110 #define Hangul_IsLV(u) (Hangul_IsS(u) && Hangul_IsN(u))
111 #define Hangul_IsL(u)  ((Hangul_LBase <= (u)) && ((u) <= Hangul_LFinal))
112 #define Hangul_IsV(u)  ((Hangul_VBase <= (u)) && ((u) <= Hangul_VFinal))
113 #define Hangul_IsT(u)  ((Hangul_TBase  < (u)) && ((u) <= Hangul_TFinal))
114 /* HANGUL end */
115 
116 /* this is used for canonical ordering of combining characters (c.c.). */
117 typedef struct {
118     U8 cc;	/* combining class */
119     UV uv;	/* codepoint */
120     STRLEN pos; /* position */
121 } UNF_cc;
122 
compare_cc(const void * a,const void * b)123 static int compare_cc(const void *a, const void *b)
124 {
125     int ret_cc;
126     ret_cc = ((UNF_cc*) a)->cc - ((UNF_cc*) b)->cc;
127     if (ret_cc)
128 	return ret_cc;
129 
130     return ( ((UNF_cc*) a)->pos > ((UNF_cc*) b)->pos )
131 	 - ( ((UNF_cc*) a)->pos < ((UNF_cc*) b)->pos );
132 }
133 
dec_canonical(UV uv)134 static U8* dec_canonical(UV uv)
135 {
136     U8 ***plane, **row;
137     if (OVER_UTF_MAX(uv))
138 	return NULL;
139     plane = (U8***)UNF_canon[uv >> 16];
140     if (! plane)
141 	return NULL;
142     row = plane[(uv >> 8) & 0xff];
143     return row ? row[uv & 0xff] : NULL;
144 }
145 
dec_compat(UV uv)146 static U8* dec_compat(UV uv)
147 {
148     U8 ***plane, **row;
149     if (OVER_UTF_MAX(uv))
150 	return NULL;
151     plane = (U8***)UNF_compat[uv >> 16];
152     if (! plane)
153 	return NULL;
154     row = plane[(uv >> 8) & 0xff];
155     return row ? row[uv & 0xff] : NULL;
156 }
157 
composite_uv(UV uv,UV uv2)158 static UV composite_uv(UV uv, UV uv2)
159 {
160     UNF_complist ***plane, **row, *cell, *i;
161 
162     if (!uv2 || OVER_UTF_MAX(uv) || OVER_UTF_MAX(uv2))
163 	return 0;
164 
165     if (Hangul_IsL(uv) && Hangul_IsV(uv2)) {
166 	UV lindex = uv  - Hangul_LBase;
167 	UV vindex = uv2 - Hangul_VBase;
168 	return(Hangul_SBase + (lindex * Hangul_VCount + vindex) *
169 	       Hangul_TCount);
170     }
171     if (Hangul_IsLV(uv) && Hangul_IsT(uv2)) {
172 	UV tindex = uv2 - Hangul_TBase;
173 	return(uv + tindex);
174     }
175     plane = UNF_compos[uv >> 16];
176     if (! plane)
177 	return 0;
178     row = plane[(uv >> 8) & 0xff];
179     if (! row)
180 	return 0;
181     cell = row[uv & 0xff];
182     if (! cell)
183 	return 0;
184     for (i = cell; i->nextchar; i++) {
185 	if (uv2 == i->nextchar)
186 	    return i->composite;
187     }
188     return 0;
189 }
190 
getCombinClass(UV uv)191 static U8 getCombinClass(UV uv)
192 {
193     U8 **plane, *row;
194     if (OVER_UTF_MAX(uv))
195 	return 0;
196     plane = (U8**)UNF_combin[uv >> 16];
197     if (! plane)
198 	return 0;
199     row = plane[(uv >> 8) & 0xff];
200     return row ? row[uv & 0xff] : 0;
201 }
202 
pv_cat_decompHangul(pTHX_ U8 * d,UV uv)203 static U8* pv_cat_decompHangul(pTHX_ U8* d, UV uv)
204 {
205     UV sindex =  uv - Hangul_SBase;
206     UV lindex =  sindex / Hangul_NCount;
207     UV vindex = (sindex % Hangul_NCount) / Hangul_TCount;
208     UV tindex =  sindex % Hangul_TCount;
209 
210     if (! Hangul_IsS(uv))
211 	return d;
212 
213     d = uvchr_to_utf8(d, (lindex + Hangul_LBase));
214     d = uvchr_to_utf8(d, (vindex + Hangul_VBase));
215     if (tindex)
216 	d = uvchr_to_utf8(d, (tindex + Hangul_TBase));
217     return d;
218 }
219 
sv_2pvunicode(pTHX_ SV * sv,STRLEN * lp)220 static char* sv_2pvunicode(pTHX_ SV *sv, STRLEN *lp)
221 {
222     char *s;
223     STRLEN len;
224     s = SvPV(sv,len);
225     if (!SvUTF8(sv)) {
226 	SV* tmpsv = sv_2mortal(newSVpvn(s, len));
227 	if (!SvPOK(tmpsv))
228 	    s = SvPV_force(tmpsv,len);
229 	sv_utf8_upgrade(tmpsv);
230 	s = SvPV(tmpsv,len);
231     }
232     if (lp)
233 	*lp = len;
234     return s;
235 }
236 
237 static
pv_utf8_decompose(pTHX_ U8 * s,STRLEN slen,U8 ** dp,STRLEN dlen,bool iscompat)238 U8* pv_utf8_decompose(pTHX_ U8* s, STRLEN slen, U8** dp, STRLEN dlen, bool iscompat)
239 {
240     U8* p = s;
241     U8* e = s + slen;
242     U8* dstart = *dp;
243     U8* d = dstart;
244 
245     while (p < e) {
246 	STRLEN retlen;
247 	UV uv = utf8n_to_uvchr(p, e - p, &retlen, AllowAnyUTF);
248 	if (!retlen)
249 	    croak(ErrRetlenIsZero, "decompose");
250 	p += retlen;
251 
252 	if (Hangul_IsS(uv)) {
253 	    Renew_d_if_not_enough_to(UTF8_MAXLEN * 3)
254 	    d = pv_cat_decompHangul(aTHX_ d, uv);
255 	}
256 	else {
257 	    U8* r = iscompat ? dec_compat(uv) : dec_canonical(uv);
258 
259 	    if (r) {
260 		STRLEN len = (STRLEN)strlen((char *)r);
261 		Renew_d_if_not_enough_to(len)
262 		while (len--)
263 		    *d++ = *r++;
264 	    }
265 	    else {
266 		Renew_d_if_not_enough_to(UTF8_MAXLEN)
267 		d = uvchr_to_utf8(d, uv);
268 	    }
269 	}
270     }
271     *dp = dstart;
272     return d;
273 }
274 
275 static
pv_utf8_reorder(pTHX_ U8 * s,STRLEN slen,U8 ** dp,STRLEN dlen)276 U8* pv_utf8_reorder(pTHX_ U8* s, STRLEN slen, U8** dp, STRLEN dlen)
277 {
278     U8* p = s;
279     U8* e = s + slen;
280     U8* dstart = *dp;
281     U8* d = dstart;
282 
283     UNF_cc  seq_ary[CC_SEQ_SIZE];
284     UNF_cc* seq_ptr = seq_ary; /* use array at the beginning */
285     UNF_cc* seq_ext = NULL; /* extend if need */
286     STRLEN seq_max = CC_SEQ_SIZE;
287     STRLEN cc_pos = 0;
288 
289     while (p < e) {
290 	U8 curCC;
291 	STRLEN retlen;
292 	UV uv = utf8n_to_uvchr(p, e - p, &retlen, AllowAnyUTF);
293 	if (!retlen)
294 	    croak(ErrRetlenIsZero, "reorder");
295 	p += retlen;
296 
297 	curCC = getCombinClass(uv);
298 
299 	if (curCC != 0) {
300 	    if (seq_max < cc_pos + 1) { /* extend if need */
301 		seq_max = cc_pos + CC_SEQ_STEP; /* new size */
302 		if (CC_SEQ_SIZE == cc_pos) { /* seq_ary full */
303 		    STRLEN i;
304 		    New(0, seq_ext, seq_max, UNF_cc);
305 		    for (i = 0; i < cc_pos; i++)
306 			seq_ext[i] = seq_ary[i];
307 		}
308 		else {
309 		    Renew(seq_ext, seq_max, UNF_cc);
310 		}
311 		seq_ptr = seq_ext; /* use seq_ext from now */
312 	    }
313 
314 	    seq_ptr[cc_pos].cc  = curCC;
315 	    seq_ptr[cc_pos].uv  = uv;
316 	    seq_ptr[cc_pos].pos = cc_pos;
317 	    ++cc_pos;
318 
319 	    if (p < e)
320 		continue;
321 	}
322 
323 	/* output */
324 	if (cc_pos) {
325 	    STRLEN i;
326 
327 	    if (cc_pos > 1) /* reordered if there are two c.c.'s */
328 		qsort((void*)seq_ptr, cc_pos, sizeof(UNF_cc), compare_cc);
329 
330 	    for (i = 0; i < cc_pos; i++) {
331 		Renew_d_if_not_enough_to(UTF8_MAXLEN)
332 		d = uvchr_to_utf8(d, seq_ptr[i].uv);
333 	    }
334 	    cc_pos = 0;
335 	}
336 
337 	if (curCC == 0) {
338 	    Renew_d_if_not_enough_to(UTF8_MAXLEN)
339 	    d = uvchr_to_utf8(d, uv);
340 	}
341     }
342     if (seq_ext)
343 	Safefree(seq_ext);
344     *dp = dstart;
345     return d;
346 }
347 
348 static
pv_utf8_compose(pTHX_ U8 * s,STRLEN slen,U8 ** dp,STRLEN dlen,bool iscontig)349 U8* pv_utf8_compose(pTHX_ U8* s, STRLEN slen, U8** dp, STRLEN dlen, bool iscontig)
350 {
351     U8* p = s;
352     U8* e = s + slen;
353     U8* dstart = *dp;
354     U8* d = dstart;
355 
356     UV uvS = 0; /* code point of the starter */
357     bool valid_uvS = FALSE; /* if FALSE, uvS isn't initialized yet */
358     U8 preCC = 0;
359 
360     UV  seq_ary[CC_SEQ_SIZE];
361     UV* seq_ptr = seq_ary; /* use array at the beginning */
362     UV* seq_ext = NULL; /* extend if need */
363     STRLEN seq_max = CC_SEQ_SIZE;
364     STRLEN cc_pos = 0;
365 
366     while (p < e) {
367 	U8 curCC;
368 	STRLEN retlen;
369 	UV uv = utf8n_to_uvchr(p, e - p, &retlen, AllowAnyUTF);
370 	if (!retlen)
371 	    croak(ErrRetlenIsZero, "compose");
372 	p += retlen;
373 
374 	curCC = getCombinClass(uv);
375 
376 	if (!valid_uvS) {
377 	    if (curCC == 0) {
378 		uvS = uv; /* the first Starter is found */
379 		valid_uvS = TRUE;
380 		if (p < e)
381 		    continue;
382 	    }
383 	    else {
384 		Renew_d_if_not_enough_to(UTF8_MAXLEN)
385 		d = uvchr_to_utf8(d, uv);
386 		continue;
387 	    }
388 	}
389 	else {
390 	    bool composed;
391 
392 	    /* blocked */
393 	    if ((iscontig && cc_pos) || /* discontiguous combination */
394 		 (curCC != 0 && preCC == curCC) || /* blocked by same CC */
395 		 (preCC > curCC)) /* blocked by higher CC: revised D2 */
396 		composed = FALSE;
397 
398 	    /* not blocked:
399 		 iscontig && cc_pos == 0      -- contiguous combination
400 		 curCC == 0 && preCC == 0     -- starter + starter
401 		 curCC != 0 && preCC < curCC  -- lower CC */
402 	    else {
403 		/* try composition */
404 		UV uvComp = composite_uv(uvS, uv);
405 
406 		if (uvComp && !isExclusion(uvComp))  {
407 		    uvS = uvComp;
408 		    composed = TRUE;
409 
410 		    /* preCC should not be changed to curCC */
411 		    /* e.g. 1E14 = 0045 0304 0300 where CC(0304) == CC(0300) */
412 		    if (p < e)
413 			continue;
414 		}
415 		else
416 		    composed = FALSE;
417 	    }
418 
419 	    if (!composed) {
420 		preCC = curCC;
421 		if (curCC != 0 || !(p < e)) {
422 		    if (seq_max < cc_pos + 1) { /* extend if need */
423 			seq_max = cc_pos + CC_SEQ_STEP; /* new size */
424 			if (CC_SEQ_SIZE == cc_pos) { /* seq_ary full */
425 			    New(0, seq_ext, seq_max, UV);
426 			    Copy(seq_ary, seq_ext, cc_pos, UV);
427 			}
428 			else {
429 			    Renew(seq_ext, seq_max, UV);
430 			}
431 			seq_ptr = seq_ext; /* use seq_ext from now */
432 		    }
433 		    seq_ptr[cc_pos] = uv;
434 		    ++cc_pos;
435 		}
436 		if (curCC != 0 && p < e)
437 		    continue;
438 	    }
439 	}
440 
441 	/* output */
442 	{
443 	    Renew_d_if_not_enough_to(UTF8_MAXLEN)
444 	    d = uvchr_to_utf8(d, uvS); /* starter (composed or not) */
445 	}
446 
447 	if (cc_pos) {
448 	    STRLEN i;
449 
450 	    for (i = 0; i < cc_pos; i++) {
451 		Renew_d_if_not_enough_to(UTF8_MAXLEN)
452 		d = uvchr_to_utf8(d, seq_ptr[i]);
453 	    }
454 	    cc_pos = 0;
455 	}
456 
457 	uvS = uv;
458     }
459     if (seq_ext)
460 	Safefree(seq_ext);
461     *dp = dstart;
462     return d;
463 }
464 
465 MODULE = Unicode::Normalize	PACKAGE = Unicode::Normalize
466 
467 SV*
468 decompose(src, compat = &PL_sv_no)
469     SV * src
470     SV * compat
471   PROTOTYPE: $;$
472   PREINIT:
473     SV* dst;
474     U8 *s, *d, *dend;
475     STRLEN slen, dlen;
476   CODE:
477     s = (U8*)sv_2pvunicode(aTHX_ src,&slen);
478     dst = newSVpvn("", 0);
479     dlen = slen;
480     New(0, d, dlen+1, U8);
481     dend = pv_utf8_decompose(aTHX_ s, slen, &d, dlen, (bool)SvTRUE(compat));
482     sv_setpvn(dst, (char *)d, dend - d);
483     SvUTF8_on(dst);
484     Safefree(d);
485     RETVAL = dst;
486   OUTPUT:
487     RETVAL
488 
489 
490 SV*
491 reorder(src)
492     SV * src
493   PROTOTYPE: $
494   PREINIT:
495     SV* dst;
496     U8 *s, *d, *dend;
497     STRLEN slen, dlen;
498   CODE:
499     s = (U8*)sv_2pvunicode(aTHX_ src,&slen);
500     dst = newSVpvn("", 0);
501     dlen = slen;
502     New(0, d, dlen+1, U8);
503     dend = pv_utf8_reorder(aTHX_ s, slen, &d, dlen);
504     sv_setpvn(dst, (char *)d, dend - d);
505     SvUTF8_on(dst);
506     Safefree(d);
507     RETVAL = dst;
508   OUTPUT:
509     RETVAL
510 
511 
512 SV*
513 compose(src)
514     SV * src
515   PROTOTYPE: $
516   ALIAS:
517     composeContiguous = 1
518   PREINIT:
519     SV* dst;
520     U8 *s, *d, *dend;
521     STRLEN slen, dlen;
522   CODE:
523     s = (U8*)sv_2pvunicode(aTHX_ src,&slen);
524     dst = newSVpvn("", 0);
525     dlen = slen;
526     New(0, d, dlen+1, U8);
527     dend = pv_utf8_compose(aTHX_ s, slen, &d, dlen, (bool)ix);
528     sv_setpvn(dst, (char *)d, dend - d);
529     SvUTF8_on(dst);
530     Safefree(d);
531     RETVAL = dst;
532   OUTPUT:
533     RETVAL
534 
535 
536 SV*
537 NFD(src)
538     SV * src
539   PROTOTYPE: $
540   ALIAS:
541     NFKD = 1
542   PREINIT:
543     SV *dst;
544     U8 *s, *t, *tend, *d, *dend;
545     STRLEN slen, tlen, dlen;
546   CODE:
547     s = (U8*)sv_2pvunicode(aTHX_ src,&slen);
548 
549     /* decompose */
550     tlen = slen;
551     New(0, t, tlen+1, U8);
552     tend = pv_utf8_decompose(aTHX_ s, slen, &t, tlen, (bool)(ix==1));
553     *tend = '\0';
554     tlen = tend - t; /* no longer know real size of t */
555 
556     /* reorder */
557     dlen = tlen;
558     New(0, d, dlen+1, U8);
559     dend = pv_utf8_reorder(aTHX_ t, tlen, &d, dlen);
560     *dend = '\0';
561     dlen = dend - d; /* no longer know real size of d */
562 
563     /* return */
564     dst = newSVpvn("", 0);
565     sv_setpvn(dst, (char *)d, dlen);
566     SvUTF8_on(dst);
567 
568     Safefree(t);
569     Safefree(d);
570     RETVAL = dst;
571   OUTPUT:
572     RETVAL
573 
574 
575 SV*
576 NFC(src)
577     SV * src
578   PROTOTYPE: $
579   ALIAS:
580     NFKC = 1
581     FCC  = 2
582   PREINIT:
583     SV *dst;
584     U8 *s, *t, *tend, *u, *uend, *d, *dend;
585     STRLEN slen, tlen, ulen, dlen;
586   CODE:
587     s = (U8*)sv_2pvunicode(aTHX_ src,&slen);
588 
589     /* decompose */
590     tlen = slen;
591     New(0, t, tlen+1, U8);
592     tend = pv_utf8_decompose(aTHX_ s, slen, &t, tlen, (bool)(ix==1));
593     *tend = '\0';
594     tlen = tend - t; /* no longer know real size of t */
595 
596     /* reorder */
597     ulen = tlen;
598     New(0, u, ulen+1, U8);
599     uend = pv_utf8_reorder(aTHX_ t, tlen, &u, ulen);
600     *uend = '\0';
601     ulen = uend - u; /* no longer know real size of u */
602 
603     /* compose */
604     dlen = ulen;
605     New(0, d, dlen+1, U8);
606     dend = pv_utf8_compose(aTHX_ u, ulen, &d, dlen, (bool)(ix==2));
607     *dend = '\0';
608     dlen = dend - d; /* no longer know real size of d */
609 
610     /* return */
611     dst = newSVpvn("", 0);
612     sv_setpvn(dst, (char *)d, dlen);
613     SvUTF8_on(dst);
614 
615     Safefree(t);
616     Safefree(u);
617     Safefree(d);
618     RETVAL = dst;
619   OUTPUT:
620     RETVAL
621 
622 
623 SV*
624 checkNFD(src)
625     SV * src
626   PROTOTYPE: $
627   ALIAS:
628     checkNFKD = 1
629   PREINIT:
630     STRLEN srclen, retlen;
631     U8 *s, *e, *p, curCC, preCC;
632     bool result = TRUE;
633   CODE:
634     s = (U8*)sv_2pvunicode(aTHX_ src,&srclen);
635     e = s + srclen;
636 
637     preCC = 0;
638     for (p = s; p < e; p += retlen) {
639 	UV uv = utf8n_to_uvchr(p, e - p, &retlen, AllowAnyUTF);
640 	if (!retlen)
641 	    croak(ErrRetlenIsZero, "checkNFD or -NFKD");
642 
643 	curCC = getCombinClass(uv);
644 	if (preCC > curCC && curCC != 0) { /* canonical ordering violated */
645 	    result = FALSE;
646 	    break;
647 	}
648 	if (Hangul_IsS(uv) || (ix ? dec_compat(uv) : dec_canonical(uv))) {
649 	    result = FALSE;
650 	    break;
651 	}
652 	preCC = curCC;
653     }
654     RETVAL = boolSV(result);
655   OUTPUT:
656     RETVAL
657 
658 
659 SV*
660 checkNFC(src)
661     SV * src
662   PROTOTYPE: $
663   ALIAS:
664     checkNFKC = 1
665   PREINIT:
666     STRLEN srclen, retlen;
667     U8 *s, *e, *p, curCC, preCC;
668     bool result = TRUE;
669     bool isMAYBE = FALSE;
670   CODE:
671     s = (U8*)sv_2pvunicode(aTHX_ src,&srclen);
672     e = s + srclen;
673 
674     preCC = 0;
675     for (p = s; p < e; p += retlen) {
676 	UV uv = utf8n_to_uvchr(p, e - p, &retlen, AllowAnyUTF);
677 	if (!retlen)
678 	    croak(ErrRetlenIsZero, "checkNFC or -NFKC");
679 
680 	curCC = getCombinClass(uv);
681 	if (preCC > curCC && curCC != 0) { /* canonical ordering violated */
682 	    result = FALSE;
683 	    break;
684 	}
685 
686 	/* get NFC/NFKC property */
687 	if (Hangul_IsS(uv)) /* Hangul syllables are canonical composites */
688 	    ; /* YES */
689 	else if (isExclusion(uv) || isSingleton(uv) || isNonStDecomp(uv)) {
690 	    result = FALSE;
691 	    break;
692 	}
693 	else if (isComp2nd(uv))
694 	    isMAYBE = TRUE;
695 	else if (ix) {
696 	    char *canon, *compat;
697 	  /* NFKC_NO when having compatibility mapping. */
698 	    canon  = (char *) dec_canonical(uv);
699 	    compat = (char *) dec_compat(uv);
700 	    if (compat && !(canon && strEQ(canon, compat))) {
701 		result = FALSE;
702 		break;
703 	    }
704 	} /* end of get NFC/NFKC property */
705 
706 	preCC = curCC;
707     }
708     if (isMAYBE && result) /* NO precedes MAYBE */
709 	XSRETURN_UNDEF;
710     RETVAL = boolSV(result);
711   OUTPUT:
712     RETVAL
713 
714 
715 SV*
716 checkFCD(src)
717     SV * src
718   PROTOTYPE: $
719   ALIAS:
720     checkFCC = 1
721   PREINIT:
722     STRLEN srclen, retlen;
723     U8 *s, *e, *p, curCC, preCC;
724     bool result = TRUE;
725     bool isMAYBE = FALSE;
726   CODE:
727     s = (U8*)sv_2pvunicode(aTHX_ src,&srclen);
728     e = s + srclen;
729     preCC = 0;
730     for (p = s; p < e; p += retlen) {
731 	U8 *sCan;
732 	UV uvLead;
733 	STRLEN canlen = 0;
734 	UV uv = utf8n_to_uvchr(p, e - p, &retlen, AllowAnyUTF);
735 	if (!retlen)
736 	    croak(ErrRetlenIsZero, "checkFCD or -FCC");
737 
738 	sCan = (U8*) dec_canonical(uv);
739 
740 	if (sCan) {
741 	    STRLEN canret;
742 	    canlen = (STRLEN)strlen((char *) sCan);
743 	    uvLead = utf8n_to_uvchr(sCan, canlen, &canret, AllowAnyUTF);
744 	    if (!canret)
745 		croak(ErrRetlenIsZero, "checkFCD or -FCC");
746 	}
747 	else {
748 	    uvLead = uv;
749 	}
750 
751 	curCC = getCombinClass(uvLead);
752 
753 	if (curCC != 0 && curCC < preCC) { /* canonical ordering violated */
754 	    result = FALSE;
755 	    break;
756 	}
757 
758 	if (ix) {
759 	    if (isExclusion(uv) || isSingleton(uv) || isNonStDecomp(uv)) {
760 		result = FALSE;
761 		break;
762 	    }
763 	    else if (isComp2nd(uv))
764 		isMAYBE = TRUE;
765 	}
766 
767 	if (sCan) {
768 	    STRLEN canret;
769 	    UV uvTrail;
770 	    U8* eCan = sCan + canlen;
771 	    U8* pCan = utf8_hop(eCan, -1);
772 	    if (pCan < sCan)
773 		croak(ErrHopBeforeStart);
774 	    uvTrail = utf8n_to_uvchr(pCan, eCan - pCan, &canret, AllowAnyUTF);
775 	    if (!canret)
776 		croak(ErrRetlenIsZero, "checkFCD or -FCC");
777 	    preCC = getCombinClass(uvTrail);
778 	}
779 	else {
780 	    preCC = curCC;
781 	}
782     }
783     if (isMAYBE && result) /* NO precedes MAYBE */
784 	XSRETURN_UNDEF;
785     RETVAL = boolSV(result);
786   OUTPUT:
787     RETVAL
788 
789 
790 U8
791 getCombinClass(uv)
792     UV uv
793   PROTOTYPE: $
794 
795 bool
796 isExclusion(uv)
797     UV uv
798   PROTOTYPE: $
799 
800 bool
801 isSingleton(uv)
802     UV uv
803   PROTOTYPE: $
804 
805 bool
806 isNonStDecomp(uv)
807     UV uv
808   PROTOTYPE: $
809 
810 bool
811 isComp2nd(uv)
812     UV uv
813   PROTOTYPE: $
814   ALIAS:
815     isNFC_MAYBE  = 1
816     isNFKC_MAYBE = 2
817   INIT:
818     PERL_UNUSED_VAR(ix);
819 
820 SV*
821 isNFD_NO(uv)
822     UV uv
823   PROTOTYPE: $
824   ALIAS:
825     isNFKD_NO = 1
826   PREINIT:
827     bool result = FALSE;
828   CODE:
829     if (Hangul_IsS(uv) || (ix ? dec_compat(uv) : dec_canonical(uv)))
830 	result = TRUE; /* NFD_NO or NFKD_NO */
831     RETVAL = boolSV(result);
832   OUTPUT:
833     RETVAL
834 
835 
836 SV*
837 isComp_Ex(uv)
838     UV uv
839   PROTOTYPE: $
840   ALIAS:
841     isNFC_NO  = 0
842     isNFKC_NO = 1
843   PREINIT:
844     bool result = FALSE;
845   CODE:
846     if (isExclusion(uv) || isSingleton(uv) || isNonStDecomp(uv))
847 	result = TRUE; /* NFC_NO or NFKC_NO */
848     else if (ix) {
849 	char *canon, *compat;
850 	canon  = (char *) dec_canonical(uv);
851 	compat = (char *) dec_compat(uv);
852 	if (compat && (!canon || strNE(canon, compat)))
853 	    result = TRUE; /* NFC_NO or NFKC_NO */
854     }
855     RETVAL = boolSV(result);
856   OUTPUT:
857     RETVAL
858 
859 SV*
860 getComposite(uv, uv2)
861     UV uv
862     UV uv2
863   PROTOTYPE: $$
864   PREINIT:
865     UV composite;
866   CODE:
867     composite = composite_uv(uv, uv2);
868     RETVAL = composite ? newSVuv(composite) : &PL_sv_undef;
869   OUTPUT:
870     RETVAL
871 
872 
873 
874 SV*
875 getCanon(uv)
876     UV uv
877   PROTOTYPE: $
878   ALIAS:
879     getCompat = 1
880   CODE:
881     if (Hangul_IsS(uv)) {
882 	U8 tmp[3 * UTF8_MAXLEN + 1];
883 	U8 *t = tmp;
884 	U8 *e = pv_cat_decompHangul(aTHX_ t, uv);
885 	RETVAL = newSVpvn((char *)t, e - t);
886     } else {
887 	U8* rstr = ix ? dec_compat(uv) : dec_canonical(uv);
888 	if (!rstr)
889 	    XSRETURN_UNDEF;
890 	RETVAL = newSVpvn((char *)rstr, strlen((char *)rstr));
891     }
892     SvUTF8_on(RETVAL);
893   OUTPUT:
894     RETVAL
895 
896 
897 void
898 splitOnLastStarter(src)
899     SV * src
900   PREINIT:
901     SV *svp;
902     STRLEN srclen;
903     U8 *s, *e, *p;
904   PPCODE:
905     s = (U8*)sv_2pvunicode(aTHX_ src,&srclen);
906     e = s + srclen;
907     p = e;
908     while (s < p) {
909 	UV uv;
910 	p = utf8_hop(p, -1);
911 	if (p < s)
912 	    croak(ErrHopBeforeStart);
913 	uv = utf8n_to_uvchr(p, e - p, NULL, AllowAnyUTF);
914 	if (getCombinClass(uv) == 0) /* Last Starter found */
915 	    break;
916     }
917 
918     svp = sv_2mortal(newSVpvn((char*)s, p - s));
919     SvUTF8_on(svp);
920     XPUSHs(svp);
921 
922     svp = sv_2mortal(newSVpvn((char*)p, e - p));
923     SvUTF8_on(svp);
924     XPUSHs(svp);
925 
926