1 
2 #define PERL_NO_GET_CONTEXT /* we want efficiency */
3 
4 /* I guese no private function needs pTHX_ and aTHX_ */
5 
6 #include "EXTERN.h"
7 #include "perl.h"
8 #include "XSUB.h"
9 
10 /* This file is prepared by mkheader */
11 #include "ucatbl.h"
12 
13 /* At present, char > 0x10ffff are unaffected without complaint, right? */
14 #define VALID_UTF_MAX    (0x10ffff)
15 #define OVER_UTF_MAX(uv) (VALID_UTF_MAX < (uv))
16 
17 #define MAX_DIV_16 (UV_MAX / 16)
18 
19 /* Supported Levels */
20 #define MinLevel	(1)
21 #define MaxLevel	(4)
22 
23 /* Shifted weight at 4th level */
24 #define Shift4Wt	(0xFFFF)
25 
26 #define VCE_Length	(9)
27 
28 #define Hangul_SBase  (0xAC00)
29 #define Hangul_SIni   (0xAC00)
30 #define Hangul_SFin   (0xD7A3)
31 #define Hangul_NCount (588)
32 #define Hangul_TCount (28)
33 #define Hangul_LBase  (0x1100)
34 #define Hangul_LIni   (0x1100)
35 #define Hangul_LFin   (0x1159)
36 #define Hangul_LFill  (0x115F)
37 #define Hangul_LEnd   (0x115F) /* Unicode 5.2 */
38 #define Hangul_VBase  (0x1161)
39 #define Hangul_VIni   (0x1160) /* from Vowel Filler */
40 #define Hangul_VFin   (0x11A2)
41 #define Hangul_VEnd   (0x11A7) /* Unicode 5.2 */
42 #define Hangul_TBase  (0x11A7) /* from "no-final" codepoint */
43 #define Hangul_TIni   (0x11A8)
44 #define Hangul_TFin   (0x11F9)
45 #define Hangul_TEnd   (0x11FF) /* Unicode 5.2 */
46 #define HangulL2Ini   (0xA960) /* Unicode 5.2 */
47 #define HangulL2Fin   (0xA97C) /* Unicode 5.2 */
48 #define HangulV2Ini   (0xD7B0) /* Unicode 5.2 */
49 #define HangulV2Fin   (0xD7C6) /* Unicode 5.2 */
50 #define HangulT2Ini   (0xD7CB) /* Unicode 5.2 */
51 #define HangulT2Fin   (0xD7FB) /* Unicode 5.2 */
52 
53 #define CJK_UidIni    (0x4E00)
54 #define CJK_UidFin    (0x9FA5)
55 #define CJK_UidF41    (0x9FBB) /* Unicode 4.1 */
56 #define CJK_UidF51    (0x9FC3) /* Unicode 5.1 */
57 #define CJK_UidF52    (0x9FCB) /* Unicode 5.2 */
58 #define CJK_UidF61    (0x9FCC) /* Unicode 6.1 */
59 #define CJK_UidF80    (0x9FD5) /* Unicode 8.0 */
60 #define CJK_UidF100   (0x9FEA) /* Unicode 10.0 */
61 
62 #define CJK_ExtAIni   (0x3400) /* Unicode 3.0 */
63 #define CJK_ExtAFin   (0x4DB5) /* Unicode 3.0 */
64 #define CJK_ExtBIni  (0x20000) /* Unicode 3.1 */
65 #define CJK_ExtBFin  (0x2A6D6) /* Unicode 3.1 */
66 #define CJK_ExtCIni  (0x2A700) /* Unicode 5.2 */
67 #define CJK_ExtCFin  (0x2B734) /* Unicode 5.2 */
68 #define CJK_ExtDIni  (0x2B740) /* Unicode 6.0 */
69 #define CJK_ExtDFin  (0x2B81D) /* Unicode 6.0 */
70 #define CJK_ExtEIni  (0x2B820) /* Unicode 8.0 */
71 #define CJK_ExtEFin  (0x2CEA1) /* Unicode 8.0 */
72 #define CJK_ExtFIni  (0x2CEB0) /* Unicode 10.0 */
73 #define CJK_ExtFFin  (0x2EBE0) /* Unicode 10.0 */
74 
75 #define CJK_CompIni  (0xFA0E)
76 #define CJK_CompFin  (0xFA29)
77 static const STDCHAR UnifiedCompat[] = {
78       1,1,0,1,0,1,1,0,0,0,0,0,0,0,0,0,0,1,0,1,0,1,1,0,0,1,1,1
79 }; /* E F 0 1 2 3 4 5 6 7 8 9 A B C D E F 0 1 2 3 4 5 6 7 8 9 */
80 
81 #define TangIdeoIni  (0x17000) /* Unicode 9.0 */
82 #define TangIdeoFin  (0x187EC) /* Unicode 9.0 */
83 #define TangCompIni  (0x18800) /* Unicode 9.0 */
84 #define TangCompFin  (0x18AF2) /* Unicode 9.0 */
85 #define NushuIni     (0x1B170) /* Unicode 10.0 */
86 #define NushuFin     (0x1B2FB) /* Unicode 10.0 */
87 
88 #define codeRange(bcode, ecode)	((bcode) <= code && code <= (ecode))
89 
90 MODULE = Unicode::Collate	PACKAGE = Unicode::Collate
91 
92 PROTOTYPES: DISABLE
93 
94 void
95 _fetch_rest ()
96   PREINIT:
97     char ** rest;
98   PPCODE:
99     for (rest = (char **)UCA_rest; *rest; ++rest) {
100 	XPUSHs(sv_2mortal(newSVpv((char *) *rest, 0)));
101     }
102 
103 
104 void
105 _fetch_simple (uv)
106     UV uv
107   PREINIT:
108     U8 ***plane, **row;
109     U8* result = NULL;
110   PPCODE:
111     if (!OVER_UTF_MAX(uv)){
112 	plane = (U8***)UCA_simple[uv >> 16];
113 	if (plane) {
114 	    row = plane[(uv >> 8) & 0xff];
115 	    result = row ? row[uv & 0xff] : NULL;
116 	}
117     }
118     if (result) {
119 	int i;
120 	int num = (int)*result;
121 	++result;
122 	EXTEND(SP, num);
123 	for (i = 0; i < num; ++i) {
124 	    PUSHs(sv_2mortal(newSVpvn((char *) result, VCE_Length)));
125 	    result += VCE_Length;
126 	}
127     } else {
128 	PUSHs(sv_2mortal(newSViv(0)));
129     }
130 
131 SV*
132 _ignorable_simple (uv)
133     UV uv
134   ALIAS:
135     _exists_simple = 1
136   PREINIT:
137     U8 ***plane, **row;
138     int num = -1;
139     U8* result = NULL;
140   CODE:
141     if (!OVER_UTF_MAX(uv)){
142 	plane = (U8***)UCA_simple[uv >> 16];
143 	if (plane) {
144 	    row = plane[(uv >> 8) & 0xff];
145 	    result = row ? row[uv & 0xff] : NULL;
146 	}
147 	if (result)
148 	    num = (int)*result; /* assuming 0 <= num < 128 */
149     }
150 
151     if (ix)
152 	RETVAL = boolSV(num >0);
153     else
154 	RETVAL = boolSV(num==0);
155   OUTPUT:
156     RETVAL
157 
158 
159 void
160 _getHexArray (src)
161     SV* src
162   PREINIT:
163     char *s, *e;
164     STRLEN byte;
165     UV value;
166     bool overflowed = FALSE;
167     const char *hexdigit;
168   PPCODE:
169     s = SvPV(src,byte);
170     for (e = s + byte; s < e;) {
171 	hexdigit = strchr((char *) PL_hexdigit, *s++);
172 	if (! hexdigit)
173 	    continue;
174 	value = (hexdigit - PL_hexdigit) & 0xF;
175 	while (*s) {
176 	    hexdigit = strchr((char *) PL_hexdigit, *s++);
177 	    if (! hexdigit)
178 		break;
179 	    if (overflowed)
180 		continue;
181 	    if (value > MAX_DIV_16) {
182 		overflowed = TRUE;
183 		continue;
184 	    }
185 	    value = (value << 4) | ((hexdigit - PL_hexdigit) & 0xF);
186 	}
187 	XPUSHs(sv_2mortal(newSVuv(overflowed ? UV_MAX : value)));
188     }
189 
190 
191 SV*
192 _isIllegal (sv)
193     SV* sv
194   PREINIT:
195     UV uv;
196   CODE:
197     if (!sv || !SvIOK(sv))
198 	XSRETURN_YES;
199     uv = SvUVX(sv);
200     RETVAL = boolSV(
201 	   0x10FFFF < uv                   /* out of range */
202 	|| ((uv & 0xFFFE) == 0xFFFE)       /* ??FFF[EF] */
203 	|| (0xD800 <= uv && uv <= 0xDFFF)  /* unpaired surrogates */
204 	|| (0xFDD0 <= uv && uv <= 0xFDEF)  /* other non-characters */
205     );
206 OUTPUT:
207     RETVAL
208 
209 
210 void
211 _decompHangul (code)
212     UV code
213   PREINIT:
214     UV sindex, lindex, vindex, tindex;
215   PPCODE:
216     /* code *must* be in Hangul syllable.
217      * Check it before you enter here. */
218     sindex =  code - Hangul_SBase;
219     lindex =  sindex / Hangul_NCount;
220     vindex = (sindex % Hangul_NCount) / Hangul_TCount;
221     tindex =  sindex % Hangul_TCount;
222 
223     EXTEND(SP, tindex ? 3 : 2);
224     PUSHs(sv_2mortal(newSVuv(lindex + Hangul_LBase)));
225     PUSHs(sv_2mortal(newSVuv(vindex + Hangul_VBase)));
226     if (tindex)
227 	PUSHs(sv_2mortal(newSVuv(tindex + Hangul_TBase)));
228 
229 
230 SV*
231 getHST (code, uca_vers = 0)
232     UV code;
233     IV uca_vers;
234   PREINIT:
235     const char * hangtype;
236     STRLEN typelen;
237   CODE:
238     if (codeRange(Hangul_SIni, Hangul_SFin)) {
239 	if ((code - Hangul_SBase) % Hangul_TCount) {
240 	    hangtype = "LVT"; typelen = 3;
241 	} else {
242 	    hangtype = "LV"; typelen = 2;
243 	}
244     } else if (uca_vers < 20) {
245 	if (codeRange(Hangul_LIni, Hangul_LFin) || code == Hangul_LFill) {
246 	    hangtype = "L"; typelen = 1;
247 	} else if (codeRange(Hangul_VIni, Hangul_VFin)) {
248 	    hangtype = "V"; typelen = 1;
249 	} else if (codeRange(Hangul_TIni, Hangul_TFin)) {
250 	    hangtype = "T"; typelen = 1;
251 	} else {
252 	    hangtype = ""; typelen = 0;
253 	}
254     } else {
255 	if        (codeRange(Hangul_LIni, Hangul_LEnd) ||
256 		   codeRange(HangulL2Ini, HangulL2Fin)) {
257 	    hangtype = "L"; typelen = 1;
258 	} else if (codeRange(Hangul_VIni, Hangul_VEnd) ||
259 		   codeRange(HangulV2Ini, HangulV2Fin)) {
260 	    hangtype = "V"; typelen = 1;
261 	} else if (codeRange(Hangul_TIni, Hangul_TEnd) ||
262 		   codeRange(HangulT2Ini, HangulT2Fin)) {
263 	    hangtype = "T"; typelen = 1;
264 	} else {
265 	    hangtype = ""; typelen = 0;
266 	}
267     }
268 
269     RETVAL = newSVpvn(hangtype, typelen);
270 OUTPUT:
271     RETVAL
272 
273 
274 void
275 _derivCE_9 (code)
276     UV code
277   ALIAS:
278     _derivCE_14 = 1
279     _derivCE_18 = 2
280     _derivCE_20 = 3
281     _derivCE_22 = 4
282     _derivCE_24 = 5
283     _derivCE_32 = 6
284     _derivCE_34 = 7
285     _derivCE_36 = 8
286   PREINIT:
287     UV base, aaaa, bbbb;
288     U8 a[VCE_Length + 1] = "\x00\x00\x00\x00\x00\x00\x00\x00\x00";
289     U8 b[VCE_Length + 1] = "\x00\x00\x00\x00\x00\x00\x00\x00\x00";
290     bool basic_unified = 0, tangut = 0, nushu = 0;
291   PPCODE:
292     if (codeRange(CJK_UidIni, CJK_CompFin)) {
293 	if (codeRange(CJK_CompIni, CJK_CompFin))
294 	    basic_unified = (bool)UnifiedCompat[code - CJK_CompIni];
295 	else
296 	    basic_unified = (ix >= 8 ? (code <= CJK_UidF100) :
297 			     ix >= 6 ? (code <= CJK_UidF80) :
298 			     ix == 5 ? (code <= CJK_UidF61) :
299 			     ix >= 3 ? (code <= CJK_UidF52) :
300 			     ix == 2 ? (code <= CJK_UidF51) :
301 			     ix == 1 ? (code <= CJK_UidF41) :
302 				       (code <= CJK_UidFin));
303     } else {
304 	if (ix >= 7)
305 	    tangut = (codeRange(TangIdeoIni, TangIdeoFin) ||
306 		      codeRange(TangCompIni, TangCompFin));
307 	if (ix >= 8)
308 	    nushu = (codeRange(NushuIni, NushuFin));
309     }
310     base = tangut
311 	    ? 0xFB00 :
312 	   nushu
313 	    ? 0xFB01 :
314 	   basic_unified
315 	    ? 0xFB40 : /* CJK */
316 	   ((codeRange(CJK_ExtAIni, CJK_ExtAFin))
317 		||
318 	    (codeRange(CJK_ExtBIni, CJK_ExtBFin))
319 		||
320 	    (ix >= 3 && codeRange(CJK_ExtCIni, CJK_ExtCFin))
321 		||
322 	    (ix >= 4 && codeRange(CJK_ExtDIni, CJK_ExtDFin))
323 		||
324 	    (ix >= 6 && codeRange(CJK_ExtEIni, CJK_ExtEFin))
325 		||
326 	    (ix >= 8 && codeRange(CJK_ExtFIni, CJK_ExtFFin)))
327 	    ? 0xFB80   /* CJK ext. */
328 	    : 0xFBC0;  /* others */
329     aaaa = tangut || nushu ? base : base + (code >> 15);
330     bbbb = (tangut ? (code - TangIdeoIni) :
331 	    nushu  ? (code - NushuIni) : (code & 0x7FFF)) | 0x8000;
332     a[1] = (U8)(aaaa >> 8);
333     a[2] = (U8)(aaaa & 0xFF);
334     b[1] = (U8)(bbbb >> 8);
335     b[2] = (U8)(bbbb & 0xFF);
336     a[4] = (U8)(0x20); /* second octet of level 2 */
337     a[6] = (U8)(0x02); /* second octet of level 3 */
338     a[7] = b[7] = (U8)(code >> 8);
339     a[8] = b[8] = (U8)(code & 0xFF);
340     EXTEND(SP, 2);
341     PUSHs(sv_2mortal(newSVpvn((char *) a, VCE_Length)));
342     PUSHs(sv_2mortal(newSVpvn((char *) b, VCE_Length)));
343 
344 
345 void
346 _derivCE_8 (code)
347     UV code
348   PREINIT:
349     UV aaaa, bbbb;
350     U8 a[VCE_Length + 1] = "\x00\x00\x00\x00\x00\x00\x00\x00\x00";
351     U8 b[VCE_Length + 1] = "\x00\x00\x00\x00\x00\x00\x00\x00\x00";
352   PPCODE:
353     aaaa =  0xFF80 + (code >> 15);
354     bbbb = (code & 0x7FFF) | 0x8000;
355     a[1] = (U8)(aaaa >> 8);
356     a[2] = (U8)(aaaa & 0xFF);
357     b[1] = (U8)(bbbb >> 8);
358     b[2] = (U8)(bbbb & 0xFF);
359     a[4] = (U8)(0x02); /* second octet of level 2 */
360     a[6] = (U8)(0x01); /* second octet of level 3 */
361     a[7] = b[7] = (U8)(code >> 8);
362     a[8] = b[8] = (U8)(code & 0xFF);
363     EXTEND(SP, 2);
364     PUSHs(sv_2mortal(newSVpvn((char *) a, VCE_Length)));
365     PUSHs(sv_2mortal(newSVpvn((char *) b, VCE_Length)));
366 
367 
368 void
369 _uideoCE_8 (code)
370     UV code
371   PREINIT:
372     U8 uice[VCE_Length + 1] = "\x00\x00\x00\x00\x00\x00\x00\x00\x00";
373   PPCODE:
374     uice[1] = uice[7] = (U8)(code >> 8);
375     uice[2] = uice[8] = (U8)(code & 0xFF);
376     uice[4] = (U8)(0x20); /* second octet of level 2 */
377     uice[6] = (U8)(0x02); /* second octet of level 3 */
378     PUSHs(sv_2mortal(newSVpvn((char *) uice, VCE_Length)));
379 
380 
381 SV*
382 _isUIdeo (code, uca_vers)
383     UV code;
384     IV uca_vers;
385     bool basic_unified = 0;
386   CODE:
387     /* uca_vers = 0 for _uideoCE_8() */
388     if (CJK_UidIni <= code) {
389 	if (codeRange(CJK_CompIni, CJK_CompFin))
390 	    basic_unified = (bool)UnifiedCompat[code - CJK_CompIni];
391 	else
392 	    basic_unified = (uca_vers >= 36 ? (code <= CJK_UidF100) :
393 			     uca_vers >= 32 ? (code <= CJK_UidF80) :
394 			     uca_vers >= 24 ? (code <= CJK_UidF61) :
395 			     uca_vers >= 20 ? (code <= CJK_UidF52) :
396 			     uca_vers >= 18 ? (code <= CJK_UidF51) :
397 			     uca_vers >= 14 ? (code <= CJK_UidF41) :
398 					      (code <= CJK_UidFin));
399     }
400     RETVAL = boolSV(
401 	(basic_unified)
402 		||
403 	(codeRange(CJK_ExtAIni, CJK_ExtAFin))
404 		||
405 	(uca_vers >=  8 && codeRange(CJK_ExtBIni, CJK_ExtBFin))
406 		||
407 	(uca_vers >= 20 && codeRange(CJK_ExtCIni, CJK_ExtCFin))
408 		||
409 	(uca_vers >= 22 && codeRange(CJK_ExtDIni, CJK_ExtDFin))
410 		||
411 	(uca_vers >= 32 && codeRange(CJK_ExtEIni, CJK_ExtEFin))
412 		||
413 	(uca_vers >= 36 && codeRange(CJK_ExtFIni, CJK_ExtFFin))
414     );
415 OUTPUT:
416     RETVAL
417 
418 
419 SV*
420 mk_SortKey (self, buf)
421     SV* self;
422     SV* buf;
423   PREINIT:
424     SV *dst, **svp;
425     STRLEN dlen, vlen;
426     U8 *d, *p, *e, *v, *s[MaxLevel], *eachlevel[MaxLevel];
427     AV *bufAV;
428     HV *selfHV;
429     UV back_flag;
430     I32 i, buf_len;
431     IV  lv, level, uca_vers;
432     bool upper_lower, kata_hira, v2i, last_is_var;
433   CODE:
434     if (SvROK(self) && SvTYPE(SvRV(self)) == SVt_PVHV)
435 	selfHV = (HV*)SvRV(self);
436     else
437 	croak("$self is not a HASHREF.");
438 
439     if (SvROK(buf) && SvTYPE(SvRV(buf)) == SVt_PVAV)
440 	bufAV = (AV*)SvRV(buf);
441     else
442 	croak("XSUB, not an ARRAYREF.");
443 
444     buf_len = av_len(bufAV);
445 
446     if (buf_len < 0) { /* empty: -1 */
447 	dlen = 2 * (MaxLevel - 1);
448 	dst = newSV(dlen);
449 	(void)SvPOK_only(dst);
450 	d = (U8*)SvPVX(dst);
451 	while (dlen--)
452 	    *d++ = '\0';
453     } else {
454 	svp = hv_fetch(selfHV, "level", 5, FALSE);
455 	level = svp ? SvIV(*svp) : MaxLevel;
456 
457 	for (lv = 0; lv < level; lv++) {
458 	    New(0, eachlevel[lv], 2 * (1 + buf_len) + 1, U8);
459 	    s[lv] = eachlevel[lv];
460 	}
461 
462 	svp = hv_fetch(selfHV, "upper_before_lower", 18, FALSE);
463 	upper_lower = svp ? SvTRUE(*svp) : FALSE;
464 	svp = hv_fetch(selfHV, "katakana_before_hiragana", 24, FALSE);
465 	kata_hira = svp ? SvTRUE(*svp) : FALSE;
466 	svp = hv_fetch(selfHV, "UCA_Version", 11, FALSE);
467 	uca_vers = SvIV(*svp);
468 	svp = hv_fetch(selfHV, "variable", 8, FALSE);
469 	v2i = uca_vers >= 9 && svp /* (vers >= 9) and not (non-ignorable) */
470 	    ? !(SvCUR(*svp) == 13 && memEQ(SvPVX(*svp), "non-ignorable", 13))
471 	    : FALSE;
472 
473 	last_is_var = FALSE;
474 	for (i = 0; i <= buf_len; i++) {
475 	    svp = av_fetch(bufAV, i, FALSE);
476 
477 	    if (svp && SvPOK(*svp))
478 		v = (U8*)SvPV(*svp, vlen);
479 	    else
480 		croak("not a vwt.");
481 
482 	    if (vlen < VCE_Length) /* ignore short VCE (unexpected) */
483 		continue;
484 
485 	    /* "Ignorable (L1, L2) after Variable" since track. v. 9 */
486 	    if (v2i) {
487 		if (*v)
488 		    last_is_var = TRUE;
489 		else if (v[1] || v[2]) /* non zero primary weight */
490 		    last_is_var = FALSE;
491 		else if (last_is_var) /* zero primary weight; skipped */
492 		    continue;
493 	    }
494 
495 	    if (v[5] == 0) { /* tert wt < 256 */
496 		if (upper_lower) {
497 		    if (0x8 <= v[6] && v[6] <= 0xC) /* lower */
498 			v[6] -= 6;
499 		    else if (0x2 <= v[6] && v[6] <= 0x6) /* upper */
500 			v[6] += 6;
501 		    else if (v[6] == 0x1C) /* square upper */
502 			v[6]++;
503 		    else if (v[6] == 0x1D) /* square lower */
504 			v[6]--;
505 		}
506 		if (kata_hira) {
507 		    if (0x0F <= v[6] && v[6] <= 0x13) /* katakana */
508 			v[6] -= 2;
509 		    else if (0xD <= v[6] && v[6] <= 0xE) /* hiragana */
510 			v[6] += 5;
511 		}
512 	    }
513 
514 	    for (lv = 0; lv < level; lv++) {
515 		if (v[2 * lv + 1] || v[2 * lv + 2]) {
516 		    *s[lv]++ = v[2 * lv + 1];
517 		    *s[lv]++ = v[2 * lv + 2];
518 		}
519 	    }
520 	}
521 
522 	dlen = 2 * (MaxLevel - 1);
523 	for (lv = 0; lv < level; lv++)
524 	    dlen += s[lv] - eachlevel[lv];
525 
526 	dst = newSV(dlen);
527 	(void)SvPOK_only(dst);
528 	d = (U8*)SvPVX(dst);
529 
530 	svp = hv_fetch(selfHV, "backwardsFlag", 13, FALSE);
531 	back_flag = svp ? SvUV(*svp) : (UV)0;
532 
533 	for (lv = 0; lv < level; lv++) {
534 	    if (back_flag & (1 << (lv + 1))) {
535 		p = s[lv];
536 		e = eachlevel[lv];
537 		for ( ; e < p; p -= 2) {
538 		    *d++ = p[-2];
539 		    *d++ = p[-1];
540 		}
541 	    }
542 	    else {
543 		p = eachlevel[lv];
544 		e = s[lv];
545 		while (p < e)
546 		    *d++ = *p++;
547 	    }
548 	    if (lv + 1 < MaxLevel) { /* lv + 1 == real level */
549 		*d++ = '\0';
550 		*d++ = '\0';
551 	    }
552 	}
553 
554 	for (lv = level; lv < MaxLevel; lv++) {
555 	    if (lv + 1 < MaxLevel) { /* lv + 1 == real level */
556 		*d++ = '\0';
557 		*d++ = '\0';
558 	    }
559 	}
560 
561 	for (lv = 0; lv < level; lv++) {
562 	    Safefree(eachlevel[lv]);
563 	}
564     }
565     *d = '\0';
566     SvCUR_set(dst, d - (U8*)SvPVX(dst));
567     RETVAL = dst;
568 OUTPUT:
569     RETVAL
570 
571 
572 SV*
573 varCE (self, vce)
574     SV* self;
575     SV* vce;
576   PREINIT:
577     SV *dst, *vbl, **svp;
578     HV *selfHV;
579     U8 *a, *v, *d;
580     STRLEN alen, vlen;
581     bool ig_l2;
582     IV uca_vers;
583     UV totwt;
584   CODE:
585     if (SvROK(self) && SvTYPE(SvRV(self)) == SVt_PVHV)
586 	selfHV = (HV*)SvRV(self);
587     else
588 	croak("$self is not a HASHREF.");
589 
590     svp = hv_fetch(selfHV, "ignore_level2", 13, FALSE);
591     ig_l2 = svp ? SvTRUE(*svp) : FALSE;
592 
593     svp = hv_fetch(selfHV, "variable", 8, FALSE);
594     vbl = svp ? *svp : &PL_sv_no;
595     a = (U8*)SvPV(vbl, alen);
596     v = (U8*)SvPV(vce, vlen);
597 
598     dst = newSV(vlen);
599     d = (U8*)SvPVX(dst);
600     (void)SvPOK_only(dst);
601     Copy(v, d, vlen, U8);
602     SvCUR_set(dst, vlen);
603     d[vlen] = '\0';
604 
605     /* primary weight == 0 && secondary weight != 0 */
606     if (ig_l2 && !d[1] && !d[2] && (d[3] || d[4])) {
607 	d[3] = d[4] = d[5] = d[6] = '\0';
608     }
609 
610     /* variable: checked only the first char and the length,
611        trusting checkCollator() and %VariableOK in Perl ... */
612 
613     if (vlen >= VCE_Length && *a != 'n') {
614 	if (*v) {
615 	    if (*a == 's') { /* shifted or shift-trimmed */
616 		d[7] = d[1]; /* wt level 1 to 4 */
617 		d[8] = d[2];
618 	    } /* else blanked */
619 	    d[1] = d[2] = d[3] = d[4] = d[5] = d[6] = '\0';
620 	} else if (*a == 's') { /* shifted or shift-trimmed */
621 	    totwt = d[1] + d[2] + d[3] + d[4] + d[5] + d[6];
622 	    if (alen == 7 && totwt != 0) { /* shifted */
623 		if (d[1] == 0 && d[2] == 1) { /* XXX: CollationAuxiliary-6.2.0 */
624 		    d[7] = d[1]; /* wt level 1 to 4 */
625 		    d[8] = d[2];
626 		} else {
627 		    svp = hv_fetch(selfHV, "UCA_Version", 11, FALSE);
628 		    if (!svp)
629 			croak("Panic: no $self->{UCA_Version} in varCE");
630 		    uca_vers = SvIV(*svp);
631 
632 		    /* completely ignorable or the second derived CE */
633 		    if (uca_vers >= 36 && d[3] + d[4] + d[5] + d[6] == 0) {
634 			d[7] = d[8] = '\0';
635 		    } else {
636 			d[7] = (U8)(Shift4Wt >> 8);
637 			d[8] = (U8)(Shift4Wt & 0xFF);
638 		    }
639 		}
640 	    } else { /* shift-trimmed or completely ignorable */
641 		d[7] = d[8] = '\0';
642 	    }
643 	} /* else blanked */
644     } /* else non-ignorable */
645     RETVAL = dst;
646 OUTPUT:
647     RETVAL
648 
649 
650 
651 SV*
652 visualizeSortKey (self, key)
653     SV * self
654     SV * key
655   PREINIT:
656     HV *selfHV;
657     SV **svp, *dst;
658     U8 *s, *e, *d;
659     STRLEN klen, dlen;
660     UV uv;
661     IV uca_vers, sep = 0;
662     const char *upperhex = "0123456789ABCDEF";
663   CODE:
664     if (SvROK(self) && SvTYPE(SvRV(self)) == SVt_PVHV)
665 	selfHV = (HV*)SvRV(self);
666     else
667 	croak("$self is not a HASHREF.");
668 
669     svp = hv_fetch(selfHV, "UCA_Version", 11, FALSE);
670     if (!svp)
671 	croak("Panic: no $self->{UCA_Version} in visualizeSortKey");
672     uca_vers = SvIV(*svp);
673 
674     s = (U8*)SvPV(key, klen);
675 
676    /* slightly *longer* than the need, but I'm afraid of miscounting;
677       = (klen / 2) * 5 - 1
678              # FFFF and ' ' for each 16bit units but ' ' is less by 1;
679              # ' ' and '|' for level boundaries including the identical level
680        + 2   # '[' and ']'
681        + 1   # '\0'
682        (a) if klen is odd (not expected), maybe more 5 bytes.
683        (b) there is not always the identical level.
684    */
685     dlen = (klen / 2) * 5 + MaxLevel * 2 + 2;
686     dst = newSV(dlen);
687     (void)SvPOK_only(dst);
688     d = (U8*)SvPVX(dst);
689 
690     *d++ = '[';
691     for (e = s + klen; s < e; s += 2) {
692 	uv = (U16)(*s << 8 | s[1]);
693 	if (uv || sep >= MaxLevel) {
694 	    if ((d[-1] != '[') && ((9 <= uca_vers) || (d[-1] != '|')))
695 		*d++ = ' ';
696 	    *d++ = upperhex[ (s[0] >> 4) & 0xF ];
697 	    *d++ = upperhex[  s[0]       & 0xF ];
698 	    *d++ = upperhex[ (s[1] >> 4) & 0xF ];
699 	    *d++ = upperhex[  s[1]       & 0xF ];
700 	} else {
701 	    if ((9 <= uca_vers) && (d[-1] != '['))
702 		*d++ = ' ';
703 	    *d++ = '|';
704 	    ++sep;
705 	}
706     }
707     *d++ = ']';
708     *d   = '\0';
709     SvCUR_set(dst, d - (U8*)SvPVX(dst));
710     RETVAL = dst;
711 OUTPUT:
712     RETVAL
713