1 
2 #define PERL_NO_GET_CONTEXT /* we want efficiency */
3 
4 /* I guese no private function needs pTHX_ and aTHX_ */
5 
6 #include "EXTERN.h"
7 #include "perl.h"
8 #include "XSUB.h"
9 
10 /* This file is prepared by mkheader */
11 #include "ucatbl.h"
12 
13 /* At present, char > 0x10ffff are unaffected without complaint, right? */
14 #define VALID_UTF_MAX    (0x10ffff)
15 #define OVER_UTF_MAX(uv) (VALID_UTF_MAX < (uv))
16 
17 #define MAX_DIV_16 (UV_MAX / 16)
18 
19 /* Supported Levels */
20 #define MinLevel	(1)
21 #define MaxLevel	(4)
22 
23 /* Shifted weight at 4th level */
24 #define Shift4Wt	(0xFFFF)
25 
26 #define VCE_Length	(9)
27 
28 #define Hangul_SBase  (0xAC00)
29 #define Hangul_SIni   (0xAC00)
30 #define Hangul_SFin   (0xD7A3)
31 #define Hangul_NCount (588)
32 #define Hangul_TCount (28)
33 #define Hangul_LBase  (0x1100)
34 #define Hangul_LIni   (0x1100)
35 #define Hangul_LFin   (0x1159)
36 #define Hangul_LFill  (0x115F)
37 #define Hangul_LEnd   (0x115F) /* Unicode 5.2 */
38 #define Hangul_VBase  (0x1161)
39 #define Hangul_VIni   (0x1160) /* from Vowel Filler */
40 #define Hangul_VFin   (0x11A2)
41 #define Hangul_VEnd   (0x11A7) /* Unicode 5.2 */
42 #define Hangul_TBase  (0x11A7) /* from "no-final" codepoint */
43 #define Hangul_TIni   (0x11A8)
44 #define Hangul_TFin   (0x11F9)
45 #define Hangul_TEnd   (0x11FF) /* Unicode 5.2 */
46 #define HangulL2Ini   (0xA960) /* Unicode 5.2 */
47 #define HangulL2Fin   (0xA97C) /* Unicode 5.2 */
48 #define HangulV2Ini   (0xD7B0) /* Unicode 5.2 */
49 #define HangulV2Fin   (0xD7C6) /* Unicode 5.2 */
50 #define HangulT2Ini   (0xD7CB) /* Unicode 5.2 */
51 #define HangulT2Fin   (0xD7FB) /* Unicode 5.2 */
52 
53 #define CJK_UidIni    (0x4E00)
54 #define CJK_UidFin    (0x9FA5)
55 #define CJK_UidF41    (0x9FBB) /* Unicode 4.1 */
56 #define CJK_UidF51    (0x9FC3) /* Unicode 5.1 */
57 #define CJK_UidF52    (0x9FCB) /* Unicode 5.2 */
58 #define CJK_UidF61    (0x9FCC) /* Unicode 6.1 */
59 #define CJK_UidF80    (0x9FD5) /* Unicode 8.0 */
60 #define CJK_UidF100   (0x9FEA) /* Unicode 10.0 */
61 #define CJK_UidF110   (0x9FEF) /* Unicode 11.0 */
62 #define CJK_UidF130   (0x9FFC) /* Unicode 13.0 */
63 
64 #define CJK_ExtAIni   (0x3400) /* Unicode 3.0 */
65 #define CJK_ExtAFin   (0x4DB5) /* Unicode 3.0 */
66 #define CJK_ExtA130   (0x4DBF) /* Unicode 13.0 */
67 #define CJK_ExtBIni  (0x20000) /* Unicode 3.1 */
68 #define CJK_ExtBFin  (0x2A6D6) /* Unicode 3.1 */
69 #define CJK_ExtB130  (0x2A6DD) /* Unicode 13.0 */
70 #define CJK_ExtCIni  (0x2A700) /* Unicode 5.2 */
71 #define CJK_ExtCFin  (0x2B734) /* Unicode 5.2 */
72 #define CJK_ExtDIni  (0x2B740) /* Unicode 6.0 */
73 #define CJK_ExtDFin  (0x2B81D) /* Unicode 6.0 */
74 #define CJK_ExtEIni  (0x2B820) /* Unicode 8.0 */
75 #define CJK_ExtEFin  (0x2CEA1) /* Unicode 8.0 */
76 #define CJK_ExtFIni  (0x2CEB0) /* Unicode 10.0 */
77 #define CJK_ExtFFin  (0x2EBE0) /* Unicode 10.0 */
78 #define CJK_ExtGIni  (0x30000) /* Unicode 13.0 */
79 #define CJK_ExtGFin  (0x3134A) /* Unicode 13.0 */
80 
81 #define CJK_CompIni  (0xFA0E)
82 #define CJK_CompFin  (0xFA29)
83 static const STDCHAR UnifiedCompat[] = {
84       1,1,0,1,0,1,1,0,0,0,0,0,0,0,0,0,0,1,0,1,0,1,1,0,0,1,1,1
85 }; /* E F 0 1 2 3 4 5 6 7 8 9 A B C D E F 0 1 2 3 4 5 6 7 8 9 */
86 
87 #define TangIdeoIni  (0x17000) /* Unicode 9.0 */
88 #define TangIdeoFin  (0x187EC) /* Unicode 9.0 */
89 #define TangIdeo110  (0x187F1) /* Unicode 11.0 */
90 #define TangIdeo120  (0x187F7) /* Unicode 12.0 */
91 #define TangCompIni  (0x18800) /* Unicode 9.0 */
92 #define TangCompFin  (0x18AF2) /* Unicode 9.0 */
93 #define TangComp130  (0x18AFF) /* Unicode 13.0 */
94 #define TangSuppIni  (0x18D00) /* Unicode 13.0 */
95 #define TangSuppFin  (0x18D08) /* Unicode 13.0 */
96 #define NushuIni     (0x1B170) /* Unicode 10.0 */
97 #define NushuFin     (0x1B2FB) /* Unicode 10.0 */
98 #define KhitanIni    (0x18B00) /* Unicode 13.0 */
99 #define KhitanFin    (0x18CD5) /* Unicode 13.0 */
100 
101 #define codeRange(bcode, ecode)	((bcode) <= code && code <= (ecode))
102 
103 MODULE = Unicode::Collate	PACKAGE = Unicode::Collate
104 
105 PROTOTYPES: DISABLE
106 
107 void
108 _fetch_rest ()
109   PREINIT:
110     char ** rest;
111   PPCODE:
112     for (rest = (char **)UCA_rest; *rest; ++rest) {
113 	XPUSHs(sv_2mortal(newSVpv((char *) *rest, 0)));
114     }
115 
116 
117 void
118 _fetch_simple (uv)
119     UV uv
120   PREINIT:
121     U8 ***plane, **row;
122     U8* result = NULL;
123   PPCODE:
124     if (!OVER_UTF_MAX(uv)){
125 	plane = (U8***)UCA_simple[uv >> 16];
126 	if (plane) {
127 	    row = plane[(uv >> 8) & 0xff];
128 	    result = row ? row[uv & 0xff] : NULL;
129 	}
130     }
131     if (result) {
132 	int i;
133 	int num = (int)*result;
134 	++result;
135 	EXTEND(SP, num);
136 	for (i = 0; i < num; ++i) {
137 	    PUSHs(sv_2mortal(newSVpvn((char *) result, VCE_Length)));
138 	    result += VCE_Length;
139 	}
140     } else {
141 	PUSHs(sv_2mortal(newSViv(0)));
142     }
143 
144 SV*
145 _ignorable_simple (uv)
146     UV uv
147   ALIAS:
148     _exists_simple = 1
149   PREINIT:
150     U8 ***plane, **row;
151     int num = -1;
152     U8* result = NULL;
153   CODE:
154     if (!OVER_UTF_MAX(uv)){
155 	plane = (U8***)UCA_simple[uv >> 16];
156 	if (plane) {
157 	    row = plane[(uv >> 8) & 0xff];
158 	    result = row ? row[uv & 0xff] : NULL;
159 	}
160 	if (result)
161 	    num = (int)*result; /* assuming 0 <= num < 128 */
162     }
163 
164     if (ix)
165 	RETVAL = boolSV(num >0);
166     else
167 	RETVAL = boolSV(num==0);
168   OUTPUT:
169     RETVAL
170 
171 
172 void
173 _getHexArray (src)
174     SV* src
175   PREINIT:
176     char *s, *e;
177     STRLEN byte;
178     UV value;
179     bool overflowed = FALSE;
180     const char *hexdigit;
181   PPCODE:
182     s = SvPV(src,byte);
183     for (e = s + byte; s < e;) {
184 	hexdigit = strchr((char *) PL_hexdigit, *s++);
185 	if (! hexdigit)
186 	    continue;
187 	value = (hexdigit - PL_hexdigit) & 0xF;
188 	while (*s) {
189 	    hexdigit = strchr((char *) PL_hexdigit, *s++);
190 	    if (! hexdigit)
191 		break;
192 	    if (overflowed)
193 		continue;
194 	    if (value > MAX_DIV_16) {
195 		overflowed = TRUE;
196 		continue;
197 	    }
198 	    value = (value << 4) | ((hexdigit - PL_hexdigit) & 0xF);
199 	}
200 	XPUSHs(sv_2mortal(newSVuv(overflowed ? UV_MAX : value)));
201     }
202 
203 
204 SV*
205 _isIllegal (sv)
206     SV* sv
207   PREINIT:
208     UV uv;
209   CODE:
210     if (!sv || !SvIOK(sv))
211 	XSRETURN_YES;
212     uv = SvUVX(sv);
213     RETVAL = boolSV(
214 	   0x10FFFF < uv                   /* out of range */
215 	|| ((uv & 0xFFFE) == 0xFFFE)       /* ??FFF[EF] */
216 	|| (0xD800 <= uv && uv <= 0xDFFF)  /* unpaired surrogates */
217 	|| (0xFDD0 <= uv && uv <= 0xFDEF)  /* other non-characters */
218     );
219 OUTPUT:
220     RETVAL
221 
222 
223 void
224 _decompHangul (code)
225     UV code
226   PREINIT:
227     UV sindex, lindex, vindex, tindex;
228   PPCODE:
229     /* code *must* be in Hangul syllable.
230      * Check it before you enter here. */
231     sindex =  code - Hangul_SBase;
232     lindex =  sindex / Hangul_NCount;
233     vindex = (sindex % Hangul_NCount) / Hangul_TCount;
234     tindex =  sindex % Hangul_TCount;
235 
236     EXTEND(SP, tindex ? 3 : 2);
237     PUSHs(sv_2mortal(newSVuv(lindex + Hangul_LBase)));
238     PUSHs(sv_2mortal(newSVuv(vindex + Hangul_VBase)));
239     if (tindex)
240 	PUSHs(sv_2mortal(newSVuv(tindex + Hangul_TBase)));
241 
242 
243 SV*
244 getHST (code, uca_vers = 0)
245     UV code;
246     IV uca_vers;
247   PREINIT:
248     const char * hangtype;
249     STRLEN typelen;
250   CODE:
251     if (codeRange(Hangul_SIni, Hangul_SFin)) {
252 	if ((code - Hangul_SBase) % Hangul_TCount) {
253 	    hangtype = "LVT"; typelen = 3;
254 	} else {
255 	    hangtype = "LV"; typelen = 2;
256 	}
257     } else if (uca_vers < 20) {
258 	if (codeRange(Hangul_LIni, Hangul_LFin) || code == Hangul_LFill) {
259 	    hangtype = "L"; typelen = 1;
260 	} else if (codeRange(Hangul_VIni, Hangul_VFin)) {
261 	    hangtype = "V"; typelen = 1;
262 	} else if (codeRange(Hangul_TIni, Hangul_TFin)) {
263 	    hangtype = "T"; typelen = 1;
264 	} else {
265 	    hangtype = ""; typelen = 0;
266 	}
267     } else {
268 	if        (codeRange(Hangul_LIni, Hangul_LEnd) ||
269 		   codeRange(HangulL2Ini, HangulL2Fin)) {
270 	    hangtype = "L"; typelen = 1;
271 	} else if (codeRange(Hangul_VIni, Hangul_VEnd) ||
272 		   codeRange(HangulV2Ini, HangulV2Fin)) {
273 	    hangtype = "V"; typelen = 1;
274 	} else if (codeRange(Hangul_TIni, Hangul_TEnd) ||
275 		   codeRange(HangulT2Ini, HangulT2Fin)) {
276 	    hangtype = "T"; typelen = 1;
277 	} else {
278 	    hangtype = ""; typelen = 0;
279 	}
280     }
281 
282     RETVAL = newSVpvn(hangtype, typelen);
283 OUTPUT:
284     RETVAL
285 
286 
287 void
288 _derivCE_9 (code)
289     UV code
290   ALIAS:
291     _derivCE_14 = 1
292     _derivCE_18 = 2
293     _derivCE_20 = 3
294     _derivCE_22 = 4
295     _derivCE_24 = 5
296     _derivCE_32 = 6
297     _derivCE_34 = 7
298     _derivCE_36 = 8
299     _derivCE_38 = 9
300     _derivCE_40 = 10
301     _derivCE_43 = 11
302   PREINIT:
303     UV base, aaaa, bbbb;
304     U8 a[VCE_Length + 1] = "\x00\x00\x00\x00\x00\x00\x00\x00\x00";
305     U8 b[VCE_Length + 1] = "\x00\x00\x00\x00\x00\x00\x00\x00\x00";
306     bool basic_unified = 0, tangut = 0, nushu = 0, khitan = 0;
307   PPCODE:
308     if (codeRange(CJK_UidIni, CJK_CompFin)) {
309 	if (codeRange(CJK_CompIni, CJK_CompFin))
310 	    basic_unified = (bool)UnifiedCompat[code - CJK_CompIni];
311 	else
312 	    basic_unified = (ix >= 11 ? (code <= CJK_UidF130) :
313 			     ix >= 9  ? (code <= CJK_UidF110) :
314 			     ix == 8  ? (code <= CJK_UidF100) :
315 			     ix >= 6  ? (code <= CJK_UidF80) :
316 			     ix == 5  ? (code <= CJK_UidF61) :
317 			     ix >= 3  ? (code <= CJK_UidF52) :
318 			     ix == 2  ? (code <= CJK_UidF51) :
319 			     ix == 1  ? (code <= CJK_UidF41) :
320 				        (code <= CJK_UidFin));
321     } else {
322 	if (ix >= 7) {
323 	    tangut = (ix >= 11) ? (codeRange(TangIdeoIni, TangIdeo120) ||
324 				   codeRange(TangCompIni, TangComp130) ||
325 				   codeRange(TangSuppIni, TangSuppFin)) :
326 		     (ix == 10) ? (codeRange(TangIdeoIni, TangIdeo120) ||
327 				   codeRange(TangCompIni, TangCompFin)) :
328 		     (ix == 9)  ? (codeRange(TangIdeoIni, TangIdeo110) ||
329 				   codeRange(TangCompIni, TangCompFin)) :
330 				  (codeRange(TangIdeoIni, TangIdeoFin) ||
331 				   codeRange(TangCompIni, TangCompFin));
332 	}
333 	if (ix >= 8)
334 	    nushu = (codeRange(NushuIni, NushuFin));
335 	if (ix >= 11)
336 	    khitan = (codeRange(KhitanIni, KhitanFin));
337     }
338     base = tangut
339 	    ? 0xFB00 :
340 	   nushu
341 	    ? 0xFB01 :
342 	   khitan
343 	    ? 0xFB02 :
344 	   basic_unified
345 	    ? 0xFB40 : /* CJK */
346 	   ((ix >= 11 ? codeRange(CJK_ExtAIni, CJK_ExtA130)
347 		      : codeRange(CJK_ExtAIni, CJK_ExtAFin))
348 		||
349 	    (ix >= 11 ? codeRange(CJK_ExtBIni, CJK_ExtB130)
350 		      : codeRange(CJK_ExtBIni, CJK_ExtBFin))
351 		||
352 	    (ix >= 3 && codeRange(CJK_ExtCIni, CJK_ExtCFin))
353 		||
354 	    (ix >= 4 && codeRange(CJK_ExtDIni, CJK_ExtDFin))
355 		||
356 	    (ix >= 6 && codeRange(CJK_ExtEIni, CJK_ExtEFin))
357 		||
358 	    (ix >= 8 && codeRange(CJK_ExtFIni, CJK_ExtFFin))
359 		||
360 	   (ix >= 11 && codeRange(CJK_ExtGIni, CJK_ExtGFin)))
361 	    ? 0xFB80   /* CJK ext. */
362 	    : 0xFBC0;  /* others */
363     aaaa = tangut || nushu || khitan ? base : base + (code >> 15);
364     bbbb = (tangut ? (code - TangIdeoIni) :
365 	    nushu  ? (code - NushuIni) :
366 	    khitan ? (code - KhitanIni) : (code & 0x7FFF)) | 0x8000;
367     a[1] = (U8)(aaaa >> 8);
368     a[2] = (U8)(aaaa & 0xFF);
369     b[1] = (U8)(bbbb >> 8);
370     b[2] = (U8)(bbbb & 0xFF);
371     a[4] = (U8)(0x20); /* second octet of level 2 */
372     a[6] = (U8)(0x02); /* second octet of level 3 */
373     a[7] = b[7] = (U8)(code >> 8);
374     a[8] = b[8] = (U8)(code & 0xFF);
375     EXTEND(SP, 2);
376     PUSHs(sv_2mortal(newSVpvn((char *) a, VCE_Length)));
377     PUSHs(sv_2mortal(newSVpvn((char *) b, VCE_Length)));
378 
379 
380 void
381 _derivCE_8 (code)
382     UV code
383   PREINIT:
384     UV aaaa, bbbb;
385     U8 a[VCE_Length + 1] = "\x00\x00\x00\x00\x00\x00\x00\x00\x00";
386     U8 b[VCE_Length + 1] = "\x00\x00\x00\x00\x00\x00\x00\x00\x00";
387   PPCODE:
388     aaaa =  0xFF80 + (code >> 15);
389     bbbb = (code & 0x7FFF) | 0x8000;
390     a[1] = (U8)(aaaa >> 8);
391     a[2] = (U8)(aaaa & 0xFF);
392     b[1] = (U8)(bbbb >> 8);
393     b[2] = (U8)(bbbb & 0xFF);
394     a[4] = (U8)(0x02); /* second octet of level 2 */
395     a[6] = (U8)(0x01); /* second octet of level 3 */
396     a[7] = b[7] = (U8)(code >> 8);
397     a[8] = b[8] = (U8)(code & 0xFF);
398     EXTEND(SP, 2);
399     PUSHs(sv_2mortal(newSVpvn((char *) a, VCE_Length)));
400     PUSHs(sv_2mortal(newSVpvn((char *) b, VCE_Length)));
401 
402 
403 void
404 _uideoCE_8 (code)
405     UV code
406   PREINIT:
407     U8 uice[VCE_Length + 1] = "\x00\x00\x00\x00\x00\x00\x00\x00\x00";
408   PPCODE:
409     uice[1] = uice[7] = (U8)(code >> 8);
410     uice[2] = uice[8] = (U8)(code & 0xFF);
411     uice[4] = (U8)(0x20); /* second octet of level 2 */
412     uice[6] = (U8)(0x02); /* second octet of level 3 */
413     PUSHs(sv_2mortal(newSVpvn((char *) uice, VCE_Length)));
414 
415 
416 SV*
417 _isUIdeo (code, uca_vers)
418     UV code;
419     IV uca_vers;
420     bool basic_unified = 0;
421   CODE:
422     /* uca_vers = 0 for _uideoCE_8() */
423     if (CJK_UidIni <= code) {
424 	if (codeRange(CJK_CompIni, CJK_CompFin))
425 	    basic_unified = (bool)UnifiedCompat[code - CJK_CompIni];
426 	else
427 	    basic_unified = (uca_vers >= 43 ? (code <= CJK_UidF130) :
428 			     uca_vers >= 38 ? (code <= CJK_UidF110) :
429 			     uca_vers >= 36 ? (code <= CJK_UidF100) :
430 			     uca_vers >= 32 ? (code <= CJK_UidF80) :
431 			     uca_vers >= 24 ? (code <= CJK_UidF61) :
432 			     uca_vers >= 20 ? (code <= CJK_UidF52) :
433 			     uca_vers >= 18 ? (code <= CJK_UidF51) :
434 			     uca_vers >= 14 ? (code <= CJK_UidF41) :
435 					      (code <= CJK_UidFin));
436     }
437     RETVAL = boolSV(
438 	(basic_unified)
439 		||
440 	(codeRange(CJK_ExtAIni, CJK_ExtAFin))
441 		||
442 	(uca_vers >= 43 && codeRange(CJK_ExtAIni, CJK_ExtA130))
443 		||
444 	(uca_vers >=  8 && codeRange(CJK_ExtBIni, CJK_ExtBFin))
445 		||
446 	(uca_vers >= 43 && codeRange(CJK_ExtBIni, CJK_ExtB130))
447 		||
448 	(uca_vers >= 20 && codeRange(CJK_ExtCIni, CJK_ExtCFin))
449 		||
450 	(uca_vers >= 22 && codeRange(CJK_ExtDIni, CJK_ExtDFin))
451 		||
452 	(uca_vers >= 32 && codeRange(CJK_ExtEIni, CJK_ExtEFin))
453 		||
454 	(uca_vers >= 36 && codeRange(CJK_ExtFIni, CJK_ExtFFin))
455 		||
456 	(uca_vers >= 43 && codeRange(CJK_ExtGIni, CJK_ExtGFin))
457     );
458 OUTPUT:
459     RETVAL
460 
461 
462 SV*
463 mk_SortKey (self, buf)
464     SV* self;
465     SV* buf;
466   PREINIT:
467     SV *dst, **svp;
468     STRLEN dlen, vlen;
469     U8 *d, *p, *e, *v, *s[MaxLevel], *eachlevel[MaxLevel];
470     AV *bufAV;
471     HV *selfHV;
472     UV back_flag;
473     I32 i, buf_len;
474     IV  lv, level, uca_vers;
475     bool upper_lower, kata_hira, v2i, last_is_var;
476   CODE:
477     if (SvROK(self) && SvTYPE(SvRV(self)) == SVt_PVHV)
478 	selfHV = (HV*)SvRV(self);
479     else
480 	croak("$self is not a HASHREF.");
481 
482     if (SvROK(buf) && SvTYPE(SvRV(buf)) == SVt_PVAV)
483 	bufAV = (AV*)SvRV(buf);
484     else
485 	croak("XSUB, not an ARRAYREF.");
486 
487     buf_len = av_len(bufAV);
488 
489     if (buf_len < 0) { /* empty: -1 */
490 	dlen = 2 * (MaxLevel - 1);
491 	dst = newSV(dlen);
492 	(void)SvPOK_only(dst);
493 	d = (U8*)SvPVX(dst);
494 	while (dlen--)
495 	    *d++ = '\0';
496     } else {
497 	svp = hv_fetch(selfHV, "level", 5, FALSE);
498 	level = svp ? SvIV(*svp) : MaxLevel;
499 
500 	for (lv = 0; lv < level; lv++) {
501 	    New(0, eachlevel[lv], 2 * (1 + buf_len) + 1, U8);
502 	    s[lv] = eachlevel[lv];
503 	}
504 
505 	svp = hv_fetch(selfHV, "upper_before_lower", 18, FALSE);
506 	upper_lower = svp ? SvTRUE(*svp) : FALSE;
507 	svp = hv_fetch(selfHV, "katakana_before_hiragana", 24, FALSE);
508 	kata_hira = svp ? SvTRUE(*svp) : FALSE;
509 	svp = hv_fetch(selfHV, "UCA_Version", 11, FALSE);
510 	uca_vers = SvIV(*svp);
511 	svp = hv_fetch(selfHV, "variable", 8, FALSE);
512 	v2i = uca_vers >= 9 && svp /* (vers >= 9) and not (non-ignorable) */
513 	    ? !(SvCUR(*svp) == 13 && memEQ(SvPVX(*svp), "non-ignorable", 13))
514 	    : FALSE;
515 
516 	last_is_var = FALSE;
517 	for (i = 0; i <= buf_len; i++) {
518 	    svp = av_fetch(bufAV, i, FALSE);
519 
520 	    if (svp && SvPOK(*svp))
521 		v = (U8*)SvPV(*svp, vlen);
522 	    else
523 		croak("not a vwt.");
524 
525 	    if (vlen < VCE_Length) /* ignore short VCE (unexpected) */
526 		continue;
527 
528 	    /* "Ignorable (L1, L2) after Variable" since track. v. 9 */
529 	    if (v2i) {
530 		if (*v)
531 		    last_is_var = TRUE;
532 		else if (v[1] || v[2]) /* non zero primary weight */
533 		    last_is_var = FALSE;
534 		else if (last_is_var) /* zero primary weight; skipped */
535 		    continue;
536 	    }
537 
538 	    if (v[5] == 0) { /* tert wt < 256 */
539 		if (upper_lower) {
540 		    if (0x8 <= v[6] && v[6] <= 0xC) /* lower */
541 			v[6] -= 6;
542 		    else if (0x2 <= v[6] && v[6] <= 0x6) /* upper */
543 			v[6] += 6;
544 		    else if (v[6] == 0x1C) /* square upper */
545 			v[6]++;
546 		    else if (v[6] == 0x1D) /* square lower */
547 			v[6]--;
548 		}
549 		if (kata_hira) {
550 		    if (0x0F <= v[6] && v[6] <= 0x13) /* katakana */
551 			v[6] -= 2;
552 		    else if (0xD <= v[6] && v[6] <= 0xE) /* hiragana */
553 			v[6] += 5;
554 		}
555 	    }
556 
557 	    for (lv = 0; lv < level; lv++) {
558 		if (v[2 * lv + 1] || v[2 * lv + 2]) {
559 		    *s[lv]++ = v[2 * lv + 1];
560 		    *s[lv]++ = v[2 * lv + 2];
561 		}
562 	    }
563 	}
564 
565 	dlen = 2 * (MaxLevel - 1);
566 	for (lv = 0; lv < level; lv++)
567 	    dlen += s[lv] - eachlevel[lv];
568 
569 	dst = newSV(dlen);
570 	(void)SvPOK_only(dst);
571 	d = (U8*)SvPVX(dst);
572 
573 	svp = hv_fetch(selfHV, "backwardsFlag", 13, FALSE);
574 	back_flag = svp ? SvUV(*svp) : (UV)0;
575 
576 	for (lv = 0; lv < level; lv++) {
577 	    if (back_flag & (1 << (lv + 1))) {
578 		p = s[lv];
579 		e = eachlevel[lv];
580 		for ( ; e < p; p -= 2) {
581 		    *d++ = p[-2];
582 		    *d++ = p[-1];
583 		}
584 	    }
585 	    else {
586 		p = eachlevel[lv];
587 		e = s[lv];
588 		while (p < e)
589 		    *d++ = *p++;
590 	    }
591 	    if (lv + 1 < MaxLevel) { /* lv + 1 == real level */
592 		*d++ = '\0';
593 		*d++ = '\0';
594 	    }
595 	}
596 
597 	for (lv = level; lv < MaxLevel; lv++) {
598 	    if (lv + 1 < MaxLevel) { /* lv + 1 == real level */
599 		*d++ = '\0';
600 		*d++ = '\0';
601 	    }
602 	}
603 
604 	for (lv = 0; lv < level; lv++) {
605 	    Safefree(eachlevel[lv]);
606 	}
607     }
608     *d = '\0';
609     SvCUR_set(dst, d - (U8*)SvPVX(dst));
610     RETVAL = dst;
611 OUTPUT:
612     RETVAL
613 
614 
615 SV*
616 varCE (self, vce)
617     SV* self;
618     SV* vce;
619   PREINIT:
620     SV *dst, *vbl, **svp;
621     HV *selfHV;
622     U8 *a, *v, *d;
623     STRLEN alen, vlen;
624     bool ig_l2;
625     IV uca_vers;
626     UV totwt;
627   CODE:
628     if (SvROK(self) && SvTYPE(SvRV(self)) == SVt_PVHV)
629 	selfHV = (HV*)SvRV(self);
630     else
631 	croak("$self is not a HASHREF.");
632 
633     svp = hv_fetch(selfHV, "ignore_level2", 13, FALSE);
634     ig_l2 = svp ? SvTRUE(*svp) : FALSE;
635 
636     svp = hv_fetch(selfHV, "variable", 8, FALSE);
637     vbl = svp ? *svp : &PL_sv_no;
638     a = (U8*)SvPV(vbl, alen);
639     v = (U8*)SvPV(vce, vlen);
640 
641     dst = newSV(vlen);
642     d = (U8*)SvPVX(dst);
643     (void)SvPOK_only(dst);
644     Copy(v, d, vlen, U8);
645     SvCUR_set(dst, vlen);
646     d[vlen] = '\0';
647 
648     /* primary weight == 0 && secondary weight != 0 */
649     if (ig_l2 && !d[1] && !d[2] && (d[3] || d[4])) {
650 	d[3] = d[4] = d[5] = d[6] = '\0';
651     }
652 
653     /* variable: checked only the first char and the length,
654        trusting checkCollator() and %VariableOK in Perl ... */
655 
656     if (vlen >= VCE_Length && *a != 'n') {
657 	if (*v) {
658 	    if (*a == 's') { /* shifted or shift-trimmed */
659 		d[7] = d[1]; /* wt level 1 to 4 */
660 		d[8] = d[2];
661 	    } /* else blanked */
662 	    d[1] = d[2] = d[3] = d[4] = d[5] = d[6] = '\0';
663 	} else if (*a == 's') { /* shifted or shift-trimmed */
664 	    totwt = d[1] + d[2] + d[3] + d[4] + d[5] + d[6];
665 	    if (alen == 7 && totwt != 0) { /* shifted */
666 		if (d[1] == 0 && d[2] == 1) { /* XXX: CollationAuxiliary-6.2.0 */
667 		    d[7] = d[1]; /* wt level 1 to 4 */
668 		    d[8] = d[2];
669 		} else {
670 		    svp = hv_fetch(selfHV, "UCA_Version", 11, FALSE);
671 		    if (!svp)
672 			croak("Panic: no $self->{UCA_Version} in varCE");
673 		    uca_vers = SvIV(*svp);
674 
675 		    /* completely ignorable or the second derived CE */
676 		    if (uca_vers >= 36 && d[3] + d[4] + d[5] + d[6] == 0) {
677 			d[7] = d[8] = '\0';
678 		    } else {
679 			d[7] = (U8)(Shift4Wt >> 8);
680 			d[8] = (U8)(Shift4Wt & 0xFF);
681 		    }
682 		}
683 	    } else { /* shift-trimmed or completely ignorable */
684 		d[7] = d[8] = '\0';
685 	    }
686 	} /* else blanked */
687     } /* else non-ignorable */
688     RETVAL = dst;
689 OUTPUT:
690     RETVAL
691 
692 
693 
694 SV*
695 visualizeSortKey (self, key)
696     SV * self
697     SV * key
698   PREINIT:
699     HV *selfHV;
700     SV **svp, *dst;
701     U8 *s, *e, *d;
702     STRLEN klen, dlen;
703     UV uv;
704     IV uca_vers, sep = 0;
705     const char *upperhex = "0123456789ABCDEF";
706   CODE:
707     if (SvROK(self) && SvTYPE(SvRV(self)) == SVt_PVHV)
708 	selfHV = (HV*)SvRV(self);
709     else
710 	croak("$self is not a HASHREF.");
711 
712     svp = hv_fetch(selfHV, "UCA_Version", 11, FALSE);
713     if (!svp)
714 	croak("Panic: no $self->{UCA_Version} in visualizeSortKey");
715     uca_vers = SvIV(*svp);
716 
717     s = (U8*)SvPV(key, klen);
718 
719    /* slightly *longer* than the need, but I'm afraid of miscounting;
720       = (klen / 2) * 5 - 1
721              # FFFF and ' ' for each 16bit units but ' ' is less by 1;
722              # ' ' and '|' for level boundaries including the identical level
723        + 2   # '[' and ']'
724        + 1   # '\0'
725        (a) if klen is odd (not expected), maybe more 5 bytes.
726        (b) there is not always the identical level.
727    */
728     dlen = (klen / 2) * 5 + MaxLevel * 2 + 2;
729     dst = newSV(dlen);
730     (void)SvPOK_only(dst);
731     d = (U8*)SvPVX(dst);
732 
733     *d++ = '[';
734     for (e = s + klen; s < e; s += 2) {
735 	uv = (U16)(*s << 8 | s[1]);
736 	if (uv || sep >= MaxLevel) {
737 	    if ((d[-1] != '[') && ((9 <= uca_vers) || (d[-1] != '|')))
738 		*d++ = ' ';
739 	    *d++ = upperhex[ (s[0] >> 4) & 0xF ];
740 	    *d++ = upperhex[  s[0]       & 0xF ];
741 	    *d++ = upperhex[ (s[1] >> 4) & 0xF ];
742 	    *d++ = upperhex[  s[1]       & 0xF ];
743 	} else {
744 	    if ((9 <= uca_vers) && (d[-1] != '['))
745 		*d++ = ' ';
746 	    *d++ = '|';
747 	    ++sep;
748 	}
749     }
750     *d++ = ']';
751     *d   = '\0';
752     SvCUR_set(dst, d - (U8*)SvPVX(dst));
753     RETVAL = dst;
754 OUTPUT:
755     RETVAL
756