1 /*
2  $Id: Unicode.xs,v 2.20 2021/07/23 02:26:54 dankogai Exp $
3  */
4 
5 #define IN_UNICODE_XS
6 
7 #define PERL_NO_GET_CONTEXT
8 #include "EXTERN.h"
9 #include "perl.h"
10 #include "XSUB.h"
11 #include "../Encode/encode.h"
12 
13 #define FBCHAR			0xFFFd
14 #define BOM_BE			0xFeFF
15 #define BOM16LE			0xFFFe
16 #define BOM32LE			0xFFFe0000
17 #define issurrogate(x)		(0xD800 <= (x)  && (x) <= 0xDFFF )
18 #define isHiSurrogate(x)	(0xD800 <= (x)  && (x) <  0xDC00 )
19 #define isLoSurrogate(x)	(0xDC00 <= (x)  && (x) <= 0xDFFF )
20 #define invalid_ucs2(x)         ( issurrogate(x) || 0xFFFF < (x) )
21 
22 #ifndef SVfARG
23 #define SVfARG(p) ((void*)(p))
24 #endif
25 
26 #define PERLIO_BUFSIZ 1024 /* XXX value comes from PerlIOEncode_get_base */
27 
28 /* Avoid wasting too much space in the result buffer */
29 /* static void */
30 /* shrink_buffer(SV *result) */
31 /* { */
32 /*     if (SvLEN(result) > 42 + SvCUR(result)) { */
33 /* 	char *buf; */
34 /* 	STRLEN len = 1 + SvCUR(result); /\* include the NUL byte *\/ */
35 /* 	New(0, buf, len, char); */
36 /* 	Copy(SvPVX(result), buf, len, char); */
37 /* 	Safefree(SvPVX(result)); */
38 /* 	SvPV_set(result, buf); */
39 /* 	SvLEN_set(result, len); */
40 /*     } */
41 /* } */
42 
43 #define shrink_buffer(result) { \
44     if (SvLEN(result) > 42 + SvCUR(result)) { \
45 	char *newpv; \
46 	STRLEN newlen = 1 + SvCUR(result); /* include the NUL byte */ \
47 	New(0, newpv, newlen, char); \
48 	Copy(SvPVX(result), newpv, newlen, char); \
49 	Safefree(SvPVX(result)); \
50 	SvPV_set(result, newpv); \
51 	SvLEN_set(result, newlen); \
52     } \
53 }
54 
55 static UV
56 enc_unpack(pTHX_ U8 **sp, U8 *e, STRLEN size, U8 endian)
57 {
58     U8 *s = *sp;
59     UV v = 0;
60     if (s+size > e) {
61 	croak("Partial character %c",(char) endian);
62     }
63     switch(endian) {
64     case 'N':
65 	v = *s++;
66 	v = (v << 8) | *s++;
67         /* FALLTHROUGH */
68     case 'n':
69 	v = (v << 8) | *s++;
70 	v = (v << 8) | *s++;
71 	break;
72     case 'V':
73     case 'v':
74 	v |= *s++;
75 	v |= (*s++ << 8);
76 	if (endian == 'v')
77 	    break;
78 	v |= (*s++ << 16);
79 	v |= ((UV)*s++ << 24);
80 	break;
81     default:
82 	croak("Unknown endian %c",(char) endian);
83 	break;
84     }
85     *sp = s;
86     return v;
87 }
88 
89 static void
90 enc_pack(pTHX_ SV *result, STRLEN size, U8 endian, UV value)
91 {
92     U8 *d = (U8 *) SvPV_nolen(result);
93 
94     switch(endian) {
95     case 'v':
96     case 'V':
97 	d += SvCUR(result);
98 	SvCUR_set(result,SvCUR(result)+size);
99 	while (size--) {
100 	    *d++ = (U8)(value & 0xFF);
101 	    value >>= 8;
102 	}
103 	break;
104     case 'n':
105     case 'N':
106 	SvCUR_set(result,SvCUR(result)+size);
107 	d += SvCUR(result);
108 	while (size--) {
109 	    *--d = (U8)(value & 0xFF);
110 	    value >>= 8;
111 	}
112 	break;
113     default:
114 	croak("Unknown endian %c",(char) endian);
115 	break;
116     }
117 }
118 
119 MODULE = Encode::Unicode PACKAGE = Encode::Unicode
120 
121 PROTOTYPES: DISABLE
122 
123 #define attr(k)  (hv_exists((HV *)SvRV(obj),"" k "",sizeof(k)-1) ? \
124     *hv_fetch((HV *)SvRV(obj),"" k "",sizeof(k)-1,0) : &PL_sv_undef)
125 
126 void
127 decode(obj, str, check = 0)
128 SV *	obj
129 SV *	str
130 IV	check
131 CODE:
132 {
133     SV *name     = attr("Name");
134     SV *sve      = attr("endian");
135     U8 endian    = *((U8 *)SvPV_nolen(sve));
136     SV *svs      = attr("size");
137     int size     = SvIV(svs);
138     int ucs2     = -1; /* only needed in the event of surrogate pairs */
139     SV *result   = newSVpvn("",0);
140     STRLEN usize = (size > 0 ? size : 1); /* protect against rogue size<=0 */
141     STRLEN ulen;
142     STRLEN resultbuflen;
143     U8 *resultbuf;
144     U8 *s;
145     U8 *e;
146     bool modify = (check && !(check & ENCODE_LEAVE_SRC));
147     bool temp_result;
148 
149     SvGETMAGIC(str);
150     if (!SvOK(str))
151         XSRETURN_UNDEF;
152     s = modify ? (U8 *)SvPV_force_nomg(str, ulen) : (U8 *)SvPV_nomg(str, ulen);
153     if (SvUTF8(str)) {
154         if (!modify) {
155             SV *tmp = sv_2mortal(newSVpvn((char *)s, ulen));
156             SvUTF8_on(tmp);
157             if (SvTAINTED(str))
158                 SvTAINTED_on(tmp);
159             str = tmp;
160             s = (U8 *)SvPVX(str);
161         }
162         if (ulen) {
163             if (!utf8_to_bytes(s, &ulen))
164                 croak("Wide character");
165             SvCUR_set(str, ulen);
166         }
167         SvUTF8_off(str);
168     }
169     e = s+ulen;
170 
171     /* Optimise for the common case of being called from PerlIOEncode_fill()
172        with a standard length buffer. In this case the result SV's buffer is
173        only used temporarily, so we can afford to allocate the maximum needed
174        and not care about unused space. */
175     temp_result = (ulen == PERLIO_BUFSIZ);
176 
177     ST(0) = sv_2mortal(result);
178     SvUTF8_on(result);
179 
180     if (!endian && s+size <= e) {
181 	SV *sv;
182 	UV bom;
183 	endian = (size == 4) ? 'N' : 'n';
184 	bom = enc_unpack(aTHX_ &s,e,size,endian);
185 	if (bom != BOM_BE) {
186 	    if (bom == BOM16LE) {
187 		endian = 'v';
188 	    }
189 	    else if (bom == BOM32LE) {
190 		endian = 'V';
191 	    }
192 	    else {
193                /* No BOM found, use big-endian fallback as specified in
194                 * RFC2781 and the Unicode Standard version 8.0:
195                 *
196                 *  The UTF-16 encoding scheme may or may not begin with
197                 *  a BOM. However, when there is no BOM, and in the
198                 *  absence of a higher-level protocol, the byte order
199                 *  of the UTF-16 encoding scheme is big-endian.
200                 *
201                 *  If the first two octets of the text is not 0xFE
202                 *  followed by 0xFF, and is not 0xFF followed by 0xFE,
203                 *  then the text SHOULD be interpreted as big-endian.
204                 */
205                 s -= size;
206 	    }
207 	}
208 #if 1
209 	/* Update endian for next sequence */
210 	sv = attr("renewed");
211 	if (SvTRUE(sv)) {
212 	    (void)hv_store((HV *)SvRV(obj),"endian",6,newSVpv((char *)&endian,1),0);
213 	}
214 #endif
215     }
216 
217     if (temp_result) {
218 	resultbuflen = 1 + ulen/usize * UTF8_MAXLEN;
219     } else {
220 	/* Preallocate the buffer to the minimum possible space required. */
221 	resultbuflen = ulen/usize + UTF8_MAXLEN + 1;
222     }
223     resultbuf = (U8 *) SvGROW(result, resultbuflen);
224 
225     while (s < e && s+size <= e) {
226 	UV ord = enc_unpack(aTHX_ &s,e,size,endian);
227 	U8 *d;
228 	HV *hv = NULL;
229 	if (issurrogate(ord)) {
230 	    if (ucs2 == -1) {
231 		SV *sv = attr("ucs2");
232 		ucs2 = SvTRUE(sv);
233 	    }
234 	    if (ucs2 || size == 4) {
235 		if (check & ENCODE_DIE_ON_ERR) {
236 		    croak("%" SVf ":no surrogates allowed %" UVxf,
237 			  SVfARG(name), ord);
238 		}
239 		if (encode_ckWARN(check, WARN_SURROGATE)) {
240 		    warner(packWARN(WARN_SURROGATE),
241 			  "%" SVf ":no surrogates allowed %" UVxf,
242 			  SVfARG(name), ord);
243 		}
244 		ord = FBCHAR;
245 	    }
246 	    else {
247 		UV lo;
248 		if (!isHiSurrogate(ord)) {
249 		    if (check & ENCODE_DIE_ON_ERR) {
250 			croak("%" SVf ":Malformed HI surrogate %" UVxf,
251 			      SVfARG(name), ord);
252 		    }
253 		    if (encode_ckWARN(check, WARN_SURROGATE)) {
254 			warner(packWARN(WARN_SURROGATE),
255 			      "%" SVf ":Malformed HI surrogate %" UVxf,
256 			      SVfARG(name), ord);
257 		    }
258 		    ord = FBCHAR;
259 		}
260 		else if (s+size > e) {
261 		    if (check & ENCODE_STOP_AT_PARTIAL) {
262 		        s -= size;
263 		        break;
264 		    }
265 		    if (check & ENCODE_DIE_ON_ERR) {
266 			croak("%" SVf ":Malformed HI surrogate %" UVxf,
267 			      SVfARG(name), ord);
268 		    }
269 		    if (encode_ckWARN(check, WARN_SURROGATE)) {
270 			warner(packWARN(WARN_SURROGATE),
271 			      "%" SVf ":Malformed HI surrogate %" UVxf,
272 			      SVfARG(name), ord);
273 		    }
274 		    ord = FBCHAR;
275 		}
276 		else {
277 		    lo = enc_unpack(aTHX_ &s,e,size,endian);
278 		    if (!isLoSurrogate(lo)) {
279 			if (check & ENCODE_DIE_ON_ERR) {
280 			    croak("%" SVf ":Malformed LO surrogate %" UVxf,
281 				  SVfARG(name), ord);
282 			}
283 			if (encode_ckWARN(check, WARN_SURROGATE)) {
284 			    warner(packWARN(WARN_SURROGATE),
285 				  "%" SVf ":Malformed LO surrogate %" UVxf,
286 				  SVfARG(name), ord);
287 			}
288 			s -= size;
289 			ord = FBCHAR;
290 		    }
291 		    else {
292 			ord = 0x10000 + ((ord - 0xD800) << 10) + (lo - 0xDC00);
293 		    }
294 		}
295 	    }
296 	}
297 
298 	if ((ord & 0xFFFE) == 0xFFFE || (ord >= 0xFDD0 && ord <= 0xFDEF)) {
299 	    if (check & ENCODE_DIE_ON_ERR) {
300 		croak("%" SVf ":Unicode character %" UVxf " is illegal",
301 		      SVfARG(name), ord);
302 	    }
303 	    if (encode_ckWARN(check, WARN_NONCHAR)) {
304 	        warner(packWARN(WARN_NONCHAR),
305 		      "%" SVf ":Unicode character %" UVxf " is illegal",
306 		      SVfARG(name), ord);
307 	    }
308 	    ord = FBCHAR;
309 	}
310 
311 	if (resultbuflen < SvCUR(result) + UTF8_MAXLEN + 1) {
312 	    /* Do not allocate >8Mb more than the minimum needed.
313 	       This prevents allocating too much in the rogue case of a large
314 	       input consisting initially of long sequence uft8-byte unicode
315 	       chars followed by single utf8-byte chars. */
316             /* +1
317                fixes  Unicode.xs!decode_xs n-byte heap-overflow
318               */
319 	    STRLEN remaining = (e - s)/usize + 1; /* +1 to avoid the leak */
320 	    STRLEN max_alloc = remaining + (8*1024*1024);
321 	    STRLEN est_alloc = remaining * UTF8_MAXLEN;
322 	    STRLEN newlen = SvLEN(result) + /* min(max_alloc, est_alloc) */
323 		(est_alloc > max_alloc ? max_alloc : est_alloc);
324 	    resultbuf = (U8 *) SvGROW(result, newlen);
325 	    resultbuflen = SvLEN(result);
326 	}
327 
328         d = uvchr_to_utf8_flags_msgs(resultbuf+SvCUR(result), ord, UNICODE_DISALLOW_ILLEGAL_INTERCHANGE | UNICODE_WARN_ILLEGAL_INTERCHANGE, &hv);
329         if (hv) {
330             SV *message = *hv_fetch(hv, "text", 4, 0);
331             U32 categories = SvUVx(*hv_fetch(hv, "warn_categories", 15, 0));
332             sv_2mortal((SV *)hv);
333             if (check & ENCODE_DIE_ON_ERR)
334                 croak("%" SVf, SVfARG(message));
335             if (encode_ckWARN_packed(check, categories))
336                 warner(categories, "%" SVf, SVfARG(message));
337             d = uvchr_to_utf8_flags(resultbuf+SvCUR(result), FBCHAR, 0);
338         }
339 
340 	SvCUR_set(result, d - (U8 *)SvPVX(result));
341     }
342 
343     if (s < e) {
344 	/* unlikely to happen because it's fixed-length -- dankogai */
345         if (check & ENCODE_DIE_ON_ERR)
346             croak("%" SVf ":Partial character", SVfARG(name));
347         if (encode_ckWARN(check, WARN_UTF8)) {
348             warner(packWARN(WARN_UTF8),"%" SVf ":Partial character", SVfARG(name));
349 	}
350     }
351     if (check && !(check & ENCODE_LEAVE_SRC)) {
352 	if (s < e) {
353 	    Move(s,SvPVX(str),e-s,U8);
354 	    SvCUR_set(str,(e-s));
355 	}
356 	else {
357 	    SvCUR_set(str,0);
358 	}
359 	*SvEND(str) = '\0';
360 	SvSETMAGIC(str);
361     }
362 
363     if (!temp_result) shrink_buffer(result);
364 
365     /* Make sure we have a trailing NUL: */
366     *SvEND(result) = '\0';
367 
368     if (SvTAINTED(str)) SvTAINTED_on(result); /* propagate taintedness */
369     XSRETURN(1);
370 }
371 
372 void
373 encode(obj, utf8, check = 0)
374 SV *	obj
375 SV *	utf8
376 IV	check
377 CODE:
378 {
379     SV *name = attr("Name");
380     SV *sve = attr("endian");
381     U8 endian = *((U8 *)SvPV_nolen(sve));
382     SV *svs = attr("size");
383     const int size = SvIV(svs);
384     int ucs2 = -1; /* only needed if there is invalid_ucs2 input */
385     const STRLEN usize = (size > 0 ? size : 1);
386     SV *result = newSVpvn("", 0);
387     STRLEN ulen;
388     U8 *s;
389     U8 *e;
390     bool modify = (check && !(check & ENCODE_LEAVE_SRC));
391     bool temp_result;
392 
393     SvGETMAGIC(utf8);
394     if (!SvOK(utf8))
395         XSRETURN_UNDEF;
396     s = modify ? (U8 *)SvPV_force_nomg(utf8, ulen) : (U8 *)SvPV_nomg(utf8, ulen);
397     if (!SvUTF8(utf8)) {
398         if (!modify) {
399             SV *tmp = sv_2mortal(newSVpvn((char *)s, ulen));
400             if (SvTAINTED(utf8))
401                 SvTAINTED_on(tmp);
402             utf8 = tmp;
403         }
404         sv_utf8_upgrade_nomg(utf8);
405         s = (U8 *)SvPV_nomg(utf8, ulen);
406     }
407     e = s+ulen;
408 
409     /* Optimise for the common case of being called from PerlIOEncode_flush()
410        with a standard length buffer. In this case the result SV's buffer is
411        only used temporarily, so we can afford to allocate the maximum needed
412        and not care about unused space. */
413     temp_result = (ulen == PERLIO_BUFSIZ);
414 
415     ST(0) = sv_2mortal(result);
416 
417     /* Preallocate the result buffer to the maximum possible size.
418        ie. assume each UTF8 byte is 1 character.
419        Then shrink the result's buffer if necesary at the end. */
420     SvGROW(result, ((ulen+1) * usize));
421 
422     if (!endian) {
423 	SV *sv;
424 	endian = (size == 4) ? 'N' : 'n';
425 	enc_pack(aTHX_ result,size,endian,BOM_BE);
426 #if 1
427 	/* Update endian for next sequence */
428 	sv = attr("renewed");
429 	if (SvTRUE(sv)) {
430 	    (void)hv_store((HV *)SvRV(obj),"endian",6,newSVpv((char *)&endian,1),0);
431 	}
432 #endif
433     }
434     while (s < e && s+UTF8SKIP(s) <= e) {
435         STRLEN len;
436         AV *msgs = NULL;
437         UV ord = utf8n_to_uvchr_msgs(s, e-s, &len, UTF8_DISALLOW_ILLEGAL_INTERCHANGE | UTF8_WARN_ILLEGAL_INTERCHANGE, NULL, &msgs);
438         if (msgs) {
439             SSize_t i;
440             SSize_t len = av_len(msgs)+1;
441             sv_2mortal((SV *)msgs);
442             for (i = 0; i < len; ++i) {
443                 SV *sv = *av_fetch(msgs, i, 0);
444                 HV *hv = (HV *)SvRV(sv);
445                 SV *message = *hv_fetch(hv, "text", 4, 0);
446                 U32 categories = SvUVx(*hv_fetch(hv, "warn_categories", 15, 0));
447                 if (check & ENCODE_DIE_ON_ERR)
448                     croak("%" SVf, SVfARG(message));
449                 if (encode_ckWARN_packed(check, categories))
450                     warner(categories, "%" SVf, SVfARG(message));
451             }
452         }
453 	if ((size != 4 && invalid_ucs2(ord)) || (ord == 0 && *s != 0)) {
454 	    if (!issurrogate(ord)) {
455 		if (ucs2 == -1) {
456 		    SV *sv = attr("ucs2");
457 		    ucs2 = SvTRUE(sv);
458 		}
459 		if (ucs2 || ord > 0x10FFFF) {
460 		    if (check & ENCODE_DIE_ON_ERR) {
461 			croak("%" SVf ":code point \"\\x{%" UVxf "}\" too high",
462 				  SVfARG(name),ord);
463 		    }
464 		    if (encode_ckWARN(check, WARN_NON_UNICODE)) {
465 			warner(packWARN(WARN_NON_UNICODE),
466 				  "%" SVf ":code point \"\\x{%" UVxf "}\" too high",
467 				  SVfARG(name),ord);
468 		    }
469 		    enc_pack(aTHX_ result,size,endian,FBCHAR);
470 		} else if (ord == 0) {
471 		    enc_pack(aTHX_ result,size,endian,FBCHAR);
472 		} else {
473 		    UV hi = ((ord - 0x10000) >> 10)   + 0xD800;
474 		    UV lo = ((ord - 0x10000) & 0x3FF) + 0xDC00;
475 		    enc_pack(aTHX_ result,size,endian,hi);
476 		    enc_pack(aTHX_ result,size,endian,lo);
477 		}
478 	    }
479 	    else {
480 		/* not supposed to happen */
481 		enc_pack(aTHX_ result,size,endian,FBCHAR);
482 	    }
483 	}
484 	else {
485 	    enc_pack(aTHX_ result,size,endian,ord);
486 	}
487 	s += len;
488     }
489     if (s < e) {
490 	/* UTF-8 partial char happens often on PerlIO.
491 	   Since this is okay and normal, we do not warn.
492 	   But this is critical when you choose to LEAVE_SRC
493 	   in which case we die */
494 	if (check & (ENCODE_DIE_ON_ERR|ENCODE_LEAVE_SRC)) {
495 	    Perl_croak(aTHX_ "%" SVf ":partial character is not allowed "
496 		       "when CHECK = 0x%" UVuf,
497 		       SVfARG(name), check);
498 	}
499     }
500     if (check && !(check & ENCODE_LEAVE_SRC)) {
501 	if (s < e) {
502 	    Move(s,SvPVX(utf8),e-s,U8);
503 	    SvCUR_set(utf8,(e-s));
504 	}
505 	else {
506 	    SvCUR_set(utf8,0);
507 	}
508 	*SvEND(utf8) = '\0';
509 	SvSETMAGIC(utf8);
510     }
511 
512     if (!temp_result) shrink_buffer(result);
513     if (SvTAINTED(utf8)) SvTAINTED_on(result); /* propagate taintedness */
514 
515     XSRETURN(1);
516 }
517