1 /*
2  $Id: Unicode.xs,v 2.11 2014/04/29 16:25:06 dankogai Exp dankogai $
3  */
4 
5 #define PERL_NO_GET_CONTEXT
6 #include "EXTERN.h"
7 #include "perl.h"
8 #include "XSUB.h"
9 #include "../Encode/encode.h"
10 
11 #define FBCHAR			0xFFFd
12 #define BOM_BE			0xFeFF
13 #define BOM16LE			0xFFFe
14 #define BOM32LE			0xFFFe0000
15 #define issurrogate(x)		(0xD800 <= (x)  && (x) <= 0xDFFF )
16 #define isHiSurrogate(x)	(0xD800 <= (x)  && (x) <  0xDC00 )
17 #define isLoSurrogate(x)	(0xDC00 <= (x)  && (x) <= 0xDFFF )
18 #define invalid_ucs2(x)         ( issurrogate(x) || 0xFFFF < (x) )
19 
20 /* For pre-5.14 source compatibility */
21 #ifndef UNICODE_WARN_ILLEGAL_INTERCHANGE
22 #   define UNICODE_WARN_ILLEGAL_INTERCHANGE 0
23 #   define UTF8_DISALLOW_SURROGATE 0
24 #   define UTF8_WARN_SURROGATE 0
25 #   define UTF8_DISALLOW_FE_FF 0
26 #   define UTF8_WARN_FE_FF 0
27 #   define UTF8_WARN_NONCHAR 0
28 #endif
29 
30 #define PERLIO_BUFSIZ 1024 /* XXX value comes from PerlIOEncode_get_base */
31 
32 /* Avoid wasting too much space in the result buffer */
33 /* static void */
34 /* shrink_buffer(SV *result) */
35 /* { */
36 /*     if (SvLEN(result) > 42 + SvCUR(result)) { */
37 /* 	char *buf; */
38 /* 	STRLEN len = 1 + SvCUR(result); /\* include the NUL byte *\/ */
39 /* 	New(0, buf, len, char); */
40 /* 	Copy(SvPVX(result), buf, len, char); */
41 /* 	Safefree(SvPVX(result)); */
42 /* 	SvPV_set(result, buf); */
43 /* 	SvLEN_set(result, len); */
44 /*     } */
45 /* } */
46 
47 #define shrink_buffer(result) { \
48     if (SvLEN(result) > 42 + SvCUR(result)) { \
49 	char *newpv; \
50 	STRLEN newlen = 1 + SvCUR(result); /* include the NUL byte */ \
51 	New(0, newpv, newlen, char); \
52 	Copy(SvPVX(result), newpv, newlen, char); \
53 	Safefree(SvPVX(result)); \
54 	SvPV_set(result, newpv); \
55 	SvLEN_set(result, newlen); \
56     } \
57 }
58 
59 static UV
60 enc_unpack(pTHX_ U8 **sp, U8 *e, STRLEN size, U8 endian)
61 {
62     U8 *s = *sp;
63     UV v = 0;
64     if (s+size > e) {
65 	croak("Partial character %c",(char) endian);
66     }
67     switch(endian) {
68     case 'N':
69 	v = *s++;
70 	v = (v << 8) | *s++;
71     case 'n':
72 	v = (v << 8) | *s++;
73 	v = (v << 8) | *s++;
74 	break;
75     case 'V':
76     case 'v':
77 	v |= *s++;
78 	v |= (*s++ << 8);
79 	if (endian == 'v')
80 	    break;
81 	v |= (*s++ << 16);
82 	v |= ((UV)*s++ << 24);
83 	break;
84     default:
85 	croak("Unknown endian %c",(char) endian);
86 	break;
87     }
88     *sp = s;
89     return v;
90 }
91 
92 void
93 enc_pack(pTHX_ SV *result, STRLEN size, U8 endian, UV value)
94 {
95     U8 *d = (U8 *) SvPV_nolen(result);
96 
97     switch(endian) {
98     case 'v':
99     case 'V':
100 	d += SvCUR(result);
101 	SvCUR_set(result,SvCUR(result)+size);
102 	while (size--) {
103 	    *d++ = (U8)(value & 0xFF);
104 	    value >>= 8;
105 	}
106 	break;
107     case 'n':
108     case 'N':
109 	SvCUR_set(result,SvCUR(result)+size);
110 	d += SvCUR(result);
111 	while (size--) {
112 	    *--d = (U8)(value & 0xFF);
113 	    value >>= 8;
114 	}
115 	break;
116     default:
117 	croak("Unknown endian %c",(char) endian);
118 	break;
119     }
120 }
121 
122 MODULE = Encode::Unicode PACKAGE = Encode::Unicode
123 
124 PROTOTYPES: DISABLE
125 
126 #define attr(k, l)  (hv_exists((HV *)SvRV(obj),k,l) ? \
127     *hv_fetch((HV *)SvRV(obj),k,l,0) : &PL_sv_undef)
128 
129 void
130 decode_xs(obj, str, check = 0)
131 SV *	obj
132 SV *	str
133 IV	check
134 CODE:
135 {
136     U8 endian    = *((U8 *)SvPV_nolen(attr("endian", 6)));
137     int size     = SvIV(attr("size", 4));
138     int ucs2     = -1; /* only needed in the event of surrogate pairs */
139     SV *result   = newSVpvn("",0);
140     STRLEN usize = (size > 0 ? size : 1); /* protect against rogue size<=0 */
141     STRLEN ulen;
142     STRLEN resultbuflen;
143     U8 *resultbuf;
144     U8 *s = (U8 *)SvPVbyte(str,ulen);
145     U8 *e = (U8 *)SvEND(str);
146     /* Optimise for the common case of being called from PerlIOEncode_fill()
147        with a standard length buffer. In this case the result SV's buffer is
148        only used temporarily, so we can afford to allocate the maximum needed
149        and not care about unused space. */
150     const bool temp_result = (ulen == PERLIO_BUFSIZ);
151 
152     ST(0) = sv_2mortal(result);
153     SvUTF8_on(result);
154 
155     if (!endian && s+size <= e) {
156 	UV bom;
157 	endian = (size == 4) ? 'N' : 'n';
158 	bom = enc_unpack(aTHX_ &s,e,size,endian);
159 	if (bom != BOM_BE) {
160 	    if (bom == BOM16LE) {
161 		endian = 'v';
162 	    }
163 	    else if (bom == BOM32LE) {
164 		endian = 'V';
165 	    }
166 	    else {
167 		croak("%"SVf":Unrecognised BOM %"UVxf,
168 		      *hv_fetch((HV *)SvRV(obj),"Name",4,0),
169 		      bom);
170 	    }
171 	}
172 #if 1
173 	/* Update endian for next sequence */
174 	if (SvTRUE(attr("renewed", 7))) {
175 	    hv_store((HV *)SvRV(obj),"endian",6,newSVpv((char *)&endian,1),0);
176 	}
177 #endif
178     }
179 
180     if (temp_result) {
181 	resultbuflen = 1 + ulen/usize * UTF8_MAXLEN;
182     } else {
183 	/* Preallocate the buffer to the minimum possible space required. */
184 	resultbuflen = ulen/usize + UTF8_MAXLEN + 1;
185     }
186     resultbuf = (U8 *) SvGROW(result, resultbuflen);
187 
188     while (s < e && s+size <= e) {
189 	UV ord = enc_unpack(aTHX_ &s,e,size,endian);
190 	U8 *d;
191 	if (issurrogate(ord)) {
192 	    if (ucs2 == -1) {
193 		ucs2 = SvTRUE(attr("ucs2", 4));
194 	    }
195 	    if (ucs2 || size == 4) {
196 		if (check) {
197 		    croak("%"SVf":no surrogates allowed %"UVxf,
198 			  *hv_fetch((HV *)SvRV(obj),"Name",4,0),
199 			  ord);
200 		}
201 		ord = FBCHAR;
202 	    }
203 	    else {
204 		UV lo;
205 		if (!isHiSurrogate(ord)) {
206 		    if (check) {
207 			croak("%"SVf":Malformed HI surrogate %"UVxf,
208 			      *hv_fetch((HV *)SvRV(obj),"Name",4,0),
209 			      ord);
210 		    }
211 		    else {
212 			ord = FBCHAR;
213 		    }
214 		}
215 		else if (s+size > e) {
216 		    if (check) {
217 		        if (check & ENCODE_STOP_AT_PARTIAL) {
218 		             s -= size;
219 		             break;
220 		        }
221 		        else {
222 		             croak("%"SVf":Malformed HI surrogate %"UVxf,
223 				   *hv_fetch((HV *)SvRV(obj),"Name",4,0),
224 				   ord);
225 		        }
226 		    }
227 		    else {
228 		        ord = FBCHAR;
229 		    }
230 		}
231 		else {
232 		    lo = enc_unpack(aTHX_ &s,e,size,endian);
233 		    if (!isLoSurrogate(lo)) {
234 			if (check) {
235 			    croak("%"SVf":Malformed LO surrogate %"UVxf,
236 				  *hv_fetch((HV *)SvRV(obj),"Name",4,0),
237 				  ord);
238 			}
239 			else {
240 			    s -= size;
241 			    ord = FBCHAR;
242 			}
243 		    }
244 		    else {
245 			ord = 0x10000 + ((ord - 0xD800) << 10) + (lo - 0xDC00);
246 		    }
247 		}
248 	    }
249 	}
250 
251 	if ((ord & 0xFFFE) == 0xFFFE || (ord >= 0xFDD0 && ord <= 0xFDEF)) {
252 	    if (check) {
253 		croak("%"SVf":Unicode character %"UVxf" is illegal",
254 		      *hv_fetch((HV *)SvRV(obj),"Name",4,0),
255 		      ord);
256 	    } else {
257 		ord = FBCHAR;
258 	    }
259 	}
260 
261 	if (resultbuflen < SvCUR(result) + UTF8_MAXLEN + 1) {
262 	    /* Do not allocate >8Mb more than the minimum needed.
263 	       This prevents allocating too much in the rogue case of a large
264 	       input consisting initially of long sequence uft8-byte unicode
265 	       chars followed by single utf8-byte chars. */
266             /* +1
267                fixes  Unicode.xs!decode_xs n-byte heap-overflow
268               */
269 	    STRLEN remaining = (e - s)/usize + 1; /* +1 to avoid the leak */
270 	    STRLEN max_alloc = remaining + (8*1024*1024);
271 	    STRLEN est_alloc = remaining * UTF8_MAXLEN;
272 	    STRLEN newlen = SvLEN(result) + /* min(max_alloc, est_alloc) */
273 		(est_alloc > max_alloc ? max_alloc : est_alloc);
274 	    resultbuf = (U8 *) SvGROW(result, newlen);
275 	    resultbuflen = SvLEN(result);
276 	}
277 
278 	d = uvuni_to_utf8_flags(resultbuf+SvCUR(result), ord,
279                                             UNICODE_WARN_ILLEGAL_INTERCHANGE);
280 	SvCUR_set(result, d - (U8 *)SvPVX(result));
281     }
282 
283     if (s < e) {
284 	/* unlikely to happen because it's fixed-length -- dankogai */
285 	if (check & ENCODE_WARN_ON_ERR) {
286 	    Perl_warner(aTHX_ packWARN(WARN_UTF8),"%"SVf":Partial character",
287 			*hv_fetch((HV *)SvRV(obj),"Name",4,0));
288 	}
289     }
290     if (check && !(check & ENCODE_LEAVE_SRC)) {
291 	if (s < e) {
292 	    Move(s,SvPVX(str),e-s,U8);
293 	    SvCUR_set(str,(e-s));
294 	}
295 	else {
296 	    SvCUR_set(str,0);
297 	}
298 	*SvEND(str) = '\0';
299     }
300 
301     if (!temp_result) shrink_buffer(result);
302     if (SvTAINTED(str)) SvTAINTED_on(result); /* propagate taintedness */
303     XSRETURN(1);
304 }
305 
306 void
307 encode_xs(obj, utf8, check = 0)
308 SV *	obj
309 SV *	utf8
310 IV	check
311 CODE:
312 {
313     U8 endian = *((U8 *)SvPV_nolen(attr("endian", 6)));
314     const int size = SvIV(attr("size", 4));
315     int ucs2 = -1; /* only needed if there is invalid_ucs2 input */
316     const STRLEN usize = (size > 0 ? size : 1);
317     SV *result = newSVpvn("", 0);
318     STRLEN ulen;
319     U8 *s = (U8 *) SvPVutf8(utf8, ulen);
320     const U8 *e = (U8 *) SvEND(utf8);
321     /* Optimise for the common case of being called from PerlIOEncode_flush()
322        with a standard length buffer. In this case the result SV's buffer is
323        only used temporarily, so we can afford to allocate the maximum needed
324        and not care about unused space. */
325     const bool temp_result = (ulen == PERLIO_BUFSIZ);
326 
327     ST(0) = sv_2mortal(result);
328 
329     /* Preallocate the result buffer to the maximum possible size.
330        ie. assume each UTF8 byte is 1 character.
331        Then shrink the result's buffer if necesary at the end. */
332     SvGROW(result, ((ulen+1) * usize));
333 
334     if (!endian) {
335 	endian = (size == 4) ? 'N' : 'n';
336 	enc_pack(aTHX_ result,size,endian,BOM_BE);
337 #if 1
338 	/* Update endian for next sequence */
339 	if (SvTRUE(attr("renewed", 7))) {
340 	    hv_store((HV *)SvRV(obj),"endian",6,newSVpv((char *)&endian,1),0);
341 	}
342 #endif
343     }
344     while (s < e && s+UTF8SKIP(s) <= e) {
345 	STRLEN len;
346 	UV ord = utf8n_to_uvuni(s, e-s, &len, (UTF8_DISALLOW_SURROGATE
347                                                |UTF8_WARN_SURROGATE
348                                                |UTF8_DISALLOW_FE_FF
349                                                |UTF8_WARN_FE_FF
350                                                |UTF8_WARN_NONCHAR));
351 	s += len;
352 	if (size != 4 && invalid_ucs2(ord)) {
353 	    if (!issurrogate(ord)) {
354 		if (ucs2 == -1) {
355 		    ucs2 = SvTRUE(attr("ucs2", 4));
356 		}
357 		if (ucs2 || ord > 0x10FFFF) {
358 		    if (check) {
359 			croak("%"SVf":code point \"\\x{%"UVxf"}\" too high",
360 				  *hv_fetch((HV *)SvRV(obj),"Name",4,0),ord);
361 		    }
362 		    enc_pack(aTHX_ result,size,endian,FBCHAR);
363 		} else {
364 		    UV hi = ((ord - 0x10000) >> 10)   + 0xD800;
365 		    UV lo = ((ord - 0x10000) & 0x3FF) + 0xDC00;
366 		    enc_pack(aTHX_ result,size,endian,hi);
367 		    enc_pack(aTHX_ result,size,endian,lo);
368 		}
369 	    }
370 	    else {
371 		/* not supposed to happen */
372 		enc_pack(aTHX_ result,size,endian,FBCHAR);
373 	    }
374 	}
375 	else {
376 	    enc_pack(aTHX_ result,size,endian,ord);
377 	}
378     }
379     if (s < e) {
380 	/* UTF-8 partial char happens often on PerlIO.
381 	   Since this is okay and normal, we do not warn.
382 	   But this is critical when you choose to LEAVE_SRC
383 	   in which case we die */
384 	if (check & (ENCODE_DIE_ON_ERR|ENCODE_LEAVE_SRC)) {
385 	    Perl_croak(aTHX_ "%"SVf":partial character is not allowed "
386 		       "when CHECK = 0x%" UVuf,
387 		       *hv_fetch((HV *)SvRV(obj),"Name",4,0), check);
388 	}
389     }
390     if (check && !(check & ENCODE_LEAVE_SRC)) {
391 	if (s < e) {
392 	    Move(s,SvPVX(utf8),e-s,U8);
393 	    SvCUR_set(utf8,(e-s));
394 	}
395 	else {
396 	    SvCUR_set(utf8,0);
397 	}
398 	*SvEND(utf8) = '\0';
399     }
400 
401     if (!temp_result) shrink_buffer(result);
402     if (SvTAINTED(utf8)) SvTAINTED_on(result); /* propagate taintedness */
403 
404     SvSETMAGIC(utf8);
405 
406     XSRETURN(1);
407 }
408