1 /*
2  $Id: Unicode.xs,v 2.6 2009/11/16 14:08:13 dankogai Exp $
3  */
4 
5 #define PERL_NO_GET_CONTEXT
6 #include "EXTERN.h"
7 #include "perl.h"
8 #include "XSUB.h"
9 #define U8 U8
10 #include "../Encode/encode.h"
11 
12 #define FBCHAR			0xFFFd
13 #define BOM_BE			0xFeFF
14 #define BOM16LE			0xFFFe
15 #define BOM32LE			0xFFFe0000
16 #define issurrogate(x)		(0xD800 <= (x)  && (x) <= 0xDFFF )
17 #define isHiSurrogate(x)	(0xD800 <= (x)  && (x) <  0xDC00 )
18 #define isLoSurrogate(x)	(0xDC00 <= (x)  && (x) <= 0xDFFF )
19 #define invalid_ucs2(x)         ( issurrogate(x) || 0xFFFF < (x) )
20 
21 #define PERLIO_BUFSIZ 1024 /* XXX value comes from PerlIOEncode_get_base */
22 
23 /* Avoid wasting too much space in the result buffer */
24 /* static void */
25 /* shrink_buffer(SV *result) */
26 /* { */
27 /*     if (SvLEN(result) > 42 + SvCUR(result)) { */
28 /* 	char *buf; */
29 /* 	STRLEN len = 1 + SvCUR(result); /\* include the NUL byte *\/ */
30 /* 	New(0, buf, len, char); */
31 /* 	Copy(SvPVX(result), buf, len, char); */
32 /* 	Safefree(SvPVX(result)); */
33 /* 	SvPV_set(result, buf); */
34 /* 	SvLEN_set(result, len); */
35 /*     } */
36 /* } */
37 
38 #define shrink_buffer(result) { \
39     if (SvLEN(result) > 42 + SvCUR(result)) { \
40 	char *newpv; \
41 	STRLEN newlen = 1 + SvCUR(result); /* include the NUL byte */ \
42 	New(0, newpv, newlen, char); \
43 	Copy(SvPVX(result), newpv, newlen, char); \
44 	Safefree(SvPVX(result)); \
45 	SvPV_set(result, newpv); \
46 	SvLEN_set(result, newlen); \
47     } \
48 }
49 
50 static UV
51 enc_unpack(pTHX_ U8 **sp, U8 *e, STRLEN size, U8 endian)
52 {
53     U8 *s = *sp;
54     UV v = 0;
55     if (s+size > e) {
56 	croak("Partial character %c",(char) endian);
57     }
58     switch(endian) {
59     case 'N':
60 	v = *s++;
61 	v = (v << 8) | *s++;
62     case 'n':
63 	v = (v << 8) | *s++;
64 	v = (v << 8) | *s++;
65 	break;
66     case 'V':
67     case 'v':
68 	v |= *s++;
69 	v |= (*s++ << 8);
70 	if (endian == 'v')
71 	    break;
72 	v |= (*s++ << 16);
73 	v |= (*s++ << 24);
74 	break;
75     default:
76 	croak("Unknown endian %c",(char) endian);
77 	break;
78     }
79     *sp = s;
80     return v;
81 }
82 
83 void
84 enc_pack(pTHX_ SV *result, STRLEN size, U8 endian, UV value)
85 {
86     U8 *d = (U8 *) SvPV_nolen(result);
87 
88     switch(endian) {
89     case 'v':
90     case 'V':
91 	d += SvCUR(result);
92 	SvCUR_set(result,SvCUR(result)+size);
93 	while (size--) {
94 	    *d++ = (U8)(value & 0xFF);
95 	    value >>= 8;
96 	}
97 	break;
98     case 'n':
99     case 'N':
100 	SvCUR_set(result,SvCUR(result)+size);
101 	d += SvCUR(result);
102 	while (size--) {
103 	    *--d = (U8)(value & 0xFF);
104 	    value >>= 8;
105 	}
106 	break;
107     default:
108 	croak("Unknown endian %c",(char) endian);
109 	break;
110     }
111 }
112 
113 MODULE = Encode::Unicode PACKAGE = Encode::Unicode
114 
115 PROTOTYPES: DISABLE
116 
117 #define attr(k, l)  (hv_exists((HV *)SvRV(obj),k,l) ? \
118     *hv_fetch((HV *)SvRV(obj),k,l,0) : &PL_sv_undef)
119 
120 void
121 decode_xs(obj, str, check = 0)
122 SV *	obj
123 SV *	str
124 IV	check
125 CODE:
126 {
127     U8 endian    = *((U8 *)SvPV_nolen(attr("endian", 6)));
128     int size     = SvIV(attr("size", 4));
129     int ucs2     = -1; /* only needed in the event of surrogate pairs */
130     SV *result   = newSVpvn("",0);
131     STRLEN usize = (size > 0 ? size : 1); /* protect against rogue size<=0 */
132     STRLEN ulen;
133     STRLEN resultbuflen;
134     U8 *resultbuf;
135     U8 *s = (U8 *)SvPVbyte(str,ulen);
136     U8 *e = (U8 *)SvEND(str);
137     /* Optimise for the common case of being called from PerlIOEncode_fill()
138        with a standard length buffer. In this case the result SV's buffer is
139        only used temporarily, so we can afford to allocate the maximum needed
140        and not care about unused space. */
141     const bool temp_result = (ulen == PERLIO_BUFSIZ);
142 
143     ST(0) = sv_2mortal(result);
144     SvUTF8_on(result);
145 
146     if (!endian && s+size <= e) {
147 	UV bom;
148 	endian = (size == 4) ? 'N' : 'n';
149 	bom = enc_unpack(aTHX_ &s,e,size,endian);
150 	if (bom != BOM_BE) {
151 	    if (bom == BOM16LE) {
152 		endian = 'v';
153 	    }
154 	    else if (bom == BOM32LE) {
155 		endian = 'V';
156 	    }
157 	    else {
158 		croak("%"SVf":Unrecognised BOM %"UVxf,
159 		      *hv_fetch((HV *)SvRV(obj),"Name",4,0),
160 		      bom);
161 	    }
162 	}
163 #if 1
164 	/* Update endian for next sequence */
165 	if (SvTRUE(attr("renewed", 7))) {
166 	    hv_store((HV *)SvRV(obj),"endian",6,newSVpv((char *)&endian,1),0);
167 	}
168 #endif
169     }
170 
171     if (temp_result) {
172 	resultbuflen = 1 + ulen/usize * UTF8_MAXLEN;
173     } else {
174 	/* Preallocate the buffer to the minimum possible space required. */
175 	resultbuflen = ulen/usize + UTF8_MAXLEN + 1;
176     }
177     resultbuf = (U8 *) SvGROW(result, resultbuflen);
178 
179     while (s < e && s+size <= e) {
180 	UV ord = enc_unpack(aTHX_ &s,e,size,endian);
181 	U8 *d;
182 	if (issurrogate(ord)) {
183 	    if (ucs2 == -1) {
184 		ucs2 = SvTRUE(attr("ucs2", 4));
185 	    }
186 	    if (ucs2 || size == 4) {
187 		if (check) {
188 		    croak("%"SVf":no surrogates allowed %"UVxf,
189 			  *hv_fetch((HV *)SvRV(obj),"Name",4,0),
190 			  ord);
191 		}
192 		if (s+size <= e) {
193 		    /* skip the next one as well */
194 		    enc_unpack(aTHX_ &s,e,size,endian);
195 		}
196 		ord = FBCHAR;
197 	    }
198 	    else {
199 		UV lo;
200 		if (!isHiSurrogate(ord)) {
201 		    if (check) {
202 			croak("%"SVf":Malformed HI surrogate %"UVxf,
203 			      *hv_fetch((HV *)SvRV(obj),"Name",4,0),
204 			      ord);
205 		    }
206 		    else {
207 			ord = FBCHAR;
208 		    }
209 		}
210 		else {
211 		    if (s+size > e) {
212 			/* Partial character */
213 			s -= size;   /* back up to 1st half */
214 			break;       /* And exit loop */
215 		    }
216 		    lo = enc_unpack(aTHX_ &s,e,size,endian);
217 		    if (!isLoSurrogate(lo)) {
218 			if (check) {
219 			    croak("%"SVf":Malformed LO surrogate %"UVxf,
220 				  *hv_fetch((HV *)SvRV(obj),"Name",4,0),
221 				  ord);
222 			}
223 			else {
224 			    ord = FBCHAR;
225 			}
226 		    }
227 		    else {
228 			ord = 0x10000 + ((ord - 0xD800) << 10) + (lo - 0xDC00);
229 		    }
230 		}
231 	    }
232 	}
233 
234 	if ((ord & 0xFFFE) == 0xFFFE || (ord >= 0xFDD0 && ord <= 0xFDEF)) {
235 	    if (check) {
236 		croak("%"SVf":Unicode character %"UVxf" is illegal",
237 		      *hv_fetch((HV *)SvRV(obj),"Name",4,0),
238 		      ord);
239 	    } else {
240 		ord = FBCHAR;
241 	    }
242 	}
243 
244 	if (resultbuflen < SvCUR(result) + UTF8_MAXLEN + 1) {
245 	    /* Do not allocate >8Mb more than the minimum needed.
246 	       This prevents allocating too much in the rogue case of a large
247 	       input consisting initially of long sequence uft8-byte unicode
248 	       chars followed by single utf8-byte chars. */
249 	    /* +1
250                fixes  Unicode.xs!decode_xs n-byte heap-overflow
251               */
252 	    STRLEN remaining = (e - s)/usize + 1; /* +1 to avoid the leak */
253 	    STRLEN max_alloc = remaining + (8*1024*1024);
254 	    STRLEN est_alloc = remaining * UTF8_MAXLEN;
255 	    STRLEN newlen = SvLEN(result) + /* min(max_alloc, est_alloc) */
256 		(est_alloc > max_alloc ? max_alloc : est_alloc);
257 	    resultbuf = (U8 *) SvGROW(result, newlen);
258 	    resultbuflen = SvLEN(result);
259 	}
260 
261 	d = uvuni_to_utf8_flags(resultbuf+SvCUR(result), ord, 0);
262 	SvCUR_set(result, d - (U8 *)SvPVX(result));
263     }
264 
265     if (s < e) {
266 	/* unlikely to happen because it's fixed-length -- dankogai */
267 	if (check & ENCODE_WARN_ON_ERR) {
268 	    Perl_warner(aTHX_ packWARN(WARN_UTF8),"%"SVf":Partial character",
269 			*hv_fetch((HV *)SvRV(obj),"Name",4,0));
270 	}
271     }
272     if (check && !(check & ENCODE_LEAVE_SRC)) {
273 	if (s < e) {
274 	    Move(s,SvPVX(str),e-s,U8);
275 	    SvCUR_set(str,(e-s));
276 	}
277 	else {
278 	    SvCUR_set(str,0);
279 	}
280 	*SvEND(str) = '\0';
281     }
282 
283     if (!temp_result)
284 	shrink_buffer(result);
285 
286     XSRETURN(1);
287 }
288 
289 void
290 encode_xs(obj, utf8, check = 0)
291 SV *	obj
292 SV *	utf8
293 IV	check
294 CODE:
295 {
296     U8 endian = *((U8 *)SvPV_nolen(attr("endian", 6)));
297     const int size = SvIV(attr("size", 4));
298     int ucs2 = -1; /* only needed if there is invalid_ucs2 input */
299     const STRLEN usize = (size > 0 ? size : 1);
300     SV *result = newSVpvn("", 0);
301     STRLEN ulen;
302     U8 *s = (U8 *) SvPVutf8(utf8, ulen);
303     const U8 *e = (U8 *) SvEND(utf8);
304     /* Optimise for the common case of being called from PerlIOEncode_flush()
305        with a standard length buffer. In this case the result SV's buffer is
306        only used temporarily, so we can afford to allocate the maximum needed
307        and not care about unused space. */
308     const bool temp_result = (ulen == PERLIO_BUFSIZ);
309 
310     ST(0) = sv_2mortal(result);
311 
312     /* Preallocate the result buffer to the maximum possible size.
313        ie. assume each UTF8 byte is 1 character.
314        Then shrink the result's buffer if necesary at the end. */
315     SvGROW(result, ((ulen+1) * usize));
316 
317     if (!endian) {
318 	endian = (size == 4) ? 'N' : 'n';
319 	enc_pack(aTHX_ result,size,endian,BOM_BE);
320 #if 1
321 	/* Update endian for next sequence */
322 	if (SvTRUE(attr("renewed", 7))) {
323 	    hv_store((HV *)SvRV(obj),"endian",6,newSVpv((char *)&endian,1),0);
324 	}
325 #endif
326     }
327     while (s < e && s+UTF8SKIP(s) <= e) {
328 	STRLEN len;
329 	UV ord = utf8n_to_uvuni(s, e-s, &len, 0);
330 	s += len;
331 	if (size != 4 && invalid_ucs2(ord)) {
332 	    if (!issurrogate(ord)) {
333 		if (ucs2 == -1) {
334 		    ucs2 = SvTRUE(attr("ucs2", 4));
335 		}
336 		if (ucs2) {
337 		    if (check) {
338 			croak("%"SVf":code point \"\\x{%"UVxf"}\" too high",
339 				  *hv_fetch((HV *)SvRV(obj),"Name",4,0),ord);
340 		    }
341 		    enc_pack(aTHX_ result,size,endian,FBCHAR);
342 		} else {
343 		    UV hi = ((ord - 0x10000) >> 10)   + 0xD800;
344 		    UV lo = ((ord - 0x10000) & 0x3FF) + 0xDC00;
345 		    enc_pack(aTHX_ result,size,endian,hi);
346 		    enc_pack(aTHX_ result,size,endian,lo);
347 		}
348 	    }
349 	    else {
350 		/* not supposed to happen */
351 		enc_pack(aTHX_ result,size,endian,FBCHAR);
352 	    }
353 	}
354 	else {
355 	    enc_pack(aTHX_ result,size,endian,ord);
356 	}
357     }
358     if (s < e) {
359 	/* UTF-8 partial char happens often on PerlIO.
360 	   Since this is okay and normal, we do not warn.
361 	   But this is critical when you choose to LEAVE_SRC
362 	   in which case we die */
363 	if (check & (ENCODE_DIE_ON_ERR|ENCODE_LEAVE_SRC)) {
364 	    Perl_croak(aTHX_ "%"SVf":partial character is not allowed "
365 		       "when CHECK = 0x%" UVuf,
366 		       *hv_fetch((HV *)SvRV(obj),"Name",4,0), check);
367 	}
368     }
369     if (check && !(check & ENCODE_LEAVE_SRC)) {
370 	if (s < e) {
371 	    Move(s,SvPVX(utf8),e-s,U8);
372 	    SvCUR_set(utf8,(e-s));
373 	}
374 	else {
375 	    SvCUR_set(utf8,0);
376 	}
377 	*SvEND(utf8) = '\0';
378     }
379 
380     if (!temp_result)
381 	shrink_buffer(result);
382 
383     SvSETMAGIC(utf8);
384 
385     XSRETURN(1);
386 }
387