1 #include "EXTERN.h"
2 #include "perl.h"
3 #include "XSUB.h"
4 #include "perliol.h"
5 #include "ppport.h"
6
7 #define UTF8_MAX_BYTES 4
8
9 static const U8 xs_utf8_sequence_len[0x100] = {
10 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, /* 0x00-0x0F */
11 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, /* 0x10-0x1F */
12 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, /* 0x20-0x2F */
13 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, /* 0x30-0x3F */
14 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, /* 0x40-0x4F */
15 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, /* 0x50-0x5F */
16 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, /* 0x60-0x6F */
17 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, /* 0x70-0x7F */
18 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, /* 0x80-0x8F */
19 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, /* 0x90-0x9F */
20 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, /* 0xA0-0xAF */
21 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, /* 0xB0-0xBF */
22 0,0,2,2,2,2,2,2,2,2,2,2,2,2,2,2, /* 0xC0-0xCF */
23 2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2, /* 0xD0-0xDF */
24 3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3, /* 0xE0-0xEF */
25 4,4,4,4,4,0,0,0,0,0,0,0,0,0,0,0, /* 0xF0-0xFF */
26 };
27
28
29 typedef enum { STRICT_UTF8=0, ALLOW_SURROGATES=1, ALLOW_NONCHARACTERS=2, ALLOW_NONSHORTEST=4 } utf8_flags;
30
31
skip_sequence(const U8 * cur,const STRLEN len)32 static STRLEN skip_sequence(const U8 *cur, const STRLEN len) {
33 STRLEN i, n = xs_utf8_sequence_len[*cur];
34
35 if (n < 1 || len < 2)
36 return 1;
37
38 switch (cur[0]) {
39 case 0xE0: if ((cur[1] & 0xE0) != 0xA0) return 1; break;
40 case 0xED: if ((cur[1] & 0xE0) != 0x80) return 1; break;
41 case 0xF4: if ((cur[1] & 0xF0) != 0x80) return 1; break;
42 case 0xF0: if ((cur[1] & 0xF0) == 0x80) return 1; /* FALLTROUGH */
43 default: if ((cur[1] & 0xC0) != 0x80) return 1; break;
44 }
45
46 if (n > len)
47 n = len;
48 for (i = 2; i < n; i++)
49 if ((cur[i] & 0xC0) != 0x80)
50 break;
51 return i;
52 }
53
report_illformed(pTHX_ const U8 * cur,STRLEN len,bool eof)54 static void report_illformed(pTHX_ const U8 *cur, STRLEN len, bool eof) {
55 static const char *hex = "0123456789ABCDEF";
56 const char *fmt;
57 char seq[UTF8_MAX_BYTES * 3];
58 char *d = seq;
59
60 if (eof)
61 fmt = "Can't decode ill-formed UTF-8 octet sequence <%s> at end of file";
62 else
63 fmt = "Can't decode ill-formed UTF-8 octet sequence <%s>";
64
65 while (len-- > 0) {
66 const U8 c = *cur++;
67 *d++ = hex[c >> 4];
68 *d++ = hex[c & 15];
69 if (len)
70 *d++ = ' ';
71 }
72 *d = 0;
73 Perl_croak(aTHX_ fmt, seq);
74 }
75
report_noncharacter(pTHX_ UV usv)76 static void report_noncharacter(pTHX_ UV usv) {
77 static const char *fmt = "Can't interchange noncharacter code point U+%"UVXf;
78 Perl_croak(aTHX_ fmt, usv);
79 }
80
validate(pTHX_ const U8 * buf,const U8 * end,const int flags,PerlIO * handle)81 static STRLEN validate(pTHX_ const U8 *buf, const U8 *end, const int flags, PerlIO* handle) {
82 const bool eof = PerlIO_eof(handle);
83 const U8 *cur = buf;
84 const U8 *end4 = end - UTF8_MAX_BYTES;
85 STRLEN skip = 0;
86 U32 v;
87
88 while (cur < end4) {
89 while (cur < end4 && *cur < 0x80)
90 cur++;
91
92 check:
93 switch (xs_utf8_sequence_len[*cur]) {
94 case 0:
95 goto illformed;
96 case 1:
97 cur += 1;
98 break;
99 case 2:
100 /* 110xxxxx 10xxxxxx */
101 if ((cur[1] & 0xC0) != 0x80)
102 goto illformed;
103 cur += 2;
104 break;
105 case 3:
106 v = ((U32)cur[0] << 16)
107 | ((U32)cur[1] << 8)
108 | ((U32)cur[2]);
109 /* 1110xxxx 10xxxxxx 10xxxxxx */
110 if ((v & 0x00F0C0C0) != 0x00E08080 ||
111 /* Non-shortest form */
112 v < 0x00E0A080)
113 goto illformed;
114 /* Surrogates U+D800..U+DFFF */
115 if (!(flags & ALLOW_SURROGATES) && (v & 0x00EFA080) == 0x00EDA080)
116 goto illformed;
117 /* Non-characters U+FDD0..U+FDEF, U+FFFE..U+FFFF */
118 if (!(flags & ALLOW_NONCHARACTERS) && v >= 0x00EFB790 && (v <= 0x00EFB7AF || v >= 0x00EFBFBE))
119 goto noncharacter;
120 cur += 3;
121 break;
122 case 4:
123 v = ((U32)cur[0] << 24)
124 | ((U32)cur[1] << 16)
125 | ((U32)cur[2] << 8)
126 | ((U32)cur[3]);
127 /* 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx */
128 if ((v & 0xF8C0C0C0) != 0xF0808080 ||
129 /* Non-shortest form */
130 v < 0xF0908080 ||
131 /* Greater than U+10FFFF */
132 v > 0xF48FBFBF)
133 goto illformed;
134 /* Non-characters U+nFFFE..U+nFFFF on plane 1-16 */
135 if (!(flags & ALLOW_NONCHARACTERS) && (v & 0x000FBFBE) == 0x000FBFBE)
136 goto noncharacter;
137 cur += 4;
138 break;
139 }
140 }
141
142 if (cur < end) {
143 if (cur + xs_utf8_sequence_len[*cur] <= end)
144 goto check;
145 skip = skip_sequence(cur, end - cur);
146 if (eof || cur + skip < end)
147 goto illformed;
148 }
149 return cur - buf;
150
151 illformed:
152 if (!skip)
153 skip = skip_sequence(cur, end - cur);
154 PerlIOBase(handle)->flags |= PERLIO_F_ERROR;
155 report_illformed(aTHX_ cur, skip, eof);
156
157 noncharacter:
158 if (v < 0xF0808080)
159 v = (v & 0x3F) | (v & 0x3F00) >> 2 | (v & 0x0F0000) >> 4;
160 else
161 v = (v & 0x3F) | (v & 0x3F00) >> 2 | (v & 0x3F0000) >> 4 | (v & 0x07000000) >> 6;
162 PerlIOBase(handle)->flags |= PERLIO_F_ERROR;
163 report_noncharacter(aTHX_ v);
164 }
165
166 typedef struct {
167 PerlIOBuf buf;
168 STDCHAR leftovers[UTF8_MAX_BYTES];
169 size_t leftover_length;
170 utf8_flags flags;
171 } PerlIOUnicode;
172
173 static struct {
174 const char* name;
175 size_t length;
176 utf8_flags value;
177 } map[] = {
178 { STR_WITH_LEN("allow_surrogates"), ALLOW_SURROGATES },
179 { STR_WITH_LEN("allow_noncharacters"), ALLOW_NONCHARACTERS },
180 { STR_WITH_LEN("allow_nonshortest"), ALLOW_NONSHORTEST },
181 { STR_WITH_LEN("strict"), 0 },
182 { STR_WITH_LEN("loose"), ALLOW_SURROGATES | ALLOW_NONCHARACTERS | ALLOW_NONSHORTEST },
183 };
184
lookup_parameter(pTHX_ const char * ptr,size_t len)185 static utf8_flags lookup_parameter(pTHX_ const char* ptr, size_t len) {
186 unsigned i;
187 for (i = 0; i < sizeof map / sizeof *map; ++i) {
188 if (map[i].length == len && memcmp(ptr, map[i].name, len) == 0)
189 return map[i].value;
190 }
191 Perl_croak(aTHX_ "Unknown argument to :utf8_strict: %*s", (int)len, ptr);
192 }
parse_parameters(pTHX_ SV * param)193 static utf8_flags parse_parameters(pTHX_ SV* param) {
194 STRLEN len;
195 const char *begin, *delim;
196 if (!param || !SvOK(param))
197 return 0;
198
199 begin = SvPV(param, len);
200 delim = strchr(begin, ',');
201 if(delim) {
202 utf8_flags ret = 0;
203 const char* end = begin + len;
204 do {
205 ret |= lookup_parameter(aTHX_ begin, delim - begin);
206 begin = delim + 1;
207 delim = strchr(begin, ',');
208 } while (delim);
209 if (begin < end)
210 ret |= lookup_parameter(aTHX_ begin, end - begin);
211 return ret;
212 }
213 else {
214 return lookup_parameter(aTHX_ begin, len);
215 }
216 }
217
218 #define line_buffered(flags) ((flags & (PERLIO_F_LINEBUF | PERLIO_F_CANWRITE)) == (PERLIO_F_LINEBUF | PERLIO_F_CANWRITE))
219
PerlIOBase_flush_linebuf(pTHX)220 void PerlIOBase_flush_linebuf(pTHX) {
221 #ifdef dVAR
222 dVAR;
223 #endif
224 PerlIOl **table = &PL_perlio;
225 PerlIOl *f;
226 while ((f = *table)) {
227 int i;
228 table = (PerlIOl **) (f++);
229 for (i = 1; i < 64; i++) {
230 if (f->next && line_buffered(PerlIOBase(&(f->next))->flags))
231 PerlIO_flush(&(f->next));
232 f++;
233 }
234 }
235 }
236
PerlIOUnicode_pushed(pTHX_ PerlIO * f,const char * mode,SV * arg,PerlIO_funcs * tab)237 static IV PerlIOUnicode_pushed(pTHX_ PerlIO* f, const char* mode, SV* arg, PerlIO_funcs* tab) {
238 utf8_flags flags = parse_parameters(aTHX_ arg);
239 if (PerlIOBuf_pushed(aTHX_ f, mode, arg, tab) == 0) {
240 PerlIOBase(f)->flags |= PERLIO_F_UTF8;
241 PerlIOSelf(f, PerlIOUnicode)->flags = flags;
242 return 0;
243 }
244 return -1;
245 }
246
PerlIOUnicode_fill(pTHX_ PerlIO * f)247 static IV PerlIOUnicode_fill(pTHX_ PerlIO* f) {
248 PerlIOUnicode * const u = PerlIOSelf(f, PerlIOUnicode);
249 PerlIOBuf * const b = &u->buf;
250 PerlIO *n = PerlIONext(f);
251 SSize_t avail;
252 Size_t read_bytes = 0;
253 STDCHAR *end;
254 SSize_t fit;
255
256 if (PerlIO_flush(f) != 0)
257 return -1;
258 if (PerlIOBase(f)->flags & PERLIO_F_TTY)
259 PerlIOBase_flush_linebuf(aTHX);
260
261 if (!b->buf)
262 PerlIO_get_base(f);
263
264 assert(b->buf);
265
266 if (u->leftover_length) {
267 Copy(u->leftovers, b->buf, u->leftover_length, STDCHAR);
268 b->end = b->buf + u->leftover_length;
269 read_bytes = u->leftover_length;
270 u->leftover_length = 0;
271 }
272 else {
273 b->ptr = b->end = b->buf;
274 }
275 fit = (SSize_t)b->bufsiz - (b->end - b->buf);
276
277 if (!PerlIOValid(n)) {
278 PerlIOBase(f)->flags |= PERLIO_F_EOF;
279 return -1;
280 }
281
282 if (PerlIO_fast_gets(n)) {
283 /*
284 * Layer below is also buffered. We do _NOT_ want to call its
285 * ->Read() because that will loop till it gets what we asked for
286 * which may hang on a pipe etc. Instead take anything it has to
287 * hand, or ask it to fill _once_.
288 */
289 avail = PerlIO_get_cnt(n);
290 if (avail <= 0) {
291 avail = PerlIO_fill(n);
292 if (avail == 0)
293 avail = PerlIO_get_cnt(n);
294 else {
295 if (!PerlIO_error(n) && PerlIO_eof(n))
296 avail = 0;
297 }
298 }
299 if (avail > 0) {
300 STDCHAR *ptr = PerlIO_get_ptr(n);
301 const SSize_t cnt = avail;
302 if (avail > fit)
303 avail = fit;
304 Copy(ptr, b->end, avail, STDCHAR);
305 PerlIO_set_ptrcnt(n, ptr + avail, cnt - avail);
306 read_bytes += avail;
307 }
308 }
309 else {
310 avail = PerlIO_read(n, b->end, fit);
311 if (avail > 0)
312 read_bytes += avail;
313 }
314 if (avail <= 0) {
315 if (avail < 0 || (read_bytes == 0 && PerlIO_eof(n))) {
316 PerlIOBase(f)->flags |= (avail == 0) ? PERLIO_F_EOF : PERLIO_F_ERROR;
317 return -1;
318 }
319 }
320 end = b->buf + read_bytes;
321 b->end = b->buf + validate(aTHX_ (const U8 *)b->buf, (const U8 *)end, u->flags, n);
322 if (b->end < end) {
323 size_t len = b->buf + read_bytes - b->end;
324 Copy(b->end, u->leftovers, len, char);
325 u->leftover_length = len;
326 }
327 PerlIOBase(f)->flags |= PERLIO_F_RDBUF;
328
329 return 0;
330 }
331
332 PERLIO_FUNCS_DECL(PerlIO_utf8_strict) = {
333 sizeof(PerlIO_funcs),
334 "utf8_strict",
335 sizeof(PerlIOUnicode),
336 PERLIO_K_BUFFERED|PERLIO_K_UTF8,
337 PerlIOUnicode_pushed,
338 PerlIOBuf_popped,
339 PerlIOBuf_open,
340 PerlIOBase_binmode,
341 NULL,
342 PerlIOBase_fileno,
343 PerlIOBuf_dup,
344 PerlIOBuf_read,
345 PerlIOBase_unread,
346 PerlIOBuf_write,
347 PerlIOBuf_seek,
348 PerlIOBuf_tell,
349 PerlIOBuf_close,
350 PerlIOBuf_flush,
351 PerlIOUnicode_fill,
352 PerlIOBase_eof,
353 PerlIOBase_error,
354 PerlIOBase_clearerr,
355 PerlIOBase_setlinebuf,
356 PerlIOBuf_get_base,
357 PerlIOBuf_bufsiz,
358 PerlIOBuf_get_ptr,
359 PerlIOBuf_get_cnt,
360 PerlIOBuf_set_ptrcnt,
361 };
362
363 MODULE = PerlIO::utf8_strict
364
365 PROTOTYPES: DISABLE
366
367 BOOT:
368 PerlIO_define_layer(aTHX_ (PerlIO_funcs*)&PerlIO_utf8_strict);
369
370