1 #include "EXTERN.h"
2 #include "perl.h"
3 #include "XSUB.h"
4 #include "perliol.h"
5 #include "ppport.h"
6 
7 #define UTF8_MAX_BYTES 4
8 
9 static const U8 xs_utf8_sequence_len[0x100] = {
10     1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, /* 0x00-0x0F */
11     1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, /* 0x10-0x1F */
12     1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, /* 0x20-0x2F */
13     1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, /* 0x30-0x3F */
14     1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, /* 0x40-0x4F */
15     1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, /* 0x50-0x5F */
16     1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, /* 0x60-0x6F */
17     1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, /* 0x70-0x7F */
18     0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, /* 0x80-0x8F */
19     0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, /* 0x90-0x9F */
20     0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, /* 0xA0-0xAF */
21     0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, /* 0xB0-0xBF */
22     0,0,2,2,2,2,2,2,2,2,2,2,2,2,2,2, /* 0xC0-0xCF */
23     2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2, /* 0xD0-0xDF */
24     3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3, /* 0xE0-0xEF */
25     4,4,4,4,4,0,0,0,0,0,0,0,0,0,0,0, /* 0xF0-0xFF */
26 };
27 
28 
29 typedef enum { STRICT_UTF8=0, ALLOW_SURROGATES=1, ALLOW_NONCHARACTERS=2, ALLOW_NONSHORTEST=4 } utf8_flags;
30 
31 
skip_sequence(const U8 * cur,const STRLEN len)32 static STRLEN skip_sequence(const U8 *cur, const STRLEN len) {
33 	STRLEN i, n = xs_utf8_sequence_len[*cur];
34 
35 	if (n < 1 || len < 2)
36 		return 1;
37 
38 	switch (cur[0]) {
39 		case 0xE0: if ((cur[1] & 0xE0) != 0xA0) return 1; break;
40 		case 0xED: if ((cur[1] & 0xE0) != 0x80) return 1; break;
41 		case 0xF4: if ((cur[1] & 0xF0) != 0x80) return 1; break;
42 		case 0xF0: if ((cur[1] & 0xF0) == 0x80) return 1; /* FALLTROUGH */
43 		default:   if ((cur[1] & 0xC0) != 0x80) return 1; break;
44 	}
45 
46 	if (n > len)
47 		n = len;
48 	for (i = 2; i < n; i++)
49 		if ((cur[i] & 0xC0) != 0x80)
50 			break;
51 	return i;
52 }
53 
report_illformed(pTHX_ const U8 * cur,STRLEN len,bool eof)54 static void report_illformed(pTHX_ const U8 *cur, STRLEN len, bool eof) {
55 	static const char *hex = "0123456789ABCDEF";
56 	const char *fmt;
57 	char seq[UTF8_MAX_BYTES * 3];
58 	char *d = seq;
59 
60 	if (eof)
61 		fmt = "Can't decode ill-formed UTF-8 octet sequence <%s> at end of file";
62 	else
63 		fmt = "Can't decode ill-formed UTF-8 octet sequence <%s>";
64 
65 	while (len-- > 0) {
66 		const U8 c = *cur++;
67 		*d++ = hex[c >> 4];
68 		*d++ = hex[c & 15];
69 		if (len)
70 			*d++ = ' ';
71 	}
72 	*d = 0;
73 	Perl_croak(aTHX_ fmt, seq);
74 }
75 
report_noncharacter(pTHX_ UV usv)76 static void report_noncharacter(pTHX_ UV usv) {
77 	static const char *fmt = "Can't interchange noncharacter code point U+%"UVXf;
78 	Perl_croak(aTHX_ fmt, usv);
79 }
80 
validate(pTHX_ const U8 * buf,const U8 * end,const int flags,PerlIO * handle)81 static STRLEN validate(pTHX_ const U8 *buf, const U8 *end, const int flags, PerlIO* handle) {
82 	const bool eof = PerlIO_eof(handle);
83 	const U8 *cur = buf;
84 	const U8 *end4 = end - UTF8_MAX_BYTES;
85 	STRLEN skip = 0;
86 	U32 v;
87 
88 	while (cur < end4) {
89 		while (cur < end4 && *cur < 0x80)
90 			cur++;
91 
92 	  check:
93 		switch (xs_utf8_sequence_len[*cur]) {
94 			case 0:
95 				goto illformed;
96 			case 1:
97 				cur += 1;
98 				break;
99 			case 2:
100 				/* 110xxxxx 10xxxxxx */
101 				if ((cur[1] & 0xC0) != 0x80)
102 					goto illformed;
103 				cur += 2;
104 				break;
105 			case 3:
106 				v = ((U32)cur[0] << 16)
107 				  | ((U32)cur[1] <<  8)
108 				  | ((U32)cur[2]);
109 				/* 1110xxxx 10xxxxxx 10xxxxxx */
110 				if ((v & 0x00F0C0C0) != 0x00E08080 ||
111 					/* Non-shortest form */
112 					v < 0x00E0A080)
113 					goto illformed;
114 				/* Surrogates U+D800..U+DFFF */
115 				if (!(flags & ALLOW_SURROGATES) && (v & 0x00EFA080) == 0x00EDA080)
116 					goto illformed;
117 				/* Non-characters U+FDD0..U+FDEF, U+FFFE..U+FFFF */
118 				if (!(flags & ALLOW_NONCHARACTERS) && v >= 0x00EFB790 && (v <= 0x00EFB7AF || v >= 0x00EFBFBE))
119 					goto noncharacter;
120 				cur += 3;
121 				break;
122 			case 4:
123 				v = ((U32)cur[0] << 24)
124 				  | ((U32)cur[1] << 16)
125 				  | ((U32)cur[2] <<  8)
126 				  | ((U32)cur[3]);
127 				/* 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx */
128 				if ((v & 0xF8C0C0C0) != 0xF0808080 ||
129 					/* Non-shortest form */
130 					v < 0xF0908080 ||
131 					/* Greater than U+10FFFF */
132 					v > 0xF48FBFBF)
133 					goto illformed;
134 				/* Non-characters U+nFFFE..U+nFFFF on plane 1-16 */
135 				if (!(flags & ALLOW_NONCHARACTERS) && (v & 0x000FBFBE) == 0x000FBFBE)
136 					goto noncharacter;
137 				cur += 4;
138 				break;
139 		}
140 	}
141 
142 	if (cur < end) {
143 		if (cur + xs_utf8_sequence_len[*cur] <= end)
144 			goto check;
145 		skip = skip_sequence(cur, end - cur);
146 		if (eof || cur + skip < end)
147 			goto illformed;
148 	}
149 	return cur - buf;
150 
151   illformed:
152 	if (!skip)
153 		skip = skip_sequence(cur, end - cur);
154 	PerlIOBase(handle)->flags |= PERLIO_F_ERROR;
155 	report_illformed(aTHX_ cur, skip, eof);
156 
157   noncharacter:
158 	if (v < 0xF0808080)
159 		v = (v & 0x3F) | (v & 0x3F00) >> 2 | (v & 0x0F0000) >> 4;
160 	else
161 		v = (v & 0x3F) | (v & 0x3F00) >> 2 | (v & 0x3F0000) >> 4 | (v & 0x07000000) >> 6;
162 	PerlIOBase(handle)->flags |= PERLIO_F_ERROR;
163 	report_noncharacter(aTHX_ v);
164 }
165 
166 typedef struct {
167 	PerlIOBuf buf;
168 	STDCHAR leftovers[UTF8_MAX_BYTES];
169 	size_t leftover_length;
170 	utf8_flags flags;
171 } PerlIOUnicode;
172 
173 static struct {
174 	const char* name;
175 	size_t length;
176 	utf8_flags value;
177 } map[] = {
178 	{ STR_WITH_LEN("allow_surrogates"), ALLOW_SURROGATES },
179 	{ STR_WITH_LEN("allow_noncharacters"), ALLOW_NONCHARACTERS },
180 	{ STR_WITH_LEN("allow_nonshortest"), ALLOW_NONSHORTEST },
181 	{ STR_WITH_LEN("strict"), 0 },
182 	{ STR_WITH_LEN("loose"), ALLOW_SURROGATES | ALLOW_NONCHARACTERS | ALLOW_NONSHORTEST },
183 };
184 
lookup_parameter(pTHX_ const char * ptr,size_t len)185 static utf8_flags lookup_parameter(pTHX_ const char* ptr, size_t len) {
186 	unsigned i;
187 	for (i = 0; i < sizeof map / sizeof *map; ++i) {
188 		if (map[i].length == len && memcmp(ptr, map[i].name, len) == 0)
189 			return map[i].value;
190 	}
191 	Perl_croak(aTHX_ "Unknown argument to :utf8_strict: %*s", (int)len, ptr);
192 }
parse_parameters(pTHX_ SV * param)193 static utf8_flags parse_parameters(pTHX_ SV* param) {
194 	STRLEN len;
195 	const char *begin, *delim;
196 	if (!param || !SvOK(param))
197 		return 0;
198 
199 	begin = SvPV(param, len);
200 	delim = strchr(begin, ',');
201 	if(delim) {
202 		utf8_flags ret = 0;
203 		const char* end = begin + len;
204 		do {
205 			ret |= lookup_parameter(aTHX_ begin, delim - begin);
206 			begin = delim + 1;
207 			delim = strchr(begin, ',');
208 		} while (delim);
209 		if (begin < end)
210 			ret |= lookup_parameter(aTHX_ begin, end - begin);
211 		return ret;
212 	}
213 	else {
214 		return lookup_parameter(aTHX_ begin, len);
215 	}
216 }
217 
218 #define line_buffered(flags) ((flags & (PERLIO_F_LINEBUF | PERLIO_F_CANWRITE)) == (PERLIO_F_LINEBUF | PERLIO_F_CANWRITE))
219 
PerlIOBase_flush_linebuf(pTHX)220 void PerlIOBase_flush_linebuf(pTHX) {
221 #ifdef dVAR
222 	dVAR;
223 #endif
224 	PerlIOl **table = &PL_perlio;
225 	PerlIOl *f;
226 	while ((f = *table)) {
227 		int i;
228 		table = (PerlIOl **) (f++);
229 		for (i = 1; i < 64; i++) {
230 			if (f->next && line_buffered(PerlIOBase(&(f->next))->flags))
231 				PerlIO_flush(&(f->next));
232 			f++;
233 		}
234 	}
235 }
236 
PerlIOUnicode_pushed(pTHX_ PerlIO * f,const char * mode,SV * arg,PerlIO_funcs * tab)237 static IV PerlIOUnicode_pushed(pTHX_ PerlIO* f, const char* mode, SV* arg, PerlIO_funcs* tab) {
238 	utf8_flags flags = parse_parameters(aTHX_ arg);
239 	if (PerlIOBuf_pushed(aTHX_ f, mode, arg, tab) == 0) {
240 		PerlIOBase(f)->flags |= PERLIO_F_UTF8;
241 		PerlIOSelf(f, PerlIOUnicode)->flags = flags;
242 		return 0;
243 	}
244 	return -1;
245 }
246 
PerlIOUnicode_fill(pTHX_ PerlIO * f)247 static IV PerlIOUnicode_fill(pTHX_ PerlIO* f) {
248 	PerlIOUnicode * const u = PerlIOSelf(f, PerlIOUnicode);
249 	PerlIOBuf * const b = &u->buf;
250 	PerlIO *n = PerlIONext(f);
251 	SSize_t avail;
252 	Size_t read_bytes = 0;
253 	STDCHAR *end;
254 	SSize_t fit;
255 
256 	if (PerlIO_flush(f) != 0)
257 		return -1;
258 	if (PerlIOBase(f)->flags & PERLIO_F_TTY)
259 		PerlIOBase_flush_linebuf(aTHX);
260 
261 	if (!b->buf)
262 		PerlIO_get_base(f);
263 
264 	assert(b->buf);
265 
266 	if (u->leftover_length) {
267 		Copy(u->leftovers, b->buf, u->leftover_length, STDCHAR);
268 		b->end = b->buf + u->leftover_length;
269 		read_bytes = u->leftover_length;
270 		u->leftover_length = 0;
271 	}
272 	else {
273 		b->ptr = b->end = b->buf;
274 	}
275 	fit = (SSize_t)b->bufsiz - (b->end - b->buf);
276 
277 	if (!PerlIOValid(n)) {
278 		PerlIOBase(f)->flags |= PERLIO_F_EOF;
279 		return -1;
280 	}
281 
282 	if (PerlIO_fast_gets(n)) {
283 		/*
284 		 * Layer below is also buffered. We do _NOT_ want to call its
285 		 * ->Read() because that will loop till it gets what we asked for
286 		 * which may hang on a pipe etc. Instead take anything it has to
287 		 * hand, or ask it to fill _once_.
288 		 */
289 		avail = PerlIO_get_cnt(n);
290 		if (avail <= 0) {
291 			avail = PerlIO_fill(n);
292 			if (avail == 0)
293 				avail = PerlIO_get_cnt(n);
294 			else {
295 				if (!PerlIO_error(n) && PerlIO_eof(n))
296 					avail = 0;
297 			}
298 		}
299 		if (avail > 0) {
300 			STDCHAR *ptr = PerlIO_get_ptr(n);
301 			const SSize_t cnt = avail;
302 			if (avail > fit)
303 				avail = fit;
304 			Copy(ptr, b->end, avail, STDCHAR);
305 			PerlIO_set_ptrcnt(n, ptr + avail, cnt - avail);
306 			read_bytes += avail;
307 		}
308 	}
309 	else {
310 		avail = PerlIO_read(n, b->end, fit);
311 		if (avail > 0)
312 			read_bytes += avail;
313 	}
314 	if (avail <= 0) {
315 		if (avail < 0 || (read_bytes == 0 && PerlIO_eof(n))) {
316 			PerlIOBase(f)->flags |= (avail == 0) ? PERLIO_F_EOF : PERLIO_F_ERROR;
317 			return -1;
318 		}
319 	}
320 	end = b->buf + read_bytes;
321 	b->end = b->buf + validate(aTHX_ (const U8 *)b->buf, (const U8 *)end, u->flags, n);
322 	if (b->end < end) {
323 		size_t len = b->buf + read_bytes - b->end;
324 		Copy(b->end, u->leftovers, len, char);
325 		u->leftover_length = len;
326 	}
327 	PerlIOBase(f)->flags |= PERLIO_F_RDBUF;
328 
329 	return 0;
330 }
331 
332 PERLIO_FUNCS_DECL(PerlIO_utf8_strict) = {
333 	sizeof(PerlIO_funcs),
334 	"utf8_strict",
335 	sizeof(PerlIOUnicode),
336 	PERLIO_K_BUFFERED|PERLIO_K_UTF8,
337 	PerlIOUnicode_pushed,
338 	PerlIOBuf_popped,
339 	PerlIOBuf_open,
340 	PerlIOBase_binmode,
341 	NULL,
342 	PerlIOBase_fileno,
343 	PerlIOBuf_dup,
344 	PerlIOBuf_read,
345 	PerlIOBase_unread,
346 	PerlIOBuf_write,
347 	PerlIOBuf_seek,
348 	PerlIOBuf_tell,
349 	PerlIOBuf_close,
350 	PerlIOBuf_flush,
351 	PerlIOUnicode_fill,
352 	PerlIOBase_eof,
353 	PerlIOBase_error,
354 	PerlIOBase_clearerr,
355 	PerlIOBase_setlinebuf,
356 	PerlIOBuf_get_base,
357 	PerlIOBuf_bufsiz,
358 	PerlIOBuf_get_ptr,
359 	PerlIOBuf_get_cnt,
360 	PerlIOBuf_set_ptrcnt,
361 };
362 
363 MODULE = PerlIO::utf8_strict
364 
365 PROTOTYPES: DISABLE
366 
367 BOOT:
368 	PerlIO_define_layer(aTHX_ (PerlIO_funcs*)&PerlIO_utf8_strict);
369 
370