1 /*
2 *	Selene Unicode/UTF-8
3 *	This additions
4 *	Copyright (c) 2005 Malete Partner, Berlin, partner@malete.org
5 *	Available under "Lua 5.0 license", see http://www.lua.org/license.html#5
6 *	$Id: slnunico.c,v 1.5 2006/07/26 17:20:04 paul Exp $
7 *
8 *	contains code from
9 ** lstrlib.c,v 1.109 2004/12/01 15:46:06 roberto Exp
10 ** Standard library for string operations and pattern-matching
11 ** See Copyright Notice in lua.h
12 *
13 *	uses the udata table and a couple of expressions from Tcl 8.4.x UTF-8
14 * which comes with the following license.terms:
15 
16 This software is copyrighted by the Regents of the University of
17 California, Sun Microsystems, Inc., Scriptics Corporation, ActiveState
18 Corporation and other parties.  The following terms apply to all files
19 associated with the software unless explicitly disclaimed in
20 individual files.
21 
22 The authors hereby grant permission to use, copy, modify, distribute,
23 and license this software and its documentation for any purpose, provided
24 that existing copyright notices are retained in all copies and that this
25 notice is included verbatim in any distributions. No written agreement,
26 license, or royalty fee is required for any of the authorized uses.
27 Modifications to this software may be copyrighted by their authors
28 and need not follow the licensing terms described here, provided that
29 the new terms are clearly indicated on the first page of each file where
30 they apply.
31 
32 IN NO EVENT SHALL THE AUTHORS OR DISTRIBUTORS BE LIABLE TO ANY PARTY
33 FOR DIRECT, INDIRECT, SPECIAL, INCIDENTAL, OR CONSEQUENTIAL DAMAGES
34 ARISING OUT OF THE USE OF THIS SOFTWARE, ITS DOCUMENTATION, OR ANY
35 DERIVATIVES THEREOF, EVEN IF THE AUTHORS HAVE BEEN ADVISED OF THE
36 POSSIBILITY OF SUCH DAMAGE.
37 
38 THE AUTHORS AND DISTRIBUTORS SPECIFICALLY DISCLAIM ANY WARRANTIES,
39 INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY,
40 FITNESS FOR A PARTICULAR PURPOSE, AND NON-INFRINGEMENT.  THIS SOFTWARE
41 IS PROVIDED ON AN "AS IS" BASIS, AND THE AUTHORS AND DISTRIBUTORS HAVE
42 NO OBLIGATION TO PROVIDE MAINTENANCE, SUPPORT, UPDATES, ENHANCEMENTS, OR
43 MODIFICATIONS.
44 
45 GOVERNMENT USE: If you are acquiring this software on behalf of the
46 U.S. government, the Government shall have only "Restricted Rights"
47 in the software and related documentation as defined in the Federal
48 Acquisition Regulations (FARs) in Clause 52.227.19 (c) (2).  If you
49 are acquiring the software on behalf of the Department of Defense, the
50 software shall be classified as "Commercial Computer Software" and the
51 Government shall have only "Restricted Rights" as defined in Clause
52 252.227-7013 (c) (1) of DFARs.  Notwithstanding the foregoing, the
53 authors grant the U.S. Government and others acting in its behalf
54 permission to use and distribute the software in accordance with the
55 terms specified in this license.
56 
57 (end of Tcl license terms)
58 */
59 
60 /*
61 According to http://ietf.org/rfc/rfc3629.txt we support up to 4-byte
62 (21 bit) sequences encoding the UTF-16 reachable 0-0x10FFFF.
63 Any byte not part of a 2-4 byte sequence in that range decodes to itself.
64 Ill formed (non-shortest) "C0 80" will be decoded as two code points C0 and 80,
65 not code point 0; see security considerations in the RFC.
66 However, UTF-16 surrogates (D800-DFFF) are accepted.
67 
68 See http://www.unicode.org/reports/tr29/#Grapheme_Cluster_Boundaries
69 for default grapheme clusters.
70 Lazy westerners we are (and lacking the Hangul_Syllable_Type data),
71 we care for base char + Grapheme_Extend, but not for Hangul syllable sequences.
72 
73 For http://unicode.org/Public/UNIDATA/UCD.html#Grapheme_Extend
74 we use Mn (NON_SPACING_MARK) + Me (ENCLOSING_MARK),
75 ignoring the 18 mostly south asian Other_Grapheme_Extend (16 Mc, 2 Cf) from
76 http://www.unicode.org/Public/UNIDATA/PropList.txt
77 */
78 
79 #include <ctype.h>
80 #include <stddef.h>
81 #include <stdio.h>
82 #include <stdlib.h>
83 #include <string.h>
84 #include <stdint.h>
85 
86 #define lstrlib_c
87 #define LUA_LIB
88 
89 #include "lua.h"
90 
91 #ifdef LuajitTeX
92 #include "lua/lauxlib_bridge.h"
93 #else
94 #include "lauxlib.h"
95 #endif
96 #include "lualib.h"
97 
98 #ifndef SLN_UNICODENAME /* unless set it luaconf */
99 # define SLN_UNICODENAME "unicode"
100 #endif
101 
102 #define LUA_MAXCAPTURES 32
103 #if defined(LUA_USELONGLONG)
104 
105 #define LUA_INTFRMLEN           "ll"
106 #define LUA_INTFRM_T            long long
107 
108 #else
109 
110 #define LUA_INTFRMLEN           "l"
111 #define LUA_INTFRM_T            long
112 
113 #endif
114 
115 /*
116 UTF-8 Bit Distribution pag 103 Unicode 5.0
117 First byte  Lenght
118 00..7f       1 byte
119 c0..df       2 bytes
120 e0..ef       3 bytes
121 f0..f7       4 bytes
122 */
123 #define U8_LENGTH(c) ((unsigned char)(c)<=0x7f ? 1 : ((unsigned char)(c)<=0xdf ? 2 : ((unsigned char)(c)<=0xef ? 3 :  ((unsigned char)(c)<=0xf7 ? 4:-1))))
124 
125 #include "slnudata.c"
126 #define charinfo(c) (~0xFFFF&(c) ? 0 : GetUniCharInfo(c)) /* BMP only */
127 #define charcat(c) (UNICODE_CATEGORY_MASK & charinfo(c))
128 #define Grapheme_Extend(code) \
129 	(1 & (((1<<NON_SPACING_MARK)|(1<<ENCLOSING_MARK)) >> charcat(code)))
130 
131 enum { /* operation modes */
132 	MODE_ASCII, /* single byte 7bit */
133 	MODE_LATIN, /* single byte 8859-1 */
134 	MODE_UTF8,	/* UTF-8 by code points */
135 	MODE_GRAPH	/* UTF-8 by grapheme clusters */
136 #define MODE_MBYTE(mode) (~1&(mode))
137 };
138 
139 
140 /* macro to `unsign' a character */
141 #define uchar(c)				((unsigned char)(c))
142 
143 typedef const unsigned char cuc; /* it's just toooo long :) */
144 
145 
utf8_enco(luaL_Buffer * b,unsigned c)146 static void utf8_enco (luaL_Buffer *b, unsigned c)
147 {
148 	if (0x80 > c) {
149 		luaL_addchar(b, c);
150 		return;
151 	}
152 	if (0x800 > c)
153 		luaL_addchar(b, 0xC0|(c>>6));
154 	else {
155 		if (0x10000 > c)
156 			luaL_addchar(b, 0xE0|(c>>12));
157 		else {
158 			luaL_addchar(b, 0xF0|(c>>18));
159 			luaL_addchar(b, 0x80|(0x3F&(c>>12)));
160 		}
161 		luaL_addchar(b, 0x80|(0x3F&(c>>6)));
162 	}
163 	luaL_addchar(b, 0x80|(0x3F&c));
164 }	/* utf8_enco */
165 
166 
167 /* end must be > *pp */
utf8_deco(const char ** pp,const char * end)168 static unsigned utf8_deco (const char **pp, const char *end)
169 {
170 	register cuc *p = (cuc*)*pp, * const e = (cuc*)end;
171 	unsigned first = *p, code;
172 
173 	*pp = (const char*)++p; /* eat one */
174 	/* check ASCII, dangling cont., non-shortest or not continued */
175 	if (0xC2 > first || e == p || 0x80 != (0xC0&*p)) return first;
176 	code = 0x3F&*p++; /* save 1st cont. */
177 	/* check 2 byte (5+6 = 11 bit) sequence up to 0x7FF */
178 	if (0xE0 > first) { /* any >= C2 is valid */
179 		code |= (0x1F&first)<<6;
180 		goto seq;
181 	}
182 	if (e != p && 0x80 == (0xC0&*p)) { /* is continued */
183 		code = code<<6 | (0x3F&*p++); /* save 2nd */
184 		if (0xF0 > first) { /* 3 byte (4+6+6 = 16 bit) seq -- want 2nd cont. */
185 			if ( 0xF800&(code |= (0x0F&first)<<12) /* >0x7FF: not non-shortest */
186 				/* && 0xD800 != (0xD800 & code) -- nah, let surrogates pass */
187 			)
188 				goto seq;
189 		} else if (e != p && 0x80 == (0xC0&*p) /* check 3rd */
190 			/* catch 0xF4 < first and other out-of-bounds */
191 			/* TH: add the 256 out-of-range glyphs in 'plane 18' */
192 			&& 0x110100 > (code = (0x0F&first)<<18 | code<<6 | (0x3F&*p++))
193 			&& 0xFFFF < code /* not a 16 bitty */
194 		)
195 			goto seq;
196 	}
197 	return first;
198 seq:
199 	*pp = (const char*)p;
200 	return code;
201 }	/* utf8_deco */
202 
203 
204 /* reverse decode before pp > start */
utf8_oced(const char ** pp,const char * start)205 static unsigned utf8_oced (const char **pp, const char *start)
206 {
207 	register cuc *p = (cuc*)*pp, * const s = (cuc*)start;
208 	unsigned last = *--p, code;
209 
210 	*pp = (const char*)p; /* eat one */
211 	/* check non-continuer or at the edge */
212 	if (0x80 != (0xC0&last) || s == p) return last;
213 	code = 0x3F&last; /* save last cont. */
214 	if (0xC0 == (0xE0&*--p)) { /* preceeded by 2 byte seq starter */
215 		if (0xC2 <= *p) { code |= (0x1F&*p)<<6; goto seq; }
216 	} else if (0x80 == (0xC0&*p) && s<p) {
217 		code |= (0x3F&*p)<<6;
218 		if (0xE0 == (0xF0&*--p)) { /* 3 byte starter */
219 			if (0xF800&(code |= (0x0F&*p)<<12)) goto seq;
220 		} else if (0x80 == (0xC0&*p) && s<=--p /* valid 4 byte ? */
221 			/* TH: add the 256 out-of-range glyphs in 'plane 18' */
222 			&& 0x110100 > (code |= (0x0F&*p)<<18 | (0x3F&p[1])<<12)
223 			&& 0xFFFF < code
224 		)
225 			goto seq;
226 	}
227 	return last;
228 seq:
229 	*pp = (const char*)p;
230 	return code;
231 }	/* utf8_oced */
232 
233 
234 /* skip over Grapheme_Extend codes */
utf8_graphext(const char ** pp,const char * end)235 static void utf8_graphext (const char **pp, const char *end)
236 {
237 	const char *p = *pp;
238 	for (; p < end; *pp=p) {
239 		unsigned code = utf8_deco(&p, end);
240 		if (!Grapheme_Extend(code)) break;
241 	}
242 }	/* utf8_graphext */
243 
244 
utf8_count(const char ** pp,int bytes,int graph,int max)245 static int utf8_count (const char **pp, int bytes, int graph, int max)
246 {
247 	const char *const end = *pp+bytes;
248 	int count = 0;
249 	while (*pp < end && count != max) {
250 		unsigned code = utf8_deco(pp, end);
251 		count++;
252 		if (!graph) continue;
253 		if (Grapheme_Extend(code) && 1<count) count--; /* uncount */
254 	}
255 	if (graph && count == max) /* gather more extending */
256 		utf8_graphext(pp, end);
257 	return count;
258 }	/* utf8_count */
259 
260 
261 
unic_len(lua_State * L)262 static int unic_len (lua_State *L) {
263 	size_t l;
264 	const char *s = luaL_checklstring(L, 1, &l);
265 	int mode = lua_tointeger(L, lua_upvalueindex(1));
266 	if (MODE_MBYTE(mode)) l = (size_t)utf8_count(&s, l, mode-2, -1);
267 	lua_pushinteger(L, l);
268 	return 1;
269 }
270 
271 
posrelat(ptrdiff_t pos,size_t len)272 static ptrdiff_t posrelat (ptrdiff_t pos, size_t len) {
273 	/* relative string position: negative means back from end */
274 	return (pos>=0) ? pos : (ptrdiff_t)len+pos+1;
275 }
276 
277 
unic_sub(lua_State * L)278 static int unic_sub (lua_State *L) {
279 	size_t l;
280 	const char *s = luaL_checklstring(L, 1, &l), *p, *e=s+l;
281 	ptrdiff_t start = luaL_checkinteger(L, 2);
282 	ptrdiff_t end = luaL_optinteger(L, 3, -1);
283 	int mode = lua_tointeger(L, lua_upvalueindex(1));
284 
285 	if (MODE_MBYTE(mode)) { p=s; l = (size_t)utf8_count(&p, l, mode-2, -1); }
286 	start = posrelat(start, l);
287 	end = posrelat(end, l);
288 	if (start < 1) start = 1;
289 	if (end > (ptrdiff_t)l) end = (ptrdiff_t)l;
290 	if (start > end)
291 		lua_pushliteral(L, "");
292 	else {
293 		l = end - --start; /* #units */
294 		if (!(MODE_MBYTE(mode))) /* single byte */
295 			s += start;
296 		else {
297 			if (start) utf8_count(&s, e-s, mode-2, start); /* skip */
298 			p = s;
299 			utf8_count(&p, e-p, mode-2, l);
300 			l = p-s;
301 		}
302 		lua_pushlstring(L, s, l);
303 	}
304 	return 1;
305 }
306 
307 
str_reverse(lua_State * L)308 static int str_reverse (lua_State *L) { /* TODO? whatfor? */
309 	size_t l;
310 	luaL_Buffer b;
311 	const char *s = luaL_checklstring(L, 1, &l), *p = s+l, *q;
312 	int mode = lua_tointeger(L, lua_upvalueindex(1)), mb = MODE_MBYTE(mode);
313 
314 	luaL_buffinit(L, &b);
315 	if (!mb)
316 		while (s < p--) luaL_addchar(&b, *p);
317 	else {
318 		unsigned code;
319 		while (s < p) {
320 			q = p;
321 			code = utf8_oced(&p, s);
322 			if (MODE_GRAPH == mode)
323 				while (Grapheme_Extend(code) && p>s) code = utf8_oced(&p, s);
324 			luaL_addlstring(&b, p, q-p);
325 		}
326 	}
327 	luaL_pushresult(&b);
328 	return 1;
329 }
330 
331 
332 
unic_lower(lua_State * L)333 static int unic_lower (lua_State *L) {
334 	size_t l;
335 	luaL_Buffer b;
336 	const char *s = luaL_checklstring(L, 1, &l), * const e=s+l;
337 	int mode = lua_tointeger(L, lua_upvalueindex(1)), mb = MODE_MBYTE(mode);
338 	luaL_buffinit(L, &b);
339 	while (s < e) {
340 		unsigned c = mb ? utf8_deco(&s, e) : uchar(*s++);
341 		int info = charinfo(c);
342 		if (GetCaseType(info)&0x02 && (mode || !(0x80&c))) c += GetDelta(info);
343 		if (mb) utf8_enco(&b, c); else luaL_addchar(&b, c);
344 	}
345 	luaL_pushresult(&b);
346 	return 1;
347 }
348 
349 
unic_upper(lua_State * L)350 static int unic_upper (lua_State *L) {
351 	size_t l;
352 	luaL_Buffer b;
353 	const char *s = luaL_checklstring(L, 1, &l), * const e=s+l;
354 	int mode = lua_tointeger(L, lua_upvalueindex(1)), mb = MODE_MBYTE(mode);
355 	luaL_buffinit(L, &b);
356 	while (s < e) {
357 		unsigned c = mb ? utf8_deco(&s, e) : uchar(*s++);
358 		int info = charinfo(c);
359 		if (GetCaseType(info)&0x04 && (mode || !(0x80&c))) c -= GetDelta(info);
360 		if (mb) utf8_enco(&b, c); else luaL_addchar(&b, c);
361 	}
362 	luaL_pushresult(&b);
363 	return 1;
364 }
365 
366 
str_rep(lua_State * L)367 static int str_rep (lua_State *L) {
368 	size_t l;
369 	luaL_Buffer b;
370 	const char *s = luaL_checklstring(L, 1, &l);
371 	int n = luaL_checkint(L, 2);
372 	luaL_buffinit(L, &b);
373 	while (n-- > 0)
374 		luaL_addlstring(&b, s, l);
375 	luaL_pushresult(&b);
376 	return 1;
377 }
378 
379 
unic_byte(lua_State * L)380 static int unic_byte (lua_State *L) {
381 	size_t l;
382 	ptrdiff_t posi, pose;
383 	const char *s = luaL_checklstring(L, 1, &l), *p, *e=s+l;
384 	int n, mode = lua_tointeger(L, lua_upvalueindex(1)), mb = MODE_MBYTE(mode);
385 
386 	if (mb) { p=s; l = (size_t)utf8_count(&p, l, mode-2, -1); }
387 	posi = posrelat(luaL_optinteger(L, 2, 1), l);
388 	pose = posrelat(luaL_optinteger(L, 3, posi), l);
389 	if (posi <= 0) posi = 1;
390 	if ((size_t)pose > l) pose = l;
391 	if (0 >= (n = pose - --posi)) return 0;	/* empty interval */
392 	if (!mb)
393 		e = (s += posi) + n;
394 	else {
395 		if (posi) utf8_count(&s, e-s, mode-2, posi); /* skip */
396 		p=s;
397 		utf8_count(&p, e-s, mode-2, n);
398 		e=p;
399 	}
400 	/* byte count is upper bound on #elements */
401 	luaL_checkstack(L, e-s, "string slice too long");
402 	for (n=0; s<e; n++)
403 		lua_pushinteger(L, mb ? utf8_deco(&s, e) : uchar(*s++));
404 	return n;
405 }
406 
407 
unic_char(lua_State * L)408 static int unic_char (lua_State *L) {
409 	int i, n = lua_gettop(L);	/* number of arguments */
410 	int mode = lua_tointeger(L, lua_upvalueindex(1)), mb = MODE_MBYTE(mode);
411     /* TH: add the 256 out-of-range glyphs in 'plane 18' */
412 	unsigned lim = mb ? 0x110100 : 0x100;
413 
414 	luaL_Buffer b;
415 	luaL_buffinit(L, &b);
416 	for (i=1; i<=n; i++) {
417 		unsigned c = luaL_checkint(L, i);
418 		luaL_argcheck(L, lim > c, i, "invalid value");
419 		if (mb) utf8_enco(&b, c); else luaL_addchar(&b, c);
420 	}
421 	luaL_pushresult(&b);
422 	return 1;
423 }
424 
425 
writer(lua_State * L,const void * b,size_t size,void * B)426 static int writer (lua_State *L, const void* b, size_t size, void* B) {
427 	(void)L;
428 	luaL_addlstring((luaL_Buffer*) B, (const char *)b, size);
429 	return 0;
430 }
431 
432 
str_dump(lua_State * L)433 static int str_dump (lua_State *L) {
434 	luaL_Buffer b;
435 	luaL_checktype(L, 1, LUA_TFUNCTION);
436 	lua_settop(L, 1);
437 	luaL_buffinit(L,&b);
438 	if (lua_dump(L, writer, &b) != 0)
439 		luaL_error(L, "unable to dump given function");
440 	luaL_pushresult(&b);
441 	return 1;
442 }
443 
444 
445 
446 /*
447 ** {======================================================
448 ** PATTERN MATCHING
449 ** =======================================================
450 * find/gfind(_aux) -> match, push_captures
451 * gsub -> match, add_s (-> push_captures)
452 * push_captures, add_s -> push_onecapture
453 * match ->
454 * 	start/end_capture -> match,
455 * 	match_capture, matchbalance, classend -> -,
456 * 	min/max_expand -> match, singlematch
457 * 	singlematch -> matchbracketclass, match_class,
458 * 	matchbracketclass -> match_class -> -,
459 */
460 
461 
462 #define CAP_UNFINISHED	(-1)
463 #define CAP_POSITION	(-2)
464 
465 typedef struct MatchState {
466 	const char *src_init;	/* init of source string */
467 	const char *src_end;	/* end (`\0') of source string */
468 	lua_State *L;
469 	int level;	/* total number of captures (finished or unfinished) */
470 	int mode;
471 	int mb;
472 	struct {
473 		const char *init;
474 		ptrdiff_t len;
475 	} capture[LUA_MAXCAPTURES];
476 } MatchState;
477 
478 
479 #define L_ESC		'%'
480 #define SPECIALS	"^$*+?.([%-"
481 
482 
check_capture(MatchState * ms,int l)483 static int check_capture (MatchState *ms, int l) {
484 	l -= '1';
485 	if (l < 0 || l >= ms->level || ms->capture[l].len == CAP_UNFINISHED)
486 		return luaL_error(ms->L, "invalid capture index");
487 	return l;
488 }
489 
490 
capture_to_close(MatchState * ms)491 static int capture_to_close (MatchState *ms)
492 {
493 	int level = ms->level;
494 	for (level--; level>=0; level--)
495 		if (ms->capture[level].len == CAP_UNFINISHED) return level;
496 	return luaL_error(ms->L, "invalid pattern capture");
497 }
498 
499 
classend(MatchState * ms,const char * p)500 static const char *classend (MatchState *ms, const char *p)
501 {
502 	switch (*p) {
503 	case L_ESC:
504 		if (!*++p) luaL_error(ms->L, "malformed pattern (ends with " LUA_QL("%%") ")");
505 		break;
506 	case '[':
507 		/* if (*p == '^') p++; -- no effect */
508 		do {	/* look for a `]' */
509 			if (!*p) luaL_error(ms->L, "malformed pattern (missing " LUA_QL("]") ")");
510 			if (L_ESC == *(p++) && *p) p++;	/* skip escapes (e.g. `%]') */
511 		} while (']' != *p);
512 		break;
513 	default:
514 		if (!ms->mb) break;
515 		utf8_deco(&p, p+4);
516 		return p;
517 	}
518 	return p+1;
519 }	/* classend */
520 
521 
522 /*
523  * The following macros are used for fast character category tests.  The
524  * x_BITS values are shifted right by the category value to determine whether
525  * the given category is included in the set.
526  */
527 
528 #define LETTER_BITS ((1 << UPPERCASE_LETTER) | (1 << LOWERCASE_LETTER) \
529     | (1 << TITLECASE_LETTER) | (1 << MODIFIER_LETTER) | (1 << OTHER_LETTER))
530 
531 #define DIGIT_BITS (1 << DECIMAL_DIGIT_NUMBER)
532 
533 #define NUMBER_BITS (1 << DECIMAL_DIGIT_NUMBER) \
534 	| (1 << LETTER_NUMBER) | (1 << OTHER_NUMBER)
535 
536 #define SPACE_BITS ((1 << SPACE_SEPARATOR) | (1 << LINE_SEPARATOR) \
537     | (1 << PARAGRAPH_SEPARATOR))
538 
539 #define CONNECTOR_BITS (1 << CONNECTOR_PUNCTUATION)
540 
541 #define PUNCT_BITS ((1 << CONNECTOR_PUNCTUATION) | \
542 	    (1 << DASH_PUNCTUATION) | (1 << OPEN_PUNCTUATION) | \
543 	    (1 << CLOSE_PUNCTUATION) | (1 << INITIAL_QUOTE_PUNCTUATION) | \
544 	    (1 << FINAL_QUOTE_PUNCTUATION) | (1 << OTHER_PUNCTUATION))
545 
546 
547 /* character c matches class cl. undefined for cl not ascii */
match_class(int c,int cl,int mode)548 static int match_class (int c, int cl, int mode)
549 {
550 	int msk, res;
551 	switch (0x20|cl /*tolower*/) {
552 	case 'a' : msk = LETTER_BITS; break;
553 	case 'c' : msk = 1<<CONTROL; break;
554 	case 'x' : /* hexdigits */
555 		if (0x40==(~0x3f&c)/*64-127*/ && 1&(0x7e/*a-f*/>>(0x1f&c))) goto matched;
556 	case 'd' : msk = 1<<DECIMAL_DIGIT_NUMBER; mode=0;/* ASCII only */ break;
557 	case 'l' : msk = 1<<LOWERCASE_LETTER; break;
558 	case 'n' : msk = NUMBER_BITS; break; /* new */
559 	case 'p' : msk = PUNCT_BITS; break;
560 	case 's' :
561 #define STDSPACE /* standard "space" controls 9-13 */ \
562 		(1<<9/*TAB*/|1<<10/*LF*/|1<<11/*VT*/|1<<12/*FF*/|1<<13/*CR*/)
563 		if (!(~0x1f & c) && 1&(STDSPACE >> c)) goto matched;
564 		msk = SPACE_BITS;
565 		break;
566 	case 'u' : msk = 1<<UPPERCASE_LETTER; break;
567 	/*
568 		this is not compatible to lua 5.1, where %w is just a letter or a digit
569 	case 'w' : msk = LETTER_BITS|NUMBER_BITS|CONNECTOR_BITS; break;
570 	*/
571 	case 'w' : msk = LETTER_BITS|NUMBER_BITS; break;
572 	case 'z' : if (!c) goto matched; msk = 0; break;
573 	default: return cl == c;
574 	}
575 	res = 1 & (msk >> charcat(c));
576 	if (!mode && 0x80&c) res = 0;
577 	if (0) {
578 matched:
579 		res = 1;
580 	}
581 	return 0x20&cl /*islower*/ ? res : !res;
582 }	/* match_class */
583 
584 
585 /* decode single byte or UTF-8 seq; advance *s */
deco(const MatchState * ms,const char ** s,const char * e)586 static unsigned deco (const MatchState *ms, const char **s, const char *e)
587 {
588 	return ms->mb ? utf8_deco(s, e) : *(unsigned char*)(*s)++;
589 }
590 
591 /* s must be < ms->src_end, p < ep */
singlematch(const MatchState * ms,const char * s,const char * p,const char * ep)592 static const char *singlematch (const MatchState *ms,
593 	const char *s, const char *p, const char *ep)
594 {
595 	int neg = 0;
596 	unsigned c1, c2;
597 	unsigned c;
598 #ifdef OPTIMIZE_SIZE
599 	c = deco(ms, &s, ms->src_end);
600 #else
601 	if (!ms->mb || !(0x80&*s))
602 		c = *(unsigned char*)s++;
603 	else
604 		c = utf8_deco(&s, ms->src_end);
605 #endif
606 
607 	switch (*p) {
608 	case L_ESC:
609 		if (match_class(c, uchar(p[1]), ms->mode)) {
610 	case '.': /* the all class */
611 #ifndef OPTIMIZE_SIZE
612 			if (MODE_GRAPH != ms->mode) return s; /* common fast path */
613 #endif
614 			goto matched_class;
615 		}
616 		s = 0;
617 		break;
618 	default:
619 #ifdef OPTIMIZE_SIZE
620 		c1 = deco(ms, &p, ep);
621 #else
622 		if (!ms->mb || !(0x80&*p))
623 			c1 = *(unsigned char*)p++;
624 		else
625 			c1 = utf8_deco(&p, ep);
626 #endif
627 		if (c != c1) s = 0;
628 		break;
629 	case '[': /* matchbracketclass */
630 		ep--; /* now on the ']' */
631 		if ((neg = '^' == *++p)) p++;	/* skip the `^' */
632 		while (p < ep) {
633 			if (*p == L_ESC) {
634 				if (match_class(c, uchar(*++p), ms->mode)) goto matched_class_in_brack;
635 				p++;
636 				continue;
637 			}
638 			c1 = deco(ms, &p, ep);
639 			/* in lua-5.1 and 5.1.1 a trailing '-' is allowed
640 				dynasm.lua relies on this
641 			*/
642 			if ( ep <= p + 1 || '-' != *p ) {
643 				const char *op = p, *es;
644 				if (MODE_GRAPH == ms->mode) utf8_graphext(&p, ep);
645 				if (c != c1) continue;
646 				if (MODE_GRAPH != ms->mode) goto matched;
647 				/* comp grapheme extension */
648 				es = s;
649 				utf8_graphext(&es, ms->src_end);
650 				if (es-s == p-op && (es==s || !memcmp(s, op, es-s))) goto matched;
651 				continue;
652 
653 			}
654 			++p;
655 			/* range c1-c2 -- no extend support in range bounds... */
656 			/* if (ep == ++p) break; see above */ /* bugger - trailing dash */
657 			c2 = deco(ms, &p, ep);
658 			if (c2 < c1) { unsigned swap=c1; c1=c2; c2=swap; }
659 			if (c1 <= c && c <= c2) goto matched_class_in_brack; /* ...but extend match */
660 		}
661 		/* not matched */
662 		neg = !neg;
663 	matched:
664 		if (neg) s = 0;
665 		/* matchbracketclass */
666 	}
667 	return s;
668 matched_class_in_brack: /* matched %something or range in [] */
669 	if (neg)
670 		s = 0;
671 	else {
672 matched_class: /* matched %something or . */
673 		if (MODE_GRAPH == ms->mode) utf8_graphext(&s, ms->src_end);
674 	}
675 	return s;
676 }
677 
678 
679 static const char *match (MatchState *ms, const char *s, const char *p);
680 
681 
matchbalance(MatchState * ms,const char * s,const char * p)682 static const char *matchbalance (MatchState *ms, const char *s,
683 																	 const char *p) {
684 	if (*p == 0 || *(p+1) == 0)
685 		luaL_error(ms->L, "unbalanced pattern");
686 	if (*s != *p) return NULL;
687 	else {
688 		int b = *p;
689 		int e = *(p+1);
690 		int cont = 1;
691 		while (++s < ms->src_end) {
692 			if (*s == e) {
693 				if (--cont == 0) return s+1;
694 			}
695 			else if (*s == b) cont++;
696 		}
697 	}
698 	return NULL;	/* string ends out of balance */
699 }
700 
701 
max_expand(MatchState * ms,const char * s,const char * p,const char * ep)702 static const char *max_expand (MatchState *ms,
703 	const char *s, const char *p, const char *ep)
704 {
705 	const char *sp = s, *es;
706 	while (sp<ms->src_end && (es = singlematch(ms, sp, p, ep)))
707 		sp = es;
708 	/* keeps trying to match with the maximum repetitions */
709 	while (sp>=s) {
710 		const char *res = match(ms, sp, ep+1);
711 		if (res || sp==s) return res;
712 		if (!ms->mb)
713 			sp--;	/* else didn't match; reduce 1 repetition to try again */
714 		else {
715 			unsigned code = utf8_oced(&sp, s);
716 			if (MODE_GRAPH == ms->mode)
717 				while (Grapheme_Extend(code) && sp>s) code = utf8_oced(&sp, s);
718 		}
719 	}
720 	return NULL;
721 }
722 
723 
min_expand(MatchState * ms,const char * s,const char * p,const char * ep)724 static const char *min_expand (MatchState *ms,
725 	const char *s, const char *p, const char *ep)
726 {
727 	do {
728 		const char *res = match(ms, s, ep+1);
729 		if (res) return res;
730 		if (s >= ms->src_end) break;
731 	} while ((s = singlematch(ms, s, p, ep))); /* try with one more repetition */
732 	return NULL;
733 }
734 
735 
start_capture(MatchState * ms,const char * s,const char * p,int what)736 static const char *start_capture (MatchState *ms, const char *s,
737 																		const char *p, int what) {
738 	const char *res;
739 	int level = ms->level;
740 	if (level >= LUA_MAXCAPTURES) luaL_error(ms->L, "too many captures");
741 	ms->capture[level].init = s;
742 	ms->capture[level].len = what;
743 	ms->level = level+1;
744 	if ((res=match(ms, s, p)) == NULL)	/* match failed? */
745 		ms->level--;	/* undo capture */
746 	return res;
747 }
748 
749 
end_capture(MatchState * ms,const char * s,const char * p)750 static const char *end_capture (MatchState *ms, const char *s,
751 																	const char *p) {
752 	int l = capture_to_close(ms);
753 	const char *res;
754 	ms->capture[l].len = s - ms->capture[l].init;	/* close capture */
755 	if ((res = match(ms, s, p)) == NULL)	/* match failed? */
756 		ms->capture[l].len = CAP_UNFINISHED;	/* undo capture */
757 	return res;
758 }
759 
760 
match_capture(MatchState * ms,const char * s,int l)761 static const char *match_capture (MatchState *ms, const char *s, int l) {
762 	size_t len;
763 	l = check_capture(ms, l);
764 	len = ms->capture[l].len;
765 	if ((size_t)(ms->src_end-s) >= len &&
766 			memcmp(ms->capture[l].init, s, len) == 0)
767 		return s+len;
768 	else return NULL;
769 }
770 
771 
match(MatchState * ms,const char * s,const char * p)772 static const char *match (MatchState *ms, const char *s, const char *p) {
773 	init: /* using goto's to optimize tail recursion */
774 	switch (*p) {
775 		case '(': {	/* start capture */
776 			if (*(p+1) == ')')	/* position capture? */
777 				return start_capture(ms, s, p+2, CAP_POSITION);
778 			else
779 				return start_capture(ms, s, p+1, CAP_UNFINISHED);
780 		}
781 		case ')': {	/* end capture */
782 			return end_capture(ms, s, p+1);
783 		}
784 		case L_ESC: {
785 			switch (*(p+1)) {
786 				case 'b': {	/* balanced string? */
787 					s = matchbalance(ms, s, p+2);
788 					if (s == NULL) return NULL;
789 					p+=4; goto init;	/* else return match(ms, s, p+4); */
790 				}
791 #if 0 /* TODO */
792 				case 'f': {	/* frontier? */
793 					const char *ep; char previous;
794 					p += 2;
795 					if (*p != '[')
796 						luaL_error(ms->L, "missing " LUA_QL("[") " after "
797 								LUA_QL("%%f") " in pattern" );
798 						luaL_error(ms->L, "missing `[' after `%%f' in pattern");
799 					ep = classend(ms, p);	/* points to what is next */
800 					/* with UTF-8, getting the previous is more complicated */
801 					previous = (s == ms->src_init) ? '\0' : *(s-1);
802 					/* use singlematch to apply all necessary magic */
803 					if (singlematch(uchar(previous), p, ep-1) ||
804 						 !singlematch(uchar(*s), p, ep-1)) return NULL;
805 					p=ep; goto init;	/* else return match(ms, s, ep); */
806 				}
807 #endif
808 				default: {
809 					if (isdigit(uchar(*(p+1)))) {	/* capture results (%0-%9)? */
810 						s = match_capture(ms, s, uchar(*(p+1)));
811 						if (s == NULL) return NULL;
812 						p+=2; goto init;	/* else return match(ms, s, p+2) */
813 					}
814 					goto dflt;	/* case default */
815 				}
816 			}
817 		}
818 		case '\0': {	/* end of pattern */
819 			return s;	/* match succeeded */
820 		}
821 		case '$': {
822 			if (*(p+1) == '\0')	/* is the `$' the last char in pattern? */
823 				return (s == ms->src_end) ? s : NULL;	/* check end of string */
824 			else goto dflt; /* ??? */
825 		}
826 		default: dflt: {	/* it is a pattern item */
827 			const char *ep = classend(ms, p);	/* points to what is next */
828 			const char *es = 0;
829 			if (s < ms->src_end) es = singlematch(ms, s, p, ep);
830 			switch (*ep) {
831 				case '?': {	/* optional */
832 					const char *res;
833 					if (es && (res=match(ms, es, ep+1))) return res;
834 					p=ep+1; goto init;	/* else return match(ms, s, ep+1); */
835 				}
836 				case '*': {	/* 0 or more repetitions */
837 					return max_expand(ms, s, p, ep);
838 				}
839 				case '+': {	/* 1 or more repetitions */
840 					return (es ? max_expand(ms, es, p, ep) : NULL);
841 				}
842 				case '-': {	/* 0 or more repetitions (minimum) */
843 					return min_expand(ms, s, p, ep);
844 				}
845 				default: {
846 					if (!es) return NULL;
847 					s=es; p=ep; goto init;	/* else return match(ms, s+1, ep); */
848 				}
849 			}
850 		}
851 	}
852 }
853 
854 
855 
lmemfind(const char * s1,size_t l1,const char * s2,size_t l2)856 static const char *lmemfind (const char *s1, size_t l1,
857 															 const char *s2, size_t l2) {
858 	if (l2 == 0) return s1;	/* empty strings are everywhere */
859 	else if (l2 > l1) return NULL;	/* avoids a negative `l1' */
860 	else {
861 		const char *init;	/* to search for a `*s2' inside `s1' */
862 		l2--;	/* 1st char will be checked by `memchr' */
863 		l1 = l1-l2;	/* `s2' cannot be found after that */
864 		while (l1 > 0 && (init = (const char *)memchr(s1, *s2, l1)) != NULL) {
865 			init++;	 /* 1st char is already checked */
866 			if (memcmp(init, s2+1, l2) == 0)
867 				return init-1;
868 			else {	/* correct `l1' and `s1' to try again */
869 				l1 -= init-s1;
870 				s1 = init;
871 			}
872 		}
873 		return NULL;	/* not found */
874 	}
875 }
876 
877 
push_onecapture(MatchState * ms,int i,const char * s,const char * e)878 static void push_onecapture (MatchState *ms, int i, const char *s,
879 		const char *e )
880 {
881 	if (i >= ms->level) {
882 		if (i == 0)  /* ms->level == 0, too */
883 			lua_pushlstring(ms->L, s, e - s);  /* add whole match */
884 		else
885 			luaL_error(ms->L, "invalid capture index");
886 	}
887 	else {
888 		ptrdiff_t l = ms->capture[i].len;
889 		if (l == CAP_UNFINISHED) luaL_error(ms->L, "unfinished capture");
890 		if (l == CAP_POSITION)
891 			lua_pushinteger(ms->L, ms->capture[i].init - ms->src_init + 1);
892 		else
893 			lua_pushlstring(ms->L, ms->capture[i].init, l);
894 	}
895 }
896 
897 
push_captures(MatchState * ms,const char * s,const char * e)898 static int push_captures (MatchState *ms, const char *s, const char *e) {
899 	int i;
900 	int nlevels = (ms->level == 0 && s) ? 1 : ms->level;
901 	luaL_checkstack(ms->L, nlevels, "too many captures");
902 	for ( i = 0; i < nlevels; i++ )
903 		push_onecapture( ms, i, s, e );
904 	return nlevels;  /* number of strings pushed */
905 }
906 
907 
unic_find_aux(lua_State * L,int find)908 static int unic_find_aux (lua_State *L, int find) {
909 	size_t l1, l2;
910 	const char *s = luaL_checklstring(L, 1, &l1);
911 	const char *p = luaL_checklstring(L, 2, &l2);
912 	ptrdiff_t init = posrelat(luaL_optinteger(L, 3, 1), l1) - 1;
913 	if (init < 0) init = 0;
914 	else if ((size_t)(init) > l1) init = (ptrdiff_t)l1;
915 	if (find && (lua_toboolean(L, 4) ||	/* explicit request? */
916 			strpbrk(p, SPECIALS) == NULL)) {	/* or no special characters? */
917 		/* do a plain search */
918 		const char *s2 = lmemfind(s+init, l1-init, p, l2);
919 		if (s2) {
920 			lua_pushinteger(L, s2-s+1);
921 			lua_pushinteger(L, s2-s+l2);
922 			return 2;
923 		}
924 	}
925 	else {
926 		MatchState ms;
927 		int anchor = (*p == '^') ? (p++, 1) : 0;
928 		const char *s1=s+init;
929 		unsigned char u8_lenght = U8_LENGTH( (unsigned char)s[0] );
930 		ms.L = L;
931 		ms.src_init = s;
932 		ms.src_end = s+l1;
933 		ms.mode = lua_tointeger(L, lua_upvalueindex(1));
934 		ms.mb = MODE_MBYTE(ms.mode);
935 
936 
937 		/* LS/HH : patch for tracker issue 869, concerning "%s" match of à; the old code  */
938  	        /* increments by 1 on a failure and can end up in the middle of an utf sequence  */
939 		/* so this was a major bug.  */
940 
941 		do {
942 			const char *res;
943 			ms.level = 0;
944 			if ((res=match(&ms, s1, p)) != NULL) {
945 				if (find) {
946 					lua_pushinteger(L, s1-s+1);  /* start */
947 					lua_pushinteger(L, res-s);   /* end */
948 					return push_captures(&ms, NULL, 0) + 2;
949 				} else
950 					return push_captures(&ms, s1, res);
951 			}
952 			s1 = s1 + (ms.mode > MODE_LATIN ? U8_LENGTH( uchar(s1[0])) : 1) ;
953 		} while (s1 < ms.src_end && !anchor);
954 	}
955 	lua_pushnil(L);	/* not found */
956 	return 1;
957 }
958 
unic_find(lua_State * L)959 static int unic_find (lua_State *L) {
960 	return unic_find_aux(L, 1);
961 }
962 
963 
unic_match(lua_State * L)964 static int unic_match (lua_State *L) {
965 	return unic_find_aux(L, 0);
966 }
967 
968 
969 
gmatch_aux(lua_State * L)970 static int gmatch_aux (lua_State *L) {
971 	MatchState ms;
972 	size_t ls;
973 	const char *s = lua_tolstring(L, lua_upvalueindex(1), &ls);
974 	const char *p = lua_tostring(L, lua_upvalueindex(2));
975 	const char *src;
976 	ms.L = L;
977 	ms.src_init = s;
978 	ms.src_end = s+ls;
979     ms.mode = lua_tointeger(L, lua_upvalueindex(4));
980     ms.mb = MODE_MBYTE(ms.mode);
981 	for (src = s + (size_t)lua_tointeger(L, lua_upvalueindex(3));
982 			src <= ms.src_end;
983 			src++)
984 	{
985 		const char *e;
986 		ms.level = 0;
987 		if ((e = match(&ms, src, p)) != NULL) {
988 			lua_Integer newstart = e-s;
989 			if (e == src) newstart++;  /* empty match? go at least one position */
990 			lua_pushinteger(L, newstart);
991 			lua_replace(L, lua_upvalueindex(3));
992 			return push_captures(&ms, src, e);
993 		}
994 	}
995 	return 0;	/* not found */
996 }
997 
998 
999 
gmatch(lua_State * L)1000 static int gmatch (lua_State *L) {
1001 	luaL_checkstring(L, 1);
1002 	luaL_checkstring(L, 2);
1003 	lua_settop(L, 2);
1004 	lua_pushinteger(L, 0);
1005 	lua_pushinteger(L, lua_upvalueindex(1));
1006 	lua_pushcclosure(L, gmatch_aux, 4);
1007 	return 1;
1008 }
1009 
gfind_nodef(lua_State * L)1010 static int gfind_nodef (lua_State *L) {
1011 	return luaL_error(L, LUA_QL("string.gfind") " was renamed to "
1012 		LUA_QL("string.gmatch"));
1013 }
1014 
1015 
add_s(MatchState * ms,luaL_Buffer * b,const char * s,const char * e)1016 static void add_s (MatchState *ms, luaL_Buffer *b,
1017 		const char *s, const char *e)
1018 {
1019 	size_t l, i;
1020 	const char *news = lua_tolstring(ms->L, 3, &l);
1021 	for (i = 0; i < l; i++) {
1022 		if (news[i] != L_ESC)
1023 			luaL_addchar(b, news[i]);
1024 		else {
1025 			i++;  /* skip ESC */
1026 			if (!isdigit(uchar(news[i])))
1027 				luaL_addchar(b, news[i]);
1028 			else if (news[i] == '0')
1029 				luaL_addlstring(b, s, e - s);
1030 			else {
1031 				push_onecapture(ms, news[i] - '1', s, e);
1032 				luaL_addvalue(b);  /* add capture to accumulated result */
1033 			}
1034 		}
1035 	}
1036 }
1037 
add_value(MatchState * ms,luaL_Buffer * b,const char * s,const char * e)1038 static void add_value (MatchState *ms, luaL_Buffer *b, const char *s,
1039 	const char *e)
1040 {
1041 	lua_State *L = ms->L;
1042 	switch (lua_type(L, 3)) {
1043 		case LUA_TNUMBER:
1044 		case LUA_TSTRING: {
1045 			add_s(ms, b, s, e);
1046 			return;
1047 		}
1048 		case LUA_TFUNCTION: {
1049 			int n;
1050 			lua_pushvalue(L, 3);
1051 			n = push_captures(ms, s, e);
1052 			lua_call(L, n, 1);
1053 			break;
1054 		}
1055 		case LUA_TTABLE: {
1056 			push_onecapture(ms, 0, s, e);
1057 			lua_gettable(L, 3);
1058 			break;
1059 		}
1060 		default: {
1061 			luaL_argerror(L, 3, "string/function/table expected");
1062 			return;
1063 		}
1064 	}
1065 	if (!lua_toboolean(L, -1)) {  /* nil or false? */
1066 		lua_pop(L, 1);
1067 		lua_pushlstring(L, s, e - s);  /* keep original text */
1068 	}
1069 	else if (!lua_isstring(L, -1))
1070 		luaL_error(L, "invalid replacement value (a %s)", luaL_typename(L, -1));
1071 	luaL_addvalue(b);  /* add result to accumulator */
1072 }
1073 
unic_gsub(lua_State * L)1074 static int unic_gsub (lua_State *L) {
1075 	size_t srcl;
1076 	const char *src = luaL_checklstring(L, 1, &srcl);
1077 	const char *p = luaL_checkstring(L, 2);
1078 	int max_s = luaL_optint(L, 4, srcl+1);
1079 	int anchor = (*p == '^') ? (p++, 1) : 0;
1080 	int n = 0;
1081 	MatchState ms;
1082 	luaL_Buffer b;
1083 	luaL_buffinit(L, &b);
1084 	ms.L = L;
1085 	ms.src_init = src;
1086 	ms.src_end = src+srcl;
1087 	ms.mode = lua_tointeger(L, lua_upvalueindex(1));
1088 	ms.mb = MODE_MBYTE(ms.mode);
1089 	while (n < max_s) {
1090 		const char *e;
1091 		ms.level = 0;
1092 		e = match(&ms, src, p);
1093 		if (e) {
1094 			n++;
1095 			add_value(&ms, &b, src, e);
1096 		}
1097 		if (e && e>src) /* non empty match? */
1098 			src = e;	/* skip it */
1099 		else if (src < ms.src_end)
1100 			luaL_addchar(&b, *src++);
1101 		else break;
1102 		if (anchor) break;
1103 	}
1104 	luaL_addlstring(&b, src, ms.src_end-src);
1105 	luaL_pushresult(&b);
1106 	lua_pushinteger(L, n);	/* number of substitutions */
1107 	return 2;
1108 }
1109 
1110 /* }====================================================== */
1111 
1112 
1113 /* maximum size of each formatted item (> len(format('%99.99f', -1e308))) */
1114 #define MAX_ITEM	512
1115 /* valid flags in a format specification */
1116 #define FLAGS  "-+ #0"
1117 /*
1118 ** maximum size of each format specification (such as '%-099.99d')
1119 ** (+10 accounts for %99.99x plus margin of error)
1120 */
1121 #define MAX_FORMAT (sizeof(FLAGS) + sizeof(LUA_INTFRMLEN) + 10)
1122 
1123 
addquoted(lua_State * L,luaL_Buffer * b,int arg)1124 static void addquoted (lua_State *L, luaL_Buffer *b, int arg) {
1125 	size_t l;
1126 	const char *s = luaL_checklstring(L, arg, &l);
1127 	luaL_addchar(b, '"');
1128 	while (l--) {
1129 		switch (*s) {
1130 			case '"': case '\\': case '\n': {
1131 				luaL_addchar(b, '\\');
1132 				luaL_addchar(b, *s);
1133 				break;
1134 			}
1135       case '\r': {
1136         luaL_addlstring(b, "\\r", 2);
1137         break;
1138       }
1139 			case '\0': {
1140 				luaL_addlstring(b, "\\000", 4);
1141 				break;
1142 			}
1143 			default: {
1144 				luaL_addchar(b, *s);
1145 				break;
1146 			}
1147 		}
1148 		s++;
1149 	}
1150 	luaL_addchar(b, '"');
1151 }
1152 
1153 
scanformat(lua_State * L,const char * strfrmt,char * form,int * hasprecision)1154 static const char *scanformat (lua_State *L, const char *strfrmt, char *form,
1155 		int *hasprecision)
1156 {
1157 	const char *p = strfrmt;
1158 	while (strchr(FLAGS, *p)) p++;	/* skip flags */
1159 	if ((size_t)(p - strfrmt) >= sizeof(FLAGS))
1160 		luaL_error(L, "invalid format (repeated flags)");
1161 	if (isdigit(uchar(*p))) p++;	/* skip width */
1162 	if (isdigit(uchar(*p))) p++;	/* (2 digits at most) */
1163 	if (*p == '.') {
1164 		p++;
1165 		*hasprecision = 1;
1166 		if (isdigit(uchar(*p))) p++;	/* skip precision */
1167 		if (isdigit(uchar(*p))) p++;	/* (2 digits at most) */
1168 	}
1169 	if (isdigit(uchar(*p)))
1170 		luaL_error(L, "invalid format (width or precision too long)");
1171 	form[0] = L_ESC;
1172 	strncpy(form+1, strfrmt, p-strfrmt+1);
1173 	form[p-strfrmt+2] = 0;
1174 	return p;
1175 }
1176 
addintlen(char * form)1177 static void addintlen (char *form) {
1178 	size_t l = strlen(form);
1179 	char spec = form[l - 1];
1180 	strcpy(form + l - 1, LUA_INTFRMLEN);
1181 	form[l + sizeof(LUA_INTFRMLEN) - 2] = spec;
1182 	form[l + sizeof(LUA_INTFRMLEN) - 1] = '\0';
1183 }
1184 
str_format(lua_State * L)1185 static int str_format (lua_State *L) {
1186 	int arg = 1;
1187 	size_t sfl;
1188 	const char *strfrmt = luaL_checklstring(L, arg, &sfl);
1189 	const char *strfrmt_end = strfrmt+sfl;
1190 	luaL_Buffer b;
1191 	luaL_buffinit(L, &b);
1192 	while (strfrmt < strfrmt_end) {
1193 		if (*strfrmt != L_ESC)
1194 			luaL_addchar(&b, *strfrmt++);
1195 		else if (*++strfrmt == L_ESC)
1196 			luaL_addchar(&b, *strfrmt++);	/* %% */
1197 		else { /* format item */
1198 			char form[MAX_FORMAT];	/* to store the format (`%...') */
1199 			char buff[MAX_ITEM];	/* to store the formatted item */
1200 			int hasprecision = 0;
1201 			arg++;
1202 			strfrmt = scanformat(L, strfrmt, form, &hasprecision);
1203 			switch (*strfrmt++) {
1204 				case 'c': {
1205 #ifdef LUA_USE_SNPRINTF
1206 					snprintf( buff, MAX_ITEM, form,
1207 							(int) luaL_checknumber( L, arg ) );
1208 #else
1209 					sprintf(buff, form, (int) luaL_checknumber( L, arg ) );
1210 #endif
1211 					break;
1212 				}
1213 				case 'd': case 'i': {
1214 					addintlen( form );
1215 #ifdef LUA_USE_SNPRINTF
1216 					snprintf( buff, MAX_ITEM, form,
1217 							(LUA_INTFRM_T) luaL_checknumber(L, arg) );
1218 #else
1219 					sprintf(buff, form,
1220 							(LUA_INTFRM_T) luaL_checknumber(L, arg) );
1221 #endif
1222 					break;
1223 				}
1224 				case 'o': case 'u': case 'x': case 'X': {
1225 					addintlen( form );
1226 #ifdef LUA_USE_SNPRINTF
1227 					snprintf(buff, MAX_ITEM, form,
1228 							(unsigned LUA_INTFRM_T) luaL_checknumber(L, arg) );
1229 #else
1230 					sprintf(buff, form,
1231 							(unsigned LUA_INTFRM_T) luaL_checknumber(L, arg) );
1232 #endif
1233 					break;
1234 				}
1235 				case 'e': case 'E': case 'f':
1236 				case 'g': case 'G': {
1237 #ifndef LUA_NUMBER_DOUBLE
1238 					luaL_argerror( L, 1, "double formatting not supported" );
1239 #else
1240 #	ifdef __dietlibc__
1241 #		warning "double formatting is broken in dietlibc"
1242 #	endif
1243 #	ifdef LUA_USE_SNPRINTF
1244 					snprintf(buff, MAX_ITEM, form,
1245 							(double) luaL_checknumber(L, arg) );
1246 #	else
1247 					sprintf(buff, form, (double) luaL_checknumber(L, arg) );
1248 #	endif
1249 #endif
1250 					break;
1251 				}
1252 				case 'q': {
1253 					addquoted(L, &b, arg);
1254 					continue;	/* skip the `addsize' at the end */
1255 				}
1256 				case 's': {
1257 					size_t l;
1258 					const char *s = luaL_checklstring(L, arg, &l);
1259 					if (!hasprecision && l >= 100) {
1260 						/* no precision and string is too long to be formatted;
1261 							 keep original string */
1262 						lua_pushvalue(L, arg);
1263 						luaL_addvalue(&b);
1264 						continue;	/* skip the `addsize' at the end */
1265 					}
1266 					else {
1267 #ifdef LUA_USE_SNPRINTF
1268 						snprintf(buff, MAX_ITEM, form, s);
1269 #else
1270 						sprintf(buff, form, s);
1271 #endif
1272 						break;
1273 					}
1274 				}
1275 				default: {	/* also treat cases `pnLlh' */
1276               return luaL_error(L, "invalid option " LUA_QL("%%%c") " to "
1277                                    LUA_QL("format"), *(strfrmt - 1));
1278 				}
1279 			}
1280 			luaL_addlstring(&b, buff, strlen(buff));
1281 		}
1282 	}
1283 	luaL_pushresult(&b);
1284 	return 1;
1285 }
1286 
1287 #ifdef WANT_EXT_MATCH
1288 static struct { const char *k; int v; } unicflags[] = {
1289 	 { "ASCII", MODE_ASCII }
1290 	,{ "LATIN", MODE_LATIN }
1291 	,{ "UTF8",  MODE_UTF8 }
1292 	,{ "GRAPH", MODE_GRAPH }
1293 };
1294 #define unicflags_sz ( sizeof( unicflags ) / sizeof( unicflags[0] ) )
1295 
1296 /*
1297 	allow direkt match calls from c
1298 */
ext_uni_match(void * state,const char * s,size_t n,const char * p,int init,int mode)1299 int ext_uni_match ( void *state, const char *s, size_t n,
1300 	const char *p, int init, int mode )
1301 {
1302 	lua_State *L = state;
1303 	MatchState ms;
1304 	int anchor = (*p == '^') ? (p++, 1) : 0;
1305 	const char *s1;
1306 	int i = posrelat( init, n ) - 1;
1307 	if (i < 0) i = 0;
1308 	else if ((size_t)(i) > n) i = (ptrdiff_t)n;
1309 	s1 = s + i;
1310 	ms.L = L;
1311 	ms.src_init = s;
1312 	ms.src_end = s + n;
1313 	ms.mode = mode;
1314 	ms.mb = MODE_MBYTE(mode);
1315 	do {
1316 		const char *res;
1317 		ms.level = 0;
1318 		if ( ( res=match(&ms, s1, p)) != NULL )
1319 			return 1;
1320 	} while ( s1++ < ms.src_end && !anchor );
1321 	return 0;
1322 }
1323 #endif
1324 
1325 static const luaL_Reg uniclib[] = {
1326 	{"byte", unic_byte}, /* no cluster ! */
1327 	{"char", unic_char},
1328 	{"dump", str_dump},
1329 	{"find", unic_find}, /* cluster */
1330 	{"format", str_format},
1331 	{"gfind", gfind_nodef},
1332 	{"gmatch", gmatch}, /* cluster */
1333 	{"gsub", unic_gsub}, /* cluster */
1334 	{"len", unic_len}, /* cluster/byte opt. */
1335 	{"lower", unic_lower},
1336 	{"match", unic_match}, /* cluster */
1337 	{"rep", str_rep},
1338 	{"reverse", str_reverse},
1339 	{"sub", unic_sub}, /* cluster/byte opt. */
1340 	{"upper", unic_upper},
1341 	{NULL, NULL}
1342 };
1343 
1344 #if defined( SLNUNICODE_AS_STRING ) && defined( STRING_WITH_METAT )
createmetatable(lua_State * L)1345 static void createmetatable (lua_State *L) {
1346 	lua_newtable(L);  /* create metatable for strings */
1347 	lua_pushliteral(L, "");  /* dummy string */
1348 	lua_pushvalue(L, -2);
1349 	lua_setmetatable(L, -2);  /* set string metatable */
1350 	lua_pop(L, 1);  /* pop dummy string */
1351 	lua_pushvalue(L, -2);  /* string library... */
1352 	lua_setfield(L, -2, "__index");  /* ...is the __index metamethod */
1353 	lua_pop(L, 1);  /* pop metatable */
1354 }
1355 #endif
1356 
1357 /*
1358 ** Open string library
1359 */
luaopen_unicode(lua_State * L)1360 LUALIB_API int luaopen_unicode (lua_State *L) {
1361 	/* register unicode itself so require("unicode") works */
1362 	luaL_register(L, SLN_UNICODENAME,
1363 		uniclib + (sizeof uniclib/sizeof uniclib[0] - 1)); /* empty func list */
1364 	lua_pop(L, 1);
1365 	lua_getglobal(L,SLN_UNICODENAME);
1366 	lua_newtable(L);
1367 	lua_pushinteger(L, MODE_ASCII);
1368 	luaL_setfuncs(L, uniclib, 1);
1369 	lua_setfield(L, -2, "ascii");
1370 
1371 	lua_newtable(L);
1372 	lua_pushinteger(L, MODE_LATIN);
1373 	luaL_setfuncs(L, uniclib, 1);
1374 	lua_setfield(L, -2, "latin1");
1375 
1376 	lua_newtable(L);
1377 	lua_pushinteger(L, MODE_GRAPH);
1378 	luaL_setfuncs(L, uniclib, 1);
1379 	lua_setfield(L, -2, "grapheme");
1380 
1381 	lua_newtable(L);
1382 	lua_pushinteger(L, MODE_UTF8);
1383 	luaL_setfuncs(L, uniclib, 1);
1384 	lua_setfield(L, -2, "utf8");
1385 
1386 #ifdef WANT_EXT_MATCH
1387 	{
1388 		unsigned i;
1389 		const char *ln = SLN_UNICODENAME ".mode";
1390 		luaL_findtable( L, LUA_REGISTRYINDEX, "_LOADED", 1 );
1391 		lua_getfield( L, -1, ln );
1392 		if ( !lua_istable(L, -1 ) ) {
1393 			lua_pop( L, 1 );
1394 			if ( luaL_findtable( L, LUA_GLOBALSINDEX, ln, unicflags_sz ) )
1395 				luaL_error( L, "name conflict for module " LUA_QS, ln );
1396 			lua_pushvalue( L, -1 );
1397 			lua_setfield( L, -3, ln );
1398 		}
1399 		lua_remove( L, -2 );
1400 		for( i = 0; unicflags_sz > i; ++i ) {
1401 			lua_pushnumber( L, unicflags[i].v );
1402 			lua_setfield( L, -2, unicflags[i].k );
1403 		}
1404 	}
1405 #endif
1406 	return 1;
1407 }
1408 
1409