1 /*
2 ** $Id: lutf8lib.c,v 1.13 2014/11/02 19:19:04 roberto Exp $
3 ** Standard library for UTF-8 manipulation
4 ** Modified by the LOVE Development Team to work with Lua 5.1's API
5 */
6 
7 /******************************************************************************
8  * Copyright (C) 1994-2015 Lua.org, PUC-Rio, 2015 LOVE Development Team.
9  *
10  * Permission is hereby granted, free of charge, to any person obtaining
11  * a copy of this software and associated documentation files (the
12  * "Software"), to deal in the Software without restriction, including
13  * without limitation the rights to use, copy, modify, merge, publish,
14  * distribute, sublicense, and/or sell copies of the Software, and to
15  * permit persons to whom the Software is furnished to do so, subject to
16  * the following conditions:
17  *
18  * The above copyright notice and this permission notice shall be
19  * included in all copies or substantial portions of the Software.
20  *
21  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
22  * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
23  * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
24  * IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
25  * CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
26  * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
27  * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
28  ******************************************************************************/
29 
30 #define lutf8lib_c
31 
32 #include "lprefix.h"
33 
34 
35 #include <assert.h>
36 #include <stdlib.h>
37 #include <string.h>
38 
39 #include "lutf8lib.h"
40 
41 #include "lauxlib.h"
42 #include "lualib.h"
43 
44 #define MAXUNICODE	0x10FFFF
45 
46 /* size of buffer for 'utf8esc' function (taken from lobject.h) */
47 #define UTF8BUFFSZ	8
48 
49 #define iscont(p)	((*(p) & 0xC0) == 0x80)
50 
51 
52 /* from strlib */
53 /* translate a relative string position: negative means back from end */
u_posrelat(lua_Integer pos,size_t len)54 static lua_Integer u_posrelat (lua_Integer pos, size_t len) {
55   if (pos >= 0) return pos;
56   else if (0u - (size_t)pos > len) return 0;
57   else return (lua_Integer)len + pos + 1;
58 }
59 
60 
61 /*
62 ** Decode one UTF-8 sequence, returning NULL if byte sequence is invalid.
63 */
utf8_decode(const char * o,int * val)64 static const char *utf8_decode (const char *o, int *val) {
65   static unsigned int limits[] = {0xFF, 0x7F, 0x7FF, 0xFFFF};
66   const unsigned char *s = (const unsigned char *)o;
67   unsigned int c = s[0];
68   unsigned int res = 0;  /* final result */
69   if (c < 0x80)  /* ascii? */
70     res = c;
71   else {
72     int count = 0;  /* to count number of continuation bytes */
73     while (c & 0x40) {  /* still have continuation bytes? */
74       int cc = s[++count];  /* read next byte */
75       if ((cc & 0xC0) != 0x80)  /* not a continuation byte? */
76         return NULL;  /* invalid byte sequence */
77       res = (res << 6) | (cc & 0x3F);  /* add lower 6 bits from cont. byte */
78       c <<= 1;  /* to test next bit */
79     }
80     res |= ((c & 0x7F) << (count * 5));  /* add first byte */
81     if (count > 3 || res > MAXUNICODE || res <= limits[count])
82       return NULL;  /* invalid byte sequence */
83     s += count;  /* skip continuation bytes read */
84   }
85   if (val) *val = res;
86   return (const char *)s + 1;  /* +1 to include first byte */
87 }
88 
89 
90 /*
91 ** utf8len(s [, i [, j]]) --> number of characters that start in the
92 ** range [i,j], or nil + current position if 's' is not well formed in
93 ** that interval
94 */
utflen(lua_State * L)95 static int utflen (lua_State *L) {
96   int n = 0;
97   size_t len;
98   const char *s = luaL_checklstring(L, 1, &len);
99   lua_Integer posi = u_posrelat(luaL_optinteger(L, 2, 1), len);
100   lua_Integer posj = u_posrelat(luaL_optinteger(L, 3, -1), len);
101   luaL_argcheck(L, 1 <= posi && --posi <= (lua_Integer)len, 2,
102                    "initial position out of string");
103   luaL_argcheck(L, --posj < (lua_Integer)len, 3,
104                    "final position out of string");
105   while (posi <= posj) {
106     const char *s1 = utf8_decode(s + posi, NULL);
107     if (s1 == NULL) {  /* conversion error? */
108       lua_pushnil(L);  /* return nil ... */
109       lua_pushinteger(L, posi + 1);  /* ... and current position */
110       return 2;
111     }
112     posi = s1 - s;
113     n++;
114   }
115   lua_pushinteger(L, n);
116   return 1;
117 }
118 
119 
120 /*
121 ** codepoint(s, [i, [j]])  -> returns codepoints for all characters
122 ** that start in the range [i,j]
123 */
codepoint(lua_State * L)124 static int codepoint (lua_State *L) {
125   size_t len;
126   const char *s = luaL_checklstring(L, 1, &len);
127   lua_Integer posi = u_posrelat(luaL_optinteger(L, 2, 1), len);
128   lua_Integer pose = u_posrelat(luaL_optinteger(L, 3, posi), len);
129   int n;
130   const char *se;
131   luaL_argcheck(L, posi >= 1, 2, "out of range");
132   luaL_argcheck(L, pose <= (lua_Integer)len, 3, "out of range");
133   if (posi > pose) return 0;  /* empty interval; return no values */
134   n = (int)(pose -  posi + 1);
135   if (posi + n <= pose)  /* (lua_Integer -> int) overflow? */
136     return luaL_error(L, "string slice too long");
137   luaL_checkstack(L, n, "string slice too long");
138   n = 0;
139   se = s + pose;
140   for (s += posi - 1; s < se;) {
141     int code;
142     s = utf8_decode(s, &code);
143     if (s == NULL)
144       return luaL_error(L, "invalid UTF-8 code");
145     lua_pushinteger(L, code);
146     n++;
147   }
148   return n;
149 }
150 
151 
152 /* taken from lobject.c */
utf8esc(char * buff,unsigned long x)153 static int utf8esc (char *buff, unsigned long x) {
154 	int n = 1;  /* number of bytes put in buffer (backwards) */
155 	lua_assert(x <= 0x10FFFF);
156 	if (x < 0x80)  /* ascii? */
157 		buff[UTF8BUFFSZ - 1] = (char) x;
158 	else {  /* need continuation bytes */
159 		unsigned int mfb = 0x3f;  /* maximum that fits in first byte */
160 		do {  /* add continuation bytes */
161 			buff[UTF8BUFFSZ - (n++)] = (char) (0x80 | (x & 0x3f));
162 			x >>= 6;  /* remove added bits */
163 			mfb >>= 1;  /* now there is one less bit available in first byte */
164 		} while (x > mfb);  /* still needs continuation byte? */
165 		buff[UTF8BUFFSZ - n] = (char) ((~mfb << 1) | x);  /* add first byte */
166 	}
167 	return n;
168 }
169 
pushutfchar(lua_State * L,int arg)170 static void pushutfchar (lua_State *L, int arg) {
171   lua_Integer code = luaL_checkinteger(L, arg);
172   luaL_argcheck(L, 0 <= code && code <= MAXUNICODE, arg, "value out of range");
173 
174   /* the %U string format does not exist in lua 5.1 or 5.2, so we emulate it */
175   /* (code from luaO_pushvfstring in lobject.c) */
176   char buff[UTF8BUFFSZ];
177   int l = utf8esc(buff, (long) code);
178   lua_pushlstring(L, buff + UTF8BUFFSZ - l, l);
179 }
180 
181 
182 /*
183 ** utfchar(n1, n2, ...)  -> char(n1)..char(n2)...
184 */
utfchar(lua_State * L)185 static int utfchar (lua_State *L) {
186   int n = lua_gettop(L);  /* number of arguments */
187   if (n == 1)  /* optimize common case of single char */
188     pushutfchar(L, 1);
189   else {
190     int i;
191     luaL_Buffer b;
192     luaL_buffinit(L, &b);
193     for (i = 1; i <= n; i++) {
194       pushutfchar(L, i);
195       luaL_addvalue(&b);
196     }
197     luaL_pushresult(&b);
198   }
199   return 1;
200 }
201 
202 
203 /*
204 ** offset(s, n, [i])  -> index where n-th character counting from
205 **   position 'i' starts; 0 means character at 'i'.
206 */
byteoffset(lua_State * L)207 static int byteoffset (lua_State *L) {
208   size_t len;
209   const char *s = luaL_checklstring(L, 1, &len);
210   lua_Integer n  = luaL_checkinteger(L, 2);
211   lua_Integer posi = (n >= 0) ? 1 : len + 1;
212   posi = u_posrelat(luaL_optinteger(L, 3, posi), len);
213   luaL_argcheck(L, 1 <= posi && --posi <= (lua_Integer)len, 3,
214                    "position out of range");
215   if (n == 0) {
216     /* find beginning of current byte sequence */
217     while (posi > 0 && iscont(s + posi)) posi--;
218   }
219   else {
220     if (iscont(s + posi))
221       luaL_error(L, "initial position is a continuation byte");
222     if (n < 0) {
223        while (n < 0 && posi > 0) {  /* move back */
224          do {  /* find beginning of previous character */
225            posi--;
226          } while (posi > 0 && iscont(s + posi));
227          n++;
228        }
229      }
230      else {
231        n--;  /* do not move for 1st character */
232        while (n > 0 && posi < (lua_Integer)len) {
233          do {  /* find beginning of next character */
234            posi++;
235          } while (iscont(s + posi));  /* (cannot pass final '\0') */
236          n--;
237        }
238      }
239   }
240   if (n == 0)  /* did it find given character? */
241     lua_pushinteger(L, posi + 1);
242   else  /* no such character */
243     lua_pushnil(L);
244   return 1;
245 }
246 
247 
iter_aux(lua_State * L)248 static int iter_aux (lua_State *L) {
249   size_t len;
250   const char *s = luaL_checklstring(L, 1, &len);
251   lua_Integer n = lua_tointeger(L, 2) - 1;
252   if (n < 0)  /* first iteration? */
253     n = 0;  /* start from here */
254   else if (n < (lua_Integer)len) {
255     n++;  /* skip current byte */
256     while (iscont(s + n)) n++;  /* and its continuations */
257   }
258   if (n >= (lua_Integer)len)
259     return 0;  /* no more codepoints */
260   else {
261     int code;
262     const char *next = utf8_decode(s + n, &code);
263     if (next == NULL || iscont(next))
264       return luaL_error(L, "invalid UTF-8 code");
265     lua_pushinteger(L, n + 1);
266     lua_pushinteger(L, code);
267     return 2;
268   }
269 }
270 
271 
iter_codes(lua_State * L)272 static int iter_codes (lua_State *L) {
273   luaL_checkstring(L, 1);
274   lua_pushcfunction(L, iter_aux);
275   lua_pushvalue(L, 1);
276   lua_pushinteger(L, 0);
277   return 3;
278 }
279 
280 
281 /* pattern to match a single UTF-8 character */
282 #if LUA_VERSION_NUM >= 502
283 #define UTF8PATT	"[\0-\x7F\xC2-\xF4][\x80-\xBF]*"
284 #else
285 /* lua 5.1 doesn't support literal null bytes in patterns */
286 #define UTF8PATT	"[%z\x01-\x7F\xC2-\xF4][\x80-\xBF]*"
287 #endif
288 
289 
290 static struct luaL_Reg funcs[] = {
291   {"offset", byteoffset},
292   {"codepoint", codepoint},
293   {"char", utfchar},
294   {"len", utflen},
295   {"codes", iter_codes},
296   /* placeholders */
297   {"charpattern", NULL},
298   {NULL, NULL}
299 };
300 
301 
302 /* modified version of luaopen_utf8, designed to work with lua 5.1-5.3 */
luaopen_luautf8(lua_State * L)303 int luaopen_luautf8 (lua_State *L) {
304   luaL_Reg *l;
305   lua_createtable(L, 0, (int) (sizeof(funcs) / sizeof(luaL_Reg)) - 1);
306   for (l = funcs; l->name != NULL; l++) {
307     if (l->func != NULL) {
308       lua_pushcfunction(L, l->func);
309       lua_setfield(L, -2, l->name);
310     }
311   }
312   lua_pushliteral(L, UTF8PATT);
313   lua_setfield(L, -2, "charpattern");
314   return 1;
315 }
316 
317