1 /*
2  * common/clib/utf8.c - Basic UTF8 character counting (wrapper for glib)
3  *
4  * Copyright © 2017 Dennis Hofheinz <github@kjdf.de>
5  *
6  * This program is free software: you can redistribute it and/or modify
7  * it under the terms of the GNU General Public License as published by
8  * the Free Software Foundation, either version 3 of the License, or
9  * (at your option) any later version.
10  *
11  * This program is distributed in the hope that it will be useful,
12  * but WITHOUT ANY WARRANTY; without even the implied warranty of
13  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
14  * GNU General Public License for more details.
15  *
16  * You should have received a copy of the GNU General Public License
17  * along with this program.  If not, see <http://www.gnu.org/licenses/>.
18  *
19  */
20 
21 #include "common/clib/utf8.h"
22 #include "luah.h"
23 
24 #include <glib.h>
25 
26 /* Convert 1-based into 0-based byte offset,
27  * counted from back of string if negative
28  * return (size_t) -1 if offset is out of range */
29 static size_t
abspos(ssize_t offset,size_t length)30 abspos(ssize_t offset, size_t length) {
31     if (offset == 0)
32         return (size_t) -1;
33     offset = (offset > 0) ? offset - 1 : offset + (ssize_t) length;
34     if (offset < 0 || (size_t) offset > length)
35         return (size_t) -1;
36     return (size_t) offset;
37 }
38 
39 /* UTF8 aware string length computing.
40  * Returns the number of elements pushed on the stack. */
41 static gint
luaH_utf8_len(lua_State * L)42 luaH_utf8_len(lua_State *L)
43 {
44     size_t blen;
45     const gchar *str = luaL_checklstring(L, 1, &blen);
46 
47     /* parse optional begin/end parameters
48      * raise an error if out of bounds */
49     size_t bbeg = abspos(luaL_optinteger(L, 2, 1), blen);
50     luaL_argcheck(L, bbeg != (size_t) -1, 2, "initial position out of string");
51     /* setting end position requires extra work to imitate Lua 5.3 */
52     size_t bend = bbeg;
53     ssize_t sbend = luaL_optinteger(L, 3, blen); /* may be negative */
54     sbend = (sbend >= 0) ? sbend - 1 : sbend + (ssize_t) blen;
55     luaL_argcheck(L, sbend < (ssize_t) blen, 3, "final position out of string");
56     if (sbend >= (ssize_t) bbeg && (size_t) sbend < blen)
57         bend = g_utf8_find_next_char(str + (size_t) sbend, NULL) - str;
58 
59     /* is the string valid UTF8? */
60     gchar *valend;
61     if (!g_utf8_validate(str + bbeg, bend - bbeg, (const gchar **) &valend)) {
62         lua_pushnil(L);
63         lua_pushinteger(L, (ssize_t) (valend - str) + 1);
64         return 2;
65     }
66 
67     lua_pushinteger(L, (ssize_t) g_utf8_strlen(str + bbeg, bend - bbeg));
68     return 1;
69 }
70 
71 /* UTF8 aware string offset conversion.
72  * Converts (1-based) UTF8 offset to (1-based) byte offset.
73  * Returns the number of elements pushed on the stack. */
74 static gint
luaH_utf8_offset(lua_State * L)75 luaH_utf8_offset(lua_State *L)
76 {
77     size_t blen;
78     const gchar *str = luaL_checklstring(L, 1, &blen);
79     ssize_t widx = luaL_checkinteger(L, 2);
80     if (widx > 0) widx--; /* adjust to 0-based */
81 
82     /* parse optional parameter (base index)
83      * raise an error if out of bounds
84      * or if initial position points inside a UTF8 encoding */
85     size_t bbase;
86     bbase = luaL_optinteger(L, 3, (widx>=0) ? 1 : blen + 1);
87     bbase = abspos(bbase, blen);
88     luaL_argcheck(L, bbase != (size_t) -1, 3, "position out of range");
89     if (g_utf8_get_char_validated(str + bbase, -1) == (gunichar) -1)
90         luaL_error(L, "initial position is a continuation byte");
91 
92     /* convert negative index parameter to positive */
93     size_t wseglen;
94     size_t bbeg = 0;
95     if (widx < 0) {
96         wseglen = g_utf8_strlen(str, bbase);
97         widx += wseglen;
98     } else {
99         wseglen = g_utf8_strlen(str + bbase, blen - bbase);
100         bbeg = bbase;
101     }
102 
103     /* convert positive UTF8 offset to byte offset */
104     ssize_t ret = 0;
105     if (widx >= 0 && (size_t) widx <= wseglen) {
106         gchar *pos = g_utf8_offset_to_pointer(str + bbeg, widx);
107         if (pos != NULL)
108             ret = (ssize_t) (pos - str) + 1;
109     }
110 
111     /* if conversion was successful, output result (else output nil) */
112     if (ret > 0)
113         lua_pushinteger(L, ret);
114     else
115         lua_pushnil(L);
116     return 1;
117 }
118 
119 void
utf8_lib_setup(lua_State * L)120 utf8_lib_setup(lua_State *L)
121 {
122     static const struct luaL_Reg utf8_lib[] =
123     {
124         { "len", luaH_utf8_len },
125         { "offset", luaH_utf8_offset },
126         { NULL, NULL }
127     };
128 
129     luaH_openlib(L, "utf8", utf8_lib, utf8_lib);
130 
131     lua_getglobal(L, "utf8");
132     lua_pushstring(L, "[%z\1-\x7F\xC2-\xF4][\x80-\xBF]*");
133     lua_setfield(L, -2, "charpattern");
134     lua_pop(L, 1);
135 }
136 
137 // vim: ft=c:et:sw=4:ts=8:sts=4:tw=80
138