1 /*
2 * common/clib/utf8.c - Basic UTF8 character counting (wrapper for glib)
3 *
4 * Copyright © 2017 Dennis Hofheinz <github@kjdf.de>
5 *
6 * This program is free software: you can redistribute it and/or modify
7 * it under the terms of the GNU General Public License as published by
8 * the Free Software Foundation, either version 3 of the License, or
9 * (at your option) any later version.
10 *
11 * This program is distributed in the hope that it will be useful,
12 * but WITHOUT ANY WARRANTY; without even the implied warranty of
13 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14 * GNU General Public License for more details.
15 *
16 * You should have received a copy of the GNU General Public License
17 * along with this program. If not, see <http://www.gnu.org/licenses/>.
18 *
19 */
20
21 #include "common/clib/utf8.h"
22 #include "luah.h"
23
24 #include <glib.h>
25
26 /* Convert 1-based into 0-based byte offset,
27 * counted from back of string if negative
28 * return (size_t) -1 if offset is out of range */
29 static size_t
abspos(ssize_t offset,size_t length)30 abspos(ssize_t offset, size_t length) {
31 if (offset == 0)
32 return (size_t) -1;
33 offset = (offset > 0) ? offset - 1 : offset + (ssize_t) length;
34 if (offset < 0 || (size_t) offset > length)
35 return (size_t) -1;
36 return (size_t) offset;
37 }
38
39 /* UTF8 aware string length computing.
40 * Returns the number of elements pushed on the stack. */
41 static gint
luaH_utf8_len(lua_State * L)42 luaH_utf8_len(lua_State *L)
43 {
44 size_t blen;
45 const gchar *str = luaL_checklstring(L, 1, &blen);
46
47 /* parse optional begin/end parameters
48 * raise an error if out of bounds */
49 size_t bbeg = abspos(luaL_optinteger(L, 2, 1), blen);
50 luaL_argcheck(L, bbeg != (size_t) -1, 2, "initial position out of string");
51 /* setting end position requires extra work to imitate Lua 5.3 */
52 size_t bend = bbeg;
53 ssize_t sbend = luaL_optinteger(L, 3, blen); /* may be negative */
54 sbend = (sbend >= 0) ? sbend - 1 : sbend + (ssize_t) blen;
55 luaL_argcheck(L, sbend < (ssize_t) blen, 3, "final position out of string");
56 if (sbend >= (ssize_t) bbeg && (size_t) sbend < blen)
57 bend = g_utf8_find_next_char(str + (size_t) sbend, NULL) - str;
58
59 /* is the string valid UTF8? */
60 gchar *valend;
61 if (!g_utf8_validate(str + bbeg, bend - bbeg, (const gchar **) &valend)) {
62 lua_pushnil(L);
63 lua_pushinteger(L, (ssize_t) (valend - str) + 1);
64 return 2;
65 }
66
67 lua_pushinteger(L, (ssize_t) g_utf8_strlen(str + bbeg, bend - bbeg));
68 return 1;
69 }
70
71 /* UTF8 aware string offset conversion.
72 * Converts (1-based) UTF8 offset to (1-based) byte offset.
73 * Returns the number of elements pushed on the stack. */
74 static gint
luaH_utf8_offset(lua_State * L)75 luaH_utf8_offset(lua_State *L)
76 {
77 size_t blen;
78 const gchar *str = luaL_checklstring(L, 1, &blen);
79 ssize_t widx = luaL_checkinteger(L, 2);
80 if (widx > 0) widx--; /* adjust to 0-based */
81
82 /* parse optional parameter (base index)
83 * raise an error if out of bounds
84 * or if initial position points inside a UTF8 encoding */
85 size_t bbase;
86 bbase = luaL_optinteger(L, 3, (widx>=0) ? 1 : blen + 1);
87 bbase = abspos(bbase, blen);
88 luaL_argcheck(L, bbase != (size_t) -1, 3, "position out of range");
89 if (g_utf8_get_char_validated(str + bbase, -1) == (gunichar) -1)
90 luaL_error(L, "initial position is a continuation byte");
91
92 /* convert negative index parameter to positive */
93 size_t wseglen;
94 size_t bbeg = 0;
95 if (widx < 0) {
96 wseglen = g_utf8_strlen(str, bbase);
97 widx += wseglen;
98 } else {
99 wseglen = g_utf8_strlen(str + bbase, blen - bbase);
100 bbeg = bbase;
101 }
102
103 /* convert positive UTF8 offset to byte offset */
104 ssize_t ret = 0;
105 if (widx >= 0 && (size_t) widx <= wseglen) {
106 gchar *pos = g_utf8_offset_to_pointer(str + bbeg, widx);
107 if (pos != NULL)
108 ret = (ssize_t) (pos - str) + 1;
109 }
110
111 /* if conversion was successful, output result (else output nil) */
112 if (ret > 0)
113 lua_pushinteger(L, ret);
114 else
115 lua_pushnil(L);
116 return 1;
117 }
118
119 void
utf8_lib_setup(lua_State * L)120 utf8_lib_setup(lua_State *L)
121 {
122 static const struct luaL_Reg utf8_lib[] =
123 {
124 { "len", luaH_utf8_len },
125 { "offset", luaH_utf8_offset },
126 { NULL, NULL }
127 };
128
129 luaH_openlib(L, "utf8", utf8_lib, utf8_lib);
130
131 lua_getglobal(L, "utf8");
132 lua_pushstring(L, "[%z\1-\x7F\xC2-\xF4][\x80-\xBF]*");
133 lua_setfield(L, -2, "charpattern");
134 lua_pop(L, 1);
135 }
136
137 // vim: ft=c:et:sw=4:ts=8:sts=4:tw=80
138