1 /* nl-utf8.c --- functions for UTF-8 unicode support
2 
3     Copyright (C) 2016 Lutz Mueller
4 
5     This program is free software: you can redistribute it and/or modify
6     it under the terms of the GNU General Public License as published by
7     the Free Software Foundation, either version 3 of the License, or
8     (at your option) any later version.
9 
10     This program is distributed in the hope that it will be useful,
11     but WITHOUT ANY WARRANTY; without even the implied warranty of
12     MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
13     GNU General Public License for more details.
14 
15     You should have received a copy of the GNU General Public License
16     along with this program.  If not, see <http://www.gnu.org/licenses/>.
17 
18 //
19 // portions are copied from pcre.c by: Philip Hazel <ph10@cam.ac.uk>
20 // and Copyright (c) 1997-2003 University of Cambridge
21 //
22 */
23 
24 
25 #include "newlisp.h"
26 #include <wchar.h>
27 #include <wctype.h>
28 #include "protos.h"
29 
30 /* from win-path.c */
31 CELL * utf8_from_mbcs(void * mbcs_str);
32 
33 /*************************************************
34 *    Macros and tables for character handling    *
35 *        by Philip Hazel <ph10@cam.ac.uk>        *
36 *************************************************/
37 
38 /* These are the breakpoints for different numbers of bytes in a UTF-8
39 character. */
40 
41 static const int utf8_table1[] =
42   { 0x7f, 0x7ff, 0xffff, 0x1fffff, 0x3ffffff, 0x7fffffff};
43 
44 /* These are the indicator bits and the mask for the data bits to set in the
45 first byte of a character, indexed by the number of additional bytes. */
46 
47 static const int utf8_table2[] = { 0,    0xc0, 0xe0, 0xf0, 0xf8, 0xfc};
48 static const int utf8_table3[] = { 0xff, 0x1f, 0x0f, 0x07, 0x03, 0x01};
49 
50 /* Table of the number of extra characters, indexed by the first character
51 masked with 0x3f. The highest number for a valid UTF-8 character is in fact
52 0x3d. */
53 
54 static const char utf8_table4[] = {
55   1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
56   1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
57   2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,
58   3,3,3,3,3,3,3,3,4,4,4,4,5,5,5,5 };
59 
60 /* Get the next UTF-8 character, advancing the pointer. This is called when we
61 know we are in UTF-8 mode. */
62 
63 #define GETCHARINC(c, eptr) \
64   c = (unsigned char)*eptr++; \
65   if ((c & 0xc0) == 0xc0) \
66     { \
67     int gcaa = utf8_table4[c & 0x3f];  /* Number of additional bytes */ \
68     int gcss = 6*gcaa; \
69     c = (c & utf8_table3[gcaa]) << gcss; \
70     while (gcaa-- > 0) \
71       { \
72       gcss -= 6; \
73       c |= (*eptr++ & 0x3f) << gcss; \
74       } \
75     }
76 
77 /* This function takes an integer value in the range 0 - 0x7fffffff
78 and encodes it as a UTF-8 character in 0 to 6 bytes.
79 
80 Arguments:
81   cvalue     the character value
82   buffer     pointer to buffer for result - at least 6 bytes long
83 
84 Returns:     number of characters placed in the buffer
85 */
86 
wchar_utf8(int cvalue,char * buffer)87 int wchar_utf8(int cvalue, char *buffer)
88 {
89 register int i, j;
90 for (i = 0; i < sizeof(utf8_table1)/sizeof(int); i++)
91   if (cvalue <= utf8_table1[i]) break;
92 buffer += i;
93 for (j = i; j > 0; j--)
94  {
95  *buffer-- = 0x80 | (cvalue & 0x3f);
96  cvalue >>= 6;
97  }
98 *buffer = utf8_table2[i] | cvalue;
99 return i + 1;
100 }
101 
102 
103 /* ---------------------- UTF-8 utility fuctions --------------------------- */
104 
105 /* get utf8 string from unicode wide character
106  *
107  * int wchar_utf8(int wchar, char * utf8str)
108  *
109  * the string is not nullterminated for contiguos filling
110  * of longer strings
111  * returns number of bytes placed in utf8str
112 */
113 
114 
115 /* get a unicode wide character from the utf8 string
116  * return advanced utf8 string pointer
117 */
118 
utf8_wchar(char * utf8str,int * chr)119 char * utf8_wchar(char * utf8str, int * chr)
120 {
121 GETCHARINC(*chr, utf8str)
122 
123 return(utf8str);
124 }
125 
126 /* return the number of characters encoded in utf8 string
127  * without counting the zero terminator
128  * new limit param in 10.4.5 to avoid overrun on invalid utf8 strings
129 */
130 
utf8_wlen(char * utf8str,char * limit)131 size_t utf8_wlen(char * utf8str, char * limit)
132 {
133 int gcaa;
134 int c;
135 size_t count = 0;
136 
137 while((c = *utf8str++) != 0 && utf8str < limit)
138     {
139     count++;
140     if ((c & 0xc0) == 0xc0)
141         {
142         gcaa = utf8_table4[c & 0x3f];
143         utf8str += gcaa;
144         }
145     }
146 
147 /* now handled by while loop 10.6.1
148 if(utf8str > limit)
149     errorProc(ERR_INVALID_UTF8);
150 */
151 
152 return(count);
153 }
154 
155 /* return ptr to character at index, added by LM 2012-06-10
156 */
157 
utf8_index(char * utf8str,int idx)158 char * utf8_index(char * utf8str, int idx)
159 {
160 int c;
161 
162 while((c = *utf8str) != 0 && idx-- != 0)
163     {
164     if ((c & 0xc0) == 0xc0)
165         utf8str += utf8_table4[c & 0x3f] + 1;
166     else
167         utf8str++;
168     }
169 
170 return(utf8str);
171 }
172 
173 
174 /* return the length of the first utf8 character
175 */
176 
utf8_1st_len(char * utf8str)177 int utf8_1st_len(char * utf8str)
178 {
179 int c;
180 
181 if((c = *utf8str) != 0)
182     {
183     if((c & 0xc0) == 0xc0)
184         return(utf8_table4[c & 0x3f] + 1);
185     else return(1);
186     }
187 
188 return(0);
189 }
190 
191 
192 /* convert utf8 string to vector of maxwc wide characters
193  * unicode vector is zero terminated
194  * return number of unicode characters (excluding zero int)
195 */
196 
utf8_wstr(int * unicode,char * utf8str,int maxwc)197 int utf8_wstr(int * unicode, char * utf8str, int maxwc)
198 {
199 int wchar;
200 int count = 0;
201 
202 while(maxwc-- && *utf8str != 0)
203     {
204     count++;
205     GETCHARINC(wchar, utf8str);
206 /*  utf8str = utf8_wchar(utf8str, &wchar); */
207     *(unicode++) = wchar;
208     }
209 *unicode = 0;
210 
211 return(count);
212 }
213 
214 /* convert zero terminated unicode vector into utf8 string
215  * return number of bytes stored in utr8 string excluding terminator
216  * don't use more then maxstr bytes (excluding  zero terminator)
217 */
218 
wstr_utf8(char * utf8str,int * unicode,int maxstr)219 int wstr_utf8(char * utf8str, int * unicode, int maxstr)
220 {
221 int len, size = 0;
222 
223 while(*unicode != 0 && size < maxstr)
224     {
225     len = wchar_utf8(*unicode, utf8str);
226     utf8str += len;
227     size += len;
228     unicode++;
229     }
230 
231 *utf8str = 0;
232 
233 return(size);
234 }
235 
236 /* -------------------------------------- newLISP API -----------------------------------*/
237 
p_unicode(CELL * params)238 CELL * p_unicode(CELL * params)
239 {
240 char * utf8str;
241 size_t size;
242 int * unicode;
243 
244 getStringSize(params, &utf8str, &size, TRUE);
245 unicode = allocMemory((size + 1) * sizeof(int));
246 
247 size = utf8_wstr(unicode, utf8str, size);
248 unicode = reallocMemory(unicode, (size + 1) * sizeof(int) + 1);
249 
250 /*
251 cell = getCell(CELL_STRING);
252 cell->contents = (UINT)unicode;
253 cell->aux = (size + 1) * sizeof(int) + 1;
254 */
255 
256 return(makeStringCell((char *)unicode, (size + 1) * sizeof(int)));
257 }
258 
259 
p_utf8(CELL * params)260 CELL * p_utf8(CELL * params)
261 {
262 int * unicode;
263 size_t size;
264 char * utf8str;
265 
266 params = getStringSize(params, (void *)&unicode, &size, TRUE);
267 #ifdef WINDOWS
268 if(getFlag(params)) /* its a MBCS string */
269     return(utf8_from_mbcs((void *)unicode));
270 #endif
271 
272 
273 utf8str = callocMemory(size * UTF8_MAX_BYTES + 1);
274 
275 size = wstr_utf8(utf8str, unicode, size);
276 utf8str = reallocMemory(utf8str, size + 1);
277 *(utf8str + size) = 0;
278 
279 return(makeStringCell(utf8str, size));
280 }
281 
282 
p_utf8len(CELL * params)283 CELL * p_utf8len(CELL * params)
284 {
285 char * str;
286 size_t size;
287 
288 getStringSize(params, &str, &size, TRUE);
289 
290 return(stuffInteger(utf8_wlen(str, str + size + 1)));
291 }
292 
293 /* reads a UTF-8 character */
p_readUTF8(CELL * params)294 CELL * p_readUTF8(CELL * params)
295 {
296 UINT handle;
297 int utf8C, gcaa, gcss;
298 char chr;
299 
300 getInteger(params, &handle);
301 if(read((int)handle, &chr, 1) <= 0)
302     return(nilCell);
303 
304 utf8C = chr;
305 
306 if((chr & 0xc0) == 0xc0)
307     {
308     gcaa = utf8_table4[chr & 0x3f];  /* Number of additional bytes */
309     gcss = 6*gcaa;
310     utf8C = (chr & utf8_table3[gcaa]) << gcss;
311     while (gcaa-- > 0) \
312         {
313         gcss -= 6;
314 
315         if(read((int)handle, &chr, 1) <= 0)
316             return(nilCell);
317 
318         utf8C |= (chr & 0x3f) << gcss;
319         }
320     }
321 
322 return(stuffInteger(utf8C));
323 }
324 
325 /* eof */
326