1 /*
2 * Encodings (character sets and conversions) for CLISP
3 * Bruno Haible 1998-2008, 2017-2018
4 * Sam Steingold 1998-2009, 2011, 2017
5 */
6
7 #include "lispbibl.c"
8
9 #ifdef ENABLE_UNICODE
10 #include <localcharset.h> /* from gnulib */
11 #endif
12
13 /* =========================================================================
14 * Individual encodings */
15
16 #ifdef ENABLE_UNICODE
17
18 /* NOTE 1! The mblen function has to be consistent with the mbstowcs function
19 (when called with stream = nullobj).
20 The wcslen function has to be consistent with the wcstombs function (when
21 called with stream = nullobj). */
22
23 /* NOTE 2! The conversion from bytes to characters (mbstowcs function) is
24 subject to the following restriction: At most one byte lookahead is needed.
25 This means, when someone calls mbstowcs for converting one character, and
26 he tries it with 1 byte, then with one more byte, then with one more byte,
27 and so on: when the conversion succeeds for the first time, it will leave at
28 most one byte in the buffer. stream.d (rd_ch_buffered, rd_ch_array_buffered)
29 heavily depend on this. */
30
31 /* --------------------------------------------------------------------------
32 * base64 http://rfc.net/rfc2045.html */
33
34 global uintL base64_mblen (object encoding, const uintB* src,
35 const uintB* srcend);
36 global void base64_mbstowcs (object encoding, object stream,
37 const uintB* *srcp, const uintB* srcend,
38 chart* *destp, chart* destend);
39 global uintL base64_wcslen (object encoding, const chart* src,
40 const chart* srcend);
41 global void base64_wcstombs (object encoding, object stream,
42 const chart* *srcp, const chart* srcend,
43 uintB* *destp, uintB* destend);
44 global object base64_range (object encoding, uintL start, uintL end,
45 uintL maxintervals);
46
47 static const char base64_table[64] = {
48 'A','B','C','D','E','F','G','H','I','J','K','L','M','N','O','P',
49 'Q','R','S','T','U','V','W','X','Y','Z','a','b','c','d','e','f',
50 'g','h','i','j','k','l','m','n','o','p','q','r','s','t','u','v',
51 'w','x','y','z','0','1','2','3','4','5','6','7','8','9','+','/'
52 };
53 static const signed char table_base64[128] = {
54 -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, /* -1- 9 */
55 -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, /* 10- 19 */
56 -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, /* 20- 29 */
57 -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, /* 30- 39 */
58 -1, -1, -1, 62, -1, -1, -1, 63, 52, 53, /* 40- 49 */
59 54, 55, 56, 57, 58, 59, 60, 61, -1, -1, /* 50- 59 */
60 -1, -1, -1, -1, -1, 0, 1, 2, 3, 4, /* 60- 69 */
61 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, /* 70- 79 */
62 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, /* 80- 89 */
63 25, -1, -1, -1, -1, -1, -1, 26, 27, 28, /* 90- 99 */
64 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, /* 100-109 */
65 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, /* 110-119 */
66 49, 50, 51, -1, -1, -1, -1, -1 /* 120-127 */
67 };
68 /* alternatively:
69 var int pos = 0;
70 while (pos < sizeof(table_base64)) table_base64[pos++] = -1;
71 for (pos = 0; pos < sizeof(base64_table); pos++)
72 table_base64[base64_table[pos]] = pos;
73 */
74
75 #define MIME_LINE_LENGTH 76
76
77 typedef enum { le_unix, le_mac, le_dos } line_end_t;
enc_eol_to_le(object enc_eol)78 local line_end_t enc_eol_to_le (object enc_eol) {
79 if (eq(enc_eol,S(Kunix))) return le_unix;
80 if (eq(enc_eol,S(Kdos))) return le_dos;
81 if (eq(enc_eol,S(Kmac))) return le_mac;
82 NOTREACHED;
83 }
84
base64_to_chars(line_end_t le,const uintB * src,const uintB * srcend,chart * dest)85 local uintL base64_to_chars (line_end_t le, const uintB* src,
86 const uintB* srcend, chart *dest) {
87 var uintL pos = 0;
88 var uintL counter = 0;
89 var uintL num_chars = 0;
90 while (src < srcend) {
91 int c = *src++;
92 if (counter < MIME_LINE_LENGTH/4) counter++;
93 else { /* Wrap line every 76 characters. */
94 counter = 1;
95 switch (le) {
96 case le_unix:
97 if (dest) *dest++ = ascii(LF); num_chars++;
98 break;
99 case le_dos:
100 if (dest) { *dest++ = ascii(CR); *dest++ = ascii(LF); }
101 num_chars += 2;
102 break;
103 case le_mac:
104 if (dest) *dest++ = ascii(CR); num_chars++;
105 break;
106 }
107 }
108 /* Process first byte of a triplet. */
109 if (dest) *dest++ = ascii(base64_table[0x3f & c >> 2]); num_chars++;
110 var int value = (0x03 & c) << 4;
111 /* Process second byte of a triplet. */
112 if (src == srcend) {
113 if (dest) {
114 *dest++ = ascii(base64_table[value]);
115 *dest++ = ascii('=');
116 *dest++ = ascii('=');
117 }
118 num_chars += 3;
119 break;
120 }
121 c = *src++;
122 if (dest) *dest++ = ascii(base64_table[value | (0x0f & c >> 4)]);
123 num_chars++;
124 value = (0x0f & c) << 2;
125 /* Process third byte of a triplet. */
126 if (src == srcend) {
127 if (dest) {
128 *dest++ = ascii(base64_table[value]);
129 *dest++ = ascii('=');
130 }
131 num_chars += 2;
132 break;
133 }
134 c = *src++;
135 if (dest) {
136 *dest++ = ascii(base64_table[value | (0x03 & c >> 6)]);
137 *dest++ = ascii(base64_table[0x3f & c]);
138 }
139 num_chars += 2;
140 }
141 return num_chars;
142 }
143
base64_mblen(object encoding,const uintB * src,const uintB * srcend)144 global uintL base64_mblen (object encoding, const uintB* src,
145 const uintB* srcend) {
146 return base64_to_chars(enc_eol_to_le(TheEncoding(encoding)->enc_eol),
147 src,srcend,NULL);
148 }
149
150 /* see emacs/src/fns.c */
base64_mbstowcs(object encoding,object stream,const uintB ** srcp,const uintB * srcend,chart ** destp,chart * destend)151 global void base64_mbstowcs (object encoding, object stream,
152 const uintB* *srcp, const uintB* srcend,
153 chart* *destp, chart* destend) {
154 unused(stream); unused(destend);
155 *destp += base64_to_chars(enc_eol_to_le(TheEncoding(encoding)->enc_eol),
156 *srcp,srcend,*destp);
157 *srcp = srcend;
158 }
159
160 #define BASE64_P(c) (c<sizeof(table_base64) && table_base64[c]!=-1)
161 #define BASE64_IGNORABLE_P(ch) \
162 (chareq(ch,ascii(' ')) || chareq(ch,ascii('\t')) || chareq(ch,ascii('\n')) \
163 || chareq(ch,ascii('\f')) || chareq(ch,ascii('\r')))
164
165 /* see emacs/src/fns.c */
166 #define READ_QUADRUPLET_BYTE(endform) do { \
167 if (src == srcend) { endform; } \
168 ch = *src++; \
169 } while (BASE64_IGNORABLE_P(ch)); \
170 c = as_cint(ch)
171
172 /* convert ascii src to bytes dest - when destp is given
173 return the number of bytes
174 the final bad characted position is retutned in error_p */
base64_to_bytes(const chart * src,const chart * srcend,uintB * destp,const chart ** error_p)175 local uintL base64_to_bytes (const chart *src, const chart* srcend,
176 uintB* destp, const chart* *error_p) {
177 var unsigned char c;
178 var chart ch;
179 var unsigned long value;
180 var uintB *dest = destp;
181 var uintL num_bytes = 0;
182
183 while (1) {
184 /* Process first byte of a quadruplet. */
185 READ_QUADRUPLET_BYTE(return num_bytes);
186 if (!BASE64_P(c)) { *error_p = src-1; return num_bytes; }
187 value = table_base64[c] << 18;
188
189 /* Process second byte of a quadruplet. */
190 READ_QUADRUPLET_BYTE(*error_p = src; return num_bytes);
191 if (!BASE64_P(c)) { *error_p = src-1; return num_bytes; }
192 value |= table_base64[c] << 12;
193
194 if (dest) *dest++ = (unsigned char) (value >> 16);
195 num_bytes++;
196
197 /* Process third byte of a quadruplet. */
198 READ_QUADRUPLET_BYTE(*error_p = src; return num_bytes);
199 if (c == '=') {
200 READ_QUADRUPLET_BYTE(*error_p = src; return num_bytes);
201 if (c != '=') { *error_p = src-1; return num_bytes; }
202 continue;
203 }
204
205 if (!BASE64_P(c)) { *error_p = src-1; return num_bytes; }
206 value |= table_base64[c] << 6;
207
208 if (dest) *dest++ = (unsigned char) (0xff & value >> 8);
209 num_bytes++;
210
211 /* Process fourth byte of a quadruplet. */
212 READ_QUADRUPLET_BYTE(*error_p = src-1; return num_bytes);
213 if (c == '=')
214 continue;
215 if (!BASE64_P(c)) { *error_p = src-1; return num_bytes; }
216 value |= table_base64[c];
217
218 if (dest) *dest++ = (unsigned char) (0xff & value);
219 num_bytes++;
220 }
221 }
222
base64_wcslen(object encoding,const chart * src,const chart * srcend)223 global uintL base64_wcslen (object encoding, const chart* src,
224 const chart* srcend) {
225 unused(encoding);
226 var const chart *error_p = NULL;
227 return base64_to_bytes(src,srcend,NULL,&error_p)
228 + (error_p ? 1 : 0); /* space for errors */
229 }
230
base64_wcstombs(object encoding,object stream,const chart ** srcp,const chart * srcend,uintB ** destp,uintB * destend)231 global void base64_wcstombs (object encoding, object stream,
232 const chart* *srcp, const chart* srcend,
233 uintB* *destp, uintB* destend) {
234 unused(encoding); unused(stream); unused(destend);
235 var const chart *error_p = NULL;
236 *destp += base64_to_bytes(*srcp,srcend,*destp,&error_p);
237 if (error_p) {
238 if (error_p == srcend) {
239 pushSTACK(NIL); /* <end-of-file?> slot DATUM */
240 pushSTACK(S(base64)); /* slot EXPECTED-TYPE */
241 pushSTACK(fixnum(error_p-*srcp));
242 pushSTACK(TheSubr(subr_self)->name);
243 error(charset_type_error,GETTEXT("~S: Invalid base64 encoding termination at position ~S"));
244 } else {
245 pushSTACK(code_char(*error_p)); /* slot DATUM */
246 pushSTACK(S(base64)); /* slot EXPECTED-TYPE */
247 pushSTACK(fixnum(srcend-*srcp));
248 pushSTACK(fixnum((error_p-*srcp)+1));
249 pushSTACK(code_char(*error_p));
250 pushSTACK(TheSubr(subr_self)->name);
251 error(charset_type_error,GETTEXT("~S: Invalid base64 encoding at ~S (character ~S of ~S)"));
252 }
253 }
254 *srcp = srcend;
255 }
256
base64_range(object encoding,uintL start,uintL end,uintL maxintervals)257 global object base64_range (object encoding, uintL start, uintL end,
258 uintL maxintervals) {
259 unused(encoding);
260 var uintL count = 0; /* number of intervals already on the STACK */
261 if (end >= sizeof(table_base64)) end = sizeof(table_base64) - 1;
262 for (;start <= end && count < maxintervals; count++) {
263 while ((start <= end) && (table_base64[start] == -1)) start++;
264 if (start > end) break;
265 pushSTACK(code_char(as_chart(start)));
266 while ((start <= end) && (table_base64[start] != -1)) start++;
267 pushSTACK(code_char(as_chart(start-1)));
268 }
269 return stringof(count << 1);
270 }
271
272 local char const hex_table[] = "0123456789ABCDEF";
273
274 /* Error, when a character cannot be converted to an encoding.
275 error_unencodable(encoding,ch); */
error_unencodable(object encoding,chart ch)276 global _Noreturn void error_unencodable (object encoding, chart ch) {
277 pushSTACK(code_char(ch)); /* CHARSET-TYPE-ERROR slot DATUM */
278 pushSTACK(encoding); /* CHARSET-TYPE-ERROR slot EXPECTED-TYPE */
279 pushSTACK(TheEncoding(encoding)->enc_charset);
280 pushSTACK(ascii_char(hex_table[as_cint(ch)&0x0F]));
281 pushSTACK(ascii_char(hex_table[(as_cint(ch)>>4)&0x0F]));
282 pushSTACK(ascii_char(hex_table[(as_cint(ch)>>8)&0x0F]));
283 pushSTACK(ascii_char(hex_table[(as_cint(ch)>>12)&0x0F]));
284 if (as_cint(ch) < 0x10000) {
285 pushSTACK(TheSubr(subr_self)->name);
286 error(charset_type_error,GETTEXT("~S: Character #\\u~C~C~C~C cannot be represented in the character set ~S"));
287 } else {
288 pushSTACK(ascii_char(hex_table[(as_cint(ch)>>16)&0x0F]));
289 pushSTACK(ascii_char(hex_table[(as_cint(ch)>>20)&0x0F]));
290 pushSTACK(TheSubr(subr_self)->name);
291 error(charset_type_error,GETTEXT("~S: Character #\\u00~C~C~C~C~C~C cannot be represented in the character set ~S"));
292 }
293 }
294
295 #endif /* ENABLE_UNICODE */
296
297 /* used in CONVERT-STRING-FROM-BYTES, so must not depend on ENABLE_UNICODE */
298 /* missing bytes at the end */
error_incomplete(object encoding)299 local _Noreturn void error_incomplete (object encoding) {
300 pushSTACK(NIL); /* <end-of-file?> CHARSET-TYPE-ERROR slot DATUM */
301 #ifdef ENABLE_UNICODE
302 pushSTACK(encoding); /* CHARSET-TYPE-ERROR slot EXPECTED-TYPE */
303 pushSTACK(TheEncoding(encoding)->enc_charset);
304 #else
305 pushSTACK(encoding); /* CHARSET-TYPE-ERROR slot EXPECTED-TYPE */
306 pushSTACK(encoding); /* no enc_charset slot without ENABLE_UNICODE */
307 #endif
308 pushSTACK(TheSubr(subr_self)->name);
309 error(charset_type_error,GETTEXT("~S: Incomplete byte sequence at end of buffer for ~S"));
310 }
311
312 #ifdef ENABLE_UNICODE
handle_incomplete(object encoding,chart ** destp,chart * destend)313 local void handle_incomplete (object encoding, chart* *destp, chart* destend) {
314 var object action = TheEncoding(encoding)->enc_towcs_error;
315 if (eq(action,S(Kignore))) {
316 } else if (eq(action,S(Kerror))) {
317 error_incomplete(encoding);
318 } else {
319 if (*destp < destend)
320 *(*destp)++ = char_code(action);
321 }
322 }
323
324 /* The range function for an encoding covering all of Unicode. */
all_range(object encoding,uintL start,uintL end,uintL maxintervals)325 global object all_range (object encoding, uintL start, uintL end,
326 uintL maxintervals) {
327 unused(encoding);
328 var uintL count = 0;
329 if (maxintervals > 0) {
330 pushSTACK(code_char(as_chart(start))); pushSTACK(code_char(as_chart(end)));
331 count = 2;
332 }
333 return stringof(count);
334 }
335
336 /* The range function for an encoding covering the BMP of Unicode. */
bmp_range(object encoding,uintL start,uintL end,uintL maxintervals)337 global object bmp_range (object encoding, uintL start, uintL end,
338 uintL maxintervals) {
339 unused(encoding);
340 var uintL count = 0;
341 if (maxintervals > 0 && start < 0x10000) {
342 if (end >= 0x10000)
343 end = 0xFFFF;
344 pushSTACK(code_char(as_chart(start))); pushSTACK(code_char(as_chart(end)));
345 count = 2;
346 }
347 return stringof(count);
348 }
349
350 /* --------------------------------------------------------------------------
351 * Unicode-16 encoding */
352
353 /* Unicode-16 encoding in two flavours:
354 The big-endian format (files starting with 0xFE 0xFF),
355 the little-endian format (files starting with 0xFF 0xFE). */
356
357 /* min. bytes per character = 2
358 max. bytes per character = 2 */
359
360 global uintL uni16_mblen (object encoding, const uintB* src,
361 const uintB* srcend);
362 global void uni16be_mbstowcs (object encoding, object stream,
363 const uintB* *srcp, const uintB* srcend,
364 chart* *destp, chart* destend);
365 global void uni16le_mbstowcs (object encoding, object stream,
366 const uintB* *srcp, const uintB* srcend,
367 chart* *destp, chart* destend);
368 global uintL uni16_wcslen (object encoding, const chart* src,
369 const chart* srcend);
370 global void uni16be_wcstombs (object encoding, object stream,
371 const chart* *srcp, const chart* srcend,
372 uintB* *destp, uintB* destend);
373 global void uni16le_wcstombs (object encoding, object stream,
374 const chart* *srcp, const chart* srcend,
375 uintB* *destp, uintB* destend);
376
377 /* Bytes to characters. */
378
uni16_mblen(object encoding,const uintB * src,const uintB * srcend)379 global uintL uni16_mblen (object encoding, const uintB* src,
380 const uintB* srcend) {
381 var uintL len = srcend-src;
382 var bool incomplete_p = len & 1; /* odd-p */
383 var uintL count = len >> 1; /* div 2 */
384 if (eq(TheEncoding(encoding)->enc_towcs_error,S(Kignore)))
385 return count;
386 else
387 return (count > 0 ? count + incomplete_p : 0);
388 }
389
uni16be_mbstowcs(object encoding,object stream,const uintB ** srcp,const uintB * srcend,chart ** destp,chart * destend)390 global void uni16be_mbstowcs (object encoding, object stream,
391 const uintB* *srcp, const uintB* srcend,
392 chart* *destp, chart* destend) {
393 unused(stream);
394 var const uintB* src = *srcp;
395 var chart* dest = *destp;
396 var uintL len = srcend-src;
397 var bool incomplete_p = len & 1; /* odd-p */
398 var uintL count = len >> 1; /* div 2 */
399 if (count > destend-dest)
400 count = destend-dest;
401 if (count > 0) {
402 do {
403 *dest++ = as_chart(((cint)src[0] << 8) | (cint)src[1]);
404 src += 2;
405 } while (--count);
406 *srcp = src;
407 *destp = dest;
408 if (incomplete_p) handle_incomplete(encoding,destp,destend);
409 }
410 }
411
uni16le_mbstowcs(object encoding,object stream,const uintB ** srcp,const uintB * srcend,chart ** destp,chart * destend)412 global void uni16le_mbstowcs (object encoding, object stream,
413 const uintB* *srcp, const uintB* srcend,
414 chart* *destp, chart* destend) {
415 unused(stream);
416 var const uintB* src = *srcp;
417 var chart* dest = *destp;
418 var uintL len = srcend-src;
419 var bool incomplete_p = len & 1; /* odd-p */
420 var uintL count = len >> 1; /* div 2 */
421 if (count > destend-dest)
422 count = destend-dest;
423 if (count > 0) {
424 do {
425 *dest++ = as_chart((cint)src[0] | ((cint)src[1] << 8));
426 src += 2;
427 } while (--count);
428 *srcp = src;
429 *destp = dest;
430 if (incomplete_p) handle_incomplete(encoding,destp,destend);
431 }
432 }
433
434 /* Characters to bytes. */
435
uni16_wcslen(object encoding,const chart * src,const chart * srcend)436 global uintL uni16_wcslen (object encoding, const chart* src,
437 const chart* srcend) {
438 var uintL count = srcend-src;
439 var uintL result = 0;
440 while (count--) {
441 var chart ch = *src++;
442 if (as_cint(ch) < 0x10000)
443 result += 2;
444 else {
445 var object action = TheEncoding(encoding)->enc_tombs_error;
446 if (eq(action,S(Kignore))) {
447 } else if (uint8_p(action)) {
448 result++;
449 } else if (!eq(action,S(Kerror))) {
450 var chart c = char_code(action);
451 if (as_cint(c) < 0x10000)
452 result += 2;
453 } else
454 error_unencodable(encoding,ch);
455 }
456 }
457 return result;
458 }
459
uni16be_wcstombs(object encoding,object stream,const chart ** srcp,const chart * srcend,uintB ** destp,uintB * destend)460 global void uni16be_wcstombs (object encoding, object stream,
461 const chart* *srcp, const chart* srcend,
462 uintB* *destp, uintB* destend) {
463 unused(stream);
464 var const chart* src = *srcp;
465 var uintB* dest = *destp;
466 var uintL scount = srcend-src;
467 var uintL dcount = destend-dest;
468 if (scount > 0 && dcount > 0) {
469 do {
470 var cint ch = as_cint(*src++); scount--;
471 if (ch < 0x10000) {
472 if (dcount < 2) break;
473 dest[0] = (uintB)(ch>>8); dest[1] = (uintB)ch;
474 dest += 2; dcount -= 2;
475 } else {
476 var object action = TheEncoding(encoding)->enc_tombs_error;
477 if (eq(action,S(Kignore))) {
478 } else if (uint8_p(action)) {
479 *dest++ = I_to_uint8(action); dcount--;
480 } else if (!eq(action,S(Kerror))) {
481 var cint c = char_int(action);
482 if (c < 0x10000) {
483 if (dcount < 2) break;
484 dest[0] = (uintB)(c>>8); dest[1] = (uintB)c;
485 dest += 2; dcount -= 2;
486 } else
487 error_unencodable(encoding,as_chart(ch));
488 } else
489 error_unencodable(encoding,as_chart(ch));
490 }
491 } while (scount > 0 && dcount > 0);
492 *srcp = src;
493 *destp = dest;
494 }
495 }
496
uni16le_wcstombs(object encoding,object stream,const chart ** srcp,const chart * srcend,uintB ** destp,uintB * destend)497 global void uni16le_wcstombs (object encoding, object stream,
498 const chart* *srcp, const chart* srcend,
499 uintB* *destp, uintB* destend) {
500 unused(stream);
501 var const chart* src = *srcp;
502 var uintB* dest = *destp;
503 var uintL scount = srcend-src;
504 var uintL dcount = destend-dest;
505 if (scount > 0 && dcount > 0) {
506 do {
507 var cint ch = as_cint(*src++); scount--;
508 if (ch < 0x10000) {
509 if (dcount < 2) break;
510 dest[0] = (uintB)ch; dest[1] = (uintB)(ch>>8);
511 dest += 2; dcount -= 2;
512 } else {
513 var object action = TheEncoding(encoding)->enc_tombs_error;
514 if (eq(action,S(Kignore))) {
515 } else if (uint8_p(action)) {
516 *dest++ = I_to_uint8(action); dcount--;
517 } else if (!eq(action,S(Kerror))) {
518 var cint c = char_int(action);
519 if (c < 0x10000) {
520 if (dcount < 2) break;
521 dest[0] = (uintB)c; dest[1] = (uintB)(c>>8);
522 dest += 2; dcount -= 2;
523 } else
524 error_unencodable(encoding,as_chart(ch));
525 } else
526 error_unencodable(encoding,as_chart(ch));
527 }
528 } while (scount > 0 && dcount > 0);
529 *srcp = src;
530 *destp = dest;
531 }
532 }
533
534 /* -------------------------------------------------------------------------
535 * Unicode-32 encoding */
536
537 /* Unicode-32 encoding in two flavours:
538 The big-endian format,
539 the little-endian format. */
540
541 /* min. bytes per character = 4
542 max. bytes per character = 4 */
543
544 global uintL uni32be_mblen (object encoding, const uintB* src,
545 const uintB* srcend);
546 global uintL uni32le_mblen (object encoding, const uintB* src,
547 const uintB* srcend);
548 global void uni32be_mbstowcs (object encoding, object stream,
549 const uintB* *srcp, const uintB* srcend,
550 chart* *destp, chart* destend);
551 global void uni32le_mbstowcs (object encoding, object stream,
552 const uintB* *srcp, const uintB* srcend,
553 chart* *destp, chart* destend);
554 global uintL uni32_wcslen (object encoding, const chart* src,
555 const chart* srcend);
556 global void uni32be_wcstombs (object encoding, object stream,
557 const chart* *srcp, const chart* srcend,
558 uintB* *destp, uintB* destend);
559 global void uni32le_wcstombs (object encoding, object stream,
560 const chart* *srcp, const chart* srcend,
561 uintB* *destp, uintB* destend);
562
563 /* Bytes to characters. */
564
565 /* Error when an invalid character was encountered.
566 error_uni32_invalid(encoding,code); */
error_uni32_invalid(object encoding,uint32 code)567 local _Noreturn void error_uni32_invalid (object encoding, uint32 code) {
568 pushSTACK(NIL); /* CHARSET-TYPE-ERROR slot datum - filled below */
569 pushSTACK(encoding); /* CHARSET-TYPE-ERROR slot EXPECTED-TYPE */
570 pushSTACK(TheEncoding(encoding)->enc_charset);
571 STACK_2 = uint32_to_I(code); /* fill it here since it maygc */
572 var uintC count;
573 dotimespC(count,8, {
574 pushSTACK(ascii_char(hex_table[code&0x0F]));
575 code = code>>4;
576 });
577 pushSTACK(TheSubr(subr_self)->name);
578 error(charset_type_error,GETTEXT("~S: Character #x~C~C~C~C~C~C~C~C in ~S conversion, not an UTF-32 character"));
579 }
580
uni32be_mblen(object encoding,const uintB * src,const uintB * srcend)581 global uintL uni32be_mblen (object encoding, const uintB* src,
582 const uintB* srcend) {
583 var uintL len = srcend-src;
584 var bool incomplete_p = ((len & 3) != 0); /* mod 4 */
585 var uintL count = len >> 2; /* div 4 */
586 if (eq(TheEncoding(encoding)->enc_towcs_error,S(Kignore))) {
587 var uintL result = 0;
588 dotimesL(count,count, {
589 var uint32 ch =
590 ((uint32)src[0] << 24) | ((uint32)src[1] << 16)
591 | ((uint32)src[2] << 8) | (uint32)src[3];
592 if (ch < char_code_limit)
593 result++;
594 src += 4;
595 });
596 return result;
597 } else {
598 return (count > 0 ? count + incomplete_p : 0);
599 }
600 }
601
uni32le_mblen(object encoding,const uintB * src,const uintB * srcend)602 global uintL uni32le_mblen (object encoding, const uintB* src,
603 const uintB* srcend) {
604 var uintL len = srcend-src;
605 var bool incomplete_p = ((len & 3) != 0); /* mod 4 */
606 var uintL count = len >> 2; /* div 4 */
607 if (eq(TheEncoding(encoding)->enc_towcs_error,S(Kignore))) {
608 var uintL result = 0;
609 dotimesL(count,count, {
610 var uint32 ch =
611 (uint32)src[0] | ((uint32)src[1] << 8)
612 | ((uint32)src[2] << 16) | ((uint32)src[3] << 24);
613 if (ch < char_code_limit)
614 result++;
615 src += 4;
616 });
617 return result;
618 } else {
619 return (count > 0 ? count + incomplete_p : 0);
620 }
621 }
622
uni32be_mbstowcs(object encoding,object stream,const uintB ** srcp,const uintB * srcend,chart ** destp,chart * destend)623 global void uni32be_mbstowcs (object encoding, object stream,
624 const uintB* *srcp, const uintB* srcend,
625 chart* *destp, chart* destend) {
626 unused(stream);
627 var const uintB* src = *srcp;
628 var chart* dest = *destp;
629 var uintL len = srcend-src;
630 var bool incomplete_p = ((len & 3) != 0); /* mod 4 */
631 var uintL scount = len >> 2; /* div 4 */
632 var uintL dcount = destend-dest;
633 if (scount > 0 && dcount > 0) {
634 do {
635 var uint32 ch =
636 ((uint32)src[0] << 24) | ((uint32)src[1] << 16)
637 | ((uint32)src[2] << 8) | (uint32)src[3];
638 if (ch < char_code_limit) {
639 *dest++ = as_chart(ch); dcount--;
640 } else {
641 var object action = TheEncoding(encoding)->enc_towcs_error;
642 if (eq(action,S(Kignore))) {
643 } else if (eq(action,S(Kerror))) {
644 error_uni32_invalid(encoding,ch);
645 } else {
646 *dest++ = char_code(action); dcount--;
647 }
648 }
649 src += 4; scount--;
650 } while (scount > 0 && dcount > 0);
651 *srcp = src;
652 *destp = dest;
653 if (incomplete_p) handle_incomplete(encoding,destp,destend);
654 }
655 }
656
uni32le_mbstowcs(object encoding,object stream,const uintB ** srcp,const uintB * srcend,chart ** destp,chart * destend)657 global void uni32le_mbstowcs (object encoding, object stream,
658 const uintB* *srcp, const uintB* srcend,
659 chart* *destp, chart* destend) {
660 unused(stream);
661 var const uintB* src = *srcp;
662 var chart* dest = *destp;
663 var uintL len = srcend-src;
664 var bool incomplete_p = ((len & 3) != 0); /* mod 4 */
665 var uintL scount = len >> 2; /* div 4 */
666 var uintL dcount = destend-dest;
667 if (scount > 0 && dcount > 0) {
668 do {
669 var uint32 ch =
670 (uint32)src[0] | ((uint32)src[1] << 8)
671 | ((uint32)src[2] << 16) | ((uint32)src[3] << 24);
672 if (ch < char_code_limit) {
673 *dest++ = as_chart(ch); dcount--;
674 } else {
675 var object action = TheEncoding(encoding)->enc_towcs_error;
676 if (eq(action,S(Kignore))) {
677 } else if (eq(action,S(Kerror))) {
678 error_uni32_invalid(encoding,ch);
679 } else {
680 *dest++ = char_code(action); dcount--;
681 }
682 }
683 src += 4; scount--;
684 } while (scount > 0 && dcount > 0);
685 *srcp = src;
686 *destp = dest;
687 if (incomplete_p) handle_incomplete(encoding,destp,destend);
688 }
689 }
690
691 /* Characters to bytes. */
692
uni32_wcslen(object encoding,const chart * src,const chart * srcend)693 global uintL uni32_wcslen (object encoding, const chart* src,
694 const chart* srcend) {
695 unused(encoding);
696 return (srcend-src)*4;
697 }
698
uni32be_wcstombs(object encoding,object stream,const chart ** srcp,const chart * srcend,uintB ** destp,uintB * destend)699 global void uni32be_wcstombs (object encoding, object stream,
700 const chart* *srcp, const chart* srcend,
701 uintB* *destp, uintB* destend) {
702 unused(encoding); unused(stream);
703 var const chart* src = *srcp;
704 var uintB* dest = *destp;
705 var uintL count = floor(destend-dest,4);
706 if (count > srcend-src)
707 count = srcend-src;
708 if (count > 0) {
709 dotimespL(count,count, {
710 var cint ch = as_cint(*src++);
711 dest[0] = 0; dest[1] = (uintB)(ch>>16);
712 dest[2] = (uintB)(ch>>8); dest[3] = (uintB)ch;
713 dest += 4;
714 });
715 *srcp = src;
716 *destp = dest;
717 }
718 }
719
uni32le_wcstombs(object encoding,object stream,const chart ** srcp,const chart * srcend,uintB ** destp,uintB * destend)720 global void uni32le_wcstombs (object encoding, object stream,
721 const chart* *srcp, const chart* srcend,
722 uintB* *destp, uintB* destend) {
723 unused(encoding); unused(stream);
724 var const chart* src = *srcp;
725 var uintB* dest = *destp;
726 var uintL count = floor(destend-dest,4);
727 if (count > srcend-src)
728 count = srcend-src;
729 if (count > 0) {
730 dotimespL(count,count, {
731 var cint ch = as_cint(*src++);
732 dest[0] = (uintB)ch; dest[1] = (uintB)(ch>>8);
733 dest[2] = (uintB)(ch>>16); dest[3] = 0;
734 dest += 4;
735 });
736 *srcp = src;
737 *destp = dest;
738 }
739 }
740
741 /* -------------------------------------------------------------------------
742 * UTF-8 encoding */
743
744 /* See http://www.stonehand.com/unicode/standard/fss-utf.html
745 or Linux 2.0.x, file linux/fs/nls.c
746 cmask cval shift maxval bits
747 1 byte sequence 0x80 0x00 0*6 0x7F 0XXXXXXX
748 2 byte sequence 0xE0 0xC0 1*6 0x7FF 110XXXXX 10XXXXXX
749 3 byte sequence 0xF0 0xE0 2*6 0xFFFF 1110XXXX 10XXXXXX 10XXXXXX
750 4 byte sequence 0xF8 0xF0 3*6 0x1FFFFF 11110XXX 10XXXXXX 10XXXXXX 10XXXXXX
751 5 byte sequence 0xFC 0xF8 4*6 0x3FFFFFF 111110XX 10XXXXXX 10XXXXXX 10XXXXXX 10XXXXXX
752 6 byte sequence 0xFE 0xFC 5*6 0x7FFFFFFF 1111110X 10XXXXXX 10XXXXXX 10XXXXXX 10XXXXXX 10XXXXXX
753
754 We support only 21-bit Unicode characters, i.e. those which can be encoded
755 with at most 4 bytes. Characters outside this range give an error.
756 Spurious bytes of the form 10XXXXXX are ignored.
757 (This resync feature is one of the benefits of the UTF encoding.) */
758
759 /* min. bytes per character = 1
760 max. bytes per character = 4 */
761
762 global uintL utf8_mblen (object encoding, const uintB* src,
763 const uintB* srcend);
764 global void utf8_mbstowcs (object encoding, object stream, const uintB* *srcp,
765 const uintB* srcend, chart* *destp, chart* destend);
766 global uintL utf8_wcslen (object encoding, const chart* src,
767 const chart* srcend);
768 global void utf8_wcstombs (object encoding, object stream, const chart* *srcp,
769 const chart* srcend, uintB* *destp, uintB* destend);
770
771 /* Bytes to characters. */
772
773 /* Error when an invalid 1-byte sequence was encountered.
774 error_utf8_invalid1(encoding,b1); */
error_utf8_invalid1(object encoding,uintB b1)775 local _Noreturn void error_utf8_invalid1 (object encoding, uintB b1) {
776 pushSTACK(fixnum(b1)); /* CHARSET-TYPE-ERROR slot DATUM */
777 pushSTACK(encoding); /* CHARSET-TYPE-ERROR slot EXPECTED-TYPE */
778 pushSTACK(TheEncoding(encoding)->enc_charset);
779 pushSTACK(ascii_char(hex_table[b1&0x0F]));
780 pushSTACK(ascii_char(hex_table[(b1>>4)&0x0F]));
781 pushSTACK(TheSubr(subr_self)->name);
782 error(charset_type_error,GETTEXT("~S: Invalid byte #x~C~C in ~S conversion, not a Unicode-16"));
783 }
784
785 /* Error when an invalid 2-byte sequence was encountered.
786 error_utf8_invalid2(encoding,b1,b2); */
error_utf8_invalid2(object encoding,uintB b1,uintB b2)787 local _Noreturn void error_utf8_invalid2 (object encoding, uintB b1, uintB b2) {
788 pushSTACK(NIL); /* CHARSET-TYPE-ERROR slot DATUM */
789 pushSTACK(encoding); /* CHARSET-TYPE-ERROR slot EXPECTED-TYPE */
790 STACK_1 = allocate_bit_vector(Atype_8Bit,2);
791 TheSbvector(STACK_1)->data[0] = b1;
792 TheSbvector(STACK_1)->data[1] = b2;
793 pushSTACK(TheEncoding(STACK_0/*encoding*/)->enc_charset);
794 pushSTACK(ascii_char(hex_table[b2&0x0F]));
795 pushSTACK(ascii_char(hex_table[(b2>>4)&0x0F]));
796 pushSTACK(ascii_char(hex_table[b1&0x0F]));
797 pushSTACK(ascii_char(hex_table[(b1>>4)&0x0F]));
798 pushSTACK(TheSubr(subr_self)->name);
799 error(charset_type_error,GETTEXT("~S: Invalid byte sequence #x~C~C #x~C~C in ~S conversion"));
800 }
801
802 /* Error when an invalid 3-byte sequence was encountered.
803 error_utf8_invalid3(encoding,b1,b2,b3); */
error_utf8_invalid3(object encoding,uintB b1,uintB b2,uintB b3)804 local _Noreturn void error_utf8_invalid3 (object encoding, uintB b1, uintB b2, uintB b3) {
805 pushSTACK(NIL); /* CHARSET-TYPE-ERROR slot DATUM */
806 pushSTACK(encoding); /* CHARSET-TYPE-ERROR slot EXPECTED-TYPE */
807 STACK_1 = allocate_bit_vector(Atype_8Bit,3);
808 TheSbvector(STACK_1)->data[0] = b1;
809 TheSbvector(STACK_1)->data[1] = b2;
810 TheSbvector(STACK_1)->data[2] = b3;
811 pushSTACK(TheEncoding(STACK_0/*encoding*/)->enc_charset);
812 pushSTACK(ascii_char(hex_table[b3&0x0F]));
813 pushSTACK(ascii_char(hex_table[(b3>>4)&0x0F]));
814 pushSTACK(ascii_char(hex_table[b2&0x0F]));
815 pushSTACK(ascii_char(hex_table[(b2>>4)&0x0F]));
816 pushSTACK(ascii_char(hex_table[b1&0x0F]));
817 pushSTACK(ascii_char(hex_table[(b1>>4)&0x0F]));
818 pushSTACK(TheSubr(subr_self)->name);
819 error(charset_type_error,GETTEXT("~S: Invalid byte sequence #x~C~C #x~C~C #x~C~C in ~S conversion"));
820 }
821
822 /* Error when an invalid 4-byte sequence was encountered.
823 error_utf8_invalid4(encoding,b1,b2,b3,b4); */
error_utf8_invalid4(object encoding,uintB b1,uintB b2,uintB b3,uintB b4)824 local _Noreturn void error_utf8_invalid4 (object encoding, uintB b1, uintB b2, uintB b3, uintB b4)
825 {
826 pushSTACK(NIL); /* CHARSET-TYPE-ERROR slot DATUM */
827 pushSTACK(encoding); /* CHARSET-TYPE-ERROR slot EXPECTED-TYPE */
828 STACK_1 = allocate_bit_vector(Atype_8Bit,4);
829 TheSbvector(STACK_1)->data[0] = b1;
830 TheSbvector(STACK_1)->data[1] = b2;
831 TheSbvector(STACK_1)->data[2] = b3;
832 TheSbvector(STACK_1)->data[3] = b4;
833 pushSTACK(TheEncoding(STACK_0/*encoding*/)->enc_charset);
834 pushSTACK(ascii_char(hex_table[b4&0x0F]));
835 pushSTACK(ascii_char(hex_table[(b4>>4)&0x0F]));
836 pushSTACK(ascii_char(hex_table[b3&0x0F]));
837 pushSTACK(ascii_char(hex_table[(b3>>4)&0x0F]));
838 pushSTACK(ascii_char(hex_table[b2&0x0F]));
839 pushSTACK(ascii_char(hex_table[(b2>>4)&0x0F]));
840 pushSTACK(ascii_char(hex_table[b1&0x0F]));
841 pushSTACK(ascii_char(hex_table[(b1>>4)&0x0F]));
842 pushSTACK(TheSubr(subr_self)->name);
843 error(charset_type_error,GETTEXT("~S: Invalid byte sequence #x~C~C #x~C~C #x~C~C #x~C~C in ~S conversion"));
844 }
845
utf8_mblen(object encoding,const uintB * src,const uintB * srcend)846 global uintL utf8_mblen (object encoding, const uintB* src,
847 const uintB* srcend) {
848 var uintL count = 0;
849 while (src < srcend) {
850 var uintB c = src[0];
851 if (c < 0x80) { /* 1 byte sequence */
852 src += 1;
853 count++;
854 continue;
855 }
856 if (c < 0xC0) {
857 src++; continue; /* skip spurious 10XXXXXX byte */
858 }
859 if (c < 0xE0) { /* 2 byte sequence */
860 if (src+2 > srcend) break;
861 if (((src[1] ^ 0x80) < 0x40)
862 && (c >= 0xC2)) {
863 src += 2;
864 count++;
865 continue;
866 }
867 {
868 var object action = TheEncoding(encoding)->enc_towcs_error;
869 if (eq(action,S(Kignore))) {
870 src += 2; continue;
871 } else if (eq(action,S(Kerror))) {
872 error_utf8_invalid2(encoding,c,src[1]);
873 } else {
874 src += 2; count++; continue;
875 }
876 }
877 }
878 if (c < 0xF0) { /* 3 byte sequence */
879 if (src+3 > srcend) break;
880 if (((src[1] ^ 0x80) < 0x40) && ((src[2] ^ 0x80) < 0x40)
881 && (c >= 0xE1 || src[1] >= 0xA0)
882 && (c != 0xED || src[1] < 0xA0)) {
883 src += 3;
884 count++;
885 continue;
886 }
887 {
888 var object action = TheEncoding(encoding)->enc_towcs_error;
889 if (eq(action,S(Kignore))) {
890 src += 3; continue;
891 } else if (eq(action,S(Kerror))) {
892 error_utf8_invalid3(encoding,c,src[1],src[2]);
893 } else {
894 src += 3; count++; continue;
895 }
896 }
897 }
898 if (c < 0xF8) { /* 4 byte sequence */
899 if (src+4 > srcend) break;
900 if (((src[1] ^ 0x80) < 0x40) && ((src[2] ^ 0x80) < 0x40)
901 && ((src[3] ^ 0x80) < 0x40)
902 && (c >= 0xF1 || src[1] >= 0x90)) {
903 src += 4;
904 count++;
905 continue;
906 }
907 {
908 var object action = TheEncoding(encoding)->enc_towcs_error;
909 if (eq(action,S(Kignore))) {
910 src += 4; continue;
911 } else if (eq(action,S(Kerror))) {
912 error_utf8_invalid4(encoding,c,src[1],src[2],src[3]);
913 } else {
914 src += 4; count++; continue;
915 }
916 }
917 }
918 {
919 var object action = TheEncoding(encoding)->enc_towcs_error;
920 if (eq(action,S(Kignore))) {
921 src += 1; continue;
922 } else if (eq(action,S(Kerror))) {
923 error_utf8_invalid1(encoding,c);
924 } else {
925 src += 1; count++; continue;
926 }
927 }
928 }
929 return count;
930 }
931
utf8_mbstowcs(object encoding,object stream,const uintB ** srcp,const uintB * srcend,chart ** destp,chart * destend)932 global void utf8_mbstowcs (object encoding, object stream, const uintB* *srcp,
933 const uintB* srcend, chart* *destp,
934 chart* destend) {
935 unused(stream);
936 var const uintB* src = *srcp;
937 var chart* dest = *destp;
938 while (src < srcend) {
939 var uintB c = src[0];
940 if (c < 0x80) { /* 1 byte sequence */
941 if (dest == destend) break;
942 *dest++ = as_chart((cint)c);
943 src += 1;
944 continue;
945 }
946 if (c < 0xC0) {
947 src++; continue; /* skip spurious 10XXXXXX byte */
948 }
949 if (dest == destend) break;
950 if (c < 0xE0) { /* 2 byte sequence */
951 if (src+2 > srcend) break;
952 if (((src[1] ^ 0x80) < 0x40)
953 && (c >= 0xC2)) {
954 *dest++ = as_chart(((cint)(c & 0x1F) << 6) | (cint)(src[1] ^ 0x80));
955 src += 2;
956 continue;
957 }
958 {
959 var object action = TheEncoding(encoding)->enc_towcs_error;
960 if (eq(action,S(Kignore))) {
961 src += 2; continue;
962 } else if (eq(action,S(Kerror))) {
963 error_utf8_invalid2(encoding,c,src[1]);
964 } else {
965 src += 2; *dest++ = char_code(action); continue;
966 }
967 }
968 }
969 if (c < 0xF0) { /* 3 byte sequence */
970 if (src+3 > srcend) break;
971 if (((src[1] ^ 0x80) < 0x40) && ((src[2] ^ 0x80) < 0x40)
972 && (c >= 0xE1 || src[1] >= 0xA0)
973 && (c != 0xED || src[1] < 0xA0)) {
974 *dest++ = as_chart(((cint)(c & 0x0F) << 12)
975 | ((cint)(src[1] ^ 0x80) << 6)
976 | (cint)(src[2] ^ 0x80));
977 src += 3;
978 continue;
979 }
980 {
981 var object action = TheEncoding(encoding)->enc_towcs_error;
982 if (eq(action,S(Kignore))) {
983 src += 3; continue;
984 } else if (eq(action,S(Kerror))) {
985 error_utf8_invalid3(encoding,c,src[1],src[2]);
986 } else {
987 src += 3; *dest++ = char_code(action); continue;
988 }
989 }
990 }
991 if (c < 0xF8) { /* 4 byte sequence */
992 if (src+4 > srcend) break;
993 if (((src[1] ^ 0x80) < 0x40) && ((src[2] ^ 0x80) < 0x40)
994 && ((src[3] ^ 0x80) < 0x40)
995 && (c >= 0xF1 || src[1] >= 0x90)) {
996 var cint ch = ((cint)(c & 0x07) << 18)
997 | ((cint)(src[1] ^ 0x80) << 12)
998 | ((cint)(src[2] ^ 0x80) << 6)
999 | (cint)(src[3] ^ 0x80);
1000 if (ch < char_code_limit) {
1001 *dest++ = as_chart(ch);
1002 src += 4;
1003 continue;
1004 }
1005 }
1006 {
1007 var object action = TheEncoding(encoding)->enc_towcs_error;
1008 if (eq(action,S(Kignore))) {
1009 src += 4; continue;
1010 } else if (eq(action,S(Kerror))) {
1011 error_utf8_invalid4(encoding,c,src[1],src[2],src[3]);
1012 } else {
1013 src += 4; *dest++ = char_code(action); continue;
1014 }
1015 }
1016 }
1017 {
1018 var object action = TheEncoding(encoding)->enc_towcs_error;
1019 if (eq(action,S(Kignore))) {
1020 src += 1; continue;
1021 } else if (eq(action,S(Kerror))) {
1022 error_utf8_invalid1(encoding,c);
1023 } else {
1024 src += 1; *dest++ = char_code(action); continue;
1025 }
1026 }
1027 }
1028 *srcp = src;
1029 *destp = dest;
1030 }
1031
1032 /* Characters to bytes. */
1033
utf8_wcslen(object encoding,const chart * src,const chart * srcend)1034 global uintL utf8_wcslen (object encoding, const chart* src,
1035 const chart* srcend) {
1036 unused(encoding);
1037 var uintL destlen = 0;
1038 while (src < srcend) {
1039 var cint ch = as_cint(*src++);
1040 destlen += (ch < 0x80 ? 1 : ch < 0x800 ? 2 : ch < 0x10000 ? 3 : 4);
1041 }
1042 return destlen;
1043 }
1044
utf8_wcstombs(object encoding,object stream,const chart ** srcp,const chart * srcend,uintB ** destp,uintB * destend)1045 global void utf8_wcstombs (object encoding, object stream, const chart* *srcp,
1046 const chart* srcend, uintB* *destp,
1047 uintB* destend) {
1048 unused(encoding); unused(stream);
1049 var const chart* src = *srcp;
1050 var uintB* dest = *destp;
1051 while (src < srcend) {
1052 var cint ch = as_cint(*src);
1053 var uintL count = (ch < 0x80 ? 1 : ch < 0x800 ? 2 : 3);
1054 if (dest+count > destend) break;
1055 src++;
1056 if (ch < 0x80) { /* 1 byte sequence */
1057 *dest++ = ch;
1058 } else if (ch < 0x800) { /* 2 byte sequence */
1059 *dest++ = 0xC0 | (ch >> 6);
1060 *dest++ = 0x80 | (ch & 0x3F);
1061 } else if (ch < 0x10000) { /* 3 byte sequence */
1062 *dest++ = 0xE0 | (ch >> 12);
1063 *dest++ = 0x80 | ((ch >> 6) & 0x3F);
1064 *dest++ = 0x80 | (ch & 0x3F);
1065 } else { /* ch < 0x110000, 4 byte sequence */
1066 *dest++ = 0xF0 | (ch >> 18);
1067 *dest++ = 0x80 | ((ch >> 12) & 0x3F);
1068 *dest++ = 0x80 | ((ch >> 6) & 0x3F);
1069 *dest++ = 0x80 | (ch & 0x3F);
1070 }
1071 }
1072 *srcp = src;
1073 *destp = dest;
1074 }
1075
1076 /* -------------------------------------------------------------------------
1077 * Java encoding */
1078
1079 /* This is ISO 8859-1 with \uXXXX escape sequences,
1080 denoting Unicode characters.
1081 See the Java Language Specification.
1082 Characters outside the BMP are represented by two consecutive
1083 \uXXXX escape sequences, like UTF-16. Example:
1084 $ printf '\U00102345\n' | native2ascii -encoding UTF-8
1085 \udbc8\udf45
1086
1087 This is quick&dirty: The text is supposed not to contain \u except as part
1088 of \uXXXX escape sequences. */
1089
1090 /* min. bytes per character = 1
1091 max. bytes per character = 12 */
1092
1093 global uintL java_mblen (object encoding, const uintB* src,
1094 const uintB* srcend);
1095 global void java_mbstowcs (object encoding, object stream, const uintB* *srcp,
1096 const uintB* srcend, chart* *destp, chart* destend);
1097 global uintL java_wcslen (object encoding, const chart* src,
1098 const chart* srcend);
1099 global void java_wcstombs (object encoding, object stream, const chart* *srcp,
1100 const chart* srcend, uintB* *destp, uintB* destend);
1101
1102 /* Bytes to characters. */
1103
java_mblen(object encoding,const uintB * src,const uintB * srcend)1104 global uintL java_mblen (object encoding, const uintB* src,
1105 const uintB* srcend) {
1106 unused(encoding);
1107 var uintL count = 0;
1108 while (src < srcend) {
1109 var uintB c;
1110 var cint ch;
1111 if (src[0] != '\\') {
1112 src += 1;
1113 count++;
1114 continue;
1115 }
1116 if (src+2 > srcend) break;
1117 if (src[1] != 'u') {
1118 src += 1;
1119 count++;
1120 continue;
1121 }
1122 if (src+3 > srcend) break;
1123 c = src[2];
1124 if (c >= '0' && c <= '9') { c -= '0'; }
1125 else if (c >= 'A' && c <= 'F') { c -= 'A'-10; }
1126 else if (c >= 'a' && c <= 'f') { c -= 'a'-10; }
1127 else {
1128 src += 2; /* skip incomplete \u sequence */
1129 continue;
1130 }
1131 ch = (cint)c << 12;
1132 if (src+4 > srcend) break;
1133 c = src[3];
1134 if (c >= '0' && c <= '9') { c -= '0'; }
1135 else if (c >= 'A' && c <= 'F') { c -= 'A'-10; }
1136 else if (c >= 'a' && c <= 'f') { c -= 'a'-10; }
1137 else {
1138 src += 3; /* skip incomplete \u sequence */
1139 continue;
1140 }
1141 ch |= (cint)c << 8;
1142 if (src+5 > srcend) break;
1143 c = src[4];
1144 if (c >= '0' && c <= '9') { c -= '0'; }
1145 else if (c >= 'A' && c <= 'F') { c -= 'A'-10; }
1146 else if (c >= 'a' && c <= 'f') { c -= 'a'-10; }
1147 else {
1148 src += 4; /* skip incomplete \u sequence */
1149 continue;
1150 }
1151 ch |= (cint)c << 4;
1152 if (src+6 > srcend) break;
1153 c = src[5];
1154 if (c >= '0' && c <= '9') { c -= '0'; }
1155 else if (c >= 'A' && c <= 'F') { c -= 'A'-10; }
1156 else if (c >= 'a' && c <= 'f') { c -= 'a'-10; }
1157 else {
1158 src += 5; /* skip incomplete \u sequence */
1159 continue;
1160 }
1161 ch |= (cint)c;
1162 if (ch < char_code_limit && !(ch >= 0xd800 && ch < 0xe000)) {
1163 src += 6; /* complete \u sequence */
1164 count++;
1165 continue;
1166 }
1167 if (!(ch >= 0xd800 && ch < 0xdc00)) {
1168 src += 6; /* skip invalid \u sequence */
1169 continue;
1170 }
1171 var cint ch1 = ch;
1172 if (src+7 > srcend) break;
1173 if (src[6] != '\\') {
1174 src += 6; /* skip incomplete \u sequence */
1175 continue;
1176 }
1177 if (src+8 > srcend) break;
1178 if (src[7] != 'u') {
1179 src += 6; /* skip incomplete \u sequence */
1180 continue;
1181 }
1182 if (src+9 > srcend) break;
1183 c = src[8];
1184 if (c >= '0' && c <= '9') { c -= '0'; }
1185 else if (c >= 'A' && c <= 'F') { c -= 'A'-10; }
1186 else if (c >= 'a' && c <= 'f') { c -= 'a'-10; }
1187 else {
1188 src += 8; /* skip incomplete \u sequence */
1189 continue;
1190 }
1191 ch = (cint)c << 12;
1192 if (src+10 > srcend) break;
1193 c = src[9];
1194 if (c >= '0' && c <= '9') { c -= '0'; }
1195 else if (c >= 'A' && c <= 'F') { c -= 'A'-10; }
1196 else if (c >= 'a' && c <= 'f') { c -= 'a'-10; }
1197 else {
1198 src += 9; /* skip incomplete \u sequence */
1199 continue;
1200 }
1201 ch |= (cint)c << 8;
1202 if (src+11 > srcend) break;
1203 c = src[10];
1204 if (c >= '0' && c <= '9') { c -= '0'; }
1205 else if (c >= 'A' && c <= 'F') { c -= 'A'-10; }
1206 else if (c >= 'a' && c <= 'f') { c -= 'a'-10; }
1207 else {
1208 src += 10; /* skip incomplete \u sequence */
1209 continue;
1210 }
1211 ch |= (cint)c << 4;
1212 if (src+12 > srcend) break;
1213 c = src[11];
1214 if (c >= '0' && c <= '9') { c -= '0'; }
1215 else if (c >= 'A' && c <= 'F') { c -= 'A'-10; }
1216 else if (c >= 'a' && c <= 'f') { c -= 'a'-10; }
1217 else {
1218 src += 11; /* skip incomplete \u sequence */
1219 continue;
1220 }
1221 ch |= (cint)c;
1222 if (ch >= 0xdc00 && ch < 0xe000) {
1223 ch = 0x10000 + ((ch1 - 0xd800) << 10) + (ch - 0xdc00);
1224 if (ch < char_code_limit) {
1225 src += 12; /* complete \u sequence */
1226 count++;
1227 continue;
1228 }
1229 }
1230 src += 6; /* skip invalid \u sequence */
1231 continue;
1232 }
1233 return count;
1234 }
1235
java_mbstowcs(object encoding,object stream,const uintB ** srcp,const uintB * srcend,chart ** destp,chart * destend)1236 global void java_mbstowcs (object encoding, object stream, const uintB* *srcp,
1237 const uintB* srcend, chart* *destp,
1238 chart* destend) {
1239 unused(encoding); unused(stream);
1240 var const uintB* src = *srcp;
1241 var chart* dest = *destp;
1242 while (src < srcend) {
1243 var uintB c;
1244 var cint ch;
1245 c = src[0];
1246 if (c != '\\') {
1247 if (dest==destend) break;
1248 *dest++ = as_chart((cint)c);
1249 src += 1;
1250 continue;
1251 }
1252 if (src+2 > srcend) break;
1253 if (src[1] != 'u') {
1254 if (dest==destend) break;
1255 *dest++ = as_chart((cint)c);
1256 src += 1;
1257 continue;
1258 }
1259 if (src+3 > srcend) break;
1260 c = src[2];
1261 if (c >= '0' && c <= '9') { c -= '0'; }
1262 else if (c >= 'A' && c <= 'F') { c -= 'A'-10; }
1263 else if (c >= 'a' && c <= 'f') { c -= 'a'-10; }
1264 else {
1265 src += 2; /* skip incomplete \u sequence */
1266 continue;
1267 }
1268 ch = (cint)c << 12;
1269 if (src+4 > srcend) break;
1270 c = src[3];
1271 if (c >= '0' && c <= '9') { c -= '0'; }
1272 else if (c >= 'A' && c <= 'F') { c -= 'A'-10; }
1273 else if (c >= 'a' && c <= 'f') { c -= 'a'-10; }
1274 else {
1275 src += 3; /* skip incomplete \u sequence */
1276 continue;
1277 }
1278 ch |= (cint)c << 8;
1279 if (src+5 > srcend) break;
1280 c = src[4];
1281 if (c >= '0' && c <= '9') { c -= '0'; }
1282 else if (c >= 'A' && c <= 'F') { c -= 'A'-10; }
1283 else if (c >= 'a' && c <= 'f') { c -= 'a'-10; }
1284 else {
1285 src += 4; /* skip incomplete \u sequence */
1286 continue;
1287 }
1288 ch |= (cint)c << 4;
1289 if (src+6 > srcend) break;
1290 c = src[5];
1291 if (c >= '0' && c <= '9') { c -= '0'; }
1292 else if (c >= 'A' && c <= 'F') { c -= 'A'-10; }
1293 else if (c >= 'a' && c <= 'f') { c -= 'a'-10; }
1294 else {
1295 src += 5; /* skip incomplete \u sequence */
1296 continue;
1297 }
1298 ch |= (cint)c;
1299 if (ch < char_code_limit && !(ch >= 0xd800 && ch < 0xe000)) {
1300 if (dest==destend) break;
1301 *dest++ = as_chart(ch);
1302 src += 6; /* complete \u sequence */
1303 continue;
1304 }
1305 if (!(ch >= 0xd800 && ch < 0xdc00)) {
1306 src += 6; /* skip invalid \u sequence */
1307 continue;
1308 }
1309 var cint ch1 = ch;
1310 if (src+7 > srcend) break;
1311 if (src[6] != '\\') {
1312 src += 6; /* skip incomplete \u sequence */
1313 continue;
1314 }
1315 if (src+8 > srcend) break;
1316 if (src[7] != 'u') {
1317 src += 6; /* skip incomplete \u sequence */
1318 continue;
1319 }
1320 if (src+9 > srcend) break;
1321 c = src[8];
1322 if (c >= '0' && c <= '9') { c -= '0'; }
1323 else if (c >= 'A' && c <= 'F') { c -= 'A'-10; }
1324 else if (c >= 'a' && c <= 'f') { c -= 'a'-10; }
1325 else {
1326 src += 8; /* skip incomplete \u sequence */
1327 continue;
1328 }
1329 ch = (cint)c << 12;
1330 if (src+10 > srcend) break;
1331 c = src[9];
1332 if (c >= '0' && c <= '9') { c -= '0'; }
1333 else if (c >= 'A' && c <= 'F') { c -= 'A'-10; }
1334 else if (c >= 'a' && c <= 'f') { c -= 'a'-10; }
1335 else {
1336 src += 9; /* skip incomplete \u sequence */
1337 continue;
1338 }
1339 ch |= (cint)c << 8;
1340 if (src+11 > srcend) break;
1341 c = src[10];
1342 if (c >= '0' && c <= '9') { c -= '0'; }
1343 else if (c >= 'A' && c <= 'F') { c -= 'A'-10; }
1344 else if (c >= 'a' && c <= 'f') { c -= 'a'-10; }
1345 else {
1346 src += 10; /* skip incomplete \u sequence */
1347 continue;
1348 }
1349 ch |= (cint)c << 4;
1350 if (src+12 > srcend) break;
1351 c = src[11];
1352 if (c >= '0' && c <= '9') { c -= '0'; }
1353 else if (c >= 'A' && c <= 'F') { c -= 'A'-10; }
1354 else if (c >= 'a' && c <= 'f') { c -= 'a'-10; }
1355 else {
1356 src += 11; /* skip incomplete \u sequence */
1357 continue;
1358 }
1359 ch |= (cint)c;
1360 if (ch >= 0xdc00 && ch < 0xe000) {
1361 ch = 0x10000 + ((ch1 - 0xd800) << 10) + (ch - 0xdc00);
1362 if (ch < char_code_limit) {
1363 if (dest==destend) break;
1364 *dest++ = as_chart(ch);
1365 src += 12; /* complete \u sequence */
1366 continue;
1367 }
1368 }
1369 src += 6; /* skip invalid \u sequence */
1370 continue;
1371 }
1372 *srcp = src;
1373 *destp = dest;
1374 }
1375
1376 /* Characters to bytes. */
1377
java_wcslen(object encoding,const chart * src,const chart * srcend)1378 global uintL java_wcslen (object encoding, const chart* src,
1379 const chart* srcend) {
1380 unused(encoding);
1381 var uintL destlen = 0;
1382 while (src < srcend) {
1383 var cint ch = as_cint(*src++);
1384 destlen += (ch < 0x80 ? 1 : ch < 0x10000 ? 6 : 12);
1385 }
1386 return destlen;
1387 }
1388
java_wcstombs(object encoding,object stream,const chart ** srcp,const chart * srcend,uintB ** destp,uintB * destend)1389 global void java_wcstombs (object encoding, object stream, const chart* *srcp,
1390 const chart* srcend, uintB* *destp,
1391 uintB* destend) {
1392 unused(encoding); unused(stream);
1393 var const chart* src = *srcp;
1394 var uintB* dest = *destp;
1395 while (src < srcend) {
1396 local char const hex_table_lc[] = "0123456789abcdef"; /* lowercase! */
1397 var cint ch = as_cint(*src);
1398 var uintL count = (ch < 0x80 ? 1 : ch < 0x10000 ? 6 : 12);
1399 if (dest+count > destend) break;
1400 src++;
1401 if (ch < 0x80) { /* 1 byte sequence */
1402 *dest++ = ch;
1403 } else if (ch < 0x10000) { /* 6 byte sequence */
1404 *dest++ = '\\';
1405 *dest++ = 'u';
1406 *dest++ = hex_table_lc[(ch>>12)&0x0F];
1407 *dest++ = hex_table_lc[(ch>>8)&0x0F];
1408 *dest++ = hex_table_lc[(ch>>4)&0x0F];
1409 *dest++ = hex_table_lc[ch&0x0F];
1410 } else { /* 12 byte sequence */
1411 var cint ch1 = 0xD800 + ((ch - 0x10000) >> 10);
1412 var cint ch2 = 0xDC00 + ((ch - 0x10000) & 0x3FF);
1413 *dest++ = '\\';
1414 *dest++ = 'u';
1415 *dest++ = hex_table_lc[(ch1>>12)&0x0F];
1416 *dest++ = hex_table_lc[(ch1>>8)&0x0F];
1417 *dest++ = hex_table_lc[(ch1>>4)&0x0F];
1418 *dest++ = hex_table_lc[ch1&0x0F];
1419 *dest++ = '\\';
1420 *dest++ = 'u';
1421 *dest++ = hex_table_lc[(ch2>>12)&0x0F];
1422 *dest++ = hex_table_lc[(ch2>>8)&0x0F];
1423 *dest++ = hex_table_lc[(ch2>>4)&0x0F];
1424 *dest++ = hex_table_lc[ch2&0x0F];
1425 }
1426 }
1427 *srcp = src;
1428 *destp = dest;
1429 }
1430
1431 /* -------------------------------------------------------------------------
1432 * 8-bit NLS characters sets */
1433
1434 /* min. bytes per character = 1
1435 max. bytes per character = 1 */
1436
1437 typedef struct nls_table_t {
1438 const char* charset;
1439 const unsigned char* const* page_uni2charset; /* UCS-2 to 8-bit table */
1440 const unsigned short* charset2uni; /* 8-bit to UCS-2 table */
1441 int is_ascii_extension;
1442 }
1443 #if (alignment_long < PSEUDODATA_ALIGNMENT) && defined(GNU)
1444 /* Force all XPSEUDODATAs to be allocated with a PSEUDODATA_ALIGNMENT.
1445 GC needs this. */
1446 __attribute__ ((aligned (PSEUDODATA_ALIGNMENT)))
1447 #endif
1448 nls_table_t;
1449
1450 static const unsigned char nopage[256] = {
1451 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x00-0x07 */
1452 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x08-0x0f */
1453 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x10-0x17 */
1454 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x18-0x1f */
1455 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x20-0x27 */
1456 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x28-0x2f */
1457 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x30-0x37 */
1458 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x38-0x3f */
1459 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x40-0x47 */
1460 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x48-0x4f */
1461 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x50-0x57 */
1462 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x58-0x5f */
1463 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x60-0x67 */
1464 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x68-0x6f */
1465 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x70-0x77 */
1466 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x78-0x7f */
1467 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x80-0x87 */
1468 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x88-0x8f */
1469 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x90-0x97 */
1470 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x98-0x9f */
1471 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xa0-0xa7 */
1472 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xa8-0xaf */
1473 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xb0-0xb7 */
1474 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xb8-0xbf */
1475 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xc0-0xc7 */
1476 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xc8-0xcf */
1477 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xd0-0xd7 */
1478 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xd8-0xdf */
1479 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xe0-0xe7 */
1480 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xe8-0xef */
1481 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xf0-0xf7 */
1482 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00 /* 0xf8-0xff */
1483 };
1484
1485 #include "nls_ascii.c"
1486 #include "nls_iso8859_1.c"
1487 #include "nls_iso8859_2.c"
1488 #include "nls_iso8859_3.c"
1489 #include "nls_iso8859_4.c"
1490 #include "nls_iso8859_5.c"
1491 #include "nls_iso8859_6.c"
1492 #include "nls_iso8859_7.c"
1493 #include "nls_iso8859_8.c"
1494 #include "nls_iso8859_9.c"
1495 #include "nls_iso8859_10.c"
1496 #include "nls_iso8859_13.c"
1497 #include "nls_iso8859_14.c"
1498 #include "nls_iso8859_15.c"
1499 #include "nls_iso8859_16.c"
1500 #include "nls_koi8_r.c"
1501 #include "nls_koi8_u.c"
1502 #include "nls_mac_arabic.c"
1503 #include "nls_mac_centraleurope.c"
1504 #include "nls_mac_croatian.c"
1505 #include "nls_mac_cyrillic.c"
1506 #include "nls_mac_dingbat.c"
1507 #include "nls_mac_greek.c"
1508 #include "nls_mac_hebrew.c"
1509 #include "nls_mac_iceland.c"
1510 #include "nls_mac_roman.c"
1511 #include "nls_mac_romania.c"
1512 #include "nls_mac_symbol.c"
1513 #include "nls_mac_thai.c"
1514 #include "nls_mac_turkish.c"
1515 #include "nls_mac_ukraine.c"
1516 #include "nls_cp437_ms.c"
1517 #include "nls_cp437_ibm.c"
1518 #include "nls_cp737.c"
1519 #include "nls_cp775.c"
1520 #include "nls_cp850.c"
1521 #include "nls_cp852_ms.c"
1522 #include "nls_cp852_ibm.c"
1523 #include "nls_cp855.c"
1524 #include "nls_cp857.c"
1525 #include "nls_cp860_ms.c"
1526 #include "nls_cp860_ibm.c"
1527 #include "nls_cp861_ms.c"
1528 #include "nls_cp861_ibm.c"
1529 #include "nls_cp862_ms.c"
1530 #include "nls_cp862_ibm.c"
1531 #include "nls_cp863_ms.c"
1532 #include "nls_cp863_ibm.c"
1533 #include "nls_cp864_ms.c"
1534 #include "nls_cp864_ibm.c"
1535 #include "nls_cp865_ms.c"
1536 #include "nls_cp865_ibm.c"
1537 #include "nls_cp866.c"
1538 #include "nls_cp869_ms.c"
1539 #include "nls_cp869_ibm.c"
1540 #include "nls_cp874_ms.c"
1541 #include "nls_cp874_ibm.c"
1542 #include "nls_cp1250.c"
1543 #include "nls_cp1251.c"
1544 #include "nls_cp1252.c"
1545 #include "nls_cp1253.c"
1546 #include "nls_cp1254.c"
1547 #include "nls_cp1256.c"
1548 #include "nls_cp1257.c"
1549 #include "nls_hp_roman8.c"
1550 #include "nls_jisx0201.c"
1551
1552 #define nls_first_sym S(ascii)
1553 #define nls_last_sym S(jisx0201)
1554 #define nls_num_encodings (&symbol_tab_data.S_jisx0201 - &symbol_tab_data.S_ascii + 1)
1555
1556 static const nls_table_t * const nls_tables[] = {
1557 &nls_ascii_table,
1558 &nls_iso8859_1_table,
1559 &nls_iso8859_2_table,
1560 &nls_iso8859_3_table,
1561 &nls_iso8859_4_table,
1562 &nls_iso8859_5_table,
1563 &nls_iso8859_6_table,
1564 &nls_iso8859_7_table,
1565 &nls_iso8859_8_table,
1566 &nls_iso8859_9_table,
1567 &nls_iso8859_10_table,
1568 &nls_iso8859_13_table,
1569 &nls_iso8859_14_table,
1570 &nls_iso8859_15_table,
1571 &nls_iso8859_16_table,
1572 &nls_koi8_r_table,
1573 &nls_koi8_u_table,
1574 &nls_mac_arabic_table,
1575 &nls_mac_centraleurope_table,
1576 &nls_mac_croatian_table,
1577 &nls_mac_cyrillic_table,
1578 &nls_mac_dingbat_table,
1579 &nls_mac_greek_table,
1580 &nls_mac_hebrew_table,
1581 &nls_mac_iceland_table,
1582 &nls_mac_roman_table,
1583 &nls_mac_romania_table,
1584 &nls_mac_symbol_table,
1585 &nls_mac_thai_table,
1586 &nls_mac_turkish_table,
1587 &nls_mac_ukraine_table,
1588 &nls_cp437_ms_table,
1589 &nls_cp437_ibm_table,
1590 &nls_cp737_table,
1591 &nls_cp775_table,
1592 &nls_cp850_table,
1593 &nls_cp852_ms_table,
1594 &nls_cp852_ibm_table,
1595 &nls_cp855_table,
1596 &nls_cp857_table,
1597 &nls_cp860_ms_table,
1598 &nls_cp860_ibm_table,
1599 &nls_cp861_ms_table,
1600 &nls_cp861_ibm_table,
1601 &nls_cp862_ms_table,
1602 &nls_cp862_ibm_table,
1603 &nls_cp863_ms_table,
1604 &nls_cp863_ibm_table,
1605 &nls_cp864_ms_table,
1606 &nls_cp864_ibm_table,
1607 &nls_cp865_ms_table,
1608 &nls_cp865_ibm_table,
1609 &nls_cp866_table,
1610 &nls_cp869_ms_table,
1611 &nls_cp869_ibm_table,
1612 &nls_cp874_ms_table,
1613 &nls_cp874_ibm_table,
1614 &nls_cp1250_table,
1615 &nls_cp1251_table,
1616 &nls_cp1252_table,
1617 &nls_cp1253_table,
1618 &nls_cp1254_table,
1619 &nls_cp1256_table,
1620 &nls_cp1257_table,
1621 &nls_hp_roman8_table,
1622 &nls_jisx0201_table,
1623 };
1624
1625 global uintL nls_mblen (object encoding, const uintB* src,
1626 const uintB* srcend);
1627 global void nls_mbstowcs (object encoding, object stream, const uintB* *srcp,
1628 const uintB* srcend, chart* *destp, chart* destend);
1629 global uintL nls_asciiext_mblen (object encoding, const uintB* src,
1630 const uintB* srcend);
1631 global void nls_asciiext_mbstowcs (object encoding, object stream,
1632 const uintB* *srcp, const uintB* srcend,
1633 chart* *destp, chart* destend);
1634 global uintL nls_wcslen (object encoding, const chart* src,
1635 const chart* srcend);
1636 global void nls_wcstombs (object encoding, object stream, const chart* *srcp,
1637 const chart* srcend, uintB* *destp, uintB* destend);
1638 global uintL nls_asciiext_wcslen (object encoding, const chart* src,
1639 const chart* srcend);
1640 global void nls_asciiext_wcstombs (object encoding, object stream,
1641 const chart* *srcp, const chart* srcend,
1642 uintB* *destp, uintB* destend);
1643 global object nls_range (object encoding, uintL start, uintL end, uintL maxintervals);
1644
1645 /* Bytes to characters. */
1646
1647 /* Error when an invalid byte was encountered.
1648 error_nls_invalid(encoding,b); */
error_nls_invalid(object encoding,uintB b)1649 local _Noreturn void error_nls_invalid (object encoding, uintB b) {
1650 pushSTACK(fixnum(b)); /* CHARSET-TYPE-ERROR slot DATUM */
1651 pushSTACK(encoding); /* CHARSET-TYPE-ERROR slot EXPECTED-TYPE */
1652 pushSTACK(TheEncoding(encoding)->enc_charset);
1653 pushSTACK(ascii_char(hex_table[b&0x0F]));
1654 pushSTACK(ascii_char(hex_table[(b>>4)&0x0F]));
1655 pushSTACK(TheSubr(subr_self)->name);
1656 error(charset_type_error,GETTEXT("~S: Invalid byte #x~C~C in ~S conversion"));
1657 }
1658
nls_mblen(object encoding,const uintB * src,const uintB * srcend)1659 global uintL nls_mblen (object encoding, const uintB* src,
1660 const uintB* srcend) {
1661 if (!eq(TheEncoding(encoding)->enc_towcs_error,S(Kignore)))
1662 return (srcend-src);
1663 else {
1664 var uintL count = srcend-src;
1665 var uintL result = 0;
1666 if (count > 0) {
1667 var const nls_table_t* table =
1668 (const nls_table_t*) TheMachine(TheEncoding(encoding)->enc_table);
1669 var const unsigned short* cvtable = table->charset2uni;
1670 dotimespL(count,count, {
1671 if (!(cvtable[*src++] == 0xFFFD))
1672 result++;
1673 });
1674 }
1675 return result;
1676 }
1677 }
1678
nls_mbstowcs(object encoding,object stream,const uintB ** srcp,const uintB * srcend,chart ** destp,chart * destend)1679 global void nls_mbstowcs (object encoding, object stream, const uintB* *srcp,
1680 const uintB* srcend, chart* *destp, chart* destend) {
1681 unused(stream);
1682 var const uintB* src = *srcp;
1683 var chart* dest = *destp;
1684 var uintL count = destend-dest;
1685 if (count > srcend-src)
1686 count = srcend-src;
1687 if (count > 0) {
1688 var const nls_table_t* table =
1689 (const nls_table_t*) TheMachine(TheEncoding(encoding)->enc_table);
1690 var const unsigned short* cvtable = table->charset2uni;
1691 dotimespL(count,count, {
1692 var uintB b = *src++;
1693 var cint ch = cvtable[b];
1694 if (!(ch == 0xFFFD)) {
1695 *dest++ = as_chart(ch);
1696 } else {
1697 var object action = TheEncoding(encoding)->enc_towcs_error;
1698 if (eq(action,S(Kignore))) {
1699 } else if (eq(action,S(Kerror))) {
1700 error_nls_invalid(encoding,b);
1701 } else {
1702 *dest++ = char_code(action);
1703 }
1704 }
1705 });
1706 *srcp = src;
1707 *destp = dest;
1708 }
1709 }
1710
1711 /* Same thing, specially optimized for ASCII extensions. */
1712
nls_asciiext_mblen(object encoding,const uintB * src,const uintB * srcend)1713 global uintL nls_asciiext_mblen (object encoding, const uintB* src,
1714 const uintB* srcend) {
1715 if (!eq(TheEncoding(encoding)->enc_towcs_error,S(Kignore)))
1716 return (srcend-src);
1717 else {
1718 var uintL count = srcend-src;
1719 var uintL result = 0;
1720 if (count > 0) {
1721 var const nls_table_t* table =
1722 (const nls_table_t*) TheMachine(TheEncoding(encoding)->enc_table);
1723 var const unsigned short* cvtable = table->charset2uni;
1724 dotimespL(count,count, {
1725 var uintB b = *src++;
1726 if ((b < 0x80) || !(cvtable[b] == 0xFFFD))
1727 result++;
1728 });
1729 }
1730 return result;
1731 }
1732 }
1733
nls_asciiext_mbstowcs(object encoding,object stream,const uintB ** srcp,const uintB * srcend,chart ** destp,chart * destend)1734 global void nls_asciiext_mbstowcs (object encoding, object stream,
1735 const uintB* *srcp, const uintB* srcend,
1736 chart* *destp, chart* destend) {
1737 unused(stream);
1738 var const uintB* src = *srcp;
1739 var chart* dest = *destp;
1740 var uintL count = destend-dest;
1741 if (count > srcend-src)
1742 count = srcend-src;
1743 if (count > 0) {
1744 var const nls_table_t* table =
1745 (const nls_table_t*) TheMachine(TheEncoding(encoding)->enc_table);
1746 var const unsigned short* cvtable = table->charset2uni;
1747 dotimespL(count,count, {
1748 var uintB b = *src++;
1749 if (b < 0x80) { /* avoid memory reference (big speedup!) */
1750 *dest++ = as_chart((cint)b);
1751 } else {
1752 var cint ch = cvtable[b];
1753 if (!(ch == 0xFFFD)) {
1754 *dest++ = as_chart(ch);
1755 } else {
1756 var object action = TheEncoding(encoding)->enc_towcs_error;
1757 if (eq(action,S(Kignore))) {
1758 } else if (eq(action,S(Kerror))) {
1759 error_nls_invalid(encoding,b);
1760 } else {
1761 *dest++ = char_code(action);
1762 }
1763 }
1764 }
1765 });
1766 *srcp = src;
1767 *destp = dest;
1768 }
1769 }
1770
1771 /* Characters to bytes. */
1772
nls_wcslen(object encoding,const chart * src,const chart * srcend)1773 global uintL nls_wcslen (object encoding, const chart* src,
1774 const chart* srcend) {
1775 var uintL count = srcend-src;
1776 var uintL result = 0;
1777 if (count > 0) {
1778 var const nls_table_t* table = (const nls_table_t*)
1779 TheMachine(TheEncoding(encoding)->enc_table);
1780 var const unsigned char* const* cvtable = table->page_uni2charset;
1781 dotimespL(count,count, {
1782 var chart ch = *src++;
1783 if (as_cint(ch) < 0x10000
1784 && (cvtable[as_cint(ch)>>8][as_cint(ch)&0xFF] != 0
1785 || chareq(ch,ascii(0))))
1786 result++;
1787 else {
1788 var object action = TheEncoding(encoding)->enc_tombs_error;
1789 if (eq(action,S(Kignore))) {
1790 } else if (uint8_p(action)) {
1791 result++;
1792 } else if (!eq(action,S(Kerror))) {
1793 var chart c = char_code(action);
1794 if (as_cint(c) < 0x10000
1795 && (cvtable[as_cint(c)>>8][as_cint(c)&0xFF] != 0
1796 || chareq(c,ascii(0))))
1797 result++;
1798 } else
1799 error_unencodable(encoding,ch);
1800 }
1801 });
1802 }
1803 return result;
1804 }
1805
nls_wcstombs(object encoding,object stream,const chart ** srcp,const chart * srcend,uintB ** destp,uintB * destend)1806 global void nls_wcstombs (object encoding, object stream,
1807 const chart* *srcp, const chart* srcend,
1808 uintB* *destp, uintB* destend) {
1809 unused(stream);
1810 var const chart* src = *srcp;
1811 var uintB* dest = *destp;
1812 var uintL scount = srcend-src;
1813 var uintL dcount = destend-dest;
1814 if (scount > 0 && dcount > 0) {
1815 var const nls_table_t* table = (const nls_table_t*)
1816 TheMachine(TheEncoding(encoding)->enc_table);
1817 var const unsigned char* const* cvtable = table->page_uni2charset;
1818 do {
1819 var chart ch = *src++; scount--;
1820 var uintB b;
1821 if (as_cint(ch) < 0x10000
1822 && (b = cvtable[as_cint(ch)>>8][as_cint(ch)&0xFF],
1823 b != 0 || chareq(ch,ascii(0)))) {
1824 *dest++ = b; dcount--;
1825 } else {
1826 var object action = TheEncoding(encoding)->enc_tombs_error;
1827 if (eq(action,S(Kignore))) {
1828 } else if (uint8_p(action)) {
1829 *dest++ = I_to_uint8(action); dcount--;
1830 } else if (!eq(action,S(Kerror))) {
1831 var chart c = char_code(action);
1832 if (as_cint(c) < 0x10000
1833 && (b = cvtable[as_cint(c)>>8][as_cint(c)&0xFF],
1834 b != 0 || chareq(c,ascii(0)))) {
1835 *dest++ = b; dcount--;
1836 } else
1837 error_unencodable(encoding,ch);
1838 } else
1839 error_unencodable(encoding,ch);
1840 }
1841 } while (scount > 0 && dcount > 0);
1842 *srcp = src;
1843 *destp = dest;
1844 }
1845 }
1846
1847 /* Same thing, specially optimized for ASCII extensions. */
1848
nls_asciiext_wcslen(object encoding,const chart * src,const chart * srcend)1849 global uintL nls_asciiext_wcslen (object encoding, const chart* src,
1850 const chart* srcend) {
1851 var uintL count = srcend-src;
1852 var uintL result = 0;
1853 if (count > 0) {
1854 var const nls_table_t* table = (const nls_table_t*)
1855 TheMachine(TheEncoding(encoding)->enc_table);
1856 var const unsigned char* const* cvtable = table->page_uni2charset;
1857 dotimespL(count,count, {
1858 var chart ch = *src++;
1859 if (as_cint(ch) < 0x80
1860 || (as_cint(ch) < 0x10000
1861 && cvtable[as_cint(ch)>>8][as_cint(ch)&0xFF] != 0))
1862 result++;
1863 else {
1864 var object action = TheEncoding(encoding)->enc_tombs_error;
1865 if (eq(action,S(Kignore))) {
1866 } else if (uint8_p(action)) {
1867 result++;
1868 } else if (!eq(action,S(Kerror))) {
1869 var chart c = char_code(action);
1870 if (as_cint(c) < 0x10000
1871 && (cvtable[as_cint(c)>>8][as_cint(c)&0xFF] != 0
1872 || chareq(c,ascii(0))))
1873 result++;
1874 } else
1875 error_unencodable(encoding,ch);
1876 }
1877 });
1878 }
1879 return result;
1880 }
1881
nls_asciiext_wcstombs(object encoding,object stream,const chart ** srcp,const chart * srcend,uintB ** destp,uintB * destend)1882 global void nls_asciiext_wcstombs (object encoding, object stream,
1883 const chart* *srcp, const chart* srcend,
1884 uintB* *destp, uintB* destend) {
1885 unused(stream);
1886 var const chart* src = *srcp;
1887 var uintB* dest = *destp;
1888 var uintL scount = srcend-src;
1889 var uintL dcount = destend-dest;
1890 if (scount > 0 && dcount > 0) {
1891 var const nls_table_t* table = (const nls_table_t*)
1892 TheMachine(TheEncoding(encoding)->enc_table);
1893 var const unsigned char* const* cvtable = table->page_uni2charset;
1894 do {
1895 var chart ch = *src++; scount--;
1896 if (as_cint(ch) < 0x80) { /* avoid memory reference (big speedup!) */
1897 *dest++ = (uintB)as_cint(ch);
1898 dcount--;
1899 } else {
1900 var uintB b;
1901 if (as_cint(ch) < 0x10000
1902 && (b = cvtable[as_cint(ch)>>8][as_cint(ch)&0xFF],
1903 b != 0)) {
1904 *dest++ = b; dcount--;
1905 } else {
1906 var object action = TheEncoding(encoding)->enc_tombs_error;
1907 if (eq(action,S(Kignore))) {
1908 } else if (uint8_p(action)) {
1909 *dest++ = I_to_uint8(action); dcount--;
1910 } else if (!eq(action,S(Kerror))) {
1911 var chart c = char_code(action);
1912 if (as_cint(c) < 0x10000
1913 && (b = cvtable[as_cint(c)>>8][as_cint(c)&0xFF],
1914 b != 0 || chareq(c,ascii(0)))) {
1915 *dest++ = b; dcount--;
1916 } else
1917 error_unencodable(encoding,ch);
1918 } else
1919 error_unencodable(encoding,ch);
1920 }
1921 }
1922 } while (scount > 0 && dcount > 0);
1923 *srcp = src;
1924 *destp = dest;
1925 }
1926 }
1927
1928 /* Determining the range of encodable characters. */
nls_range(object encoding,uintL start,uintL end,uintL maxintervals)1929 global object nls_range (object encoding, uintL start, uintL end,
1930 uintL maxintervals) {
1931 var uintL count = 0; /* number of intervals already on the STACK */
1932 /* The range lies in the BMP; no need to look beyond U+FFFF. */
1933 if (maxintervals > 0 && start < 0x10000) {
1934 if (end >= 0x10000)
1935 end = 0xFFFF;
1936 var const nls_table_t* table =
1937 (const nls_table_t*) TheMachine(TheEncoding(encoding)->enc_table);
1938 var const unsigned char* const* cvtable = table->page_uni2charset;
1939 var uintL i1;
1940 var uintL i2;
1941 var bool have_i1_i2 = false; /* [i1,i2] = interval being built */
1942 var uintL i;
1943 for (i = start;;) {
1944 /* Here i < 0x10000 and count < maxintervals. */
1945 var chart ch = as_chart(i);
1946 if (cvtable[as_cint(ch)>>8][as_cint(ch)&0xFF] != 0
1947 || chareq(ch,ascii(0))) {
1948 /* ch encodable -> extend the interval */
1949 if (!have_i1_i2) {
1950 have_i1_i2 = true; i1 = i;
1951 }
1952 i2 = i;
1953 } else {
1954 /* ch not encodable -> finish the interval */
1955 if (have_i1_i2) {
1956 pushSTACK(code_char(as_chart(i1)));
1957 pushSTACK(code_char(as_chart(i2)));
1958 check_STACK(); count++;
1959 have_i1_i2 = false;
1960 /* If we have already produced the maximum number of intervals
1961 requested by the caller, it's of no use to search further. */
1962 if (count == maxintervals)
1963 break;
1964 }
1965 }
1966 if (i == end)
1967 break;
1968 i++;
1969 }
1970 if (have_i1_i2) {
1971 pushSTACK(code_char(as_chart(i1))); pushSTACK(code_char(as_chart(i2)));
1972 check_STACK(); count++;
1973 }
1974 }
1975 return stringof(2*count);
1976 }
1977
1978 /* -------------------------------------------------------------------------
1979 * iconv-based encodings */
1980
1981 /* They are defined in stream.d because they need to access internals of
1982 the ChannelStream. */
1983
1984 #ifdef HAVE_GOOD_ICONV
1985
1986 extern uintL iconv_mblen (object encoding, const uintB* src,
1987 const uintB* srcend);
1988 extern void iconv_mbstowcs (object encoding, object stream, const uintB* *srcp,
1989 const uintB* srcend, chart* *destp,
1990 chart* destend);
1991 extern uintL iconv_wcslen (object encoding, const chart* src,
1992 const chart* srcend);
1993 extern void iconv_wcstombs (object encoding, object stream, const chart* *srcp,
1994 const chart* srcend, uintB* *destp,
1995 uintB* destend);
1996 extern object iconv_range (object encoding, uintL start, uintL end,
1997 uintL maxintervals);
1998
1999 #if defined(GNU_LIBICONV) || (__GLIBC__ > 2 || (__GLIBC__ == 2 && __GLIBC_MINOR__ >= 2))
2000
2001 #ifdef GNU_LIBICONV
2002 #define iconv_first_sym S(koi8_ru)
2003 #define iconv_last_sym S(utf_7)
2004 #define iconv_num_encodings (&symbol_tab_data.S_utf_7 - &symbol_tab_data.S_koi8_ru + 1)
2005 #else
2006 #define iconv_first_sym S(cp1255)
2007 #define iconv_last_sym S(utf_7)
2008 #define iconv_num_encodings (&symbol_tab_data.S_utf_7 - &symbol_tab_data.S_cp1255 + 1)
2009 #endif
2010
2011 #endif
2012
2013 #endif /* HAVE_GOOD_ICONV */
2014
2015 /* ----------------------------------------------------------------------- */
2016
2017 #endif /* ENABLE_UNICODE */
2018
2019 /* =========================================================================
2020 * General functions */
2021
2022 /* (MAKE-ENCODING [:charset] [:line-terminator] [:input-error-action]
2023 [:output-error-action] [:if-does-not-exist])
2024 creates a new encoding. */
2025 LISPFUN(make_encoding,seclass_read,0,0,norest,key,5,
2026 (kw(charset),kw(line_terminator),
2027 kw(input_error_action),kw(output_error_action)
2028 kw(if_does_not_exist))) {
2029 var object arg = popSTACK(); /* :if-does-not-exist */
2030 var bool ignore_not_exist = nullp(arg); /* no error */
2031 /* Check the :CHARSET argument. */
2032 arg = STACK_3;
2033 /* string -> symbol in CHARSET */
2034 if (!boundp(arg) || eq(arg,S(Kdefault))) {
2035 arg = O(default_file_encoding);
2036 #ifndef ENABLE_UNICODE
2037 if (nullp(arg)) /* initialization */
2038 goto create_new_encoding;
2039 #endif
2040 } else if (encodingp(arg)) {
2041 }
2042 #ifdef ENABLE_UNICODE
2043 else if (symbolp(arg) && constant_var_p(TheSymbol(arg))
2044 && encodingp(Symbol_value(arg))) {
2045 arg = Symbol_value(arg);
2046 } else if (stringp(arg)) {
2047 var object arg_upcase = string_upcase(arg);
2048 var object sym;
2049 arg = STACK_3; /* refetch */
2050 if (find_external_symbol(arg_upcase,false,O(charset_package),&sym)
2051 && constant_var_p(TheSymbol(sym)) && encodingp(Symbol_value(sym)))
2052 arg = Symbol_value(sym);
2053 #ifdef HAVE_GOOD_ICONV
2054 else {
2055 var bool valid_encoding_p = true;
2056 with_string_0(arg,Symbol_value(S(ascii)),charset_ascii,{
2057 valid_encoding_p =
2058 check_charset(charset_ascii,ignore_not_exist ? nullobj : arg);
2059 });
2060 /* if :IF-DOES-NOT-EXIST was non-NIL and the encoding was invalid,
2061 check_charset() would have signalled an error */
2062 if (valid_encoding_p) {
2063 pushSTACK(coerce_ss(arg));
2064 var object encoding = allocate_encoding();
2065 TheEncoding(encoding)->enc_eol = S(Kunix);
2066 TheEncoding(encoding)->enc_towcs_error = S(Kerror);
2067 TheEncoding(encoding)->enc_tombs_error = S(Kerror);
2068 TheEncoding(encoding)->enc_charset = popSTACK();
2069 TheEncoding(encoding)->enc_mblen = P(iconv_mblen);
2070 TheEncoding(encoding)->enc_mbstowcs = P(iconv_mbstowcs);
2071 TheEncoding(encoding)->enc_wcslen = P(iconv_wcslen);
2072 TheEncoding(encoding)->enc_wcstombs = P(iconv_wcstombs);
2073 TheEncoding(encoding)->enc_range = P(iconv_range);
2074 TheEncoding(encoding)->min_bytes_per_char = 1;
2075 TheEncoding(encoding)->max_bytes_per_char = max_bytes_per_chart; /* wild assumption */
2076 arg = encoding;
2077 } else {
2078 ASSERT(ignore_not_exist);
2079 arg = NIL;
2080 }
2081 }
2082 #else
2083 else error_illegal_arg(arg,S(encoding),S(Kcharset));
2084 #endif
2085 }
2086 #endif
2087 else error_illegal_arg(arg,S(encoding),S(Kcharset));
2088 STACK_3 = arg;
2089 /* Check the :LINE-TERMINATOR argument. */
2090 arg = STACK_2;
2091 if (!(!boundp(arg) || eq(arg,S(Kunix)) || eq(arg,S(Kmac)) || eq(arg,S(Kdos))))
2092 error_illegal_arg(arg,O(type_line_terminator),S(Kline_terminator));
2093 /* Check the :INPUT-ERROR-ACTION argument. */
2094 arg = STACK_1;
2095 if (!(!boundp(arg) || eq(arg,S(Kerror)) || eq(arg,S(Kignore)) || charp(arg)))
2096 error_illegal_arg(arg,O(type_input_error_action),S(Kinput_error_action));
2097 /* Check the :OUTPUT-ERROR-ACTION argument. */
2098 arg = STACK_0;
2099 if (!(!boundp(arg)
2100 || eq(arg,S(Kerror)) || eq(arg,S(Kignore))
2101 || charp(arg) || uint8_p(arg)))
2102 error_illegal_arg(arg,O(type_output_error_action),S(Koutput_error_action));
2103 /* Create a new encoding. */
2104 if (nullp(STACK_3) /* illegal charset & :IF-DOES-NOT-EXIST NIL */
2105 || ((!boundp(STACK_2) || eq(STACK_2,TheEncoding(STACK_3)->enc_eol))
2106 && (!boundp(STACK_1)
2107 || eq(STACK_1,TheEncoding(STACK_3)->enc_towcs_error))
2108 && (!boundp(STACK_0)
2109 || eq(STACK_0,TheEncoding(STACK_3)->enc_tombs_error)))) {
2110 VALUES1(STACK_3);
2111 } else create_new_encoding: {
2112 var object encoding = allocate_encoding();
2113 var object old_encoding = STACK_3;
2114 if (encodingp(old_encoding)) {
2115 var const gcv_object_t* ptr1 = &TheRecord(old_encoding)->recdata[0];
2116 var gcv_object_t* ptr2 = &TheRecord(encoding)->recdata[0];
2117 var uintC count = encoding_length;
2118 do { *ptr2++ = *ptr1++; } while (--count);
2119 if (encoding_xlength > 0)
2120 copy_mem_b(ptr2,ptr1,encoding_xlength);
2121 }
2122 if (boundp(STACK_2))
2123 TheEncoding(encoding)->enc_eol = STACK_2;
2124 if (boundp(STACK_1))
2125 TheEncoding(encoding)->enc_towcs_error = STACK_1;
2126 if (boundp(STACK_0))
2127 TheEncoding(encoding)->enc_tombs_error = STACK_0;
2128 VALUES1(encoding);
2129 }
2130 skipSTACK(4);
2131 }
2132
2133 /* (SYSTEM::ENCODINGP object) */
2134 LISPFUNNF(encodingp,1) {
2135 var object arg = popSTACK();
2136 VALUES_IF(encodingp(arg));
2137 }
2138
2139 #ifdef ENABLE_UNICODE
2140 #define DEFAULT_ENC &O(misc_encoding)
2141 #else
2142 #define DEFAULT_ENC &O(default_file_encoding)
2143 #endif
2144
2145 /* (SYSTEM::CHARSET-TYPEP object encoding) tests whether the object
2146 is a character belonging to the given character set. */
2147 LISPFUNNR(charset_typep,2) {
2148 var object encoding = check_encoding(STACK_0,DEFAULT_ENC,false);
2149 var object obj = STACK_1;
2150 if (charp(obj)) {
2151 #ifdef ENABLE_UNICODE
2152 var uintL i = as_cint(char_code(obj));
2153 obj = Encoding_range(encoding)(encoding,i,i,1);
2154 VALUES_IF(Sstring_length(obj));
2155 #else
2156 VALUES1(T);
2157 #endif
2158 } else {
2159 VALUES1(NIL);
2160 }
2161 skipSTACK(2);
2162 }
2163
2164 /* (EXT:ENCODING-LINE-TERMINATOR encoding) --> :UNIX/:DOS/:MAC */
2165 LISPFUNNF(encoding_line_terminator,1) {
2166 var object encoding = check_encoding(popSTACK(),DEFAULT_ENC,false);
2167 VALUES1(TheEncoding(encoding)->enc_eol);
2168 }
2169
2170 #ifdef ENABLE_UNICODE
2171
2172 /* (EXT:ENCODING-CHARSET encoding) --> charset */
2173 LISPFUNNF(encoding_charset,1) {
2174 var object encoding = check_encoding(popSTACK(),DEFAULT_ENC,false);
2175 VALUES1(TheEncoding(encoding)->enc_charset);
2176 }
2177
2178 /* (SYSTEM::CHARSET-RANGE encoding char1 char2 [maxintervals])
2179 returns the range of characters in [char1,char2] encodable in the encoding. */
2180 LISPFUN(charset_range,seclass_read,3,1,norest,nokey,0,NIL) {
2181 var object encoding = check_encoding(STACK_3,DEFAULT_ENC,false);
2182 if (!charp(STACK_2)) STACK_2 = check_char(STACK_2);
2183 if (!charp(STACK_1)) STACK_1 = check_char(STACK_1);
2184 var uintL i1 = as_cint(char_code(STACK_2));
2185 var uintL i2 = as_cint(char_code(STACK_1));
2186 var uintL maxintervals;
2187 if (missingp(STACK_0))
2188 maxintervals = ~(uintL)0;
2189 else if (uint32_p(STACK_0))
2190 maxintervals = I_to_uint32(STACK_0);
2191 else
2192 error_uint32(STACK_0);
2193 VALUES1(i1 <= i2 ?
2194 Encoding_range(encoding)(encoding,i1,i2,maxintervals) :
2195 (object)O(empty_string));
2196 skipSTACK(4);
2197 }
2198
2199 #endif
2200
2201 /* -------------------------------------------------------------------------
2202 * Elementary string functions */
2203
2204 /* UP: return a LISP string the given contents.
2205 n_char_to_string(charptr,len,encoding)
2206 > char* charptr: the address of the character sequence
2207 > uintL len: its length
2208 > object encoding: Encoding
2209 < return: normal-simple-string with len characters as content
2210 can trigger GC */
2211 #ifdef ENABLE_UNICODE
n_char_to_string(const char * srcptr,uintL blen,object encoding)2212 modexp maygc object n_char_to_string
2213 (const char* srcptr, uintL blen, object encoding) {
2214 var const uintB* bptr = (const uintB*)srcptr;
2215 var const uintB* bendptr = bptr+blen;
2216 var uintL clen = Encoding_mblen(encoding)(encoding,bptr,bendptr);
2217 pushSTACK(encoding);
2218 check_stringsize(clen);
2219 var object obj = allocate_string(clen);
2220 encoding = popSTACK();
2221 {
2222 var chart* cptr = &TheSnstring(obj)->data[0];
2223 var chart* cendptr = cptr+clen;
2224 Encoding_mbstowcs(encoding)(encoding,nullobj,&bptr,bendptr,&cptr,cendptr);
2225 ASSERT(cptr == cendptr);
2226 }
2227 return obj;
2228 }
2229 #else
n_char_to_string_(const char * srcptr,uintL len)2230 modexp maygc object n_char_to_string_ (const char* srcptr, uintL len) {
2231 var const uintB* bptr = (const uintB*)srcptr;
2232 check_stringsize(len);
2233 var object obj = allocate_string(len);
2234 if (len > 0) {
2235 var chart* ptr = &TheSnstring(obj)->data[0];
2236 /* copy bptr to ptr as characters: */
2237 dotimespL(len,len, { *ptr++ = as_chart(*bptr++); } );
2238 }
2239 return obj;
2240 }
2241 #endif
2242
2243 /* UP: Convert an ASCIZ string to a LISP string
2244 asciz_to_string(asciz,encoding)
2245 ascii_to_string(asciz)
2246 > char* asciz: ASCIZ-String (NULL-terminated)
2247 > object encoding: Encoding
2248 < return: normal-simple-string the same string (without NULL)
2249 can trigger GC */
2250 #ifdef ENABLE_UNICODE
asciz_to_string(const char * asciz,object encoding)2251 modexp maygc object asciz_to_string (const char * asciz, object encoding) {
2252 return n_char_to_string(asciz,asciz_length(asciz),encoding);
2253 }
2254 #else
asciz_to_string_(const char * asciz)2255 modexp maygc object asciz_to_string_ (const char * asciz) {
2256 return n_char_to_string_(asciz,asciz_length(asciz));
2257 }
2258 #endif
ascii_to_string(const char * asciz)2259 modexp maygc object ascii_to_string (const char * asciz) {
2260 var const uintB* bptr = (const uintB*)asciz;
2261 var uintL len = asciz_length(asciz);
2262 check_stringsize(len);
2263 var object obj = allocate_s8string(len); /* allocate string */
2264 if (len > 0) {
2265 var cint8* ptr = &TheS8string(obj)->data[0];
2266 /* copy as characters bptr --> ptr: */
2267 dotimespL(len,len, {
2268 var uintB b = *bptr++;
2269 ASSERT(b < 0x80);
2270 *ptr++ = (cint8)b;
2271 });
2272 }
2273 DBGREALLOC(obj);
2274 return obj;
2275 }
2276
2277 /* UP: Convert a LISP string to an ASCIZ string.
2278 string_to_asciz(obj,encoding)
2279 > object obj: String
2280 > object encoding: Encoding
2281 < return: simple-bit-vector with the bytes<==characters and a NULL at the end
2282 < TheAsciz(result): address of the byte sequence contain therein
2283 can trigger GC */
2284 #ifdef ENABLE_UNICODE
string_to_asciz(object obj,object encoding)2285 modexp maygc object string_to_asciz (object obj, object encoding) {
2286 var uintL len;
2287 var uintL offset;
2288 var object string = unpack_string_ro(obj,&len,&offset);
2289 var const chart* srcptr;
2290 unpack_sstring_alloca(string,len,offset, srcptr=);
2291 var uintL bytelen = cslen(encoding,srcptr,len);
2292 pushSTACK(encoding);
2293 pushSTACK(string);
2294 var object newasciz = allocate_bit_vector(Atype_8Bit,bytelen+1);
2295 string = popSTACK();
2296 encoding = popSTACK();
2297 unpack_sstring_alloca(string,len,offset, srcptr=);
2298 cstombs(encoding,srcptr,len,&TheSbvector(newasciz)->data[0],bytelen);
2299 TheSbvector(newasciz)->data[bytelen] = '\0';
2300 return newasciz;
2301 }
2302 #else
string_to_asciz_(object obj)2303 modexp maygc object string_to_asciz_ (object obj) {
2304 pushSTACK(obj); /* save string */
2305 var object newasciz = allocate_bit_vector(Atype_8Bit,vector_length(obj)+1);
2306 obj = popSTACK(); /* restore string */
2307 {
2308 var uintL len;
2309 var uintL offset;
2310 var object string = unpack_string_ro(obj,&len,&offset);
2311 var const chart* sourceptr;
2312 unpack_sstring_alloca(string,len,offset, sourceptr=);
2313 /* source-string: length in len, bytes at sourceptr */
2314 var uintB* destptr = &TheSbvector(newasciz)->data[0];
2315 /* destination-string: bytes at destptr */
2316 { /* copy loop: */
2317 var uintL count;
2318 dotimesL(count,len, { *destptr++ = as_cint(*sourceptr++); } );
2319 *destptr++ = '\0'; /* append NULL byte */
2320 }
2321 }
2322 return newasciz;
2323 }
2324 #endif
2325
2326 /* =========================================================================
2327 * Initialization */
2328
2329 /* Initialize the encodings.
2330 init_encodings(); */
init_encodings_1(void)2331 global void init_encodings_1 (void) {
2332 /* Compile-time checks: */
2333 ASSERT(sizeof(chart) == sizeof(cint));
2334 #ifdef ENABLE_UNICODE
2335 {
2336 var object symbol = S(base64);
2337 var object encoding = allocate_encoding();
2338 TheEncoding(encoding)->enc_eol = S(Kunix);
2339 TheEncoding(encoding)->enc_towcs_error = S(Kerror);
2340 TheEncoding(encoding)->enc_tombs_error = S(Kerror);
2341 TheEncoding(encoding)->enc_charset = symbol;
2342 TheEncoding(encoding)->enc_mblen = P(base64_mblen);
2343 TheEncoding(encoding)->enc_mbstowcs = P(base64_mbstowcs);
2344 TheEncoding(encoding)->enc_wcslen = P(base64_wcslen);
2345 TheEncoding(encoding)->enc_wcstombs = P(base64_wcstombs);
2346 TheEncoding(encoding)->enc_range = P(base64_range);
2347 TheEncoding(encoding)->min_bytes_per_char = 2; /* ?? */
2348 TheEncoding(encoding)->max_bytes_per_char = 2; /* ?? */
2349 define_constant(symbol,encoding);
2350 }
2351 {
2352 var object symbol = S(unicode_16_big_endian);
2353 var object encoding = allocate_encoding();
2354 TheEncoding(encoding)->enc_eol = S(Kunix);
2355 TheEncoding(encoding)->enc_towcs_error = S(Kerror);
2356 TheEncoding(encoding)->enc_tombs_error = S(Kerror);
2357 TheEncoding(encoding)->enc_charset = symbol;
2358 TheEncoding(encoding)->enc_mblen = P(uni16_mblen);
2359 TheEncoding(encoding)->enc_mbstowcs = P(uni16be_mbstowcs);
2360 TheEncoding(encoding)->enc_wcslen = P(uni16_wcslen);
2361 TheEncoding(encoding)->enc_wcstombs = P(uni16be_wcstombs);
2362 TheEncoding(encoding)->enc_range = P(bmp_range);
2363 TheEncoding(encoding)->min_bytes_per_char = 2;
2364 TheEncoding(encoding)->max_bytes_per_char = 2;
2365 define_constant(symbol,encoding);
2366 }
2367 {
2368 var object symbol = S(unicode_16_little_endian);
2369 var object encoding = allocate_encoding();
2370 TheEncoding(encoding)->enc_eol = S(Kunix);
2371 TheEncoding(encoding)->enc_towcs_error = S(Kerror);
2372 TheEncoding(encoding)->enc_tombs_error = S(Kerror);
2373 TheEncoding(encoding)->enc_charset = symbol;
2374 TheEncoding(encoding)->enc_mblen = P(uni16_mblen);
2375 TheEncoding(encoding)->enc_mbstowcs = P(uni16le_mbstowcs);
2376 TheEncoding(encoding)->enc_wcslen = P(uni16_wcslen);
2377 TheEncoding(encoding)->enc_wcstombs = P(uni16le_wcstombs);
2378 TheEncoding(encoding)->enc_range = P(bmp_range);
2379 TheEncoding(encoding)->min_bytes_per_char = 2;
2380 TheEncoding(encoding)->max_bytes_per_char = 2;
2381 define_constant(symbol,encoding);
2382 }
2383 {
2384 var object symbol = S(unicode_32_big_endian);
2385 var object encoding = allocate_encoding();
2386 TheEncoding(encoding)->enc_eol = S(Kunix);
2387 TheEncoding(encoding)->enc_towcs_error = S(Kerror);
2388 TheEncoding(encoding)->enc_tombs_error = S(Kerror);
2389 TheEncoding(encoding)->enc_charset = symbol;
2390 TheEncoding(encoding)->enc_mblen = P(uni32be_mblen);
2391 TheEncoding(encoding)->enc_mbstowcs = P(uni32be_mbstowcs);
2392 TheEncoding(encoding)->enc_wcslen = P(uni32_wcslen);
2393 TheEncoding(encoding)->enc_wcstombs = P(uni32be_wcstombs);
2394 TheEncoding(encoding)->enc_range = P(all_range);
2395 TheEncoding(encoding)->min_bytes_per_char = 4;
2396 TheEncoding(encoding)->max_bytes_per_char = 4;
2397 define_constant(symbol,encoding);
2398 }
2399 {
2400 var object symbol = S(unicode_32_little_endian);
2401 var object encoding = allocate_encoding();
2402 TheEncoding(encoding)->enc_eol = S(Kunix);
2403 TheEncoding(encoding)->enc_towcs_error = S(Kerror);
2404 TheEncoding(encoding)->enc_tombs_error = S(Kerror);
2405 TheEncoding(encoding)->enc_charset = symbol;
2406 TheEncoding(encoding)->enc_mblen = P(uni32le_mblen);
2407 TheEncoding(encoding)->enc_mbstowcs = P(uni32le_mbstowcs);
2408 TheEncoding(encoding)->enc_wcslen = P(uni32_wcslen);
2409 TheEncoding(encoding)->enc_wcstombs = P(uni32le_wcstombs);
2410 TheEncoding(encoding)->enc_range = P(all_range);
2411 TheEncoding(encoding)->min_bytes_per_char = 4;
2412 TheEncoding(encoding)->max_bytes_per_char = 4;
2413 define_constant(symbol,encoding);
2414 }
2415 {
2416 var object symbol = S(utf_8);
2417 var object encoding = allocate_encoding();
2418 TheEncoding(encoding)->enc_eol = S(Kunix);
2419 TheEncoding(encoding)->enc_towcs_error = S(Kerror);
2420 TheEncoding(encoding)->enc_tombs_error = S(Kerror);
2421 TheEncoding(encoding)->enc_charset = symbol;
2422 TheEncoding(encoding)->enc_mblen = P(utf8_mblen);
2423 TheEncoding(encoding)->enc_mbstowcs = P(utf8_mbstowcs);
2424 TheEncoding(encoding)->enc_wcslen = P(utf8_wcslen);
2425 TheEncoding(encoding)->enc_wcstombs = P(utf8_wcstombs);
2426 TheEncoding(encoding)->enc_range = P(all_range);
2427 TheEncoding(encoding)->min_bytes_per_char = 1;
2428 TheEncoding(encoding)->max_bytes_per_char = 4;
2429 define_constant(symbol,encoding);
2430 }
2431 {
2432 var object symbol = S(java);
2433 var object encoding = allocate_encoding();
2434 TheEncoding(encoding)->enc_eol = S(Kunix);
2435 TheEncoding(encoding)->enc_towcs_error = S(Kerror);
2436 TheEncoding(encoding)->enc_tombs_error = S(Kerror);
2437 TheEncoding(encoding)->enc_charset = symbol;
2438 TheEncoding(encoding)->enc_mblen = P(java_mblen);
2439 TheEncoding(encoding)->enc_mbstowcs = P(java_mbstowcs);
2440 TheEncoding(encoding)->enc_wcslen = P(java_wcslen);
2441 TheEncoding(encoding)->enc_wcstombs = P(java_wcstombs);
2442 TheEncoding(encoding)->enc_range = P(all_range);
2443 TheEncoding(encoding)->min_bytes_per_char = 1;
2444 TheEncoding(encoding)->max_bytes_per_char = 12;
2445 define_constant(symbol,encoding);
2446 }
2447 #endif
2448 }
init_encodings_2(void)2449 global void init_encodings_2 (void) {
2450 #ifdef ENABLE_UNICODE
2451 {
2452 var object symbol = nls_first_sym;
2453 var const nls_table_t * const * ptr = &nls_tables[0];
2454 var uintC count = sizeof(nls_tables)/sizeof(nls_tables[0]);
2455 ASSERT(nls_num_encodings == count);
2456 for (; count > 0; count--) {
2457 var object encoding = allocate_encoding();
2458 TheEncoding(encoding)->enc_eol = S(Kunix);
2459 TheEncoding(encoding)->enc_towcs_error = S(Kerror);
2460 TheEncoding(encoding)->enc_tombs_error = S(Kerror);
2461 TheEncoding(encoding)->enc_charset = symbol;
2462 if ((*ptr)->is_ascii_extension) {
2463 TheEncoding(encoding)->enc_mblen = P(nls_asciiext_mblen);
2464 TheEncoding(encoding)->enc_mbstowcs = P(nls_asciiext_mbstowcs);
2465 TheEncoding(encoding)->enc_wcslen = P(nls_asciiext_wcslen);
2466 TheEncoding(encoding)->enc_wcstombs = P(nls_asciiext_wcstombs);
2467 } else {
2468 TheEncoding(encoding)->enc_mblen = P(nls_mblen);
2469 TheEncoding(encoding)->enc_mbstowcs = P(nls_mbstowcs);
2470 TheEncoding(encoding)->enc_wcslen = P(nls_wcslen);
2471 TheEncoding(encoding)->enc_wcstombs = P(nls_wcstombs);
2472 }
2473 TheEncoding(encoding)->enc_range = P(nls_range);
2474 TheEncoding(encoding)->enc_table = make_machine(*ptr);
2475 TheEncoding(encoding)->min_bytes_per_char = 1;
2476 TheEncoding(encoding)->max_bytes_per_char = 1;
2477 define_constant(symbol,encoding);
2478 symbol = objectplus(symbol,(soint)sizeof(*TheSymbol(symbol))<<(oint_addr_shift-addr_shift));
2479 ptr++;
2480 }
2481 }
2482 /* Now some aliases. */
2483 define_constant(S(unicode_16),Symbol_value(S(unicode_16_big_endian))); /* network byte order = big endian */
2484 define_constant(S(unicode_32),Symbol_value(S(unicode_32_big_endian))); /* network byte order = big endian */
2485 define_constant(S(ucs_2),Symbol_value(S(unicode_16)));
2486 define_constant(S(ucs_4),Symbol_value(S(unicode_32)));
2487 define_constant(S(macintosh),Symbol_value(S(mac_roman)));
2488 define_constant(S(windows_1250),Symbol_value(S(cp1250)));
2489 define_constant(S(windows_1251),Symbol_value(S(cp1251)));
2490 define_constant(S(windows_1252),Symbol_value(S(cp1252)));
2491 define_constant(S(windows_1253),Symbol_value(S(cp1253)));
2492 define_constant(S(windows_1254),Symbol_value(S(cp1254)));
2493 define_constant(S(windows_1256),Symbol_value(S(cp1256)));
2494 define_constant(S(windows_1257),Symbol_value(S(cp1257)));
2495 #if defined(HAVE_GOOD_ICONV) && (defined(GNU_LIBICONV) || (__GLIBC__ > 2 || (__GLIBC__ == 2 && __GLIBC_MINOR__ >= 2)))
2496 {
2497 var object symbol = iconv_first_sym;
2498 var uintC count = iconv_num_encodings;
2499 for (; count > 0; count--) {
2500 pushSTACK(Symbol_name(symbol)); pushSTACK(unbound);
2501 pushSTACK(unbound); pushSTACK(unbound); pushSTACK(NIL);
2502 C_make_encoding(); /* cannot use funcall yet */
2503 if (nullp(value1)) {
2504 pushSTACK(symbol); pushSTACK(O(charset_package)); C_unintern();
2505 } else define_constant(symbol,value1);
2506 symbol = objectplus(symbol,(soint)sizeof(*TheSymbol(symbol))<<(oint_addr_shift-addr_shift));
2507 }
2508 }
2509 /* Now some aliases, to be defined only if their targets still exit. */
2510 if (!boundp(Symbol_value(S(cp1255)))) {
2511 pushSTACK(S(windows_1255)); pushSTACK(O(charset_package)); C_unintern();
2512 } else define_constant(S(windows_1255),Symbol_value(S(cp1255)));
2513 if (!boundp(Symbol_value(S(cp1258)))) {
2514 pushSTACK(S(windows_1258)); pushSTACK(O(charset_package)); C_unintern();
2515 } else define_constant(S(windows_1258),Symbol_value(S(cp1258)));
2516 #endif
2517 /* Initialize O(internal_encoding): */
2518 pushSTACK(Symbol_value(S(utf_8))); /* :charset */
2519 pushSTACK(S(Kunix)); /* :line-terminator */
2520 pushSTACK(unbound); /* :input-error-action */
2521 pushSTACK(unbound); /* :output-error-action */
2522 pushSTACK(unbound); /* :if-does-not-exist */
2523 C_make_encoding();
2524 O(internal_encoding) = value1;
2525 #endif
2526 /* Initialize locale dependent encodings: */
2527 init_dependent_encodings();
2528 }
2529
2530 #ifdef ENABLE_UNICODE
2531 /* convert the encoding name to its canonical form
2532 at this time, just upper-case the encoding name
2533 in the future, might insert/delete `-' &c
2534 (do not use toupper() because we know that encoding name is ASCII) */
canonicalize_encoding(const char * const_encoding)2535 local char *canonicalize_encoding (const char *const_encoding) {
2536 var char* encoding = strdup(const_encoding); /* copy into writeable memory */
2537 var char* p;
2538 for (p = encoding; *p != '\0'; p++)
2539 if (*p >= 'a' && *p <= 'z')
2540 *p += 'A' - 'a';
2541 return encoding;
2542 }
2543 #endif
2544
2545 /* The reasonable 1:1 default.
2546 Rationale: this avoids random encoding errors,
2547 e.g., when DIRECTORY is called on startup and the user home dir
2548 contains files with non-ASCII names
2549 The right default encoding is the one defined by the CLISP
2550 CODE-CHAR/CHAR-CODE conversion on the first 255 bytes, i.e., ISO-8859-1 */
2551 #define DEFAULT_1_1_ENCODING Symbol_value(S(iso8859_1))
2552 #define DEFAULT_1_1_ENCODING_NAME "ISO-8859-1"
2553
2554 /* Returns an encoding specified by a name.
2555 The line-termination is OS dependent.
2556 encoding_from_name(name,context)
2557 > const char* name: Any of the canonical names returned by the locale_charset()
2558 function.
2559 > const char* context: for warnings
2560 > STACK_0 : if context /= locale and name does not make an encoding
2561 - the default locale encoding
2562 can trigger GC */
encoding_from_name(const char * const_name,const char * context)2563 local maygc object encoding_from_name (const char* const_name, const char* context) {
2564 #ifdef ENABLE_UNICODE
2565 /* Attempt to use the character set implicitly specified by the locale. */
2566 var char* name = canonicalize_encoding(const_name);
2567 if (asciz_equal(name,"US-ASCII") || asciz_equal(name,"ANSI_X3.4-1968"))
2568 pushSTACK(Symbol_value(S(ascii)));
2569 #if defined(HAVE_GOOD_ICONV) && (defined(GNU_LIBICONV) || (__GLIBC__ > 2 || (__GLIBC__ == 2 && __GLIBC_MINOR__ >= 2)))
2570 else if (asciz_equal(name,"GB2312"))
2571 pushSTACK(Symbol_value(S(euc_cn)));
2572 else if (asciz_equal(name,"SJIS"))
2573 pushSTACK(Symbol_value(S(shift_jis)));
2574 #endif
2575 else if (asciz_equal(name,"1:1") || asciz_equal(name,"8BIT"))
2576 pushSTACK(DEFAULT_1_1_ENCODING);
2577 else {
2578 pushSTACK(asciz_to_string(name,Symbol_value(S(ascii))));
2579 pushSTACK(O(charset_package));
2580 C_find_symbol();
2581 if (!nullp(value2) && encodingp(Symbol_value(value1)))
2582 pushSTACK(Symbol_value(value1));
2583 else if (asciz_equal(context,"locale")) { /* e.g., name=ISO8859-1 */
2584 fprintf(stderr,GETTEXT("WARNING: %s: no encoding %s, using %s"),
2585 context,name,DEFAULT_1_1_ENCODING_NAME);
2586 fprint(stderr,"\n");
2587 pushSTACK(DEFAULT_1_1_ENCODING);
2588 } else {
2589 fprintf(stderr,GETTEXT("WARNING: %s: no encoding %s, using %s"),
2590 context,name,"locale encoding");
2591 fprint(stderr,"\n");
2592 pushSTACK(STACK_0);
2593 }
2594 }
2595 #else
2596 unused const_name; unused context;
2597 pushSTACK(unbound); /* :charset */
2598 #endif /* ENABLE_UNICODE */
2599 #if defined(WIN32) || (defined(UNIX) && (O_BINARY != 0) && !defined(UNIX_CYGWIN))
2600 pushSTACK(S(Kdos)); /* :line-terminator */
2601 #else
2602 pushSTACK(S(Kunix)); /* :line-terminator */
2603 #endif
2604 pushSTACK(unbound); /* :input-error-action */
2605 pushSTACK(unbound); /* :output-error-action */
2606 pushSTACK(unbound); /* :if-does-not-exist */
2607 C_make_encoding();
2608 return value1;
2609 }
2610
2611 /* Initialize the encodings which depend on environment variables.
2612 init_dependent_encodings(); */
init_dependent_encodings(void)2613 global void init_dependent_encodings(void) {
2614 #ifdef ENABLE_UNICODE
2615 extern const char* locale_encoding; /* GNU locale encoding canonical name */
2616 extern const char* argv_encoding_file; /* override *DEFAULT-FILE-ENCODING* */
2617 #ifndef CONSTANT_PATHNAME_ENCODING
2618 extern const char* argv_encoding_pathname; /* override *PATHNAME-ENCODING* */
2619 #endif
2620 extern const char* argv_encoding_terminal; /* override *terminal-encoding* */
2621 extern const char* argv_encoding_foreign; /* override *foreign-encoding* */
2622 extern const char* argv_encoding_misc; /* override *misc-encoding* */
2623 begin_system_call();
2624 locale_encoding = locale_charset(); /* depends on environment variables */
2625 end_system_call();
2626 pushSTACK(encoding_from_name(locale_encoding,"locale"));
2627 /* Initialize each encoding as follows: If the corresponding -E....
2628 option was not given, use the locale dependent locale_charset().
2629 If it was given, use that, and if the specified encoding was invalid,
2630 use a default encoding that does not depend on the locale. */
2631 O(default_file_encoding) =
2632 (argv_encoding_file == NULL ? (object)STACK_0
2633 : encoding_from_name(argv_encoding_file,"*DEFAULT-FILE-ENCODING*"));
2634 O(pathname_encoding) =
2635 #ifdef CONSTANT_PATHNAME_ENCODING
2636 CONSTANT_PATHNAME_ENCODING;
2637 #else
2638 (argv_encoding_pathname == NULL ? (object)STACK_0
2639 : encoding_from_name(argv_encoding_pathname,"*PATHNAME-ENCODING*"));
2640 #endif
2641 #if defined(WIN32_NATIVE)
2642 /* cf src/glllib/localcharset.c locale_charset() */
2643 if (argv_encoding_terminal == NULL) {
2644 var char buf[2+10+1];
2645 sprintf(buf,"CP%u",GetOEMCP());
2646 O(terminal_encoding) = encoding_from_name(buf,"*TERMINAL-ENCODING*");
2647 } else
2648 O(terminal_encoding) =
2649 encoding_from_name(argv_encoding_terminal,"*TERMINAL-ENCODING*");
2650 #else
2651 O(terminal_encoding) =
2652 (argv_encoding_terminal == NULL ? (object)STACK_0
2653 : encoding_from_name(argv_encoding_terminal,"*TERMINAL-ENCODING*"));
2654 #endif
2655 #if defined(HAVE_FFI)
2656 O(foreign_encoding) =
2657 (argv_encoding_foreign == NULL ? (object)STACK_0
2658 : encoding_from_name(argv_encoding_foreign,"*FOREIGN-ENCODING*"));
2659 O(foreign_8bit_encoding) =
2660 (TheEncoding(O(foreign_encoding))->max_bytes_per_char == 1
2661 ? O(foreign_encoding)
2662 /* not DEFAULT_1_1_ENCODING because foreign_8bit_encoding must agree
2663 with foreign_encoding and only ASCII agrees with _all_ encodings */
2664 : Symbol_value(S(ascii)));
2665 #endif
2666 O(misc_encoding) =
2667 (argv_encoding_misc == NULL ? (object)STACK_0
2668 : encoding_from_name(argv_encoding_misc,"*MISC-ENCODING*"));
2669 skipSTACK(1);
2670 #else /* no ENABLE_UNICODE */
2671 O(default_file_encoding) = encoding_from_name(NULL,NULL);
2672 O(terminal_encoding) = encoding_from_name(NULL,NULL);
2673 #endif
2674 }
2675
2676 /* =========================================================================
2677 * Accessors */
2678
2679 /* (SYSTEM::DEFAULT-FILE-ENCODING) */
2680 LISPFUNNR(default_file_encoding,0) {
2681 VALUES1(O(default_file_encoding));
2682 }
2683
2684 /* (SYSTEM::SET-DEFAULT-FILE-ENCODING encoding) */
2685 LISPFUNN(set_default_file_encoding,1) {
2686 var object encoding =
2687 check_encoding(popSTACK(),&O(default_file_encoding),false);
2688 VALUES1(O(default_file_encoding) = encoding);
2689 }
2690
2691 #ifdef ENABLE_UNICODE
2692
2693 /* (SYSTEM::PATHNAME-ENCODING) */
2694 LISPFUNNR(pathname_encoding,0) {
2695 VALUES1(O(pathname_encoding));
2696 }
2697
2698 /* (SYSTEM::SET-PATHNAME-ENCODING encoding) */
2699 LISPFUNN(set_pathname_encoding,1) {
2700 var object encoding = check_encoding(popSTACK(),&O(pathname_encoding),false);
2701 #ifdef CONSTANT_PATHNAME_ENCODING
2702 if (eq(encoding,O(pathname_encoding))) VALUES1(encoding);
2703 else {
2704 pushSTACK(CONSTANT_PATHNAME_ENCODING);
2705 error(error_condition,GETTEXT("*PATHNAME_ENCODING* on this platform can only be ~S"));
2706 }
2707 #else
2708 VALUES1(O(pathname_encoding) = encoding);
2709 #endif
2710 }
2711
2712 /* (SYSTEM::TERMINAL-ENCODING) */
2713 LISPFUNNR(terminal_encoding,0) {
2714 VALUES1(O(terminal_encoding));
2715 }
2716
2717 /* (SYSTEM::SET-TERMINAL-ENCODING encoding) */
2718 LISPFUNN(set_terminal_encoding,1) {
2719 var object encoding = check_encoding(STACK_0,&O(terminal_encoding),false);
2720 STACK_0 = encoding;
2721 /* Ensure O(terminal_encoding) = (STREAM-EXTERNAL-FORMAT *TERMINAL-IO*).
2722 But first modify (STREAM-EXTERNAL-FORMAT *TERMINAL-IO*): */
2723 set_terminalstream_external_format(var_stream(S(terminal_io),0),encoding);
2724 VALUES1(O(terminal_encoding) = popSTACK());
2725 }
2726
2727 #if defined(HAVE_FFI)
2728
2729 /* (SYSTEM::FOREIGN-ENCODING) */
2730 LISPFUNNR(foreign_encoding,0) {
2731 VALUES1(O(foreign_encoding));
2732 }
2733
2734 /* (SYSTEM::SET-FOREIGN-ENCODING encoding) */
2735 LISPFUNN(set_foreign_encoding,1) {
2736 var object encoding = check_encoding(popSTACK(),&O(foreign_encoding),false);
2737 O(foreign_encoding) = encoding;
2738 O(foreign_8bit_encoding) =
2739 (TheEncoding(O(foreign_encoding))->max_bytes_per_char == 1
2740 ? O(foreign_encoding)
2741 : Symbol_value(S(ascii)));
2742 VALUES1(encoding);
2743 }
2744
2745 #endif /* HAVE_FFI */
2746
2747 /* (SYSTEM::MISC-ENCODING) */
2748 LISPFUNNR(misc_encoding,0) {
2749 VALUES1(O(misc_encoding));
2750 }
2751
2752 /* (SYSTEM::SET-MISC-ENCODING encoding) */
2753 LISPFUNN(set_misc_encoding,1) {
2754 var object encoding = check_encoding(popSTACK(),&O(misc_encoding),false);
2755 VALUES1(O(misc_encoding) = encoding);
2756 }
2757
2758 #endif /* ENABLE_UNICODE */
2759
2760 /* =========================================================================
2761 * More functions */
2762
2763 /* (CONVERT-STRING-FROM-BYTES byte-array encoding [:start] [:end]) */
2764 LISPFUN(convert_string_from_bytes,seclass_read,2,0,norest,key,2,
2765 (kw(start),kw(end)) ) {
2766 /* Stack layout: array, encoding, start, end. */
2767 STACK_3 = check_vector(STACK_3); /* check array */
2768 STACK_2 = check_encoding(STACK_2,DEFAULT_ENC,false);
2769 var object array = STACK_3;
2770 STACK_3 = STACK_2; /* encoding */
2771 STACK_2 = array; /* array */
2772 /* Stack layout: encoding, array, start, end. */
2773 if (!boundp(STACK_1))
2774 STACK_1 = Fixnum_0; /* check start */
2775 if (missingp(STACK_0)) /* check end */
2776 STACK_0 = fixnum(vector_length(array));
2777 /* Convert array to a vector with element type (UNSIGNED-BYTE 8): */
2778 if (!bit_vector_p(Atype_8Bit,array)) {
2779 /* (SYS::COERCED-SUBSEQ array '(ARRAY (UNSIGNED-BYTE 8) (*))
2780 [:start] [:end]) */
2781 pushSTACK(array); pushSTACK(O(type_uint8_vector));
2782 pushSTACK(S(Kstart)); pushSTACK(STACK_(1+3));
2783 pushSTACK(S(Kend)); pushSTACK(STACK_(0+5));
2784 funcall(L(coerced_subseq),6);
2785 if (!bit_vector_p(Atype_8Bit,value1)) { NOTREACHED; }
2786 STACK_2 = value1;
2787 STACK_0 = I_I_minus_I(STACK_0,STACK_1); /* end := (- end start) */
2788 STACK_1 = Fixnum_0; /* start := 0 */
2789 array = STACK_2;
2790 }
2791 /* Determine size of result string: */
2792 var stringarg sa;
2793 sa.offset = 0; sa.len = vector_length(array);
2794 sa.string = array_displace_check(array,sa.len,&sa.offset);
2795 test_vector_limits(&sa);
2796 STACK_0 = array = sa.string;
2797 /* stack layout: encoding, array */
2798 var uintL start = sa.offset + sa.index;
2799 var uintL end = start + sa.len;
2800 #ifdef ENABLE_UNICODE
2801 var uintL clen =
2802 Encoding_mblen(STACK_1)(STACK_1,&TheSbvector(array)->data[start],
2803 &TheSbvector(array)->data[end]);
2804 #else
2805 var uintL clen = end-start;
2806 #endif
2807 /* Allocate and fill the result string: */
2808 check_stringsize(clen);
2809 var object string = allocate_string(clen);
2810 if (clen > 0) {
2811 array = STACK_0;
2812 var chart* cptr = &TheSnstring(string)->data[0];
2813 var const uintB* bptr = &TheSbvector(array)->data[start];
2814 #ifdef ENABLE_UNICODE
2815 var const uintB* bendptr = &TheSbvector(array)->data[end];
2816 var chart* cendptr = cptr+clen;
2817 Encoding_mbstowcs(STACK_1)(STACK_1,nullobj,&bptr,bendptr,&cptr,cendptr);
2818 ASSERT(cptr == cendptr);
2819 if ((bptr != bendptr) /* some bytes were unused! */
2820 && eq(TheEncoding(STACK_1)->enc_towcs_error,S(Kerror)))
2821 error_incomplete(STACK_1);
2822 #else
2823 dotimespL(clen,clen, { *cptr++ = as_chart(*bptr++); } );
2824 #endif
2825 } else if (end != start) /* string is empty, but the vector is not! */
2826 error_incomplete(STACK_1);
2827 VALUES1(string); skipSTACK(2);
2828 }
2829
2830 /* (CONVERT-STRING-TO-BYTES string encoding [:start] [:end]) */
2831 LISPFUN(convert_string_to_bytes,seclass_read,2,0,norest,key,2,
2832 (kw(start),kw(end)) ) {
2833 /* Stack layout: string, encoding, start, end. */
2834 STACK_2 = check_encoding(STACK_2,DEFAULT_ENC,false);
2835 var object string = STACK_3;
2836 STACK_3 = STACK_2; /* encoding */
2837 STACK_2 = string; /* string */
2838 /* Stack layout: encoding, string, start, end. */
2839 var stringarg sa;
2840 test_string_limits_ro(&sa); /* check string */
2841 string = sa.string;
2842 pushSTACK(string);
2843 /* Stack layout: encoding, string */
2844 var const chart* srcptr;
2845 unpack_sstring_alloca(string,sa.len,sa.offset+sa.index, srcptr=);
2846 var uintL blen = cslen(STACK_1,srcptr,sa.len);
2847 /* Allocate and fill the result vector: */
2848 var object array = allocate_bit_vector(Atype_8Bit,blen);
2849 if (blen > 0) {
2850 string = STACK_0;
2851 unpack_sstring_alloca(string,sa.len,sa.offset+sa.index, srcptr=);
2852 cstombs(STACK_1,srcptr,sa.len,&TheSbvector(array)->data[0],blen);
2853 }
2854 VALUES1(array); skipSTACK(2);
2855 }
2856