1 /*
2  * Encodings (character sets and conversions) for CLISP
3  * Bruno Haible 1998-2008, 2017-2018
4  * Sam Steingold 1998-2009, 2011, 2017
5  */
6 
7 #include "lispbibl.c"
8 
9 #ifdef ENABLE_UNICODE
10 #include <localcharset.h>       /* from gnulib */
11 #endif
12 
13 /* =========================================================================
14  * Individual encodings */
15 
16 #ifdef ENABLE_UNICODE
17 
18 /* NOTE 1! The mblen function has to be consistent with the mbstowcs function
19  (when called with stream = nullobj).
20  The wcslen function has to be consistent with the wcstombs function (when
21  called with stream = nullobj). */
22 
23 /* NOTE 2! The conversion from bytes to characters (mbstowcs function) is
24  subject to the following restriction: At most one byte lookahead is needed.
25  This means, when someone calls mbstowcs for converting one character, and
26  he tries it with 1 byte, then with one more byte, then with one more byte,
27  and so on: when the conversion succeeds for the first time, it will leave at
28  most one byte in the buffer. stream.d (rd_ch_buffered, rd_ch_array_buffered)
29  heavily depend on this. */
30 
31 /* --------------------------------------------------------------------------
32  * base64 http://rfc.net/rfc2045.html */
33 
34 global uintL base64_mblen (object encoding, const uintB* src,
35                            const uintB* srcend);
36 global void base64_mbstowcs (object encoding, object stream,
37                              const uintB* *srcp, const uintB* srcend,
38                              chart* *destp, chart* destend);
39 global uintL base64_wcslen (object encoding, const chart* src,
40                             const chart* srcend);
41 global void base64_wcstombs (object encoding, object stream,
42                              const chart* *srcp, const chart* srcend,
43                              uintB* *destp, uintB* destend);
44 global object base64_range (object encoding, uintL start, uintL end,
45                             uintL maxintervals);
46 
47 static const char base64_table[64] = {
48   'A','B','C','D','E','F','G','H','I','J','K','L','M','N','O','P',
49   'Q','R','S','T','U','V','W','X','Y','Z','a','b','c','d','e','f',
50   'g','h','i','j','k','l','m','n','o','p','q','r','s','t','u','v',
51   'w','x','y','z','0','1','2','3','4','5','6','7','8','9','+','/'
52 };
53 static const signed char table_base64[128] = {
54   -1,  -1,  -1,  -1,  -1,  -1,  -1,  -1,  -1,  -1,      /*  -1-  9 */
55   -1,  -1,  -1,  -1,  -1,  -1,  -1,  -1,  -1,  -1,      /*  10- 19 */
56   -1,  -1,  -1,  -1,  -1,  -1,  -1,  -1,  -1,  -1,      /*  20- 29 */
57   -1,  -1,  -1,  -1,  -1,  -1,  -1,  -1,  -1,  -1,      /*  30- 39 */
58   -1,  -1,  -1,  62,  -1,  -1,  -1,  63,  52,  53,      /*  40- 49 */
59   54,  55,  56,  57,  58,  59,  60,  61,  -1,  -1,      /*  50- 59 */
60   -1,  -1,  -1,  -1,  -1,   0,   1,   2,   3,   4,      /*  60- 69 */
61    5,   6,   7,   8,   9,  10,  11,  12,  13,  14,      /*  70- 79 */
62   15,  16,  17,  18,  19,  20,  21,  22,  23,  24,      /*  80- 89 */
63   25,  -1,  -1,  -1,  -1,  -1,  -1,  26,  27,  28,      /*  90- 99 */
64   29,  30,  31,  32,  33,  34,  35,  36,  37,  38,      /* 100-109 */
65   39,  40,  41,  42,  43,  44,  45,  46,  47,  48,      /* 110-119 */
66   49,  50,  51,  -1,  -1,  -1,  -1,  -1                 /* 120-127 */
67 };
68 /* alternatively:
69     var int pos = 0;
70     while (pos < sizeof(table_base64)) table_base64[pos++] = -1;
71     for (pos = 0; pos < sizeof(base64_table); pos++)
72       table_base64[base64_table[pos]] = pos;
73 */
74 
75 #define MIME_LINE_LENGTH  76
76 
77 typedef enum { le_unix, le_mac, le_dos } line_end_t;
enc_eol_to_le(object enc_eol)78 local line_end_t enc_eol_to_le (object enc_eol) {
79   if (eq(enc_eol,S(Kunix))) return le_unix;
80   if (eq(enc_eol,S(Kdos))) return le_dos;
81   if (eq(enc_eol,S(Kmac))) return le_mac;
82   NOTREACHED;
83 }
84 
base64_to_chars(line_end_t le,const uintB * src,const uintB * srcend,chart * dest)85 local uintL base64_to_chars (line_end_t le, const uintB* src,
86                              const uintB* srcend, chart *dest) {
87   var uintL pos = 0;
88   var uintL counter = 0;
89   var uintL num_chars = 0;
90   while (src < srcend) {
91     int c = *src++;
92     if (counter < MIME_LINE_LENGTH/4) counter++;
93     else { /* Wrap line every 76 characters. */
94       counter = 1;
95       switch (le) {
96         case le_unix:
97           if (dest) *dest++ = ascii(LF); num_chars++;
98           break;
99         case le_dos:
100           if (dest) { *dest++ = ascii(CR); *dest++ = ascii(LF); }
101           num_chars += 2;
102           break;
103         case le_mac:
104           if (dest) *dest++ = ascii(CR); num_chars++;
105           break;
106       }
107     }
108     /* Process first byte of a triplet.  */
109     if (dest) *dest++ = ascii(base64_table[0x3f & c >> 2]); num_chars++;
110     var int value = (0x03 & c) << 4;
111      /* Process second byte of a triplet.  */
112     if (src == srcend) {
113       if (dest) {
114         *dest++ = ascii(base64_table[value]);
115         *dest++ = ascii('=');
116         *dest++ = ascii('=');
117       }
118       num_chars += 3;
119       break;
120     }
121     c = *src++;
122     if (dest) *dest++ = ascii(base64_table[value | (0x0f & c >> 4)]);
123     num_chars++;
124     value = (0x0f & c) << 2;
125     /* Process third byte of a triplet.  */
126     if (src == srcend) {
127       if (dest) {
128         *dest++ = ascii(base64_table[value]);
129         *dest++ = ascii('=');
130       }
131       num_chars += 2;
132       break;
133     }
134     c = *src++;
135     if (dest) {
136       *dest++ = ascii(base64_table[value | (0x03 & c >> 6)]);
137       *dest++ = ascii(base64_table[0x3f & c]);
138     }
139     num_chars += 2;
140   }
141   return num_chars;
142 }
143 
base64_mblen(object encoding,const uintB * src,const uintB * srcend)144 global uintL base64_mblen (object encoding, const uintB* src,
145                            const uintB* srcend) {
146   return base64_to_chars(enc_eol_to_le(TheEncoding(encoding)->enc_eol),
147                          src,srcend,NULL);
148 }
149 
150 /* see emacs/src/fns.c */
base64_mbstowcs(object encoding,object stream,const uintB ** srcp,const uintB * srcend,chart ** destp,chart * destend)151 global void base64_mbstowcs (object encoding, object stream,
152                              const uintB* *srcp, const uintB* srcend,
153                              chart* *destp, chart* destend) {
154   unused(stream); unused(destend);
155   *destp += base64_to_chars(enc_eol_to_le(TheEncoding(encoding)->enc_eol),
156                             *srcp,srcend,*destp);
157   *srcp = srcend;
158 }
159 
160 #define BASE64_P(c) (c<sizeof(table_base64) && table_base64[c]!=-1)
161 #define BASE64_IGNORABLE_P(ch)                                          \
162   (chareq(ch,ascii(' ')) || chareq(ch,ascii('\t')) || chareq(ch,ascii('\n')) \
163    || chareq(ch,ascii('\f')) || chareq(ch,ascii('\r')))
164 
165 /* see emacs/src/fns.c */
166 #define READ_QUADRUPLET_BYTE(endform) do {                      \
167   if (src == srcend) { endform; }                               \
168   ch = *src++;                                                  \
169  } while (BASE64_IGNORABLE_P(ch));                              \
170   c = as_cint(ch)
171 
172 /* convert ascii src to bytes dest - when destp is given
173  return the number of bytes
174  the final bad characted position is retutned in error_p */
base64_to_bytes(const chart * src,const chart * srcend,uintB * destp,const chart ** error_p)175 local uintL base64_to_bytes (const chart *src, const chart* srcend,
176                              uintB* destp, const chart* *error_p) {
177   var unsigned char c;
178   var chart ch;
179   var unsigned long value;
180   var uintB *dest = destp;
181   var uintL num_bytes = 0;
182 
183   while (1) {
184     /* Process first byte of a quadruplet. */
185     READ_QUADRUPLET_BYTE(return num_bytes);
186     if (!BASE64_P(c)) { *error_p = src-1; return num_bytes; }
187     value = table_base64[c] << 18;
188 
189     /* Process second byte of a quadruplet.  */
190     READ_QUADRUPLET_BYTE(*error_p = src; return num_bytes);
191     if (!BASE64_P(c)) { *error_p = src-1; return num_bytes; }
192     value |= table_base64[c] << 12;
193 
194     if (dest) *dest++ = (unsigned char) (value >> 16);
195     num_bytes++;
196 
197     /* Process third byte of a quadruplet.  */
198     READ_QUADRUPLET_BYTE(*error_p = src; return num_bytes);
199     if (c == '=') {
200       READ_QUADRUPLET_BYTE(*error_p = src; return num_bytes);
201       if (c != '=') { *error_p = src-1; return num_bytes; }
202       continue;
203     }
204 
205     if (!BASE64_P(c)) { *error_p = src-1; return num_bytes; }
206     value |= table_base64[c] << 6;
207 
208     if (dest) *dest++ = (unsigned char) (0xff & value >> 8);
209     num_bytes++;
210 
211     /* Process fourth byte of a quadruplet.  */
212     READ_QUADRUPLET_BYTE(*error_p = src-1; return num_bytes);
213     if (c == '=')
214       continue;
215     if (!BASE64_P(c)) { *error_p = src-1; return num_bytes; }
216     value |= table_base64[c];
217 
218     if (dest) *dest++ = (unsigned char) (0xff & value);
219     num_bytes++;
220   }
221 }
222 
base64_wcslen(object encoding,const chart * src,const chart * srcend)223 global uintL base64_wcslen (object encoding, const chart* src,
224                             const chart* srcend) {
225   unused(encoding);
226   var const chart *error_p = NULL;
227   return base64_to_bytes(src,srcend,NULL,&error_p)
228     + (error_p ? 1 : 0);        /* space for errors */
229 }
230 
base64_wcstombs(object encoding,object stream,const chart ** srcp,const chart * srcend,uintB ** destp,uintB * destend)231 global void base64_wcstombs (object encoding, object stream,
232                              const chart* *srcp, const chart* srcend,
233                              uintB* *destp, uintB* destend) {
234   unused(encoding); unused(stream); unused(destend);
235   var const chart *error_p = NULL;
236   *destp += base64_to_bytes(*srcp,srcend,*destp,&error_p);
237   if (error_p) {
238     if (error_p == srcend) {
239       pushSTACK(NIL);   /* <end-of-file?> slot DATUM */
240       pushSTACK(S(base64));     /* slot EXPECTED-TYPE */
241       pushSTACK(fixnum(error_p-*srcp));
242       pushSTACK(TheSubr(subr_self)->name);
243       error(charset_type_error,GETTEXT("~S: Invalid base64 encoding termination at position ~S"));
244     } else {
245       pushSTACK(code_char(*error_p)); /* slot DATUM */
246       pushSTACK(S(base64));     /* slot EXPECTED-TYPE */
247       pushSTACK(fixnum(srcend-*srcp));
248       pushSTACK(fixnum((error_p-*srcp)+1));
249       pushSTACK(code_char(*error_p));
250       pushSTACK(TheSubr(subr_self)->name);
251       error(charset_type_error,GETTEXT("~S: Invalid base64 encoding at ~S (character ~S of ~S)"));
252     }
253   }
254   *srcp = srcend;
255 }
256 
base64_range(object encoding,uintL start,uintL end,uintL maxintervals)257 global object base64_range (object encoding, uintL start, uintL end,
258                             uintL maxintervals) {
259   unused(encoding);
260   var uintL count = 0; /* number of intervals already on the STACK */
261   if (end >= sizeof(table_base64)) end = sizeof(table_base64) - 1;
262   for (;start <= end && count < maxintervals; count++) {
263     while ((start <= end) && (table_base64[start] == -1)) start++;
264     if (start > end) break;
265     pushSTACK(code_char(as_chart(start)));
266     while ((start <= end) && (table_base64[start] != -1)) start++;
267     pushSTACK(code_char(as_chart(start-1)));
268   }
269   return stringof(count << 1);
270 }
271 
272 local char const hex_table[] = "0123456789ABCDEF";
273 
274 /* Error, when a character cannot be converted to an encoding.
275  error_unencodable(encoding,ch); */
error_unencodable(object encoding,chart ch)276 global _Noreturn void error_unencodable (object encoding, chart ch) {
277   pushSTACK(code_char(ch)); /* CHARSET-TYPE-ERROR slot DATUM */
278   pushSTACK(encoding); /* CHARSET-TYPE-ERROR slot EXPECTED-TYPE */
279   pushSTACK(TheEncoding(encoding)->enc_charset);
280   pushSTACK(ascii_char(hex_table[as_cint(ch)&0x0F]));
281   pushSTACK(ascii_char(hex_table[(as_cint(ch)>>4)&0x0F]));
282   pushSTACK(ascii_char(hex_table[(as_cint(ch)>>8)&0x0F]));
283   pushSTACK(ascii_char(hex_table[(as_cint(ch)>>12)&0x0F]));
284   if (as_cint(ch) < 0x10000) {
285     pushSTACK(TheSubr(subr_self)->name);
286     error(charset_type_error,GETTEXT("~S: Character #\\u~C~C~C~C cannot be represented in the character set ~S"));
287   } else {
288     pushSTACK(ascii_char(hex_table[(as_cint(ch)>>16)&0x0F]));
289     pushSTACK(ascii_char(hex_table[(as_cint(ch)>>20)&0x0F]));
290     pushSTACK(TheSubr(subr_self)->name);
291     error(charset_type_error,GETTEXT("~S: Character #\\u00~C~C~C~C~C~C cannot be represented in the character set ~S"));
292   }
293 }
294 
295 #endif  /* ENABLE_UNICODE */
296 
297 /* used in CONVERT-STRING-FROM-BYTES, so must not depend on ENABLE_UNICODE */
298 /* missing bytes at the end */
error_incomplete(object encoding)299 local _Noreturn void error_incomplete (object encoding) {
300   pushSTACK(NIL);     /* <end-of-file?> CHARSET-TYPE-ERROR slot DATUM */
301 #ifdef ENABLE_UNICODE
302   pushSTACK(encoding);       /* CHARSET-TYPE-ERROR slot EXPECTED-TYPE */
303   pushSTACK(TheEncoding(encoding)->enc_charset);
304 #else
305   pushSTACK(encoding);       /* CHARSET-TYPE-ERROR slot EXPECTED-TYPE */
306   pushSTACK(encoding);       /* no enc_charset slot without ENABLE_UNICODE */
307 #endif
308   pushSTACK(TheSubr(subr_self)->name);
309   error(charset_type_error,GETTEXT("~S: Incomplete byte sequence at end of buffer for ~S"));
310 }
311 
312 #ifdef ENABLE_UNICODE
handle_incomplete(object encoding,chart ** destp,chart * destend)313 local void handle_incomplete (object encoding, chart* *destp, chart* destend) {
314   var object action = TheEncoding(encoding)->enc_towcs_error;
315   if (eq(action,S(Kignore))) {
316   } else if (eq(action,S(Kerror))) {
317     error_incomplete(encoding);
318   } else {
319     if (*destp < destend)
320       *(*destp)++ = char_code(action);
321   }
322 }
323 
324 /* The range function for an encoding covering all of Unicode. */
all_range(object encoding,uintL start,uintL end,uintL maxintervals)325 global object all_range (object encoding, uintL start, uintL end,
326                          uintL maxintervals) {
327   unused(encoding);
328   var uintL count = 0;
329   if (maxintervals > 0) {
330     pushSTACK(code_char(as_chart(start))); pushSTACK(code_char(as_chart(end)));
331     count = 2;
332   }
333   return stringof(count);
334 }
335 
336 /* The range function for an encoding covering the BMP of Unicode. */
bmp_range(object encoding,uintL start,uintL end,uintL maxintervals)337 global object bmp_range (object encoding, uintL start, uintL end,
338                          uintL maxintervals) {
339   unused(encoding);
340   var uintL count = 0;
341   if (maxintervals > 0 && start < 0x10000) {
342     if (end >= 0x10000)
343       end = 0xFFFF;
344     pushSTACK(code_char(as_chart(start))); pushSTACK(code_char(as_chart(end)));
345     count = 2;
346   }
347   return stringof(count);
348 }
349 
350 /* --------------------------------------------------------------------------
351  * Unicode-16 encoding */
352 
353 /* Unicode-16 encoding in two flavours:
354  The big-endian format (files starting with 0xFE 0xFF),
355  the little-endian format (files starting with 0xFF 0xFE). */
356 
357 /* min. bytes per character = 2
358    max. bytes per character = 2 */
359 
360 global uintL uni16_mblen (object encoding, const uintB* src,
361                           const uintB* srcend);
362 global void uni16be_mbstowcs (object encoding, object stream,
363                               const uintB* *srcp, const uintB* srcend,
364                               chart* *destp, chart* destend);
365 global void uni16le_mbstowcs (object encoding, object stream,
366                               const uintB* *srcp, const uintB* srcend,
367                               chart* *destp, chart* destend);
368 global uintL uni16_wcslen (object encoding, const chart* src,
369                            const chart* srcend);
370 global void uni16be_wcstombs (object encoding, object stream,
371                               const chart* *srcp, const chart* srcend,
372                               uintB* *destp, uintB* destend);
373 global void uni16le_wcstombs (object encoding, object stream,
374                               const chart* *srcp, const chart* srcend,
375                               uintB* *destp, uintB* destend);
376 
377 /* Bytes to characters. */
378 
uni16_mblen(object encoding,const uintB * src,const uintB * srcend)379 global uintL uni16_mblen (object encoding, const uintB* src,
380                           const uintB* srcend) {
381   var uintL len = srcend-src;
382   var bool incomplete_p = len & 1; /* odd-p */
383   var uintL count = len >> 1; /* div 2 */
384   if (eq(TheEncoding(encoding)->enc_towcs_error,S(Kignore)))
385     return count;
386   else
387     return (count > 0 ? count + incomplete_p : 0);
388 }
389 
uni16be_mbstowcs(object encoding,object stream,const uintB ** srcp,const uintB * srcend,chart ** destp,chart * destend)390 global void uni16be_mbstowcs (object encoding, object stream,
391                               const uintB* *srcp, const uintB* srcend,
392                               chart* *destp, chart* destend) {
393   unused(stream);
394   var const uintB* src = *srcp;
395   var chart* dest = *destp;
396   var uintL len = srcend-src;
397   var bool incomplete_p = len & 1; /* odd-p */
398   var uintL count = len >> 1; /* div 2 */
399   if (count > destend-dest)
400     count = destend-dest;
401   if (count > 0) {
402     do {
403       *dest++ = as_chart(((cint)src[0] << 8) | (cint)src[1]);
404       src += 2;
405     } while (--count);
406     *srcp = src;
407     *destp = dest;
408     if (incomplete_p) handle_incomplete(encoding,destp,destend);
409   }
410 }
411 
uni16le_mbstowcs(object encoding,object stream,const uintB ** srcp,const uintB * srcend,chart ** destp,chart * destend)412 global void uni16le_mbstowcs (object encoding, object stream,
413                               const uintB* *srcp, const uintB* srcend,
414                               chart* *destp, chart* destend) {
415   unused(stream);
416   var const uintB* src = *srcp;
417   var chart* dest = *destp;
418   var uintL len = srcend-src;
419   var bool incomplete_p = len & 1; /* odd-p */
420   var uintL count = len >> 1; /* div 2 */
421   if (count > destend-dest)
422     count = destend-dest;
423   if (count > 0) {
424     do {
425       *dest++ = as_chart((cint)src[0] | ((cint)src[1] << 8));
426       src += 2;
427     } while (--count);
428     *srcp = src;
429     *destp = dest;
430     if (incomplete_p) handle_incomplete(encoding,destp,destend);
431   }
432 }
433 
434 /* Characters to bytes. */
435 
uni16_wcslen(object encoding,const chart * src,const chart * srcend)436 global uintL uni16_wcslen (object encoding, const chart* src,
437                            const chart* srcend) {
438   var uintL count = srcend-src;
439   var uintL result = 0;
440   while (count--) {
441     var chart ch = *src++;
442     if (as_cint(ch) < 0x10000)
443       result += 2;
444     else {
445       var object action = TheEncoding(encoding)->enc_tombs_error;
446       if (eq(action,S(Kignore))) {
447       } else if (uint8_p(action)) {
448         result++;
449       } else if (!eq(action,S(Kerror))) {
450         var chart c = char_code(action);
451         if (as_cint(c) < 0x10000)
452           result += 2;
453       } else
454         error_unencodable(encoding,ch);
455     }
456   }
457   return result;
458 }
459 
uni16be_wcstombs(object encoding,object stream,const chart ** srcp,const chart * srcend,uintB ** destp,uintB * destend)460 global void uni16be_wcstombs (object encoding, object stream,
461                               const chart* *srcp, const chart* srcend,
462                               uintB* *destp, uintB* destend) {
463   unused(stream);
464   var const chart* src = *srcp;
465   var uintB* dest = *destp;
466   var uintL scount = srcend-src;
467   var uintL dcount = destend-dest;
468   if (scount > 0 && dcount > 0) {
469     do {
470       var cint ch = as_cint(*src++); scount--;
471       if (ch < 0x10000) {
472         if (dcount < 2) break;
473         dest[0] = (uintB)(ch>>8); dest[1] = (uintB)ch;
474         dest += 2; dcount -= 2;
475       } else {
476         var object action = TheEncoding(encoding)->enc_tombs_error;
477         if (eq(action,S(Kignore))) {
478         } else if (uint8_p(action)) {
479           *dest++ = I_to_uint8(action); dcount--;
480         } else if (!eq(action,S(Kerror))) {
481           var cint c = char_int(action);
482           if (c < 0x10000) {
483             if (dcount < 2) break;
484             dest[0] = (uintB)(c>>8); dest[1] = (uintB)c;
485             dest += 2; dcount -= 2;
486           } else
487             error_unencodable(encoding,as_chart(ch));
488         } else
489           error_unencodable(encoding,as_chart(ch));
490       }
491     } while (scount > 0 && dcount > 0);
492     *srcp = src;
493     *destp = dest;
494   }
495 }
496 
uni16le_wcstombs(object encoding,object stream,const chart ** srcp,const chart * srcend,uintB ** destp,uintB * destend)497 global void uni16le_wcstombs (object encoding, object stream,
498                               const chart* *srcp, const chart* srcend,
499                               uintB* *destp, uintB* destend) {
500   unused(stream);
501   var const chart* src = *srcp;
502   var uintB* dest = *destp;
503   var uintL scount = srcend-src;
504   var uintL dcount = destend-dest;
505   if (scount > 0 && dcount > 0) {
506     do {
507       var cint ch = as_cint(*src++); scount--;
508       if (ch < 0x10000) {
509         if (dcount < 2) break;
510         dest[0] = (uintB)ch; dest[1] = (uintB)(ch>>8);
511         dest += 2; dcount -= 2;
512       } else {
513         var object action = TheEncoding(encoding)->enc_tombs_error;
514         if (eq(action,S(Kignore))) {
515         } else if (uint8_p(action)) {
516           *dest++ = I_to_uint8(action); dcount--;
517         } else if (!eq(action,S(Kerror))) {
518           var cint c = char_int(action);
519           if (c < 0x10000) {
520             if (dcount < 2) break;
521             dest[0] = (uintB)c; dest[1] = (uintB)(c>>8);
522             dest += 2; dcount -= 2;
523           } else
524             error_unencodable(encoding,as_chart(ch));
525         } else
526           error_unencodable(encoding,as_chart(ch));
527       }
528     } while (scount > 0 && dcount > 0);
529     *srcp = src;
530     *destp = dest;
531   }
532 }
533 
534 /* -------------------------------------------------------------------------
535  * Unicode-32 encoding */
536 
537 /* Unicode-32 encoding in two flavours:
538  The big-endian format,
539  the little-endian format. */
540 
541 /* min. bytes per character = 4
542    max. bytes per character = 4 */
543 
544 global uintL uni32be_mblen (object encoding, const uintB* src,
545                             const uintB* srcend);
546 global uintL uni32le_mblen (object encoding, const uintB* src,
547                             const uintB* srcend);
548 global void uni32be_mbstowcs (object encoding, object stream,
549                               const uintB* *srcp, const uintB* srcend,
550                               chart* *destp, chart* destend);
551 global void uni32le_mbstowcs (object encoding, object stream,
552                               const uintB* *srcp, const uintB* srcend,
553                               chart* *destp, chart* destend);
554 global uintL uni32_wcslen (object encoding, const chart* src,
555                            const chart* srcend);
556 global void uni32be_wcstombs (object encoding, object stream,
557                               const chart* *srcp, const chart* srcend,
558                               uintB* *destp, uintB* destend);
559 global void uni32le_wcstombs (object encoding, object stream,
560                               const chart* *srcp, const chart* srcend,
561                               uintB* *destp, uintB* destend);
562 
563 /* Bytes to characters. */
564 
565 /* Error when an invalid character was encountered.
566  error_uni32_invalid(encoding,code); */
error_uni32_invalid(object encoding,uint32 code)567 local _Noreturn void error_uni32_invalid (object encoding, uint32 code) {
568   pushSTACK(NIL); /* CHARSET-TYPE-ERROR slot datum - filled below */
569   pushSTACK(encoding);       /* CHARSET-TYPE-ERROR slot EXPECTED-TYPE */
570   pushSTACK(TheEncoding(encoding)->enc_charset);
571   STACK_2 = uint32_to_I(code); /* fill it here since it maygc */
572   var uintC count;
573   dotimespC(count,8, {
574     pushSTACK(ascii_char(hex_table[code&0x0F]));
575     code = code>>4;
576   });
577   pushSTACK(TheSubr(subr_self)->name);
578   error(charset_type_error,GETTEXT("~S: Character #x~C~C~C~C~C~C~C~C in ~S conversion, not an UTF-32 character"));
579 }
580 
uni32be_mblen(object encoding,const uintB * src,const uintB * srcend)581 global uintL uni32be_mblen (object encoding, const uintB* src,
582                             const uintB* srcend) {
583   var uintL len = srcend-src;
584   var bool incomplete_p = ((len & 3) != 0); /* mod 4 */
585   var uintL count = len >> 2; /* div 4 */
586   if (eq(TheEncoding(encoding)->enc_towcs_error,S(Kignore))) {
587     var uintL result = 0;
588     dotimesL(count,count, {
589       var uint32 ch =
590         ((uint32)src[0] << 24) | ((uint32)src[1] << 16)
591         | ((uint32)src[2] << 8) | (uint32)src[3];
592       if (ch < char_code_limit)
593         result++;
594       src += 4;
595     });
596     return result;
597   } else {
598     return (count > 0 ? count + incomplete_p : 0);
599   }
600 }
601 
uni32le_mblen(object encoding,const uintB * src,const uintB * srcend)602 global uintL uni32le_mblen (object encoding, const uintB* src,
603                             const uintB* srcend) {
604   var uintL len = srcend-src;
605   var bool incomplete_p = ((len & 3) != 0); /* mod 4 */
606   var uintL count = len >> 2; /* div 4 */
607   if (eq(TheEncoding(encoding)->enc_towcs_error,S(Kignore))) {
608     var uintL result = 0;
609     dotimesL(count,count, {
610       var uint32 ch =
611         (uint32)src[0] | ((uint32)src[1] << 8)
612         | ((uint32)src[2] << 16) | ((uint32)src[3] << 24);
613       if (ch < char_code_limit)
614         result++;
615       src += 4;
616     });
617     return result;
618   } else {
619     return (count > 0 ? count + incomplete_p : 0);
620   }
621 }
622 
uni32be_mbstowcs(object encoding,object stream,const uintB ** srcp,const uintB * srcend,chart ** destp,chart * destend)623 global void uni32be_mbstowcs (object encoding, object stream,
624                               const uintB* *srcp, const uintB* srcend,
625                               chart* *destp, chart* destend) {
626   unused(stream);
627   var const uintB* src = *srcp;
628   var chart* dest = *destp;
629   var uintL len = srcend-src;
630   var bool incomplete_p = ((len & 3) != 0); /* mod 4 */
631   var uintL scount = len >> 2; /* div 4 */
632   var uintL dcount = destend-dest;
633   if (scount > 0 && dcount > 0) {
634     do {
635       var uint32 ch =
636         ((uint32)src[0] << 24) | ((uint32)src[1] << 16)
637         | ((uint32)src[2] << 8) | (uint32)src[3];
638       if (ch < char_code_limit) {
639         *dest++ = as_chart(ch); dcount--;
640       } else {
641         var object action = TheEncoding(encoding)->enc_towcs_error;
642         if (eq(action,S(Kignore))) {
643         } else if (eq(action,S(Kerror))) {
644           error_uni32_invalid(encoding,ch);
645         } else {
646           *dest++ = char_code(action); dcount--;
647         }
648       }
649       src += 4; scount--;
650     } while (scount > 0 && dcount > 0);
651     *srcp = src;
652     *destp = dest;
653     if (incomplete_p) handle_incomplete(encoding,destp,destend);
654   }
655 }
656 
uni32le_mbstowcs(object encoding,object stream,const uintB ** srcp,const uintB * srcend,chart ** destp,chart * destend)657 global void uni32le_mbstowcs (object encoding, object stream,
658                               const uintB* *srcp, const uintB* srcend,
659                               chart* *destp, chart* destend) {
660   unused(stream);
661   var const uintB* src = *srcp;
662   var chart* dest = *destp;
663   var uintL len = srcend-src;
664   var bool incomplete_p = ((len & 3) != 0); /* mod 4 */
665   var uintL scount = len >> 2; /* div 4 */
666   var uintL dcount = destend-dest;
667   if (scount > 0 && dcount > 0) {
668     do {
669       var uint32 ch =
670         (uint32)src[0] | ((uint32)src[1] << 8)
671         | ((uint32)src[2] << 16) | ((uint32)src[3] << 24);
672       if (ch < char_code_limit) {
673         *dest++ = as_chart(ch); dcount--;
674       } else {
675         var object action = TheEncoding(encoding)->enc_towcs_error;
676         if (eq(action,S(Kignore))) {
677         } else if (eq(action,S(Kerror))) {
678           error_uni32_invalid(encoding,ch);
679         } else {
680           *dest++ = char_code(action); dcount--;
681         }
682       }
683       src += 4; scount--;
684     } while (scount > 0 && dcount > 0);
685     *srcp = src;
686     *destp = dest;
687     if (incomplete_p) handle_incomplete(encoding,destp,destend);
688   }
689 }
690 
691 /* Characters to bytes. */
692 
uni32_wcslen(object encoding,const chart * src,const chart * srcend)693 global uintL uni32_wcslen (object encoding, const chart* src,
694                            const chart* srcend) {
695   unused(encoding);
696   return (srcend-src)*4;
697 }
698 
uni32be_wcstombs(object encoding,object stream,const chart ** srcp,const chart * srcend,uintB ** destp,uintB * destend)699 global void uni32be_wcstombs (object encoding, object stream,
700                               const chart* *srcp, const chart* srcend,
701                               uintB* *destp, uintB* destend) {
702   unused(encoding); unused(stream);
703   var const chart* src = *srcp;
704   var uintB* dest = *destp;
705   var uintL count = floor(destend-dest,4);
706   if (count > srcend-src)
707     count = srcend-src;
708   if (count > 0) {
709     dotimespL(count,count, {
710       var cint ch = as_cint(*src++);
711       dest[0] = 0; dest[1] = (uintB)(ch>>16);
712       dest[2] = (uintB)(ch>>8); dest[3] = (uintB)ch;
713       dest += 4;
714     });
715     *srcp = src;
716     *destp = dest;
717   }
718 }
719 
uni32le_wcstombs(object encoding,object stream,const chart ** srcp,const chart * srcend,uintB ** destp,uintB * destend)720 global void uni32le_wcstombs (object encoding, object stream,
721                               const chart* *srcp, const chart* srcend,
722                               uintB* *destp, uintB* destend) {
723   unused(encoding); unused(stream);
724   var const chart* src = *srcp;
725   var uintB* dest = *destp;
726   var uintL count = floor(destend-dest,4);
727   if (count > srcend-src)
728     count = srcend-src;
729   if (count > 0) {
730     dotimespL(count,count, {
731       var cint ch = as_cint(*src++);
732       dest[0] = (uintB)ch; dest[1] = (uintB)(ch>>8);
733       dest[2] = (uintB)(ch>>16); dest[3] = 0;
734       dest += 4;
735     });
736     *srcp = src;
737     *destp = dest;
738   }
739 }
740 
741 /* -------------------------------------------------------------------------
742  * UTF-8 encoding */
743 
744 /* See http://www.stonehand.com/unicode/standard/fss-utf.html
745  or  Linux 2.0.x, file linux/fs/nls.c
746                    cmask  cval  shift     maxval           bits
747   1 byte sequence   0x80  0x00   0*6         0x7F  0XXXXXXX
748   2 byte sequence   0xE0  0xC0   1*6        0x7FF  110XXXXX 10XXXXXX
749   3 byte sequence   0xF0  0xE0   2*6       0xFFFF  1110XXXX 10XXXXXX 10XXXXXX
750   4 byte sequence   0xF8  0xF0   3*6     0x1FFFFF  11110XXX 10XXXXXX 10XXXXXX 10XXXXXX
751   5 byte sequence   0xFC  0xF8   4*6    0x3FFFFFF  111110XX 10XXXXXX 10XXXXXX 10XXXXXX 10XXXXXX
752   6 byte sequence   0xFE  0xFC   5*6   0x7FFFFFFF  1111110X 10XXXXXX 10XXXXXX 10XXXXXX 10XXXXXX 10XXXXXX
753 
754  We support only 21-bit Unicode characters, i.e. those which can be encoded
755  with at most 4 bytes. Characters outside this range give an error.
756  Spurious bytes of the form 10XXXXXX are ignored.
757  (This resync feature is one of the benefits of the UTF encoding.) */
758 
759 /* min. bytes per character = 1
760    max. bytes per character = 4 */
761 
762 global uintL utf8_mblen (object encoding, const uintB* src,
763                          const uintB* srcend);
764 global void utf8_mbstowcs (object encoding, object stream, const uintB* *srcp,
765                            const uintB* srcend, chart* *destp, chart* destend);
766 global uintL utf8_wcslen (object encoding, const chart* src,
767                           const chart* srcend);
768 global void utf8_wcstombs (object encoding, object stream, const chart* *srcp,
769                            const chart* srcend, uintB* *destp, uintB* destend);
770 
771 /* Bytes to characters. */
772 
773 /* Error when an invalid 1-byte sequence was encountered.
774  error_utf8_invalid1(encoding,b1); */
error_utf8_invalid1(object encoding,uintB b1)775 local _Noreturn void error_utf8_invalid1 (object encoding, uintB b1) {
776   pushSTACK(fixnum(b1));        /* CHARSET-TYPE-ERROR slot DATUM */
777   pushSTACK(encoding);          /* CHARSET-TYPE-ERROR slot EXPECTED-TYPE */
778   pushSTACK(TheEncoding(encoding)->enc_charset);
779   pushSTACK(ascii_char(hex_table[b1&0x0F]));
780   pushSTACK(ascii_char(hex_table[(b1>>4)&0x0F]));
781   pushSTACK(TheSubr(subr_self)->name);
782   error(charset_type_error,GETTEXT("~S: Invalid byte #x~C~C in ~S conversion, not a Unicode-16"));
783 }
784 
785 /* Error when an invalid 2-byte sequence was encountered.
786  error_utf8_invalid2(encoding,b1,b2); */
error_utf8_invalid2(object encoding,uintB b1,uintB b2)787 local _Noreturn void error_utf8_invalid2 (object encoding, uintB b1, uintB b2) {
788   pushSTACK(NIL);            /* CHARSET-TYPE-ERROR slot DATUM */
789   pushSTACK(encoding);       /* CHARSET-TYPE-ERROR slot EXPECTED-TYPE */
790   STACK_1 = allocate_bit_vector(Atype_8Bit,2);
791   TheSbvector(STACK_1)->data[0] = b1;
792   TheSbvector(STACK_1)->data[1] = b2;
793   pushSTACK(TheEncoding(STACK_0/*encoding*/)->enc_charset);
794   pushSTACK(ascii_char(hex_table[b2&0x0F]));
795   pushSTACK(ascii_char(hex_table[(b2>>4)&0x0F]));
796   pushSTACK(ascii_char(hex_table[b1&0x0F]));
797   pushSTACK(ascii_char(hex_table[(b1>>4)&0x0F]));
798   pushSTACK(TheSubr(subr_self)->name);
799   error(charset_type_error,GETTEXT("~S: Invalid byte sequence #x~C~C #x~C~C in ~S conversion"));
800 }
801 
802 /* Error when an invalid 3-byte sequence was encountered.
803  error_utf8_invalid3(encoding,b1,b2,b3); */
error_utf8_invalid3(object encoding,uintB b1,uintB b2,uintB b3)804 local _Noreturn void error_utf8_invalid3 (object encoding, uintB b1, uintB b2, uintB b3) {
805   pushSTACK(NIL);            /* CHARSET-TYPE-ERROR slot DATUM */
806   pushSTACK(encoding);       /* CHARSET-TYPE-ERROR slot EXPECTED-TYPE */
807   STACK_1 = allocate_bit_vector(Atype_8Bit,3);
808   TheSbvector(STACK_1)->data[0] = b1;
809   TheSbvector(STACK_1)->data[1] = b2;
810   TheSbvector(STACK_1)->data[2] = b3;
811   pushSTACK(TheEncoding(STACK_0/*encoding*/)->enc_charset);
812   pushSTACK(ascii_char(hex_table[b3&0x0F]));
813   pushSTACK(ascii_char(hex_table[(b3>>4)&0x0F]));
814   pushSTACK(ascii_char(hex_table[b2&0x0F]));
815   pushSTACK(ascii_char(hex_table[(b2>>4)&0x0F]));
816   pushSTACK(ascii_char(hex_table[b1&0x0F]));
817   pushSTACK(ascii_char(hex_table[(b1>>4)&0x0F]));
818   pushSTACK(TheSubr(subr_self)->name);
819   error(charset_type_error,GETTEXT("~S: Invalid byte sequence #x~C~C #x~C~C #x~C~C in ~S conversion"));
820 }
821 
822 /* Error when an invalid 4-byte sequence was encountered.
823  error_utf8_invalid4(encoding,b1,b2,b3,b4); */
error_utf8_invalid4(object encoding,uintB b1,uintB b2,uintB b3,uintB b4)824 local _Noreturn void error_utf8_invalid4 (object encoding, uintB b1, uintB b2, uintB b3, uintB b4)
825 {
826   pushSTACK(NIL);            /* CHARSET-TYPE-ERROR slot DATUM */
827   pushSTACK(encoding);       /* CHARSET-TYPE-ERROR slot EXPECTED-TYPE */
828   STACK_1 = allocate_bit_vector(Atype_8Bit,4);
829   TheSbvector(STACK_1)->data[0] = b1;
830   TheSbvector(STACK_1)->data[1] = b2;
831   TheSbvector(STACK_1)->data[2] = b3;
832   TheSbvector(STACK_1)->data[3] = b4;
833   pushSTACK(TheEncoding(STACK_0/*encoding*/)->enc_charset);
834   pushSTACK(ascii_char(hex_table[b4&0x0F]));
835   pushSTACK(ascii_char(hex_table[(b4>>4)&0x0F]));
836   pushSTACK(ascii_char(hex_table[b3&0x0F]));
837   pushSTACK(ascii_char(hex_table[(b3>>4)&0x0F]));
838   pushSTACK(ascii_char(hex_table[b2&0x0F]));
839   pushSTACK(ascii_char(hex_table[(b2>>4)&0x0F]));
840   pushSTACK(ascii_char(hex_table[b1&0x0F]));
841   pushSTACK(ascii_char(hex_table[(b1>>4)&0x0F]));
842   pushSTACK(TheSubr(subr_self)->name);
843   error(charset_type_error,GETTEXT("~S: Invalid byte sequence #x~C~C #x~C~C #x~C~C #x~C~C in ~S conversion"));
844 }
845 
utf8_mblen(object encoding,const uintB * src,const uintB * srcend)846 global uintL utf8_mblen (object encoding, const uintB* src,
847                          const uintB* srcend) {
848   var uintL count = 0;
849   while (src < srcend) {
850     var uintB c = src[0];
851     if (c < 0x80) { /* 1 byte sequence */
852       src += 1;
853       count++;
854       continue;
855     }
856     if (c < 0xC0) {
857       src++; continue; /* skip spurious 10XXXXXX byte */
858     }
859     if (c < 0xE0) { /* 2 byte sequence */
860       if (src+2 > srcend) break;
861       if (((src[1] ^ 0x80) < 0x40)
862           && (c >= 0xC2)) {
863         src += 2;
864         count++;
865         continue;
866       }
867       {
868         var object action = TheEncoding(encoding)->enc_towcs_error;
869         if (eq(action,S(Kignore))) {
870           src += 2; continue;
871         } else if (eq(action,S(Kerror))) {
872           error_utf8_invalid2(encoding,c,src[1]);
873         } else {
874           src += 2; count++; continue;
875         }
876       }
877     }
878     if (c < 0xF0) { /* 3 byte sequence */
879       if (src+3 > srcend) break;
880       if (((src[1] ^ 0x80) < 0x40) && ((src[2] ^ 0x80) < 0x40)
881           && (c >= 0xE1 || src[1] >= 0xA0)
882           && (c != 0xED || src[1] < 0xA0)) {
883         src += 3;
884         count++;
885         continue;
886       }
887       {
888         var object action = TheEncoding(encoding)->enc_towcs_error;
889         if (eq(action,S(Kignore))) {
890           src += 3; continue;
891         } else if (eq(action,S(Kerror))) {
892           error_utf8_invalid3(encoding,c,src[1],src[2]);
893         } else {
894           src += 3; count++; continue;
895         }
896       }
897     }
898     if (c < 0xF8) { /* 4 byte sequence */
899       if (src+4 > srcend) break;
900       if (((src[1] ^ 0x80) < 0x40) && ((src[2] ^ 0x80) < 0x40)
901           && ((src[3] ^ 0x80) < 0x40)
902           && (c >= 0xF1 || src[1] >= 0x90)) {
903         src += 4;
904         count++;
905         continue;
906       }
907       {
908         var object action = TheEncoding(encoding)->enc_towcs_error;
909         if (eq(action,S(Kignore))) {
910           src += 4; continue;
911         } else if (eq(action,S(Kerror))) {
912           error_utf8_invalid4(encoding,c,src[1],src[2],src[3]);
913         } else {
914           src += 4; count++; continue;
915         }
916       }
917     }
918     {
919       var object action = TheEncoding(encoding)->enc_towcs_error;
920       if (eq(action,S(Kignore))) {
921         src += 1; continue;
922       } else if (eq(action,S(Kerror))) {
923         error_utf8_invalid1(encoding,c);
924       } else {
925         src += 1; count++; continue;
926       }
927     }
928   }
929   return count;
930 }
931 
utf8_mbstowcs(object encoding,object stream,const uintB ** srcp,const uintB * srcend,chart ** destp,chart * destend)932 global void utf8_mbstowcs (object encoding, object stream, const uintB* *srcp,
933                            const uintB* srcend, chart* *destp,
934                            chart* destend) {
935   unused(stream);
936   var const uintB* src = *srcp;
937   var chart* dest = *destp;
938   while (src < srcend) {
939     var uintB c = src[0];
940     if (c < 0x80) { /* 1 byte sequence */
941       if (dest == destend) break;
942       *dest++ = as_chart((cint)c);
943       src += 1;
944       continue;
945     }
946     if (c < 0xC0) {
947       src++; continue; /* skip spurious 10XXXXXX byte */
948     }
949     if (dest == destend) break;
950     if (c < 0xE0) { /* 2 byte sequence */
951       if (src+2 > srcend) break;
952       if (((src[1] ^ 0x80) < 0x40)
953           && (c >= 0xC2)) {
954         *dest++ = as_chart(((cint)(c & 0x1F) << 6) | (cint)(src[1] ^ 0x80));
955         src += 2;
956         continue;
957       }
958       {
959         var object action = TheEncoding(encoding)->enc_towcs_error;
960         if (eq(action,S(Kignore))) {
961           src += 2; continue;
962         } else if (eq(action,S(Kerror))) {
963           error_utf8_invalid2(encoding,c,src[1]);
964         } else {
965           src += 2; *dest++ = char_code(action); continue;
966         }
967       }
968     }
969     if (c < 0xF0) { /* 3 byte sequence */
970       if (src+3 > srcend) break;
971       if (((src[1] ^ 0x80) < 0x40) && ((src[2] ^ 0x80) < 0x40)
972           && (c >= 0xE1 || src[1] >= 0xA0)
973           && (c != 0xED || src[1] < 0xA0)) {
974         *dest++ = as_chart(((cint)(c & 0x0F) << 12)
975                            | ((cint)(src[1] ^ 0x80) << 6)
976                            | (cint)(src[2] ^ 0x80));
977         src += 3;
978         continue;
979       }
980       {
981         var object action = TheEncoding(encoding)->enc_towcs_error;
982         if (eq(action,S(Kignore))) {
983           src += 3; continue;
984         } else if (eq(action,S(Kerror))) {
985           error_utf8_invalid3(encoding,c,src[1],src[2]);
986         } else {
987           src += 3; *dest++ = char_code(action); continue;
988         }
989       }
990     }
991     if (c < 0xF8) { /* 4 byte sequence */
992       if (src+4 > srcend) break;
993       if (((src[1] ^ 0x80) < 0x40) && ((src[2] ^ 0x80) < 0x40)
994           && ((src[3] ^ 0x80) < 0x40)
995           && (c >= 0xF1 || src[1] >= 0x90)) {
996         var cint ch = ((cint)(c & 0x07) << 18)
997                       | ((cint)(src[1] ^ 0x80) << 12)
998                       | ((cint)(src[2] ^ 0x80) << 6)
999                       | (cint)(src[3] ^ 0x80);
1000         if (ch < char_code_limit) {
1001           *dest++ = as_chart(ch);
1002           src += 4;
1003           continue;
1004         }
1005       }
1006       {
1007         var object action = TheEncoding(encoding)->enc_towcs_error;
1008         if (eq(action,S(Kignore))) {
1009           src += 4; continue;
1010         } else if (eq(action,S(Kerror))) {
1011           error_utf8_invalid4(encoding,c,src[1],src[2],src[3]);
1012         } else {
1013           src += 4; *dest++ = char_code(action); continue;
1014         }
1015       }
1016     }
1017     {
1018       var object action = TheEncoding(encoding)->enc_towcs_error;
1019       if (eq(action,S(Kignore))) {
1020         src += 1; continue;
1021       } else if (eq(action,S(Kerror))) {
1022         error_utf8_invalid1(encoding,c);
1023       } else {
1024         src += 1; *dest++ = char_code(action); continue;
1025       }
1026     }
1027   }
1028   *srcp = src;
1029   *destp = dest;
1030 }
1031 
1032 /* Characters to bytes. */
1033 
utf8_wcslen(object encoding,const chart * src,const chart * srcend)1034 global uintL utf8_wcslen (object encoding, const chart* src,
1035                           const chart* srcend) {
1036   unused(encoding);
1037   var uintL destlen = 0;
1038   while (src < srcend) {
1039     var cint ch = as_cint(*src++);
1040     destlen += (ch < 0x80 ? 1 : ch < 0x800 ? 2 : ch < 0x10000 ? 3 : 4);
1041   }
1042   return destlen;
1043 }
1044 
utf8_wcstombs(object encoding,object stream,const chart ** srcp,const chart * srcend,uintB ** destp,uintB * destend)1045 global void utf8_wcstombs (object encoding, object stream, const chart* *srcp,
1046                            const chart* srcend, uintB* *destp,
1047                            uintB* destend) {
1048   unused(encoding); unused(stream);
1049   var const chart* src = *srcp;
1050   var uintB* dest = *destp;
1051   while (src < srcend) {
1052     var cint ch = as_cint(*src);
1053     var uintL count = (ch < 0x80 ? 1 : ch < 0x800 ? 2 : 3);
1054     if (dest+count > destend) break;
1055     src++;
1056     if (ch < 0x80) { /* 1 byte sequence */
1057       *dest++ = ch;
1058     } else if (ch < 0x800) { /* 2 byte sequence */
1059       *dest++ = 0xC0 | (ch >> 6);
1060       *dest++ = 0x80 | (ch & 0x3F);
1061     } else if (ch < 0x10000) { /* 3 byte sequence */
1062       *dest++ = 0xE0 | (ch >> 12);
1063       *dest++ = 0x80 | ((ch >> 6) & 0x3F);
1064       *dest++ = 0x80 | (ch & 0x3F);
1065     } else { /* ch < 0x110000, 4 byte sequence */
1066       *dest++ = 0xF0 | (ch >> 18);
1067       *dest++ = 0x80 | ((ch >> 12) & 0x3F);
1068       *dest++ = 0x80 | ((ch >> 6) & 0x3F);
1069       *dest++ = 0x80 | (ch & 0x3F);
1070     }
1071   }
1072   *srcp = src;
1073   *destp = dest;
1074 }
1075 
1076 /* -------------------------------------------------------------------------
1077  * Java encoding */
1078 
1079 /* This is ISO 8859-1 with \uXXXX escape sequences,
1080    denoting Unicode characters.
1081  See the Java Language Specification.
1082  Characters outside the BMP are represented by two consecutive
1083  \uXXXX escape sequences, like UTF-16. Example:
1084    $ printf '\U00102345\n' | native2ascii -encoding UTF-8
1085    \udbc8\udf45
1086 
1087  This is quick&dirty: The text is supposed not to contain \u except as part
1088  of \uXXXX escape sequences. */
1089 
1090 /* min. bytes per character = 1
1091    max. bytes per character = 12 */
1092 
1093 global uintL java_mblen (object encoding, const uintB* src,
1094                          const uintB* srcend);
1095 global void java_mbstowcs (object encoding, object stream, const uintB* *srcp,
1096                            const uintB* srcend, chart* *destp, chart* destend);
1097 global uintL java_wcslen (object encoding, const chart* src,
1098                           const chart* srcend);
1099 global void java_wcstombs (object encoding, object stream, const chart* *srcp,
1100                            const chart* srcend, uintB* *destp, uintB* destend);
1101 
1102 /* Bytes to characters. */
1103 
java_mblen(object encoding,const uintB * src,const uintB * srcend)1104 global uintL java_mblen (object encoding, const uintB* src,
1105                          const uintB* srcend) {
1106   unused(encoding);
1107   var uintL count = 0;
1108   while (src < srcend) {
1109     var uintB c;
1110     var cint ch;
1111     if (src[0] != '\\') {
1112       src += 1;
1113       count++;
1114       continue;
1115     }
1116     if (src+2 > srcend) break;
1117     if (src[1] != 'u') {
1118       src += 1;
1119       count++;
1120       continue;
1121     }
1122     if (src+3 > srcend) break;
1123     c = src[2];
1124     if (c >= '0' && c <= '9') { c -= '0'; }
1125     else if (c >= 'A' && c <= 'F') { c -= 'A'-10; }
1126     else if (c >= 'a' && c <= 'f') { c -= 'a'-10; }
1127     else {
1128       src += 2; /* skip incomplete \u sequence */
1129       continue;
1130     }
1131     ch = (cint)c << 12;
1132     if (src+4 > srcend) break;
1133     c = src[3];
1134     if (c >= '0' && c <= '9') { c -= '0'; }
1135     else if (c >= 'A' && c <= 'F') { c -= 'A'-10; }
1136     else if (c >= 'a' && c <= 'f') { c -= 'a'-10; }
1137     else {
1138       src += 3; /* skip incomplete \u sequence */
1139       continue;
1140     }
1141     ch |= (cint)c << 8;
1142     if (src+5 > srcend) break;
1143     c = src[4];
1144     if (c >= '0' && c <= '9') { c -= '0'; }
1145     else if (c >= 'A' && c <= 'F') { c -= 'A'-10; }
1146     else if (c >= 'a' && c <= 'f') { c -= 'a'-10; }
1147     else {
1148       src += 4; /* skip incomplete \u sequence */
1149       continue;
1150     }
1151     ch |= (cint)c << 4;
1152     if (src+6 > srcend) break;
1153     c = src[5];
1154     if (c >= '0' && c <= '9') { c -= '0'; }
1155     else if (c >= 'A' && c <= 'F') { c -= 'A'-10; }
1156     else if (c >= 'a' && c <= 'f') { c -= 'a'-10; }
1157     else {
1158       src += 5; /* skip incomplete \u sequence */
1159       continue;
1160     }
1161     ch |= (cint)c;
1162     if (ch < char_code_limit && !(ch >= 0xd800 && ch < 0xe000)) {
1163       src += 6; /* complete \u sequence */
1164       count++;
1165       continue;
1166     }
1167     if (!(ch >= 0xd800 && ch < 0xdc00)) {
1168       src += 6; /* skip invalid \u sequence */
1169       continue;
1170     }
1171     var cint ch1 = ch;
1172     if (src+7 > srcend) break;
1173     if (src[6] != '\\') {
1174       src += 6; /* skip incomplete \u sequence */
1175       continue;
1176     }
1177     if (src+8 > srcend) break;
1178     if (src[7] != 'u') {
1179       src += 6; /* skip incomplete \u sequence */
1180       continue;
1181     }
1182     if (src+9 > srcend) break;
1183     c = src[8];
1184     if (c >= '0' && c <= '9') { c -= '0'; }
1185     else if (c >= 'A' && c <= 'F') { c -= 'A'-10; }
1186     else if (c >= 'a' && c <= 'f') { c -= 'a'-10; }
1187     else {
1188       src += 8; /* skip incomplete \u sequence */
1189       continue;
1190     }
1191     ch = (cint)c << 12;
1192     if (src+10 > srcend) break;
1193     c = src[9];
1194     if (c >= '0' && c <= '9') { c -= '0'; }
1195     else if (c >= 'A' && c <= 'F') { c -= 'A'-10; }
1196     else if (c >= 'a' && c <= 'f') { c -= 'a'-10; }
1197     else {
1198       src += 9; /* skip incomplete \u sequence */
1199       continue;
1200     }
1201     ch |= (cint)c << 8;
1202     if (src+11 > srcend) break;
1203     c = src[10];
1204     if (c >= '0' && c <= '9') { c -= '0'; }
1205     else if (c >= 'A' && c <= 'F') { c -= 'A'-10; }
1206     else if (c >= 'a' && c <= 'f') { c -= 'a'-10; }
1207     else {
1208       src += 10; /* skip incomplete \u sequence */
1209       continue;
1210     }
1211     ch |= (cint)c << 4;
1212     if (src+12 > srcend) break;
1213     c = src[11];
1214     if (c >= '0' && c <= '9') { c -= '0'; }
1215     else if (c >= 'A' && c <= 'F') { c -= 'A'-10; }
1216     else if (c >= 'a' && c <= 'f') { c -= 'a'-10; }
1217     else {
1218       src += 11; /* skip incomplete \u sequence */
1219       continue;
1220     }
1221     ch |= (cint)c;
1222     if (ch >= 0xdc00 && ch < 0xe000) {
1223       ch = 0x10000 + ((ch1 - 0xd800) << 10) + (ch - 0xdc00);
1224       if (ch < char_code_limit) {
1225         src += 12; /* complete \u sequence */
1226         count++;
1227         continue;
1228       }
1229     }
1230     src += 6; /* skip invalid \u sequence */
1231     continue;
1232   }
1233   return count;
1234 }
1235 
java_mbstowcs(object encoding,object stream,const uintB ** srcp,const uintB * srcend,chart ** destp,chart * destend)1236 global void java_mbstowcs (object encoding, object stream, const uintB* *srcp,
1237                            const uintB* srcend, chart* *destp,
1238                            chart* destend) {
1239   unused(encoding); unused(stream);
1240   var const uintB* src = *srcp;
1241   var chart* dest = *destp;
1242   while (src < srcend) {
1243     var uintB c;
1244     var cint ch;
1245     c = src[0];
1246     if (c != '\\') {
1247       if (dest==destend) break;
1248       *dest++ = as_chart((cint)c);
1249       src += 1;
1250       continue;
1251     }
1252     if (src+2 > srcend) break;
1253     if (src[1] != 'u') {
1254       if (dest==destend) break;
1255       *dest++ = as_chart((cint)c);
1256       src += 1;
1257       continue;
1258     }
1259     if (src+3 > srcend) break;
1260     c = src[2];
1261     if (c >= '0' && c <= '9') { c -= '0'; }
1262     else if (c >= 'A' && c <= 'F') { c -= 'A'-10; }
1263     else if (c >= 'a' && c <= 'f') { c -= 'a'-10; }
1264     else {
1265       src += 2; /* skip incomplete \u sequence */
1266       continue;
1267     }
1268     ch = (cint)c << 12;
1269     if (src+4 > srcend) break;
1270     c = src[3];
1271     if (c >= '0' && c <= '9') { c -= '0'; }
1272     else if (c >= 'A' && c <= 'F') { c -= 'A'-10; }
1273     else if (c >= 'a' && c <= 'f') { c -= 'a'-10; }
1274     else {
1275       src += 3; /* skip incomplete \u sequence */
1276       continue;
1277     }
1278     ch |= (cint)c << 8;
1279     if (src+5 > srcend) break;
1280     c = src[4];
1281     if (c >= '0' && c <= '9') { c -= '0'; }
1282     else if (c >= 'A' && c <= 'F') { c -= 'A'-10; }
1283     else if (c >= 'a' && c <= 'f') { c -= 'a'-10; }
1284     else {
1285       src += 4; /* skip incomplete \u sequence */
1286       continue;
1287     }
1288     ch |= (cint)c << 4;
1289     if (src+6 > srcend) break;
1290     c = src[5];
1291     if (c >= '0' && c <= '9') { c -= '0'; }
1292     else if (c >= 'A' && c <= 'F') { c -= 'A'-10; }
1293     else if (c >= 'a' && c <= 'f') { c -= 'a'-10; }
1294     else {
1295       src += 5; /* skip incomplete \u sequence */
1296       continue;
1297     }
1298     ch |= (cint)c;
1299     if (ch < char_code_limit && !(ch >= 0xd800 && ch < 0xe000)) {
1300       if (dest==destend) break;
1301       *dest++ = as_chart(ch);
1302       src += 6; /* complete \u sequence */
1303       continue;
1304     }
1305     if (!(ch >= 0xd800 && ch < 0xdc00)) {
1306       src += 6; /* skip invalid \u sequence */
1307       continue;
1308     }
1309     var cint ch1 = ch;
1310     if (src+7 > srcend) break;
1311     if (src[6] != '\\') {
1312       src += 6; /* skip incomplete \u sequence */
1313       continue;
1314     }
1315     if (src+8 > srcend) break;
1316     if (src[7] != 'u') {
1317       src += 6; /* skip incomplete \u sequence */
1318       continue;
1319     }
1320     if (src+9 > srcend) break;
1321     c = src[8];
1322     if (c >= '0' && c <= '9') { c -= '0'; }
1323     else if (c >= 'A' && c <= 'F') { c -= 'A'-10; }
1324     else if (c >= 'a' && c <= 'f') { c -= 'a'-10; }
1325     else {
1326       src += 8; /* skip incomplete \u sequence */
1327       continue;
1328     }
1329     ch = (cint)c << 12;
1330     if (src+10 > srcend) break;
1331     c = src[9];
1332     if (c >= '0' && c <= '9') { c -= '0'; }
1333     else if (c >= 'A' && c <= 'F') { c -= 'A'-10; }
1334     else if (c >= 'a' && c <= 'f') { c -= 'a'-10; }
1335     else {
1336       src += 9; /* skip incomplete \u sequence */
1337       continue;
1338     }
1339     ch |= (cint)c << 8;
1340     if (src+11 > srcend) break;
1341     c = src[10];
1342     if (c >= '0' && c <= '9') { c -= '0'; }
1343     else if (c >= 'A' && c <= 'F') { c -= 'A'-10; }
1344     else if (c >= 'a' && c <= 'f') { c -= 'a'-10; }
1345     else {
1346       src += 10; /* skip incomplete \u sequence */
1347       continue;
1348     }
1349     ch |= (cint)c << 4;
1350     if (src+12 > srcend) break;
1351     c = src[11];
1352     if (c >= '0' && c <= '9') { c -= '0'; }
1353     else if (c >= 'A' && c <= 'F') { c -= 'A'-10; }
1354     else if (c >= 'a' && c <= 'f') { c -= 'a'-10; }
1355     else {
1356       src += 11; /* skip incomplete \u sequence */
1357       continue;
1358     }
1359     ch |= (cint)c;
1360     if (ch >= 0xdc00 && ch < 0xe000) {
1361       ch = 0x10000 + ((ch1 - 0xd800) << 10) + (ch - 0xdc00);
1362       if (ch < char_code_limit) {
1363         if (dest==destend) break;
1364         *dest++ = as_chart(ch);
1365         src += 12; /* complete \u sequence */
1366         continue;
1367       }
1368     }
1369     src += 6; /* skip invalid \u sequence */
1370     continue;
1371   }
1372   *srcp = src;
1373   *destp = dest;
1374 }
1375 
1376 /* Characters to bytes. */
1377 
java_wcslen(object encoding,const chart * src,const chart * srcend)1378 global uintL java_wcslen (object encoding, const chart* src,
1379                           const chart* srcend) {
1380   unused(encoding);
1381   var uintL destlen = 0;
1382   while (src < srcend) {
1383     var cint ch = as_cint(*src++);
1384     destlen += (ch < 0x80 ? 1 : ch < 0x10000 ? 6 : 12);
1385   }
1386   return destlen;
1387 }
1388 
java_wcstombs(object encoding,object stream,const chart ** srcp,const chart * srcend,uintB ** destp,uintB * destend)1389 global void java_wcstombs (object encoding, object stream, const chart* *srcp,
1390                            const chart* srcend, uintB* *destp,
1391                            uintB* destend) {
1392   unused(encoding); unused(stream);
1393   var const chart* src = *srcp;
1394   var uintB* dest = *destp;
1395   while (src < srcend) {
1396     local char const hex_table_lc[] = "0123456789abcdef"; /* lowercase! */
1397     var cint ch = as_cint(*src);
1398     var uintL count = (ch < 0x80 ? 1 : ch < 0x10000 ? 6 : 12);
1399     if (dest+count > destend) break;
1400     src++;
1401     if (ch < 0x80) { /* 1 byte sequence */
1402       *dest++ = ch;
1403     } else if (ch < 0x10000) { /* 6 byte sequence */
1404       *dest++ = '\\';
1405       *dest++ = 'u';
1406       *dest++ = hex_table_lc[(ch>>12)&0x0F];
1407       *dest++ = hex_table_lc[(ch>>8)&0x0F];
1408       *dest++ = hex_table_lc[(ch>>4)&0x0F];
1409       *dest++ = hex_table_lc[ch&0x0F];
1410     } else { /* 12 byte sequence */
1411       var cint ch1 = 0xD800 + ((ch - 0x10000) >> 10);
1412       var cint ch2 = 0xDC00 + ((ch - 0x10000) & 0x3FF);
1413       *dest++ = '\\';
1414       *dest++ = 'u';
1415       *dest++ = hex_table_lc[(ch1>>12)&0x0F];
1416       *dest++ = hex_table_lc[(ch1>>8)&0x0F];
1417       *dest++ = hex_table_lc[(ch1>>4)&0x0F];
1418       *dest++ = hex_table_lc[ch1&0x0F];
1419       *dest++ = '\\';
1420       *dest++ = 'u';
1421       *dest++ = hex_table_lc[(ch2>>12)&0x0F];
1422       *dest++ = hex_table_lc[(ch2>>8)&0x0F];
1423       *dest++ = hex_table_lc[(ch2>>4)&0x0F];
1424       *dest++ = hex_table_lc[ch2&0x0F];
1425     }
1426   }
1427   *srcp = src;
1428   *destp = dest;
1429 }
1430 
1431 /* -------------------------------------------------------------------------
1432  * 8-bit NLS characters sets */
1433 
1434 /* min. bytes per character = 1
1435    max. bytes per character = 1 */
1436 
1437 typedef struct nls_table_t {
1438   const char* charset;
1439   const unsigned char* const* page_uni2charset;  /* UCS-2 to 8-bit table */
1440   const unsigned short* charset2uni;             /* 8-bit to UCS-2 table */
1441   int is_ascii_extension;
1442 }
1443 #if (alignment_long < PSEUDODATA_ALIGNMENT) && defined(GNU)
1444 /* Force all XPSEUDODATAs to be allocated with a PSEUDODATA_ALIGNMENT.
1445    GC needs this. */
1446   __attribute__ ((aligned (PSEUDODATA_ALIGNMENT)))
1447 #endif
1448        nls_table_t;
1449 
1450 static const unsigned char nopage[256] = {
1451   0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x00-0x07 */
1452   0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x08-0x0f */
1453   0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x10-0x17 */
1454   0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x18-0x1f */
1455   0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x20-0x27 */
1456   0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x28-0x2f */
1457   0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x30-0x37 */
1458   0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x38-0x3f */
1459   0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x40-0x47 */
1460   0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x48-0x4f */
1461   0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x50-0x57 */
1462   0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x58-0x5f */
1463   0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x60-0x67 */
1464   0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x68-0x6f */
1465   0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x70-0x77 */
1466   0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x78-0x7f */
1467   0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x80-0x87 */
1468   0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x88-0x8f */
1469   0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x90-0x97 */
1470   0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x98-0x9f */
1471   0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xa0-0xa7 */
1472   0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xa8-0xaf */
1473   0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xb0-0xb7 */
1474   0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xb8-0xbf */
1475   0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xc0-0xc7 */
1476   0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xc8-0xcf */
1477   0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xd0-0xd7 */
1478   0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xd8-0xdf */
1479   0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xe0-0xe7 */
1480   0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xe8-0xef */
1481   0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xf0-0xf7 */
1482   0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00  /* 0xf8-0xff */
1483 };
1484 
1485 #include "nls_ascii.c"
1486 #include "nls_iso8859_1.c"
1487 #include "nls_iso8859_2.c"
1488 #include "nls_iso8859_3.c"
1489 #include "nls_iso8859_4.c"
1490 #include "nls_iso8859_5.c"
1491 #include "nls_iso8859_6.c"
1492 #include "nls_iso8859_7.c"
1493 #include "nls_iso8859_8.c"
1494 #include "nls_iso8859_9.c"
1495 #include "nls_iso8859_10.c"
1496 #include "nls_iso8859_13.c"
1497 #include "nls_iso8859_14.c"
1498 #include "nls_iso8859_15.c"
1499 #include "nls_iso8859_16.c"
1500 #include "nls_koi8_r.c"
1501 #include "nls_koi8_u.c"
1502 #include "nls_mac_arabic.c"
1503 #include "nls_mac_centraleurope.c"
1504 #include "nls_mac_croatian.c"
1505 #include "nls_mac_cyrillic.c"
1506 #include "nls_mac_dingbat.c"
1507 #include "nls_mac_greek.c"
1508 #include "nls_mac_hebrew.c"
1509 #include "nls_mac_iceland.c"
1510 #include "nls_mac_roman.c"
1511 #include "nls_mac_romania.c"
1512 #include "nls_mac_symbol.c"
1513 #include "nls_mac_thai.c"
1514 #include "nls_mac_turkish.c"
1515 #include "nls_mac_ukraine.c"
1516 #include "nls_cp437_ms.c"
1517 #include "nls_cp437_ibm.c"
1518 #include "nls_cp737.c"
1519 #include "nls_cp775.c"
1520 #include "nls_cp850.c"
1521 #include "nls_cp852_ms.c"
1522 #include "nls_cp852_ibm.c"
1523 #include "nls_cp855.c"
1524 #include "nls_cp857.c"
1525 #include "nls_cp860_ms.c"
1526 #include "nls_cp860_ibm.c"
1527 #include "nls_cp861_ms.c"
1528 #include "nls_cp861_ibm.c"
1529 #include "nls_cp862_ms.c"
1530 #include "nls_cp862_ibm.c"
1531 #include "nls_cp863_ms.c"
1532 #include "nls_cp863_ibm.c"
1533 #include "nls_cp864_ms.c"
1534 #include "nls_cp864_ibm.c"
1535 #include "nls_cp865_ms.c"
1536 #include "nls_cp865_ibm.c"
1537 #include "nls_cp866.c"
1538 #include "nls_cp869_ms.c"
1539 #include "nls_cp869_ibm.c"
1540 #include "nls_cp874_ms.c"
1541 #include "nls_cp874_ibm.c"
1542 #include "nls_cp1250.c"
1543 #include "nls_cp1251.c"
1544 #include "nls_cp1252.c"
1545 #include "nls_cp1253.c"
1546 #include "nls_cp1254.c"
1547 #include "nls_cp1256.c"
1548 #include "nls_cp1257.c"
1549 #include "nls_hp_roman8.c"
1550 #include "nls_jisx0201.c"
1551 
1552 #define nls_first_sym  S(ascii)
1553 #define nls_last_sym  S(jisx0201)
1554 #define nls_num_encodings  (&symbol_tab_data.S_jisx0201 - &symbol_tab_data.S_ascii + 1)
1555 
1556 static const nls_table_t * const nls_tables[] = {
1557   &nls_ascii_table,
1558   &nls_iso8859_1_table,
1559   &nls_iso8859_2_table,
1560   &nls_iso8859_3_table,
1561   &nls_iso8859_4_table,
1562   &nls_iso8859_5_table,
1563   &nls_iso8859_6_table,
1564   &nls_iso8859_7_table,
1565   &nls_iso8859_8_table,
1566   &nls_iso8859_9_table,
1567   &nls_iso8859_10_table,
1568   &nls_iso8859_13_table,
1569   &nls_iso8859_14_table,
1570   &nls_iso8859_15_table,
1571   &nls_iso8859_16_table,
1572   &nls_koi8_r_table,
1573   &nls_koi8_u_table,
1574   &nls_mac_arabic_table,
1575   &nls_mac_centraleurope_table,
1576   &nls_mac_croatian_table,
1577   &nls_mac_cyrillic_table,
1578   &nls_mac_dingbat_table,
1579   &nls_mac_greek_table,
1580   &nls_mac_hebrew_table,
1581   &nls_mac_iceland_table,
1582   &nls_mac_roman_table,
1583   &nls_mac_romania_table,
1584   &nls_mac_symbol_table,
1585   &nls_mac_thai_table,
1586   &nls_mac_turkish_table,
1587   &nls_mac_ukraine_table,
1588   &nls_cp437_ms_table,
1589   &nls_cp437_ibm_table,
1590   &nls_cp737_table,
1591   &nls_cp775_table,
1592   &nls_cp850_table,
1593   &nls_cp852_ms_table,
1594   &nls_cp852_ibm_table,
1595   &nls_cp855_table,
1596   &nls_cp857_table,
1597   &nls_cp860_ms_table,
1598   &nls_cp860_ibm_table,
1599   &nls_cp861_ms_table,
1600   &nls_cp861_ibm_table,
1601   &nls_cp862_ms_table,
1602   &nls_cp862_ibm_table,
1603   &nls_cp863_ms_table,
1604   &nls_cp863_ibm_table,
1605   &nls_cp864_ms_table,
1606   &nls_cp864_ibm_table,
1607   &nls_cp865_ms_table,
1608   &nls_cp865_ibm_table,
1609   &nls_cp866_table,
1610   &nls_cp869_ms_table,
1611   &nls_cp869_ibm_table,
1612   &nls_cp874_ms_table,
1613   &nls_cp874_ibm_table,
1614   &nls_cp1250_table,
1615   &nls_cp1251_table,
1616   &nls_cp1252_table,
1617   &nls_cp1253_table,
1618   &nls_cp1254_table,
1619   &nls_cp1256_table,
1620   &nls_cp1257_table,
1621   &nls_hp_roman8_table,
1622   &nls_jisx0201_table,
1623 };
1624 
1625 global uintL nls_mblen (object encoding, const uintB* src,
1626                         const uintB* srcend);
1627 global void nls_mbstowcs (object encoding, object stream, const uintB* *srcp,
1628                           const uintB* srcend, chart* *destp, chart* destend);
1629 global uintL nls_asciiext_mblen (object encoding, const uintB* src,
1630                                  const uintB* srcend);
1631 global void nls_asciiext_mbstowcs (object encoding, object stream,
1632                                    const uintB* *srcp, const uintB* srcend,
1633                                    chart* *destp, chart* destend);
1634 global uintL nls_wcslen (object encoding, const chart* src,
1635                          const chart* srcend);
1636 global void nls_wcstombs (object encoding, object stream, const chart* *srcp,
1637                           const chart* srcend, uintB* *destp, uintB* destend);
1638 global uintL nls_asciiext_wcslen (object encoding, const chart* src,
1639                                   const chart* srcend);
1640 global void nls_asciiext_wcstombs (object encoding, object stream,
1641                                    const chart* *srcp, const chart* srcend,
1642                                    uintB* *destp, uintB* destend);
1643 global object nls_range (object encoding, uintL start, uintL end, uintL maxintervals);
1644 
1645 /* Bytes to characters. */
1646 
1647 /* Error when an invalid byte was encountered.
1648  error_nls_invalid(encoding,b); */
error_nls_invalid(object encoding,uintB b)1649 local _Noreturn void error_nls_invalid (object encoding, uintB b) {
1650   pushSTACK(fixnum(b));      /* CHARSET-TYPE-ERROR slot DATUM */
1651   pushSTACK(encoding);       /* CHARSET-TYPE-ERROR slot EXPECTED-TYPE */
1652   pushSTACK(TheEncoding(encoding)->enc_charset);
1653   pushSTACK(ascii_char(hex_table[b&0x0F]));
1654   pushSTACK(ascii_char(hex_table[(b>>4)&0x0F]));
1655   pushSTACK(TheSubr(subr_self)->name);
1656   error(charset_type_error,GETTEXT("~S: Invalid byte #x~C~C in ~S conversion"));
1657 }
1658 
nls_mblen(object encoding,const uintB * src,const uintB * srcend)1659 global uintL nls_mblen (object encoding, const uintB* src,
1660                         const uintB* srcend) {
1661   if (!eq(TheEncoding(encoding)->enc_towcs_error,S(Kignore)))
1662     return (srcend-src);
1663   else {
1664     var uintL count = srcend-src;
1665     var uintL result = 0;
1666     if (count > 0) {
1667       var const nls_table_t* table =
1668         (const nls_table_t*) TheMachine(TheEncoding(encoding)->enc_table);
1669       var const unsigned short* cvtable = table->charset2uni;
1670       dotimespL(count,count, {
1671         if (!(cvtable[*src++] == 0xFFFD))
1672           result++;
1673       });
1674     }
1675     return result;
1676   }
1677 }
1678 
nls_mbstowcs(object encoding,object stream,const uintB ** srcp,const uintB * srcend,chart ** destp,chart * destend)1679 global void nls_mbstowcs (object encoding, object stream, const uintB* *srcp,
1680                           const uintB* srcend, chart* *destp, chart* destend) {
1681   unused(stream);
1682   var const uintB* src = *srcp;
1683   var chart* dest = *destp;
1684   var uintL count = destend-dest;
1685   if (count > srcend-src)
1686     count = srcend-src;
1687   if (count > 0) {
1688     var const nls_table_t* table =
1689       (const nls_table_t*) TheMachine(TheEncoding(encoding)->enc_table);
1690     var const unsigned short* cvtable = table->charset2uni;
1691     dotimespL(count,count, {
1692       var uintB b = *src++;
1693       var cint ch = cvtable[b];
1694       if (!(ch == 0xFFFD)) {
1695         *dest++ = as_chart(ch);
1696       } else {
1697         var object action = TheEncoding(encoding)->enc_towcs_error;
1698         if (eq(action,S(Kignore))) {
1699         } else if (eq(action,S(Kerror))) {
1700           error_nls_invalid(encoding,b);
1701         } else {
1702           *dest++ = char_code(action);
1703         }
1704       }
1705     });
1706     *srcp = src;
1707     *destp = dest;
1708   }
1709 }
1710 
1711 /* Same thing, specially optimized for ASCII extensions. */
1712 
nls_asciiext_mblen(object encoding,const uintB * src,const uintB * srcend)1713 global uintL nls_asciiext_mblen (object encoding, const uintB* src,
1714                                  const uintB* srcend) {
1715   if (!eq(TheEncoding(encoding)->enc_towcs_error,S(Kignore)))
1716     return (srcend-src);
1717   else {
1718     var uintL count = srcend-src;
1719     var uintL result = 0;
1720     if (count > 0) {
1721       var const nls_table_t* table =
1722         (const nls_table_t*) TheMachine(TheEncoding(encoding)->enc_table);
1723       var const unsigned short* cvtable = table->charset2uni;
1724       dotimespL(count,count, {
1725         var uintB b = *src++;
1726         if ((b < 0x80) || !(cvtable[b] == 0xFFFD))
1727           result++;
1728       });
1729     }
1730     return result;
1731   }
1732 }
1733 
nls_asciiext_mbstowcs(object encoding,object stream,const uintB ** srcp,const uintB * srcend,chart ** destp,chart * destend)1734 global void nls_asciiext_mbstowcs (object encoding, object stream,
1735                                    const uintB* *srcp, const uintB* srcend,
1736                                    chart* *destp, chart* destend) {
1737   unused(stream);
1738   var const uintB* src = *srcp;
1739   var chart* dest = *destp;
1740   var uintL count = destend-dest;
1741   if (count > srcend-src)
1742     count = srcend-src;
1743   if (count > 0) {
1744     var const nls_table_t* table =
1745       (const nls_table_t*) TheMachine(TheEncoding(encoding)->enc_table);
1746     var const unsigned short* cvtable = table->charset2uni;
1747     dotimespL(count,count, {
1748       var uintB b = *src++;
1749       if (b < 0x80) { /* avoid memory reference (big speedup!) */
1750         *dest++ = as_chart((cint)b);
1751       } else {
1752         var cint ch = cvtable[b];
1753         if (!(ch == 0xFFFD)) {
1754           *dest++ = as_chart(ch);
1755         } else {
1756           var object action = TheEncoding(encoding)->enc_towcs_error;
1757           if (eq(action,S(Kignore))) {
1758           } else if (eq(action,S(Kerror))) {
1759             error_nls_invalid(encoding,b);
1760           } else {
1761             *dest++ = char_code(action);
1762           }
1763         }
1764       }
1765     });
1766     *srcp = src;
1767     *destp = dest;
1768   }
1769 }
1770 
1771 /* Characters to bytes. */
1772 
nls_wcslen(object encoding,const chart * src,const chart * srcend)1773 global uintL nls_wcslen (object encoding, const chart* src,
1774                          const chart* srcend) {
1775   var uintL count = srcend-src;
1776   var uintL result = 0;
1777   if (count > 0) {
1778     var const nls_table_t* table = (const nls_table_t*)
1779       TheMachine(TheEncoding(encoding)->enc_table);
1780     var const unsigned char* const* cvtable = table->page_uni2charset;
1781     dotimespL(count,count, {
1782       var chart ch = *src++;
1783       if (as_cint(ch) < 0x10000
1784           && (cvtable[as_cint(ch)>>8][as_cint(ch)&0xFF] != 0
1785               || chareq(ch,ascii(0))))
1786         result++;
1787       else {
1788         var object action = TheEncoding(encoding)->enc_tombs_error;
1789         if (eq(action,S(Kignore))) {
1790         } else if (uint8_p(action)) {
1791           result++;
1792         } else if (!eq(action,S(Kerror))) {
1793           var chart c = char_code(action);
1794           if (as_cint(c) < 0x10000
1795               && (cvtable[as_cint(c)>>8][as_cint(c)&0xFF] != 0
1796                   || chareq(c,ascii(0))))
1797             result++;
1798         } else
1799           error_unencodable(encoding,ch);
1800       }
1801      });
1802   }
1803   return result;
1804 }
1805 
nls_wcstombs(object encoding,object stream,const chart ** srcp,const chart * srcend,uintB ** destp,uintB * destend)1806 global void nls_wcstombs (object encoding, object stream,
1807                           const chart* *srcp, const chart* srcend,
1808                           uintB* *destp, uintB* destend) {
1809   unused(stream);
1810   var const chart* src = *srcp;
1811   var uintB* dest = *destp;
1812   var uintL scount = srcend-src;
1813   var uintL dcount = destend-dest;
1814   if (scount > 0 && dcount > 0) {
1815     var const nls_table_t* table = (const nls_table_t*)
1816       TheMachine(TheEncoding(encoding)->enc_table);
1817     var const unsigned char* const* cvtable = table->page_uni2charset;
1818     do {
1819       var chart ch = *src++; scount--;
1820       var uintB b;
1821       if (as_cint(ch) < 0x10000
1822           && (b = cvtable[as_cint(ch)>>8][as_cint(ch)&0xFF],
1823               b != 0 || chareq(ch,ascii(0)))) {
1824         *dest++ = b; dcount--;
1825       } else {
1826         var object action = TheEncoding(encoding)->enc_tombs_error;
1827         if (eq(action,S(Kignore))) {
1828         } else if (uint8_p(action)) {
1829           *dest++ = I_to_uint8(action); dcount--;
1830         } else if (!eq(action,S(Kerror))) {
1831           var chart c = char_code(action);
1832           if (as_cint(c) < 0x10000
1833               && (b = cvtable[as_cint(c)>>8][as_cint(c)&0xFF],
1834                   b != 0 || chareq(c,ascii(0)))) {
1835             *dest++ = b; dcount--;
1836           } else
1837             error_unencodable(encoding,ch);
1838         } else
1839           error_unencodable(encoding,ch);
1840       }
1841     } while (scount > 0 && dcount > 0);
1842     *srcp = src;
1843     *destp = dest;
1844   }
1845 }
1846 
1847 /* Same thing, specially optimized for ASCII extensions. */
1848 
nls_asciiext_wcslen(object encoding,const chart * src,const chart * srcend)1849 global uintL nls_asciiext_wcslen (object encoding, const chart* src,
1850                                   const chart* srcend) {
1851   var uintL count = srcend-src;
1852   var uintL result = 0;
1853   if (count > 0) {
1854     var const nls_table_t* table = (const nls_table_t*)
1855       TheMachine(TheEncoding(encoding)->enc_table);
1856     var const unsigned char* const* cvtable = table->page_uni2charset;
1857     dotimespL(count,count, {
1858       var chart ch = *src++;
1859       if (as_cint(ch) < 0x80
1860           || (as_cint(ch) < 0x10000
1861               && cvtable[as_cint(ch)>>8][as_cint(ch)&0xFF] != 0))
1862         result++;
1863       else {
1864         var object action = TheEncoding(encoding)->enc_tombs_error;
1865         if (eq(action,S(Kignore))) {
1866         } else if (uint8_p(action)) {
1867           result++;
1868         } else if (!eq(action,S(Kerror))) {
1869           var chart c = char_code(action);
1870           if (as_cint(c) < 0x10000
1871               && (cvtable[as_cint(c)>>8][as_cint(c)&0xFF] != 0
1872                   || chareq(c,ascii(0))))
1873             result++;
1874         } else
1875           error_unencodable(encoding,ch);
1876       }
1877     });
1878   }
1879   return result;
1880 }
1881 
nls_asciiext_wcstombs(object encoding,object stream,const chart ** srcp,const chart * srcend,uintB ** destp,uintB * destend)1882 global void nls_asciiext_wcstombs (object encoding, object stream,
1883                                    const chart* *srcp, const chart* srcend,
1884                                    uintB* *destp, uintB* destend) {
1885   unused(stream);
1886   var const chart* src = *srcp;
1887   var uintB* dest = *destp;
1888   var uintL scount = srcend-src;
1889   var uintL dcount = destend-dest;
1890   if (scount > 0 && dcount > 0) {
1891     var const nls_table_t* table = (const nls_table_t*)
1892       TheMachine(TheEncoding(encoding)->enc_table);
1893     var const unsigned char* const* cvtable = table->page_uni2charset;
1894     do {
1895       var chart ch = *src++; scount--;
1896       if (as_cint(ch) < 0x80) { /* avoid memory reference (big speedup!) */
1897         *dest++ = (uintB)as_cint(ch);
1898         dcount--;
1899       } else {
1900         var uintB b;
1901         if (as_cint(ch) < 0x10000
1902             && (b = cvtable[as_cint(ch)>>8][as_cint(ch)&0xFF],
1903                 b != 0)) {
1904           *dest++ = b; dcount--;
1905         } else {
1906           var object action = TheEncoding(encoding)->enc_tombs_error;
1907           if (eq(action,S(Kignore))) {
1908           } else if (uint8_p(action)) {
1909             *dest++ = I_to_uint8(action); dcount--;
1910           } else if (!eq(action,S(Kerror))) {
1911             var chart c = char_code(action);
1912             if (as_cint(c) < 0x10000
1913                 && (b = cvtable[as_cint(c)>>8][as_cint(c)&0xFF],
1914                     b != 0 || chareq(c,ascii(0)))) {
1915               *dest++ = b; dcount--;
1916             } else
1917               error_unencodable(encoding,ch);
1918           } else
1919             error_unencodable(encoding,ch);
1920         }
1921       }
1922     } while (scount > 0 && dcount > 0);
1923     *srcp = src;
1924     *destp = dest;
1925   }
1926 }
1927 
1928 /* Determining the range of encodable characters. */
nls_range(object encoding,uintL start,uintL end,uintL maxintervals)1929 global object nls_range (object encoding, uintL start, uintL end,
1930                          uintL maxintervals) {
1931   var uintL count = 0; /* number of intervals already on the STACK */
1932   /* The range lies in the BMP; no need to look beyond U+FFFF. */
1933   if (maxintervals > 0 && start < 0x10000) {
1934     if (end >= 0x10000)
1935       end = 0xFFFF;
1936     var const nls_table_t* table =
1937       (const nls_table_t*) TheMachine(TheEncoding(encoding)->enc_table);
1938     var const unsigned char* const* cvtable = table->page_uni2charset;
1939     var uintL i1;
1940     var uintL i2;
1941     var bool have_i1_i2 = false; /* [i1,i2] = interval being built */
1942     var uintL i;
1943     for (i = start;;) {
1944       /* Here i < 0x10000 and count < maxintervals. */
1945       var chart ch = as_chart(i);
1946       if (cvtable[as_cint(ch)>>8][as_cint(ch)&0xFF] != 0
1947           || chareq(ch,ascii(0))) {
1948         /* ch encodable -> extend the interval */
1949         if (!have_i1_i2) {
1950           have_i1_i2 = true; i1 = i;
1951         }
1952         i2 = i;
1953       } else {
1954         /* ch not encodable -> finish the interval */
1955         if (have_i1_i2) {
1956           pushSTACK(code_char(as_chart(i1)));
1957           pushSTACK(code_char(as_chart(i2)));
1958           check_STACK(); count++;
1959           have_i1_i2 = false;
1960           /* If we have already produced the maximum number of intervals
1961              requested by the caller, it's of no use to search further. */
1962           if (count == maxintervals)
1963             break;
1964         }
1965       }
1966       if (i == end)
1967         break;
1968       i++;
1969     }
1970     if (have_i1_i2) {
1971       pushSTACK(code_char(as_chart(i1))); pushSTACK(code_char(as_chart(i2)));
1972       check_STACK(); count++;
1973     }
1974   }
1975   return stringof(2*count);
1976 }
1977 
1978 /* -------------------------------------------------------------------------
1979  * iconv-based encodings */
1980 
1981 /* They are defined in stream.d because they need to access internals of
1982    the ChannelStream. */
1983 
1984 #ifdef HAVE_GOOD_ICONV
1985 
1986 extern uintL iconv_mblen (object encoding, const uintB* src,
1987                           const uintB* srcend);
1988 extern void iconv_mbstowcs (object encoding, object stream, const uintB* *srcp,
1989                             const uintB* srcend, chart* *destp,
1990                             chart* destend);
1991 extern uintL iconv_wcslen (object encoding, const chart* src,
1992                            const chart* srcend);
1993 extern void iconv_wcstombs (object encoding, object stream, const chart* *srcp,
1994                             const chart* srcend, uintB* *destp,
1995                             uintB* destend);
1996 extern object iconv_range (object encoding, uintL start, uintL end,
1997                            uintL maxintervals);
1998 
1999 #if defined(GNU_LIBICONV) || (__GLIBC__ > 2 || (__GLIBC__ == 2 && __GLIBC_MINOR__ >= 2))
2000 
2001 #ifdef GNU_LIBICONV
2002 #define iconv_first_sym  S(koi8_ru)
2003 #define iconv_last_sym  S(utf_7)
2004 #define iconv_num_encodings  (&symbol_tab_data.S_utf_7 - &symbol_tab_data.S_koi8_ru + 1)
2005 #else
2006 #define iconv_first_sym  S(cp1255)
2007 #define iconv_last_sym  S(utf_7)
2008 #define iconv_num_encodings  (&symbol_tab_data.S_utf_7 - &symbol_tab_data.S_cp1255 + 1)
2009 #endif
2010 
2011 #endif
2012 
2013 #endif /* HAVE_GOOD_ICONV */
2014 
2015 /* ----------------------------------------------------------------------- */
2016 
2017 #endif /* ENABLE_UNICODE */
2018 
2019 /* =========================================================================
2020  * General functions */
2021 
2022 /* (MAKE-ENCODING [:charset] [:line-terminator] [:input-error-action]
2023                   [:output-error-action] [:if-does-not-exist])
2024  creates a new encoding. */
2025 LISPFUN(make_encoding,seclass_read,0,0,norest,key,5,
2026         (kw(charset),kw(line_terminator),
2027          kw(input_error_action),kw(output_error_action)
2028          kw(if_does_not_exist))) {
2029   var object arg = popSTACK(); /* :if-does-not-exist */
2030   var bool ignore_not_exist = nullp(arg); /* no error */
2031   /* Check the :CHARSET argument. */
2032   arg = STACK_3;
2033   /* string -> symbol in CHARSET */
2034   if (!boundp(arg) || eq(arg,S(Kdefault))) {
2035     arg = O(default_file_encoding);
2036    #ifndef ENABLE_UNICODE
2037     if (nullp(arg)) /* initialization */
2038       goto create_new_encoding;
2039    #endif
2040   } else if (encodingp(arg)) {
2041   }
2042  #ifdef ENABLE_UNICODE
2043   else if (symbolp(arg) && constant_var_p(TheSymbol(arg))
2044            && encodingp(Symbol_value(arg))) {
2045     arg = Symbol_value(arg);
2046   } else if (stringp(arg)) {
2047     var object arg_upcase = string_upcase(arg);
2048     var object sym;
2049     arg = STACK_3; /* refetch */
2050     if (find_external_symbol(arg_upcase,false,O(charset_package),&sym)
2051         && constant_var_p(TheSymbol(sym)) && encodingp(Symbol_value(sym)))
2052       arg = Symbol_value(sym);
2053     #ifdef HAVE_GOOD_ICONV
2054     else {
2055       var bool valid_encoding_p = true;
2056       with_string_0(arg,Symbol_value(S(ascii)),charset_ascii,{
2057         valid_encoding_p =
2058           check_charset(charset_ascii,ignore_not_exist ? nullobj : arg);
2059       });
2060       /* if :IF-DOES-NOT-EXIST was non-NIL and the encoding was invalid,
2061          check_charset() would have signalled an error */
2062       if (valid_encoding_p) {
2063         pushSTACK(coerce_ss(arg));
2064         var object encoding = allocate_encoding();
2065         TheEncoding(encoding)->enc_eol = S(Kunix);
2066         TheEncoding(encoding)->enc_towcs_error = S(Kerror);
2067         TheEncoding(encoding)->enc_tombs_error = S(Kerror);
2068         TheEncoding(encoding)->enc_charset = popSTACK();
2069         TheEncoding(encoding)->enc_mblen    = P(iconv_mblen);
2070         TheEncoding(encoding)->enc_mbstowcs = P(iconv_mbstowcs);
2071         TheEncoding(encoding)->enc_wcslen   = P(iconv_wcslen);
2072         TheEncoding(encoding)->enc_wcstombs = P(iconv_wcstombs);
2073         TheEncoding(encoding)->enc_range    = P(iconv_range);
2074         TheEncoding(encoding)->min_bytes_per_char = 1;
2075         TheEncoding(encoding)->max_bytes_per_char = max_bytes_per_chart; /* wild assumption */
2076         arg = encoding;
2077       } else {
2078         ASSERT(ignore_not_exist);
2079         arg = NIL;
2080       }
2081     }
2082     #else
2083     else error_illegal_arg(arg,S(encoding),S(Kcharset));
2084     #endif
2085   }
2086  #endif
2087   else error_illegal_arg(arg,S(encoding),S(Kcharset));
2088   STACK_3 = arg;
2089   /* Check the :LINE-TERMINATOR argument. */
2090   arg = STACK_2;
2091   if (!(!boundp(arg) || eq(arg,S(Kunix)) || eq(arg,S(Kmac)) || eq(arg,S(Kdos))))
2092     error_illegal_arg(arg,O(type_line_terminator),S(Kline_terminator));
2093   /* Check the :INPUT-ERROR-ACTION argument. */
2094   arg = STACK_1;
2095   if (!(!boundp(arg) || eq(arg,S(Kerror)) || eq(arg,S(Kignore)) || charp(arg)))
2096     error_illegal_arg(arg,O(type_input_error_action),S(Kinput_error_action));
2097   /* Check the :OUTPUT-ERROR-ACTION argument. */
2098   arg = STACK_0;
2099   if (!(!boundp(arg)
2100         || eq(arg,S(Kerror)) || eq(arg,S(Kignore))
2101         || charp(arg) || uint8_p(arg)))
2102     error_illegal_arg(arg,O(type_output_error_action),S(Koutput_error_action));
2103   /* Create a new encoding. */
2104   if (nullp(STACK_3) /* illegal charset & :IF-DOES-NOT-EXIST NIL */
2105       || ((!boundp(STACK_2) || eq(STACK_2,TheEncoding(STACK_3)->enc_eol))
2106           && (!boundp(STACK_1)
2107               || eq(STACK_1,TheEncoding(STACK_3)->enc_towcs_error))
2108           && (!boundp(STACK_0)
2109               || eq(STACK_0,TheEncoding(STACK_3)->enc_tombs_error)))) {
2110     VALUES1(STACK_3);
2111   } else create_new_encoding: {
2112     var object encoding = allocate_encoding();
2113     var object old_encoding = STACK_3;
2114     if (encodingp(old_encoding)) {
2115       var const gcv_object_t* ptr1 = &TheRecord(old_encoding)->recdata[0];
2116       var gcv_object_t* ptr2 = &TheRecord(encoding)->recdata[0];
2117       var uintC count = encoding_length;
2118       do { *ptr2++ = *ptr1++; } while (--count);
2119       if (encoding_xlength > 0)
2120         copy_mem_b(ptr2,ptr1,encoding_xlength);
2121     }
2122     if (boundp(STACK_2))
2123       TheEncoding(encoding)->enc_eol = STACK_2;
2124     if (boundp(STACK_1))
2125       TheEncoding(encoding)->enc_towcs_error = STACK_1;
2126     if (boundp(STACK_0))
2127       TheEncoding(encoding)->enc_tombs_error = STACK_0;
2128     VALUES1(encoding);
2129   }
2130   skipSTACK(4);
2131 }
2132 
2133 /* (SYSTEM::ENCODINGP object) */
2134 LISPFUNNF(encodingp,1) {
2135   var object arg = popSTACK();
2136   VALUES_IF(encodingp(arg));
2137 }
2138 
2139 #ifdef ENABLE_UNICODE
2140 #define DEFAULT_ENC &O(misc_encoding)
2141 #else
2142 #define DEFAULT_ENC &O(default_file_encoding)
2143 #endif
2144 
2145 /* (SYSTEM::CHARSET-TYPEP object encoding) tests whether the object
2146    is a character belonging to the given character set. */
2147 LISPFUNNR(charset_typep,2) {
2148   var object encoding = check_encoding(STACK_0,DEFAULT_ENC,false);
2149   var object obj = STACK_1;
2150   if (charp(obj)) {
2151    #ifdef ENABLE_UNICODE
2152     var uintL i = as_cint(char_code(obj));
2153     obj = Encoding_range(encoding)(encoding,i,i,1);
2154     VALUES_IF(Sstring_length(obj));
2155    #else
2156     VALUES1(T);
2157    #endif
2158   } else {
2159     VALUES1(NIL);
2160   }
2161   skipSTACK(2);
2162 }
2163 
2164 /* (EXT:ENCODING-LINE-TERMINATOR encoding) --> :UNIX/:DOS/:MAC */
2165 LISPFUNNF(encoding_line_terminator,1) {
2166   var object encoding = check_encoding(popSTACK(),DEFAULT_ENC,false);
2167   VALUES1(TheEncoding(encoding)->enc_eol);
2168 }
2169 
2170 #ifdef ENABLE_UNICODE
2171 
2172 /* (EXT:ENCODING-CHARSET encoding) --> charset */
2173 LISPFUNNF(encoding_charset,1) {
2174   var object encoding = check_encoding(popSTACK(),DEFAULT_ENC,false);
2175   VALUES1(TheEncoding(encoding)->enc_charset);
2176 }
2177 
2178 /* (SYSTEM::CHARSET-RANGE encoding char1 char2 [maxintervals])
2179  returns the range of characters in [char1,char2] encodable in the encoding. */
2180 LISPFUN(charset_range,seclass_read,3,1,norest,nokey,0,NIL) {
2181   var object encoding = check_encoding(STACK_3,DEFAULT_ENC,false);
2182   if (!charp(STACK_2)) STACK_2 = check_char(STACK_2);
2183   if (!charp(STACK_1)) STACK_1 = check_char(STACK_1);
2184   var uintL i1 = as_cint(char_code(STACK_2));
2185   var uintL i2 = as_cint(char_code(STACK_1));
2186   var uintL maxintervals;
2187   if (missingp(STACK_0))
2188     maxintervals = ~(uintL)0;
2189   else if (uint32_p(STACK_0))
2190     maxintervals = I_to_uint32(STACK_0);
2191   else
2192     error_uint32(STACK_0);
2193   VALUES1(i1 <= i2 ?
2194           Encoding_range(encoding)(encoding,i1,i2,maxintervals) :
2195           (object)O(empty_string));
2196   skipSTACK(4);
2197 }
2198 
2199 #endif
2200 
2201 /* -------------------------------------------------------------------------
2202  * Elementary string functions */
2203 
2204 /* UP: return a LISP string the given contents.
2205  n_char_to_string(charptr,len,encoding)
2206  > char* charptr: the address of the character sequence
2207  > uintL len: its length
2208  > object encoding: Encoding
2209  < return: normal-simple-string with len characters as content
2210  can trigger GC */
2211 #ifdef ENABLE_UNICODE
n_char_to_string(const char * srcptr,uintL blen,object encoding)2212 modexp maygc object n_char_to_string
2213 (const char* srcptr, uintL blen, object encoding) {
2214   var const uintB* bptr = (const uintB*)srcptr;
2215   var const uintB* bendptr = bptr+blen;
2216   var uintL clen = Encoding_mblen(encoding)(encoding,bptr,bendptr);
2217   pushSTACK(encoding);
2218   check_stringsize(clen);
2219   var object obj = allocate_string(clen);
2220   encoding = popSTACK();
2221   {
2222     var chart* cptr = &TheSnstring(obj)->data[0];
2223     var chart* cendptr = cptr+clen;
2224     Encoding_mbstowcs(encoding)(encoding,nullobj,&bptr,bendptr,&cptr,cendptr);
2225     ASSERT(cptr == cendptr);
2226   }
2227   return obj;
2228 }
2229 #else
n_char_to_string_(const char * srcptr,uintL len)2230 modexp maygc object n_char_to_string_ (const char* srcptr, uintL len) {
2231   var const uintB* bptr = (const uintB*)srcptr;
2232   check_stringsize(len);
2233   var object obj = allocate_string(len);
2234   if (len > 0) {
2235     var chart* ptr = &TheSnstring(obj)->data[0];
2236     /* copy bptr to ptr as characters: */
2237     dotimespL(len,len, { *ptr++ = as_chart(*bptr++); } );
2238   }
2239   return obj;
2240 }
2241 #endif
2242 
2243 /* UP: Convert an ASCIZ string to a LISP string
2244  asciz_to_string(asciz,encoding)
2245  ascii_to_string(asciz)
2246  > char* asciz: ASCIZ-String (NULL-terminated)
2247  > object encoding: Encoding
2248  < return: normal-simple-string the same string (without NULL)
2249  can trigger GC */
2250 #ifdef ENABLE_UNICODE
asciz_to_string(const char * asciz,object encoding)2251 modexp maygc object asciz_to_string (const char * asciz, object encoding) {
2252   return n_char_to_string(asciz,asciz_length(asciz),encoding);
2253 }
2254 #else
asciz_to_string_(const char * asciz)2255 modexp maygc object asciz_to_string_ (const char * asciz) {
2256   return n_char_to_string_(asciz,asciz_length(asciz));
2257 }
2258 #endif
ascii_to_string(const char * asciz)2259 modexp maygc object ascii_to_string (const char * asciz) {
2260   var const uintB* bptr = (const uintB*)asciz;
2261   var uintL len = asciz_length(asciz);
2262   check_stringsize(len);
2263   var object obj = allocate_s8string(len); /* allocate string */
2264   if (len > 0) {
2265     var cint8* ptr = &TheS8string(obj)->data[0];
2266     /* copy as characters bptr --> ptr: */
2267     dotimespL(len,len, {
2268       var uintB b = *bptr++;
2269       ASSERT(b < 0x80);
2270       *ptr++ = (cint8)b;
2271     });
2272   }
2273   DBGREALLOC(obj);
2274   return obj;
2275 }
2276 
2277 /* UP: Convert a LISP string to an ASCIZ string.
2278  string_to_asciz(obj,encoding)
2279  > object obj: String
2280  > object encoding: Encoding
2281  < return: simple-bit-vector with the bytes<==characters and a NULL at the end
2282  < TheAsciz(result): address of the byte sequence contain therein
2283  can trigger GC */
2284 #ifdef ENABLE_UNICODE
string_to_asciz(object obj,object encoding)2285 modexp maygc object string_to_asciz (object obj, object encoding) {
2286   var uintL len;
2287   var uintL offset;
2288   var object string = unpack_string_ro(obj,&len,&offset);
2289   var const chart* srcptr;
2290   unpack_sstring_alloca(string,len,offset, srcptr=);
2291   var uintL bytelen = cslen(encoding,srcptr,len);
2292   pushSTACK(encoding);
2293   pushSTACK(string);
2294   var object newasciz = allocate_bit_vector(Atype_8Bit,bytelen+1);
2295   string = popSTACK();
2296   encoding = popSTACK();
2297   unpack_sstring_alloca(string,len,offset, srcptr=);
2298   cstombs(encoding,srcptr,len,&TheSbvector(newasciz)->data[0],bytelen);
2299   TheSbvector(newasciz)->data[bytelen] = '\0';
2300   return newasciz;
2301 }
2302 #else
string_to_asciz_(object obj)2303 modexp maygc object string_to_asciz_ (object obj) {
2304   pushSTACK(obj); /* save string */
2305   var object newasciz = allocate_bit_vector(Atype_8Bit,vector_length(obj)+1);
2306   obj = popSTACK(); /* restore string */
2307   {
2308     var uintL len;
2309     var uintL offset;
2310     var object string = unpack_string_ro(obj,&len,&offset);
2311     var const chart* sourceptr;
2312     unpack_sstring_alloca(string,len,offset, sourceptr=);
2313     /* source-string: length in len, bytes at sourceptr */
2314     var uintB* destptr = &TheSbvector(newasciz)->data[0];
2315     /* destination-string: bytes at destptr */
2316     { /* copy loop: */
2317       var uintL count;
2318       dotimesL(count,len, { *destptr++ = as_cint(*sourceptr++); } );
2319       *destptr++ = '\0'; /* append NULL byte */
2320     }
2321   }
2322   return newasciz;
2323 }
2324 #endif
2325 
2326 /* =========================================================================
2327  * Initialization */
2328 
2329 /* Initialize the encodings.
2330  init_encodings(); */
init_encodings_1(void)2331 global void init_encodings_1 (void) {
2332   /* Compile-time checks: */
2333   ASSERT(sizeof(chart) == sizeof(cint));
2334  #ifdef ENABLE_UNICODE
2335   {
2336     var object symbol = S(base64);
2337     var object encoding = allocate_encoding();
2338     TheEncoding(encoding)->enc_eol = S(Kunix);
2339     TheEncoding(encoding)->enc_towcs_error = S(Kerror);
2340     TheEncoding(encoding)->enc_tombs_error = S(Kerror);
2341     TheEncoding(encoding)->enc_charset = symbol;
2342     TheEncoding(encoding)->enc_mblen    = P(base64_mblen);
2343     TheEncoding(encoding)->enc_mbstowcs = P(base64_mbstowcs);
2344     TheEncoding(encoding)->enc_wcslen   = P(base64_wcslen);
2345     TheEncoding(encoding)->enc_wcstombs = P(base64_wcstombs);
2346     TheEncoding(encoding)->enc_range    = P(base64_range);
2347     TheEncoding(encoding)->min_bytes_per_char = 2; /* ?? */
2348     TheEncoding(encoding)->max_bytes_per_char = 2; /* ?? */
2349     define_constant(symbol,encoding);
2350   }
2351   {
2352     var object symbol = S(unicode_16_big_endian);
2353     var object encoding = allocate_encoding();
2354     TheEncoding(encoding)->enc_eol = S(Kunix);
2355     TheEncoding(encoding)->enc_towcs_error = S(Kerror);
2356     TheEncoding(encoding)->enc_tombs_error = S(Kerror);
2357     TheEncoding(encoding)->enc_charset = symbol;
2358     TheEncoding(encoding)->enc_mblen    = P(uni16_mblen);
2359     TheEncoding(encoding)->enc_mbstowcs = P(uni16be_mbstowcs);
2360     TheEncoding(encoding)->enc_wcslen   = P(uni16_wcslen);
2361     TheEncoding(encoding)->enc_wcstombs = P(uni16be_wcstombs);
2362     TheEncoding(encoding)->enc_range    = P(bmp_range);
2363     TheEncoding(encoding)->min_bytes_per_char = 2;
2364     TheEncoding(encoding)->max_bytes_per_char = 2;
2365     define_constant(symbol,encoding);
2366   }
2367   {
2368     var object symbol = S(unicode_16_little_endian);
2369     var object encoding = allocate_encoding();
2370     TheEncoding(encoding)->enc_eol = S(Kunix);
2371     TheEncoding(encoding)->enc_towcs_error = S(Kerror);
2372     TheEncoding(encoding)->enc_tombs_error = S(Kerror);
2373     TheEncoding(encoding)->enc_charset = symbol;
2374     TheEncoding(encoding)->enc_mblen    = P(uni16_mblen);
2375     TheEncoding(encoding)->enc_mbstowcs = P(uni16le_mbstowcs);
2376     TheEncoding(encoding)->enc_wcslen   = P(uni16_wcslen);
2377     TheEncoding(encoding)->enc_wcstombs = P(uni16le_wcstombs);
2378     TheEncoding(encoding)->enc_range    = P(bmp_range);
2379     TheEncoding(encoding)->min_bytes_per_char = 2;
2380     TheEncoding(encoding)->max_bytes_per_char = 2;
2381     define_constant(symbol,encoding);
2382   }
2383   {
2384     var object symbol = S(unicode_32_big_endian);
2385     var object encoding = allocate_encoding();
2386     TheEncoding(encoding)->enc_eol = S(Kunix);
2387     TheEncoding(encoding)->enc_towcs_error = S(Kerror);
2388     TheEncoding(encoding)->enc_tombs_error = S(Kerror);
2389     TheEncoding(encoding)->enc_charset = symbol;
2390     TheEncoding(encoding)->enc_mblen    = P(uni32be_mblen);
2391     TheEncoding(encoding)->enc_mbstowcs = P(uni32be_mbstowcs);
2392     TheEncoding(encoding)->enc_wcslen   = P(uni32_wcslen);
2393     TheEncoding(encoding)->enc_wcstombs = P(uni32be_wcstombs);
2394     TheEncoding(encoding)->enc_range    = P(all_range);
2395     TheEncoding(encoding)->min_bytes_per_char = 4;
2396     TheEncoding(encoding)->max_bytes_per_char = 4;
2397     define_constant(symbol,encoding);
2398   }
2399   {
2400     var object symbol = S(unicode_32_little_endian);
2401     var object encoding = allocate_encoding();
2402     TheEncoding(encoding)->enc_eol = S(Kunix);
2403     TheEncoding(encoding)->enc_towcs_error = S(Kerror);
2404     TheEncoding(encoding)->enc_tombs_error = S(Kerror);
2405     TheEncoding(encoding)->enc_charset = symbol;
2406     TheEncoding(encoding)->enc_mblen    = P(uni32le_mblen);
2407     TheEncoding(encoding)->enc_mbstowcs = P(uni32le_mbstowcs);
2408     TheEncoding(encoding)->enc_wcslen   = P(uni32_wcslen);
2409     TheEncoding(encoding)->enc_wcstombs = P(uni32le_wcstombs);
2410     TheEncoding(encoding)->enc_range    = P(all_range);
2411     TheEncoding(encoding)->min_bytes_per_char = 4;
2412     TheEncoding(encoding)->max_bytes_per_char = 4;
2413     define_constant(symbol,encoding);
2414   }
2415   {
2416     var object symbol = S(utf_8);
2417     var object encoding = allocate_encoding();
2418     TheEncoding(encoding)->enc_eol = S(Kunix);
2419     TheEncoding(encoding)->enc_towcs_error = S(Kerror);
2420     TheEncoding(encoding)->enc_tombs_error = S(Kerror);
2421     TheEncoding(encoding)->enc_charset = symbol;
2422     TheEncoding(encoding)->enc_mblen    = P(utf8_mblen);
2423     TheEncoding(encoding)->enc_mbstowcs = P(utf8_mbstowcs);
2424     TheEncoding(encoding)->enc_wcslen   = P(utf8_wcslen);
2425     TheEncoding(encoding)->enc_wcstombs = P(utf8_wcstombs);
2426     TheEncoding(encoding)->enc_range    = P(all_range);
2427     TheEncoding(encoding)->min_bytes_per_char = 1;
2428     TheEncoding(encoding)->max_bytes_per_char = 4;
2429     define_constant(symbol,encoding);
2430   }
2431   {
2432     var object symbol = S(java);
2433     var object encoding = allocate_encoding();
2434     TheEncoding(encoding)->enc_eol = S(Kunix);
2435     TheEncoding(encoding)->enc_towcs_error = S(Kerror);
2436     TheEncoding(encoding)->enc_tombs_error = S(Kerror);
2437     TheEncoding(encoding)->enc_charset = symbol;
2438     TheEncoding(encoding)->enc_mblen    = P(java_mblen);
2439     TheEncoding(encoding)->enc_mbstowcs = P(java_mbstowcs);
2440     TheEncoding(encoding)->enc_wcslen   = P(java_wcslen);
2441     TheEncoding(encoding)->enc_wcstombs = P(java_wcstombs);
2442     TheEncoding(encoding)->enc_range    = P(all_range);
2443     TheEncoding(encoding)->min_bytes_per_char = 1;
2444     TheEncoding(encoding)->max_bytes_per_char = 12;
2445     define_constant(symbol,encoding);
2446   }
2447  #endif
2448 }
init_encodings_2(void)2449 global void init_encodings_2 (void) {
2450  #ifdef ENABLE_UNICODE
2451   {
2452     var object symbol = nls_first_sym;
2453     var const nls_table_t * const * ptr = &nls_tables[0];
2454     var uintC count = sizeof(nls_tables)/sizeof(nls_tables[0]);
2455     ASSERT(nls_num_encodings == count);
2456     for (; count > 0; count--) {
2457       var object encoding = allocate_encoding();
2458       TheEncoding(encoding)->enc_eol = S(Kunix);
2459       TheEncoding(encoding)->enc_towcs_error = S(Kerror);
2460       TheEncoding(encoding)->enc_tombs_error = S(Kerror);
2461       TheEncoding(encoding)->enc_charset = symbol;
2462       if ((*ptr)->is_ascii_extension) {
2463         TheEncoding(encoding)->enc_mblen    = P(nls_asciiext_mblen);
2464         TheEncoding(encoding)->enc_mbstowcs = P(nls_asciiext_mbstowcs);
2465         TheEncoding(encoding)->enc_wcslen   = P(nls_asciiext_wcslen);
2466         TheEncoding(encoding)->enc_wcstombs = P(nls_asciiext_wcstombs);
2467       } else {
2468         TheEncoding(encoding)->enc_mblen    = P(nls_mblen);
2469         TheEncoding(encoding)->enc_mbstowcs = P(nls_mbstowcs);
2470         TheEncoding(encoding)->enc_wcslen   = P(nls_wcslen);
2471         TheEncoding(encoding)->enc_wcstombs = P(nls_wcstombs);
2472       }
2473       TheEncoding(encoding)->enc_range    = P(nls_range);
2474       TheEncoding(encoding)->enc_table    = make_machine(*ptr);
2475       TheEncoding(encoding)->min_bytes_per_char = 1;
2476       TheEncoding(encoding)->max_bytes_per_char = 1;
2477       define_constant(symbol,encoding);
2478       symbol = objectplus(symbol,(soint)sizeof(*TheSymbol(symbol))<<(oint_addr_shift-addr_shift));
2479       ptr++;
2480     }
2481   }
2482   /* Now some aliases. */
2483   define_constant(S(unicode_16),Symbol_value(S(unicode_16_big_endian))); /* network byte order = big endian */
2484   define_constant(S(unicode_32),Symbol_value(S(unicode_32_big_endian))); /* network byte order = big endian */
2485   define_constant(S(ucs_2),Symbol_value(S(unicode_16)));
2486   define_constant(S(ucs_4),Symbol_value(S(unicode_32)));
2487   define_constant(S(macintosh),Symbol_value(S(mac_roman)));
2488   define_constant(S(windows_1250),Symbol_value(S(cp1250)));
2489   define_constant(S(windows_1251),Symbol_value(S(cp1251)));
2490   define_constant(S(windows_1252),Symbol_value(S(cp1252)));
2491   define_constant(S(windows_1253),Symbol_value(S(cp1253)));
2492   define_constant(S(windows_1254),Symbol_value(S(cp1254)));
2493   define_constant(S(windows_1256),Symbol_value(S(cp1256)));
2494   define_constant(S(windows_1257),Symbol_value(S(cp1257)));
2495  #if defined(HAVE_GOOD_ICONV) && (defined(GNU_LIBICONV) || (__GLIBC__ > 2 || (__GLIBC__ == 2 && __GLIBC_MINOR__ >= 2)))
2496   {
2497     var object symbol = iconv_first_sym;
2498     var uintC count = iconv_num_encodings;
2499     for (; count > 0; count--) {
2500       pushSTACK(Symbol_name(symbol)); pushSTACK(unbound);
2501       pushSTACK(unbound); pushSTACK(unbound); pushSTACK(NIL);
2502       C_make_encoding(); /* cannot use funcall yet */
2503       if (nullp(value1)) {
2504         pushSTACK(symbol); pushSTACK(O(charset_package)); C_unintern();
2505       } else define_constant(symbol,value1);
2506       symbol = objectplus(symbol,(soint)sizeof(*TheSymbol(symbol))<<(oint_addr_shift-addr_shift));
2507     }
2508   }
2509   /* Now some aliases, to be defined only if their targets still exit. */
2510   if (!boundp(Symbol_value(S(cp1255)))) {
2511     pushSTACK(S(windows_1255)); pushSTACK(O(charset_package)); C_unintern();
2512   } else define_constant(S(windows_1255),Symbol_value(S(cp1255)));
2513   if (!boundp(Symbol_value(S(cp1258)))) {
2514     pushSTACK(S(windows_1258)); pushSTACK(O(charset_package)); C_unintern();
2515   } else define_constant(S(windows_1258),Symbol_value(S(cp1258)));
2516  #endif
2517   /* Initialize O(internal_encoding): */
2518   pushSTACK(Symbol_value(S(utf_8))); /* :charset */
2519   pushSTACK(S(Kunix));          /* :line-terminator */
2520   pushSTACK(unbound);           /* :input-error-action */
2521   pushSTACK(unbound);           /* :output-error-action */
2522   pushSTACK(unbound);           /* :if-does-not-exist */
2523   C_make_encoding();
2524   O(internal_encoding) = value1;
2525  #endif
2526   /* Initialize locale dependent encodings: */
2527   init_dependent_encodings();
2528 }
2529 
2530 #ifdef ENABLE_UNICODE
2531 /* convert the encoding name to its canonical form
2532  at this time, just upper-case the encoding name
2533  in the future, might insert/delete `-' &c
2534  (do not use toupper() because we know that encoding name is ASCII) */
canonicalize_encoding(const char * const_encoding)2535 local char *canonicalize_encoding (const char *const_encoding) {
2536   var char* encoding = strdup(const_encoding); /* copy into writeable memory */
2537   var char* p;
2538   for (p = encoding; *p != '\0'; p++)
2539     if (*p >= 'a' && *p <= 'z')
2540       *p += 'A' - 'a';
2541   return encoding;
2542 }
2543 #endif
2544 
2545 /* The reasonable 1:1 default.
2546  Rationale: this avoids random encoding errors,
2547    e.g., when DIRECTORY is called on startup and the user home dir
2548    contains files with non-ASCII names
2549  The right default encoding is the one defined by the CLISP
2550    CODE-CHAR/CHAR-CODE conversion on the first 255 bytes, i.e., ISO-8859-1 */
2551 #define DEFAULT_1_1_ENCODING  Symbol_value(S(iso8859_1))
2552 #define DEFAULT_1_1_ENCODING_NAME "ISO-8859-1"
2553 
2554 /* Returns an encoding specified by a name.
2555  The line-termination is OS dependent.
2556  encoding_from_name(name,context)
2557  > const char* name: Any of the canonical names returned by the locale_charset()
2558                      function.
2559  > const char* context: for warnings
2560  > STACK_0 : if context /= locale and name does not make an encoding
2561              - the default locale encoding
2562  can trigger GC */
encoding_from_name(const char * const_name,const char * context)2563 local maygc object encoding_from_name (const char* const_name, const char* context) {
2564  #ifdef ENABLE_UNICODE
2565   /* Attempt to use the character set implicitly specified by the locale. */
2566   var char* name = canonicalize_encoding(const_name);
2567   if (asciz_equal(name,"US-ASCII") || asciz_equal(name,"ANSI_X3.4-1968"))
2568     pushSTACK(Symbol_value(S(ascii)));
2569   #if defined(HAVE_GOOD_ICONV) && (defined(GNU_LIBICONV) || (__GLIBC__ > 2 || (__GLIBC__ == 2 && __GLIBC_MINOR__ >= 2)))
2570   else if (asciz_equal(name,"GB2312"))
2571     pushSTACK(Symbol_value(S(euc_cn)));
2572   else if (asciz_equal(name,"SJIS"))
2573     pushSTACK(Symbol_value(S(shift_jis)));
2574   #endif
2575   else if (asciz_equal(name,"1:1") || asciz_equal(name,"8BIT"))
2576     pushSTACK(DEFAULT_1_1_ENCODING);
2577   else {
2578     pushSTACK(asciz_to_string(name,Symbol_value(S(ascii))));
2579     pushSTACK(O(charset_package));
2580     C_find_symbol();
2581     if (!nullp(value2) && encodingp(Symbol_value(value1)))
2582       pushSTACK(Symbol_value(value1));
2583     else if (asciz_equal(context,"locale")) { /* e.g., name=ISO8859-1 */
2584       fprintf(stderr,GETTEXT("WARNING: %s: no encoding %s, using %s"),
2585               context,name,DEFAULT_1_1_ENCODING_NAME);
2586       fprint(stderr,"\n");
2587       pushSTACK(DEFAULT_1_1_ENCODING);
2588     } else  {
2589       fprintf(stderr,GETTEXT("WARNING: %s: no encoding %s, using %s"),
2590               context,name,"locale encoding");
2591       fprint(stderr,"\n");
2592       pushSTACK(STACK_0);
2593     }
2594   }
2595  #else
2596   unused const_name; unused context;
2597   pushSTACK(unbound);           /* :charset */
2598  #endif /* ENABLE_UNICODE */
2599  #if defined(WIN32) || (defined(UNIX) && (O_BINARY != 0) && !defined(UNIX_CYGWIN))
2600   pushSTACK(S(Kdos));           /* :line-terminator */
2601  #else
2602   pushSTACK(S(Kunix));          /* :line-terminator */
2603  #endif
2604   pushSTACK(unbound);           /* :input-error-action */
2605   pushSTACK(unbound);           /* :output-error-action */
2606   pushSTACK(unbound);           /* :if-does-not-exist */
2607   C_make_encoding();
2608   return value1;
2609 }
2610 
2611 /* Initialize the encodings which depend on environment variables.
2612  init_dependent_encodings(); */
init_dependent_encodings(void)2613 global void init_dependent_encodings(void) {
2614 #ifdef ENABLE_UNICODE
2615   extern const char* locale_encoding; /* GNU locale encoding canonical name */
2616   extern const char* argv_encoding_file; /* override *DEFAULT-FILE-ENCODING* */
2617  #ifndef CONSTANT_PATHNAME_ENCODING
2618   extern const char* argv_encoding_pathname; /* override *PATHNAME-ENCODING* */
2619  #endif
2620   extern const char* argv_encoding_terminal; /* override *terminal-encoding* */
2621   extern const char* argv_encoding_foreign; /* override *foreign-encoding* */
2622   extern const char* argv_encoding_misc; /* override *misc-encoding* */
2623   begin_system_call();
2624   locale_encoding = locale_charset(); /* depends on environment variables */
2625   end_system_call();
2626   pushSTACK(encoding_from_name(locale_encoding,"locale"));
2627   /* Initialize each encoding as follows: If the corresponding -E....
2628      option was not given, use the locale dependent locale_charset().
2629      If it was given, use that, and if the specified encoding was invalid,
2630      use a default encoding that does not depend on the locale. */
2631   O(default_file_encoding) =
2632     (argv_encoding_file == NULL ? (object)STACK_0
2633      : encoding_from_name(argv_encoding_file,"*DEFAULT-FILE-ENCODING*"));
2634   O(pathname_encoding) =
2635    #ifdef CONSTANT_PATHNAME_ENCODING
2636     CONSTANT_PATHNAME_ENCODING;
2637    #else
2638     (argv_encoding_pathname == NULL ? (object)STACK_0
2639      : encoding_from_name(argv_encoding_pathname,"*PATHNAME-ENCODING*"));
2640    #endif
2641  #if defined(WIN32_NATIVE)
2642   /* cf src/glllib/localcharset.c locale_charset() */
2643   if (argv_encoding_terminal == NULL) {
2644     var char buf[2+10+1];
2645     sprintf(buf,"CP%u",GetOEMCP());
2646     O(terminal_encoding) = encoding_from_name(buf,"*TERMINAL-ENCODING*");
2647   } else
2648     O(terminal_encoding) =
2649       encoding_from_name(argv_encoding_terminal,"*TERMINAL-ENCODING*");
2650  #else
2651   O(terminal_encoding) =
2652     (argv_encoding_terminal == NULL ? (object)STACK_0
2653      : encoding_from_name(argv_encoding_terminal,"*TERMINAL-ENCODING*"));
2654  #endif
2655  #if defined(HAVE_FFI)
2656   O(foreign_encoding) =
2657     (argv_encoding_foreign == NULL ? (object)STACK_0
2658      : encoding_from_name(argv_encoding_foreign,"*FOREIGN-ENCODING*"));
2659   O(foreign_8bit_encoding) =
2660     (TheEncoding(O(foreign_encoding))->max_bytes_per_char == 1
2661      ? O(foreign_encoding)
2662      /* not DEFAULT_1_1_ENCODING because foreign_8bit_encoding must agree
2663         with foreign_encoding and only ASCII agrees with _all_ encodings */
2664      : Symbol_value(S(ascii)));
2665  #endif
2666   O(misc_encoding) =
2667     (argv_encoding_misc == NULL ? (object)STACK_0
2668      : encoding_from_name(argv_encoding_misc,"*MISC-ENCODING*"));
2669   skipSTACK(1);
2670 #else /* no ENABLE_UNICODE */
2671   O(default_file_encoding) = encoding_from_name(NULL,NULL);
2672   O(terminal_encoding) = encoding_from_name(NULL,NULL);
2673 #endif
2674 }
2675 
2676 /* =========================================================================
2677  * Accessors */
2678 
2679 /* (SYSTEM::DEFAULT-FILE-ENCODING) */
2680 LISPFUNNR(default_file_encoding,0) {
2681   VALUES1(O(default_file_encoding));
2682 }
2683 
2684 /* (SYSTEM::SET-DEFAULT-FILE-ENCODING encoding) */
2685 LISPFUNN(set_default_file_encoding,1) {
2686   var object encoding =
2687     check_encoding(popSTACK(),&O(default_file_encoding),false);
2688   VALUES1(O(default_file_encoding) = encoding);
2689 }
2690 
2691 #ifdef ENABLE_UNICODE
2692 
2693 /* (SYSTEM::PATHNAME-ENCODING) */
2694 LISPFUNNR(pathname_encoding,0) {
2695   VALUES1(O(pathname_encoding));
2696 }
2697 
2698 /* (SYSTEM::SET-PATHNAME-ENCODING encoding) */
2699 LISPFUNN(set_pathname_encoding,1) {
2700   var object encoding = check_encoding(popSTACK(),&O(pathname_encoding),false);
2701  #ifdef CONSTANT_PATHNAME_ENCODING
2702   if (eq(encoding,O(pathname_encoding))) VALUES1(encoding);
2703   else {
2704     pushSTACK(CONSTANT_PATHNAME_ENCODING);
2705     error(error_condition,GETTEXT("*PATHNAME_ENCODING* on this platform can only be ~S"));
2706   }
2707  #else
2708   VALUES1(O(pathname_encoding) = encoding);
2709  #endif
2710 }
2711 
2712 /* (SYSTEM::TERMINAL-ENCODING) */
2713 LISPFUNNR(terminal_encoding,0) {
2714   VALUES1(O(terminal_encoding));
2715 }
2716 
2717 /* (SYSTEM::SET-TERMINAL-ENCODING encoding) */
2718 LISPFUNN(set_terminal_encoding,1) {
2719   var object encoding = check_encoding(STACK_0,&O(terminal_encoding),false);
2720   STACK_0 = encoding;
2721   /* Ensure O(terminal_encoding) = (STREAM-EXTERNAL-FORMAT *TERMINAL-IO*).
2722      But first modify (STREAM-EXTERNAL-FORMAT *TERMINAL-IO*): */
2723   set_terminalstream_external_format(var_stream(S(terminal_io),0),encoding);
2724   VALUES1(O(terminal_encoding) = popSTACK());
2725 }
2726 
2727 #if defined(HAVE_FFI)
2728 
2729 /* (SYSTEM::FOREIGN-ENCODING) */
2730 LISPFUNNR(foreign_encoding,0) {
2731   VALUES1(O(foreign_encoding));
2732 }
2733 
2734 /* (SYSTEM::SET-FOREIGN-ENCODING encoding) */
2735 LISPFUNN(set_foreign_encoding,1) {
2736   var object encoding = check_encoding(popSTACK(),&O(foreign_encoding),false);
2737   O(foreign_encoding) = encoding;
2738   O(foreign_8bit_encoding) =
2739     (TheEncoding(O(foreign_encoding))->max_bytes_per_char == 1
2740      ? O(foreign_encoding)
2741      : Symbol_value(S(ascii)));
2742   VALUES1(encoding);
2743 }
2744 
2745 #endif /* HAVE_FFI */
2746 
2747 /* (SYSTEM::MISC-ENCODING) */
2748 LISPFUNNR(misc_encoding,0) {
2749   VALUES1(O(misc_encoding));
2750 }
2751 
2752 /* (SYSTEM::SET-MISC-ENCODING encoding) */
2753 LISPFUNN(set_misc_encoding,1) {
2754   var object encoding = check_encoding(popSTACK(),&O(misc_encoding),false);
2755   VALUES1(O(misc_encoding) = encoding);
2756 }
2757 
2758 #endif /* ENABLE_UNICODE */
2759 
2760 /* =========================================================================
2761  * More functions */
2762 
2763 /* (CONVERT-STRING-FROM-BYTES byte-array encoding [:start] [:end]) */
2764 LISPFUN(convert_string_from_bytes,seclass_read,2,0,norest,key,2,
2765         (kw(start),kw(end)) ) {
2766   /* Stack layout: array, encoding, start, end. */
2767   STACK_3 = check_vector(STACK_3); /* check array */
2768   STACK_2 = check_encoding(STACK_2,DEFAULT_ENC,false);
2769   var object array = STACK_3;
2770   STACK_3 = STACK_2; /* encoding */
2771   STACK_2 = array; /* array */
2772   /* Stack layout: encoding, array, start, end. */
2773   if (!boundp(STACK_1))
2774     STACK_1 = Fixnum_0; /* check start */
2775   if (missingp(STACK_0)) /* check end */
2776     STACK_0 = fixnum(vector_length(array));
2777   /* Convert array to a vector with element type (UNSIGNED-BYTE 8): */
2778   if (!bit_vector_p(Atype_8Bit,array)) {
2779     /* (SYS::COERCED-SUBSEQ array '(ARRAY (UNSIGNED-BYTE 8) (*))
2780                             [:start] [:end]) */
2781     pushSTACK(array); pushSTACK(O(type_uint8_vector));
2782     pushSTACK(S(Kstart)); pushSTACK(STACK_(1+3));
2783     pushSTACK(S(Kend)); pushSTACK(STACK_(0+5));
2784     funcall(L(coerced_subseq),6);
2785     if (!bit_vector_p(Atype_8Bit,value1)) { NOTREACHED; }
2786     STACK_2 = value1;
2787     STACK_0 = I_I_minus_I(STACK_0,STACK_1); /* end := (- end start) */
2788     STACK_1 = Fixnum_0; /* start := 0 */
2789     array = STACK_2;
2790   }
2791   /* Determine size of result string: */
2792   var stringarg sa;
2793   sa.offset = 0; sa.len = vector_length(array);
2794   sa.string = array_displace_check(array,sa.len,&sa.offset);
2795   test_vector_limits(&sa);
2796   STACK_0 = array = sa.string;
2797   /* stack layout: encoding, array */
2798   var uintL start = sa.offset + sa.index;
2799   var uintL end = start + sa.len;
2800  #ifdef ENABLE_UNICODE
2801   var uintL clen =
2802     Encoding_mblen(STACK_1)(STACK_1,&TheSbvector(array)->data[start],
2803                             &TheSbvector(array)->data[end]);
2804  #else
2805   var uintL clen = end-start;
2806  #endif
2807   /* Allocate and fill the result string: */
2808   check_stringsize(clen);
2809   var object string = allocate_string(clen);
2810   if (clen > 0) {
2811     array = STACK_0;
2812     var chart* cptr = &TheSnstring(string)->data[0];
2813     var const uintB* bptr = &TheSbvector(array)->data[start];
2814    #ifdef ENABLE_UNICODE
2815     var const uintB* bendptr = &TheSbvector(array)->data[end];
2816     var chart* cendptr = cptr+clen;
2817     Encoding_mbstowcs(STACK_1)(STACK_1,nullobj,&bptr,bendptr,&cptr,cendptr);
2818     ASSERT(cptr == cendptr);
2819     if ((bptr != bendptr)       /* some bytes were unused! */
2820         && eq(TheEncoding(STACK_1)->enc_towcs_error,S(Kerror)))
2821       error_incomplete(STACK_1);
2822    #else
2823     dotimespL(clen,clen, { *cptr++ = as_chart(*bptr++); } );
2824    #endif
2825   } else if (end != start) /* string is empty, but the vector is not! */
2826     error_incomplete(STACK_1);
2827   VALUES1(string); skipSTACK(2);
2828 }
2829 
2830 /* (CONVERT-STRING-TO-BYTES string encoding [:start] [:end]) */
2831 LISPFUN(convert_string_to_bytes,seclass_read,2,0,norest,key,2,
2832         (kw(start),kw(end)) ) {
2833   /* Stack layout: string, encoding, start, end. */
2834   STACK_2 = check_encoding(STACK_2,DEFAULT_ENC,false);
2835   var object string = STACK_3;
2836   STACK_3 = STACK_2; /* encoding */
2837   STACK_2 = string; /* string */
2838   /* Stack layout: encoding, string, start, end. */
2839   var stringarg sa;
2840   test_string_limits_ro(&sa); /* check string */
2841   string = sa.string;
2842   pushSTACK(string);
2843   /* Stack layout: encoding, string */
2844   var const chart* srcptr;
2845   unpack_sstring_alloca(string,sa.len,sa.offset+sa.index, srcptr=);
2846   var uintL blen = cslen(STACK_1,srcptr,sa.len);
2847   /* Allocate and fill the result vector: */
2848   var object array = allocate_bit_vector(Atype_8Bit,blen);
2849   if (blen > 0) {
2850     string = STACK_0;
2851     unpack_sstring_alloca(string,sa.len,sa.offset+sa.index, srcptr=);
2852     cstombs(STACK_1,srcptr,sa.len,&TheSbvector(array)->data[0],blen);
2853   }
2854   VALUES1(array); skipSTACK(2);
2855 }
2856