1 /* Basic character set support.
2 
3 Copyright (C) 2001-2021 Free Software Foundation, Inc.
4 
5 Copyright (C) 1995, 1996, 1997, 1998, 1999, 2000, 2001, 2002, 2003, 2004,
6   2005, 2006, 2007, 2008, 2009, 2010, 2011
7   National Institute of Advanced Industrial Science and Technology (AIST)
8   Registration Number H14PRO021
9 
10 Copyright (C) 2003, 2004
11   National Institute of Advanced Industrial Science and Technology (AIST)
12   Registration Number H13PRO009
13 
14 This file is part of GNU Emacs.
15 
16 GNU Emacs is free software: you can redistribute it and/or modify
17 it under the terms of the GNU General Public License as published by
18 the Free Software Foundation, either version 3 of the License, or (at
19 your option) any later version.
20 
21 GNU Emacs is distributed in the hope that it will be useful,
22 but WITHOUT ANY WARRANTY; without even the implied warranty of
23 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
24 GNU General Public License for more details.
25 
26 You should have received a copy of the GNU General Public License
27 along with GNU Emacs.  If not, see <https://www.gnu.org/licenses/>.  */
28 
29 #include <config.h>
30 
31 #include <errno.h>
32 #include <stdlib.h>
33 #include <unistd.h>
34 #include <limits.h>
35 #include <sys/types.h>
36 #include "lisp.h"
37 #include "character.h"
38 #include "charset.h"
39 #include "coding.h"
40 #include "buffer.h"
41 #include "sysstdio.h"
42 #include "pdumper.h"
43 
44 /*** GENERAL NOTES on CODED CHARACTER SETS (CHARSETS) ***
45 
46   A coded character set ("charset" hereafter) is a meaningful
47   collection (i.e. language, culture, functionality, etc.) of
48   characters.  Emacs handles multiple charsets at once.  In Emacs Lisp
49   code, a charset is represented by a symbol.  In C code, a charset is
50   represented by its ID number or by a pointer to a struct charset.
51 
52   The actual information about each charset is stored in two places.
53   Lispy information is stored in the hash table Vcharset_hash_table as
54   a vector (charset attributes).  The other information is stored in
55   charset_table as a struct charset.
56 
57 */
58 
59 /* Hash table that contains attributes of each charset.  Keys are
60    charset symbols, and values are vectors of charset attributes.  */
61 Lisp_Object Vcharset_hash_table;
62 
63 /* Table of struct charset.  */
64 struct charset *charset_table;
65 int charset_table_size;
66 int charset_table_used;
67 
68 /* Special charsets corresponding to symbols.  */
69 int charset_ascii;
70 int charset_eight_bit;
71 static int charset_iso_8859_1;
72 int charset_unicode;
73 static int charset_emacs;
74 
75 /* The other special charsets.  */
76 int charset_jisx0201_roman;
77 int charset_jisx0208_1978;
78 int charset_jisx0208;
79 int charset_ksc5601;
80 
81 /* Charset of unibyte characters.  */
82 int charset_unibyte;
83 
84 /* List of charsets ordered by the priority.  */
85 Lisp_Object Vcharset_ordered_list;
86 
87 /* Sub-list of Vcharset_ordered_list that contains all non-preferred
88    charsets.  */
89 Lisp_Object Vcharset_non_preferred_head;
90 
91 /* Incremented every time we change the priority of charsets.
92    Wraps around.  */
93 EMACS_UINT charset_ordered_list_tick;
94 
95 /* List of iso-2022 charsets.  */
96 Lisp_Object Viso_2022_charset_list;
97 
98 /* List of emacs-mule charsets.  */
99 Lisp_Object Vemacs_mule_charset_list;
100 
101 int emacs_mule_charset[256];
102 
103 /* Mapping table from ISO2022's charset (specified by DIMENSION,
104    CHARS, and FINAL-CHAR) to Emacs' charset.  */
105 int iso_charset_table[ISO_MAX_DIMENSION][ISO_MAX_CHARS][ISO_MAX_FINAL];
106 
107 #define CODE_POINT_TO_INDEX(charset, code)				\
108   ((charset)->code_linear_p						\
109    ? (int) ((code) - (charset)->min_code)				\
110    : (((charset)->code_space_mask[(code) >> 24] & 0x8)			\
111       && ((charset)->code_space_mask[((code) >> 16) & 0xFF] & 0x4)	\
112       && ((charset)->code_space_mask[((code) >> 8) & 0xFF] & 0x2)	\
113       && ((charset)->code_space_mask[(code) & 0xFF] & 0x1))		\
114    ? (int) (((((code) >> 24) - (charset)->code_space[12])		\
115 	     * (charset)->code_space[11])				\
116 	    + (((((code) >> 16) & 0xFF) - (charset)->code_space[8])	\
117 	       * (charset)->code_space[7])				\
118 	    + (((((code) >> 8) & 0xFF) - (charset)->code_space[4])	\
119 	       * (charset)->code_space[3])				\
120 	    + (((code) & 0xFF) - (charset)->code_space[0])		\
121 	    - ((charset)->char_index_offset))				\
122    : -1)
123 
124 
125 /* Return the code-point for the character index IDX in CHARSET.
126    IDX should be an unsigned int variable in a valid range (which is
127    always in nonnegative int range too).  IDX contains garbage afterwards.  */
128 
129 #define INDEX_TO_CODE_POINT(charset, idx)				     \
130   ((charset)->code_linear_p						     \
131    ? (idx) + (charset)->min_code					     \
132    : (idx += (charset)->char_index_offset,				     \
133       (((charset)->code_space[0] + (idx) % (charset)->code_space[2])	     \
134        | (((charset)->code_space[4]					     \
135 	   + ((idx) / (charset)->code_space[3] % (charset)->code_space[6]))  \
136 	  << 8)								     \
137        | (((charset)->code_space[8]					     \
138 	   + ((idx) / (charset)->code_space[7] % (charset)->code_space[10])) \
139 	  << 16)							     \
140        | (((charset)->code_space[12] + ((idx) / (charset)->code_space[11]))  \
141 	  << 24))))
142 
143 /* Structure to hold mapping tables for a charset.  Used by temacs
144    invoked for dumping.  */
145 
146 static struct
147 {
148   /* The current charset for which the following tables are setup.  */
149   struct charset *current;
150 
151   /* 1 iff the following table is used for encoder.  */
152   short for_encoder;
153 
154   /* When the following table is used for encoding, minimum and
155      maximum character of the current charset.  */
156   int min_char, max_char;
157 
158   /* A Unicode character corresponding to the code index 0 (i.e. the
159      minimum code-point) of the current charset, or -1 if the code
160      index 0 is not a Unicode character.  This is checked when
161      table.encoder[CHAR] is zero.  */
162   int zero_index_char;
163 
164   union {
165     /* Table mapping code-indices (not code-points) of the current
166        charset to Unicode characters.  If decoder[CHAR] is -1, CHAR
167        doesn't belong to the current charset.  */
168     int decoder[0x10000];
169     /* Table mapping Unicode characters to code-indices of the current
170        charset.  The first 0x10000 elements are for BMP (0..0xFFFF),
171        and the last 0x10000 are for SMP (0x10000..0x1FFFF) or SIP
172        (0x20000..0x2FFFF).  Note that there is no charset map that
173        uses both SMP and SIP.  */
174     unsigned short encoder[0x20000];
175   } table;
176 } *temp_charset_work;
177 
178 #define SET_TEMP_CHARSET_WORK_ENCODER(C, CODE)			\
179   do {								\
180     if ((CODE) == 0)						\
181       temp_charset_work->zero_index_char = (C);			\
182     else if ((C) < 0x20000)					\
183       temp_charset_work->table.encoder[(C)] = (CODE);		\
184     else							\
185       temp_charset_work->table.encoder[(C) - 0x10000] = (CODE);	\
186   } while (0)
187 
188 #define GET_TEMP_CHARSET_WORK_ENCODER(C)				  \
189   ((C) == temp_charset_work->zero_index_char ? 0			  \
190    : (C) < 0x20000 ? (temp_charset_work->table.encoder[(C)]		  \
191 		      ? (int) temp_charset_work->table.encoder[(C)] : -1) \
192    : temp_charset_work->table.encoder[(C) - 0x10000]			  \
193    ? temp_charset_work->table.encoder[(C) - 0x10000] : -1)
194 
195 #define SET_TEMP_CHARSET_WORK_DECODER(C, CODE)	\
196   (temp_charset_work->table.decoder[(CODE)] = (C))
197 
198 #define GET_TEMP_CHARSET_WORK_DECODER(CODE)	\
199   (temp_charset_work->table.decoder[(CODE)])
200 
201 
202 /* Set to 1 to warn that a charset map is loaded and thus a buffer
203    text and a string data may be relocated.  */
204 bool charset_map_loaded;
205 
206 struct charset_map_entries
207 {
208   struct {
209     unsigned from, to;
210     int c;
211   } entry[0x10000];
212   struct charset_map_entries *next;
213 };
214 
215 /* Load the mapping information of CHARSET from ENTRIES for
216    initializing (CONTROL_FLAG == 0), decoding (CONTROL_FLAG == 1), and
217    encoding (CONTROL_FLAG == 2).
218 
219    If CONTROL_FLAG is 0, setup CHARSET->min_char, CHARSET->max_char,
220    and CHARSET->fast_map.
221 
222    If CONTROL_FLAG is 1, setup the following tables according to
223    CHARSET->method and inhibit_load_charset_map.
224 
225    CHARSET->method       | inhibit_lcm == 0   | inhibit_lcm == 1
226    ----------------------+--------------------+---------------------------
227    CHARSET_METHOD_MAP    | CHARSET->decoder   | temp_charset_work->decoder
228    ----------------------+--------------------+---------------------------
229    CHARSET_METHOD_OFFSET | Vchar_unify_table  | temp_charset_work->decoder
230 
231    If CONTROL_FLAG is 2, setup the following tables.
232 
233    CHARSET->method       | inhibit_lcm == 0   | inhibit_lcm == 1
234    ----------------------+--------------------+---------------------------
235    CHARSET_METHOD_MAP    | CHARSET->encoder   | temp_charset_work->encoder
236    ----------------------+--------------------+--------------------------
237    CHARSET_METHOD_OFFSET | CHARSET->deunifier | temp_charset_work->encoder
238 */
239 
240 static void
load_charset_map(struct charset * charset,struct charset_map_entries * entries,int n_entries,int control_flag)241 load_charset_map (struct charset *charset, struct charset_map_entries *entries, int n_entries, int control_flag)
242 {
243   Lisp_Object vec UNINIT;
244   Lisp_Object table UNINIT;
245   unsigned max_code = CHARSET_MAX_CODE (charset);
246   bool ascii_compatible_p = charset->ascii_compatible_p;
247   int min_char, max_char, nonascii_min_char;
248   int i;
249   unsigned char *fast_map = charset->fast_map;
250 
251   if (n_entries <= 0)
252     return;
253 
254   if (control_flag)
255     {
256       if (! inhibit_load_charset_map)
257 	{
258 	  if (control_flag == 1)
259 	    {
260 	      if (charset->method == CHARSET_METHOD_MAP)
261 		{
262 		  int n = CODE_POINT_TO_INDEX (charset, max_code) + 1;
263 
264 		  vec = make_vector (n, make_fixnum (-1));
265 		  set_charset_attr (charset, charset_decoder, vec);
266 		}
267 	      else
268 		{
269 		  char_table_set_range (Vchar_unify_table,
270 					charset->min_char, charset->max_char,
271 					Qnil);
272 		}
273 	    }
274 	  else
275 	    {
276 	      table = Fmake_char_table (Qnil, Qnil);
277 	      set_charset_attr (charset,
278 				(charset->method == CHARSET_METHOD_MAP
279 				 ? charset_encoder : charset_deunifier),
280 				table);
281 	    }
282 	}
283       else
284 	{
285 	  if (! temp_charset_work)
286 	    temp_charset_work = xmalloc (sizeof *temp_charset_work);
287 	  if (control_flag == 1)
288 	    {
289 	      memset (temp_charset_work->table.decoder, -1,
290 		      sizeof (int) * 0x10000);
291 	    }
292 	  else
293 	    {
294 	      memset (temp_charset_work->table.encoder, 0,
295 		      sizeof (unsigned short) * 0x20000);
296 	      temp_charset_work->zero_index_char = -1;
297 	    }
298 	  temp_charset_work->current = charset;
299 	  temp_charset_work->for_encoder = (control_flag == 2);
300 	  control_flag += 2;
301 	}
302       charset_map_loaded = 1;
303     }
304 
305   min_char = max_char = entries->entry[0].c;
306   nonascii_min_char = MAX_CHAR;
307   for (i = 0; i < n_entries; i++)
308     {
309       unsigned from, to;
310       int from_index, to_index, lim_index;
311       int from_c, to_c;
312       int idx = i % 0x10000;
313 
314       if (i > 0 && idx == 0)
315 	entries = entries->next;
316       from = entries->entry[idx].from;
317       to = entries->entry[idx].to;
318       from_c = entries->entry[idx].c;
319       from_index = CODE_POINT_TO_INDEX (charset, from);
320       if (from == to)
321 	{
322 	  to_index = from_index;
323 	  to_c = from_c;
324 	}
325       else
326 	{
327 	  to_index = CODE_POINT_TO_INDEX (charset, to);
328 	  to_c = from_c + (to_index - from_index);
329 	}
330       if (from_index < 0 || to_index < 0)
331 	continue;
332       lim_index = to_index + 1;
333 
334       if (to_c > max_char)
335 	max_char = to_c;
336       else if (from_c < min_char)
337 	min_char = from_c;
338 
339       if (control_flag == 1)
340 	{
341 	  if (charset->method == CHARSET_METHOD_MAP)
342 	    for (; from_index < lim_index; from_index++, from_c++)
343 	      ASET (vec, from_index, make_fixnum (from_c));
344 	  else
345 	    for (; from_index < lim_index; from_index++, from_c++)
346 	      CHAR_TABLE_SET (Vchar_unify_table,
347 			      CHARSET_CODE_OFFSET (charset) + from_index,
348 			      make_fixnum (from_c));
349 	}
350       else if (control_flag == 2)
351 	{
352 	  if (charset->method == CHARSET_METHOD_MAP
353 	      && CHARSET_COMPACT_CODES_P (charset))
354 	    for (; from_index < lim_index; from_index++, from_c++)
355 	      {
356 		unsigned code = from_index;
357 		code = INDEX_TO_CODE_POINT (charset, code);
358 
359 		if (NILP (CHAR_TABLE_REF (table, from_c)))
360 		  CHAR_TABLE_SET (table, from_c, make_fixnum (code));
361 	      }
362 	  else
363 	    for (; from_index < lim_index; from_index++, from_c++)
364 	      {
365 		if (NILP (CHAR_TABLE_REF (table, from_c)))
366 		  CHAR_TABLE_SET (table, from_c, make_fixnum (from_index));
367 	      }
368 	}
369       else if (control_flag == 3)
370 	for (; from_index < lim_index; from_index++, from_c++)
371 	  SET_TEMP_CHARSET_WORK_DECODER (from_c, from_index);
372       else if (control_flag == 4)
373 	for (; from_index < lim_index; from_index++, from_c++)
374 	  SET_TEMP_CHARSET_WORK_ENCODER (from_c, from_index);
375       else			/* control_flag == 0 */
376 	{
377 	  if (ascii_compatible_p)
378 	    {
379 	      if (! ASCII_CHAR_P (from_c))
380 		{
381 		  if (from_c < nonascii_min_char)
382 		    nonascii_min_char = from_c;
383 		}
384 	      else if (! ASCII_CHAR_P (to_c))
385 		{
386 		  nonascii_min_char = 0x80;
387 		}
388 	    }
389 
390 	  for (; from_c <= to_c; from_c++)
391 	    CHARSET_FAST_MAP_SET (from_c, fast_map);
392 	}
393     }
394 
395   if (control_flag == 0)
396     {
397       CHARSET_MIN_CHAR (charset) = (ascii_compatible_p
398 				    ? nonascii_min_char : min_char);
399       CHARSET_MAX_CHAR (charset) = max_char;
400     }
401   else if (control_flag == 4)
402     {
403       temp_charset_work->min_char = min_char;
404       temp_charset_work->max_char = max_char;
405     }
406 }
407 
408 
409 /* Read a hexadecimal number (preceded by "0x") from the file FP while
410    paying attention to comment character '#'.  LOOKAHEAD is the
411    lookahead byte if it is nonnegative.  Store into *TERMINATOR the
412    input byte after the number, or EOF if an end-of-file or input
413    error occurred.  Set *OVERFLOW if the number overflows.  */
414 
415 static unsigned
read_hex(FILE * fp,int lookahead,int * terminator,bool * overflow)416 read_hex (FILE *fp, int lookahead, int *terminator, bool *overflow)
417 {
418   int c = lookahead < 0 ? getc (fp) : lookahead;
419 
420   while (true)
421     {
422       if (c == '#')
423 	do
424 	  c = getc (fp);
425 	while (0 <= c && c != '\n');
426       else if (c == '0')
427 	{
428 	  c = getc (fp);
429 	  if (c < 0 || c == 'x')
430 	    break;
431 	}
432       if (c < 0)
433 	break;
434       c = getc (fp);
435     }
436 
437   unsigned n = 0;
438   bool v = false;
439 
440   if (0 <= c)
441     while (true)
442       {
443 	c = getc (fp);
444 	int digit = char_hexdigit (c);
445 	if (digit < 0)
446 	  break;
447 	v |= INT_LEFT_SHIFT_OVERFLOW (n, 4);
448 	n = (n << 4) + digit;
449       }
450 
451   *terminator = c;
452   *overflow |= v;
453   return n;
454 }
455 
456 /* Return a mapping vector for CHARSET loaded from MAPFILE.
457    Each line of MAPFILE has this form
458 	0xAAAA 0xCCCC
459    where 0xAAAA is a code-point and 0xCCCC is the corresponding
460    character code, or this form
461 	0xAAAA-0xBBBB 0xCCCC
462    where 0xAAAA and 0xBBBB are code-points specifying a range, and
463    0xCCCC is the first character code of the range.
464 
465    The returned vector has this form:
466 	[ CODE1 CHAR1 CODE2 CHAR2 .... ]
467    where CODE1 is a code-point or a cons of code-points specifying a
468    range.
469 
470    Note that this function uses `openp' to open MAPFILE but ignores
471    `file-name-handler-alist' to avoid running any Lisp code.  */
472 
473 static void
load_charset_map_from_file(struct charset * charset,Lisp_Object mapfile,int control_flag)474 load_charset_map_from_file (struct charset *charset, Lisp_Object mapfile,
475 			    int control_flag)
476 {
477   unsigned min_code = CHARSET_MIN_CODE (charset);
478   unsigned max_code = CHARSET_MAX_CODE (charset);
479   int fd;
480   FILE *fp;
481   struct charset_map_entries *head, *entries;
482   int n_entries;
483   AUTO_STRING (map, ".map");
484   AUTO_STRING (txt, ".txt");
485   AUTO_LIST2 (suffixes, map, txt);
486   ptrdiff_t count = SPECPDL_INDEX ();
487   record_unwind_protect_nothing ();
488   specbind (Qfile_name_handler_alist, Qnil);
489   fd = openp (Vcharset_map_path, mapfile, suffixes, NULL, Qnil, false, false);
490   fp = fd < 0 ? 0 : fdopen (fd, "r");
491   if (!fp)
492     {
493       int open_errno = errno;
494       emacs_close (fd);
495       report_file_errno ("Loading charset map", mapfile, open_errno);
496     }
497   set_unwind_protect_ptr (count, fclose_unwind, fp);
498   unbind_to (count + 1, Qnil);
499 
500   /* Use record_xmalloc, as `charset_map_entries' is
501      large (larger than MAX_ALLOCA).  */
502   head = record_xmalloc (sizeof *head);
503   entries = head;
504   memset (entries, 0, sizeof (struct charset_map_entries));
505 
506   n_entries = 0;
507   int ch = -1;
508   while (true)
509     {
510       bool overflow = false;
511       unsigned from = read_hex (fp, ch, &ch, &overflow), to;
512       if (ch < 0)
513 	break;
514       if (ch == '-')
515 	{
516 	  to = read_hex (fp, -1, &ch, &overflow);
517 	  if (ch < 0)
518 	    break;
519 	}
520       else
521 	{
522 	  to = from;
523 	  ch = -1;
524 	}
525       unsigned c = read_hex (fp, ch, &ch, &overflow);
526       if (ch < 0)
527 	break;
528 
529       if (overflow)
530 	continue;
531       if (from < min_code || to > max_code || from > to || c > MAX_CHAR)
532 	continue;
533 
534       if (n_entries == 0x10000)
535 	{
536 	  entries->next = record_xmalloc (sizeof *entries->next);
537 	  entries = entries->next;
538 	  memset (entries, 0, sizeof (struct charset_map_entries));
539 	  n_entries = 0;
540 	}
541       int idx = n_entries;
542       entries->entry[idx].from = from;
543       entries->entry[idx].to = to;
544       entries->entry[idx].c = c;
545       n_entries++;
546     }
547   fclose (fp);
548   clear_unwind_protect (count);
549 
550   load_charset_map (charset, head, n_entries, control_flag);
551   unbind_to (count, Qnil);
552 }
553 
554 static void
load_charset_map_from_vector(struct charset * charset,Lisp_Object vec,int control_flag)555 load_charset_map_from_vector (struct charset *charset, Lisp_Object vec, int control_flag)
556 {
557   unsigned min_code = CHARSET_MIN_CODE (charset);
558   unsigned max_code = CHARSET_MAX_CODE (charset);
559   struct charset_map_entries *head, *entries;
560   int n_entries;
561   int len = ASIZE (vec);
562   int i;
563   USE_SAFE_ALLOCA;
564 
565   if (len % 2 == 1)
566     {
567       add_to_log ("Failure in loading charset map: %V", vec);
568       return;
569     }
570 
571   /* Use SAFE_ALLOCA instead of alloca, as `charset_map_entries' is
572      large (larger than MAX_ALLOCA).  */
573   head = SAFE_ALLOCA (sizeof *head);
574   entries = head;
575   memset (entries, 0, sizeof (struct charset_map_entries));
576 
577   n_entries = 0;
578   for (i = 0; i < len; i += 2)
579     {
580       Lisp_Object val, val2;
581       unsigned from, to;
582       EMACS_INT c;
583       int idx;
584 
585       val = AREF (vec, i);
586       if (CONSP (val))
587 	{
588 	  val2 = XCDR (val);
589 	  val = XCAR (val);
590 	  from = XFIXNAT (val);
591 	  to = XFIXNAT (val2);
592 	}
593       else
594 	from = to = XFIXNAT (val);
595       val = AREF (vec, i + 1);
596       CHECK_FIXNAT (val);
597       c = XFIXNAT (val);
598 
599       if (from < min_code || to > max_code || from > to || c > MAX_CHAR)
600 	continue;
601 
602       if (n_entries > 0 && (n_entries % 0x10000) == 0)
603 	{
604 	  entries->next = SAFE_ALLOCA (sizeof *entries->next);
605 	  entries = entries->next;
606 	  memset (entries, 0, sizeof (struct charset_map_entries));
607 	}
608       idx = n_entries % 0x10000;
609       entries->entry[idx].from = from;
610       entries->entry[idx].to = to;
611       entries->entry[idx].c = c;
612       n_entries++;
613     }
614 
615   load_charset_map (charset, head, n_entries, control_flag);
616   SAFE_FREE ();
617 }
618 
619 
620 /* Load a mapping table for CHARSET.  CONTROL-FLAG tells what kind of
621    map it is (see the comment of load_charset_map for the detail).  */
622 
623 static void
load_charset(struct charset * charset,int control_flag)624 load_charset (struct charset *charset, int control_flag)
625 {
626   Lisp_Object map;
627 
628   if (inhibit_load_charset_map
629       && temp_charset_work
630       && charset == temp_charset_work->current
631       && ((control_flag == 2) == temp_charset_work->for_encoder))
632     return;
633 
634   if (CHARSET_METHOD (charset) == CHARSET_METHOD_MAP)
635     map = CHARSET_MAP (charset);
636   else
637     {
638       if (! CHARSET_UNIFIED_P (charset))
639 	emacs_abort ();
640       map = CHARSET_UNIFY_MAP (charset);
641     }
642   if (STRINGP (map))
643     load_charset_map_from_file (charset, map, control_flag);
644   else
645     load_charset_map_from_vector (charset, map, control_flag);
646 }
647 
648 
649 DEFUN ("charsetp", Fcharsetp, Scharsetp, 1, 1, 0,
650        doc: /* Return non-nil if and only if OBJECT is a charset.*/)
651   (Lisp_Object object)
652 {
653   return (CHARSETP (object) ? Qt : Qnil);
654 }
655 
656 
657 static void
map_charset_for_dump(void (* c_function)(Lisp_Object,Lisp_Object),Lisp_Object function,Lisp_Object arg,unsigned int from,unsigned int to)658 map_charset_for_dump (void (*c_function) (Lisp_Object, Lisp_Object),
659 		      Lisp_Object function, Lisp_Object arg,
660 		      unsigned int from, unsigned int to)
661 {
662   int from_idx = CODE_POINT_TO_INDEX (temp_charset_work->current, from);
663   int to_idx = CODE_POINT_TO_INDEX (temp_charset_work->current, to);
664   Lisp_Object range = Fcons (Qnil, Qnil);
665   int c, stop;
666 
667   c = temp_charset_work->min_char;
668   stop = (temp_charset_work->max_char < 0x20000
669 	  ? temp_charset_work->max_char : 0xFFFF);
670 
671   while (1)
672     {
673       int idx = GET_TEMP_CHARSET_WORK_ENCODER (c);
674 
675       if (idx >= from_idx && idx <= to_idx)
676 	{
677 	  if (NILP (XCAR (range)))
678 	    XSETCAR (range, make_fixnum (c));
679 	}
680       else if (! NILP (XCAR (range)))
681 	{
682 	  XSETCDR (range, make_fixnum (c - 1));
683 	  if (c_function)
684 	    (*c_function) (arg, range);
685 	  else
686 	    call2 (function, range, arg);
687 	  XSETCAR (range, Qnil);
688 	}
689       if (c == stop)
690 	{
691 	  if (c == temp_charset_work->max_char)
692 	    {
693 	      if (! NILP (XCAR (range)))
694 		{
695 		  XSETCDR (range, make_fixnum (c));
696 		  if (c_function)
697 		    (*c_function) (arg, range);
698 		  else
699 		    call2 (function, range, arg);
700 		}
701 	      break;
702 	    }
703 	  c = 0x1FFFF;
704 	  stop = temp_charset_work->max_char;
705 	}
706       c++;
707     }
708 }
709 
710 void
map_charset_chars(void (* c_function)(Lisp_Object,Lisp_Object),Lisp_Object function,Lisp_Object arg,struct charset * charset,unsigned from,unsigned to)711 map_charset_chars (void (*c_function)(Lisp_Object, Lisp_Object), Lisp_Object function,
712 		   Lisp_Object arg, struct charset *charset, unsigned from, unsigned to)
713 {
714   Lisp_Object range;
715   bool partial = (from > CHARSET_MIN_CODE (charset)
716 		  || to < CHARSET_MAX_CODE (charset));
717 
718   if (CHARSET_METHOD (charset) == CHARSET_METHOD_OFFSET)
719     {
720       int from_idx = CODE_POINT_TO_INDEX (charset, from);
721       int to_idx = CODE_POINT_TO_INDEX (charset, to);
722       int from_c = from_idx + CHARSET_CODE_OFFSET (charset);
723       int to_c = to_idx + CHARSET_CODE_OFFSET (charset);
724 
725       if (CHARSET_UNIFIED_P (charset))
726 	{
727 	  if (! CHAR_TABLE_P (CHARSET_DEUNIFIER (charset)))
728 	    load_charset (charset, 2);
729 	  if (CHAR_TABLE_P (CHARSET_DEUNIFIER (charset)))
730 	    map_char_table_for_charset (c_function, function,
731 					CHARSET_DEUNIFIER (charset), arg,
732 					partial ? charset : NULL, from, to);
733 	  else
734 	    map_charset_for_dump (c_function, function, arg, from, to);
735 	}
736 
737       range = Fcons (make_fixnum (from_c), make_fixnum (to_c));
738       if (NILP (function))
739 	(*c_function) (arg, range);
740       else
741 	call2 (function, range, arg);
742     }
743   else if (CHARSET_METHOD (charset) == CHARSET_METHOD_MAP)
744     {
745       if (! CHAR_TABLE_P (CHARSET_ENCODER (charset)))
746 	load_charset (charset, 2);
747       if (CHAR_TABLE_P (CHARSET_ENCODER (charset)))
748 	map_char_table_for_charset (c_function, function,
749 				    CHARSET_ENCODER (charset), arg,
750 				    partial ? charset : NULL, from, to);
751       else
752 	map_charset_for_dump (c_function, function, arg, from, to);
753     }
754   else if (CHARSET_METHOD (charset) == CHARSET_METHOD_SUBSET)
755     {
756       Lisp_Object subset_info;
757       int offset;
758 
759       subset_info = CHARSET_SUBSET (charset);
760       charset = CHARSET_FROM_ID (XFIXNAT (AREF (subset_info, 0)));
761       offset = XFIXNUM (AREF (subset_info, 3));
762       from -= offset;
763       if (from < XFIXNAT (AREF (subset_info, 1)))
764 	from = XFIXNAT (AREF (subset_info, 1));
765       to -= offset;
766       if (to > XFIXNAT (AREF (subset_info, 2)))
767 	to = XFIXNAT (AREF (subset_info, 2));
768       map_charset_chars (c_function, function, arg, charset, from, to);
769     }
770   else				/* i.e. CHARSET_METHOD_SUPERSET */
771     {
772       Lisp_Object parents;
773 
774       for (parents = CHARSET_SUPERSET (charset); CONSP (parents);
775 	   parents = XCDR (parents))
776 	{
777 	  int offset;
778 	  unsigned this_from, this_to;
779 
780 	  charset = CHARSET_FROM_ID (XFIXNAT (XCAR (XCAR (parents))));
781 	  offset = XFIXNUM (XCDR (XCAR (parents)));
782 	  this_from = from > offset ? from - offset : 0;
783 	  this_to = to > offset ? to - offset : 0;
784 	  if (this_from < CHARSET_MIN_CODE (charset))
785 	    this_from = CHARSET_MIN_CODE (charset);
786 	  if (this_to > CHARSET_MAX_CODE (charset))
787 	    this_to = CHARSET_MAX_CODE (charset);
788 	  map_charset_chars (c_function, function, arg, charset,
789 			     this_from, this_to);
790 	}
791     }
792 }
793 
794 DEFUN ("map-charset-chars", Fmap_charset_chars, Smap_charset_chars, 2, 5, 0,
795        doc: /* Call FUNCTION for all characters in CHARSET.
796 FUNCTION is called with an argument RANGE and the optional 3rd
797 argument ARG.
798 
799 RANGE is a cons (FROM .  TO), where FROM and TO indicate a range of
800 characters contained in CHARSET.
801 
802 The optional 4th and 5th arguments FROM-CODE and TO-CODE specify the
803 range of code points (in CHARSET) of target characters.  Note that
804 these are not character codes, but code points in CHARSET; for the
805 difference see `decode-char' and `list-charset-chars'.  */)
806   (Lisp_Object function, Lisp_Object charset, Lisp_Object arg, Lisp_Object from_code, Lisp_Object to_code)
807 {
808   struct charset *cs;
809   unsigned from, to;
810 
811   CHECK_CHARSET_GET_CHARSET (charset, cs);
812   if (NILP (from_code))
813     from = CHARSET_MIN_CODE (cs);
814   else
815     {
816       from = XFIXNUM (from_code);
817       if (from < CHARSET_MIN_CODE (cs))
818 	from = CHARSET_MIN_CODE (cs);
819     }
820   if (NILP (to_code))
821     to = CHARSET_MAX_CODE (cs);
822   else
823     {
824       to = XFIXNUM (to_code);
825       if (to > CHARSET_MAX_CODE (cs))
826 	to = CHARSET_MAX_CODE (cs);
827     }
828   map_charset_chars (NULL, function, arg, cs, from, to);
829   return Qnil;
830 }
831 
832 
833 /* Define a charset according to the arguments.  The Nth argument is
834    the Nth attribute of the charset (the last attribute `charset-id'
835    is not included).  See the docstring of `define-charset' for the
836    detail.  */
837 
838 DEFUN ("define-charset-internal", Fdefine_charset_internal,
839        Sdefine_charset_internal, charset_arg_max, MANY, 0,
840        doc: /* For internal use only.
841 usage: (define-charset-internal ...)  */)
842   (ptrdiff_t nargs, Lisp_Object *args)
843 {
844   /* Charset attr vector.  */
845   Lisp_Object attrs;
846   Lisp_Object val;
847   Lisp_Object hash_code;
848   struct Lisp_Hash_Table *hash_table = XHASH_TABLE (Vcharset_hash_table);
849   int i, j;
850   struct charset charset;
851   int id;
852   int dimension;
853   bool new_definition_p;
854   int nchars;
855 
856   memset (&charset, 0, sizeof (charset));
857 
858   if (nargs != charset_arg_max)
859     Fsignal (Qwrong_number_of_arguments,
860 	     Fcons (intern ("define-charset-internal"),
861 		    make_fixnum (nargs)));
862 
863   attrs = make_nil_vector (charset_attr_max);
864 
865   CHECK_SYMBOL (args[charset_arg_name]);
866   ASET (attrs, charset_name, args[charset_arg_name]);
867 
868   val = args[charset_arg_code_space];
869   for (i = 0, dimension = 0, nchars = 1; ; i++)
870     {
871       Lisp_Object min_byte_obj = Faref (val, make_fixnum (i * 2));
872       Lisp_Object max_byte_obj = Faref (val, make_fixnum (i * 2 + 1));
873       int min_byte = check_integer_range (min_byte_obj, 0, 255);
874       int max_byte = check_integer_range (max_byte_obj, min_byte, 255);
875       charset.code_space[i * 4] = min_byte;
876       charset.code_space[i * 4 + 1] = max_byte;
877       charset.code_space[i * 4 + 2] = max_byte - min_byte + 1;
878       if (max_byte > 0)
879 	dimension = i + 1;
880       if (i == 3)
881 	break;
882       nchars *= charset.code_space[i * 4 + 2];
883       charset.code_space[i * 4 + 3] = nchars;
884     }
885 
886   val = args[charset_arg_dimension];
887   charset.dimension
888     = !NILP (val) ? check_integer_range (val, 1, 4) : dimension;
889 
890   charset.code_linear_p
891     = (charset.dimension == 1
892        || (charset.code_space[2] == 256
893 	   && (charset.dimension == 2
894 	       || (charset.code_space[6] == 256
895 		   && (charset.dimension == 3
896 		       || charset.code_space[10] == 256)))));
897 
898   if (! charset.code_linear_p)
899     {
900       charset.code_space_mask = xzalloc (256);
901       for (i = 0; i < 4; i++)
902 	for (j = charset.code_space[i * 4]; j <= charset.code_space[i * 4 + 1];
903 	     j++)
904 	  charset.code_space_mask[j] |= (1 << i);
905     }
906 
907   charset.iso_chars_96 = charset.code_space[2] == 96;
908 
909   charset.min_code = (charset.code_space[0]
910 		      | (charset.code_space[4] << 8)
911 		      | (charset.code_space[8] << 16)
912 		      | ((unsigned) charset.code_space[12] << 24));
913   charset.max_code = (charset.code_space[1]
914 		      | (charset.code_space[5] << 8)
915 		      | (charset.code_space[9] << 16)
916 		      | ((unsigned) charset.code_space[13] << 24));
917   charset.char_index_offset = 0;
918 
919   val = args[charset_arg_min_code];
920   if (! NILP (val))
921     {
922       unsigned code = cons_to_unsigned (val, UINT_MAX);
923 
924       if (code < charset.min_code
925 	  || code > charset.max_code)
926 	args_out_of_range_3 (INT_TO_INTEGER (charset.min_code),
927 			     INT_TO_INTEGER (charset.max_code), val);
928       charset.char_index_offset = CODE_POINT_TO_INDEX (&charset, code);
929       charset.min_code = code;
930     }
931 
932   val = args[charset_arg_max_code];
933   if (! NILP (val))
934     {
935       unsigned code = cons_to_unsigned (val, UINT_MAX);
936 
937       if (code < charset.min_code
938 	  || code > charset.max_code)
939 	args_out_of_range_3 (INT_TO_INTEGER (charset.min_code),
940 			     INT_TO_INTEGER (charset.max_code), val);
941       charset.max_code = code;
942     }
943 
944   charset.compact_codes_p = charset.max_code < 0x10000;
945 
946   val = args[charset_arg_invalid_code];
947   if (NILP (val))
948     {
949       if (charset.min_code > 0)
950 	charset.invalid_code = 0;
951       else
952 	{
953 	  if (charset.max_code < UINT_MAX)
954 	    charset.invalid_code = charset.max_code + 1;
955 	  else
956 	    error ("Attribute :invalid-code must be specified");
957 	}
958     }
959   else
960     charset.invalid_code = cons_to_unsigned (val, UINT_MAX);
961 
962   val = args[charset_arg_iso_final];
963   if (NILP (val))
964     charset.iso_final = -1;
965   else
966     {
967       CHECK_FIXNUM (val);
968       if (XFIXNUM (val) < '0' || XFIXNUM (val) > 127)
969 	error ("Invalid iso-final-char: %"pI"d", XFIXNUM (val));
970       charset.iso_final = XFIXNUM (val);
971     }
972 
973   val = args[charset_arg_iso_revision];
974   charset.iso_revision = !NILP (val) ? check_integer_range (val, -1, 63) : -1;
975 
976   val = args[charset_arg_emacs_mule_id];
977   if (NILP (val))
978     charset.emacs_mule_id = -1;
979   else
980     {
981       CHECK_FIXNAT (val);
982       if ((XFIXNUM (val) > 0 && XFIXNUM (val) <= 128) || XFIXNUM (val) >= 256)
983 	error ("Invalid emacs-mule-id: %"pI"d", XFIXNUM (val));
984       charset.emacs_mule_id = XFIXNUM (val);
985     }
986 
987   charset.ascii_compatible_p = ! NILP (args[charset_arg_ascii_compatible_p]);
988 
989   charset.supplementary_p = ! NILP (args[charset_arg_supplementary_p]);
990 
991   charset.unified_p = 0;
992 
993   memset (charset.fast_map, 0, sizeof (charset.fast_map));
994 
995   if (! NILP (args[charset_arg_code_offset]))
996     {
997       val = args[charset_arg_code_offset];
998       CHECK_CHARACTER (val);
999 
1000       charset.method = CHARSET_METHOD_OFFSET;
1001       charset.code_offset = XFIXNUM (val);
1002 
1003       i = CODE_POINT_TO_INDEX (&charset, charset.max_code);
1004       if (MAX_CHAR - charset.code_offset < i)
1005 	error ("Unsupported max char: %d", charset.max_char);
1006       charset.max_char = i + charset.code_offset;
1007       i = CODE_POINT_TO_INDEX (&charset, charset.min_code);
1008       charset.min_char = i + charset.code_offset;
1009 
1010       i = (charset.min_char >> 7) << 7;
1011       for (; i < 0x10000 && i <= charset.max_char; i += 128)
1012 	CHARSET_FAST_MAP_SET (i, charset.fast_map);
1013       i = (i >> 12) << 12;
1014       for (; i <= charset.max_char; i += 0x1000)
1015 	CHARSET_FAST_MAP_SET (i, charset.fast_map);
1016       if (charset.code_offset == 0 && charset.max_char >= 0x80)
1017 	charset.ascii_compatible_p = 1;
1018     }
1019   else if (! NILP (args[charset_arg_map]))
1020     {
1021       val = args[charset_arg_map];
1022       ASET (attrs, charset_map, val);
1023       charset.method = CHARSET_METHOD_MAP;
1024     }
1025   else if (! NILP (args[charset_arg_subset]))
1026     {
1027       Lisp_Object parent;
1028       Lisp_Object parent_min_code, parent_max_code, parent_code_offset;
1029       struct charset *parent_charset;
1030 
1031       val = args[charset_arg_subset];
1032       parent = Fcar (val);
1033       CHECK_CHARSET_GET_CHARSET (parent, parent_charset);
1034       parent_min_code = Fnth (make_fixnum (1), val);
1035       CHECK_FIXNAT (parent_min_code);
1036       parent_max_code = Fnth (make_fixnum (2), val);
1037       CHECK_FIXNAT (parent_max_code);
1038       parent_code_offset = Fnth (make_fixnum (3), val);
1039       CHECK_FIXNUM (parent_code_offset);
1040       ASET (attrs, charset_subset,
1041 	    CALLN (Fvector, make_fixnum (parent_charset->id),
1042 		   parent_min_code, parent_max_code, parent_code_offset));
1043 
1044       charset.method = CHARSET_METHOD_SUBSET;
1045       /* Here, we just copy the parent's fast_map.  It's not accurate,
1046 	 but at least it works for quickly detecting which character
1047 	 DOESN'T belong to this charset.  */
1048       memcpy (charset.fast_map, parent_charset->fast_map,
1049 	      sizeof charset.fast_map);
1050 
1051       /* We also copy these for parents.  */
1052       charset.min_char = parent_charset->min_char;
1053       charset.max_char = parent_charset->max_char;
1054     }
1055   else if (! NILP (args[charset_arg_superset]))
1056     {
1057       val = args[charset_arg_superset];
1058       charset.method = CHARSET_METHOD_SUPERSET;
1059       val = Fcopy_sequence (val);
1060       ASET (attrs, charset_superset, val);
1061 
1062       charset.min_char = MAX_CHAR;
1063       charset.max_char = 0;
1064       for (; ! NILP (val); val = Fcdr (val))
1065 	{
1066 	  Lisp_Object elt, car_part, cdr_part;
1067 	  int this_id, offset;
1068 	  struct charset *this_charset;
1069 
1070 	  elt = Fcar (val);
1071 	  if (CONSP (elt))
1072 	    {
1073 	      car_part = XCAR (elt);
1074 	      cdr_part = XCDR (elt);
1075 	      CHECK_CHARSET_GET_ID (car_part, this_id);
1076 	      offset = check_integer_range (cdr_part, INT_MIN, INT_MAX);
1077 	    }
1078 	  else
1079 	    {
1080 	      CHECK_CHARSET_GET_ID (elt, this_id);
1081 	      offset = 0;
1082 	    }
1083 	  XSETCAR (val, Fcons (make_fixnum (this_id), make_fixnum (offset)));
1084 
1085 	  this_charset = CHARSET_FROM_ID (this_id);
1086 	  if (charset.min_char > this_charset->min_char)
1087 	    charset.min_char = this_charset->min_char;
1088 	  if (charset.max_char < this_charset->max_char)
1089 	    charset.max_char = this_charset->max_char;
1090 	  for (i = 0; i < 190; i++)
1091 	    charset.fast_map[i] |= this_charset->fast_map[i];
1092 	}
1093     }
1094   else
1095     error ("None of :code-offset, :map, :parents are specified");
1096 
1097   val = args[charset_arg_unify_map];
1098   if (! NILP (val) && !STRINGP (val))
1099     CHECK_VECTOR (val);
1100   ASET (attrs, charset_unify_map, val);
1101 
1102   CHECK_LIST (args[charset_arg_plist]);
1103   ASET (attrs, charset_plist, args[charset_arg_plist]);
1104 
1105   charset.hash_index = hash_lookup (hash_table, args[charset_arg_name],
1106 				    &hash_code);
1107   if (charset.hash_index >= 0)
1108     {
1109       new_definition_p = 0;
1110       id = XFIXNAT (CHARSET_SYMBOL_ID (args[charset_arg_name]));
1111       set_hash_value_slot (hash_table, charset.hash_index, attrs);
1112     }
1113   else
1114     {
1115       charset.hash_index = hash_put (hash_table, args[charset_arg_name], attrs,
1116 				     hash_code);
1117       if (charset_table_used == charset_table_size)
1118 	{
1119 	  /* Ensure that charset IDs fit into 'int' as well as into the
1120 	     restriction imposed by fixnums.  Although the 'int' restriction
1121 	     could be removed, too much other code would need altering; for
1122 	     example, the IDs are stuffed into struct
1123 	     coding_system.charbuf[i] entries, which are 'int'.  */
1124 	  int old_size = charset_table_size;
1125 	  ptrdiff_t new_size = old_size;
1126 	  struct charset *new_table =
1127 	    xpalloc (0, &new_size, 1,
1128 		     min (INT_MAX, MOST_POSITIVE_FIXNUM),
1129                      sizeof *charset_table);
1130           memcpy (new_table, charset_table, old_size * sizeof *new_table);
1131           charset_table = new_table;
1132 	  charset_table_size = new_size;
1133 	  /* FIXME: This leaks memory, as the old charset_table becomes
1134 	     unreachable.  If the old charset table is charset_table_init
1135 	     then this leak is intentional; otherwise, it's unclear.
1136 	     If the latter memory leak is intentional, a
1137 	     comment should be added to explain this.  If not, the old
1138 	     charset_table should be freed, by passing it as the 1st argument
1139 	     to xpalloc and removing the memcpy.  */
1140 	}
1141       id = charset_table_used++;
1142       new_definition_p = 1;
1143     }
1144 
1145   ASET (attrs, charset_id, make_fixnum (id));
1146   charset.id = id;
1147   charset_table[id] = charset;
1148 
1149   if (charset.method == CHARSET_METHOD_MAP)
1150     {
1151       load_charset (&charset, 0);
1152       charset_table[id] = charset;
1153     }
1154 
1155   if (charset.iso_final >= 0)
1156     {
1157       ISO_CHARSET_TABLE (charset.dimension, charset.iso_chars_96,
1158 			 charset.iso_final) = id;
1159       if (new_definition_p)
1160 	Viso_2022_charset_list = nconc2 (Viso_2022_charset_list, list1i (id));
1161       if (ISO_CHARSET_TABLE (1, 0, 'J') == id)
1162 	charset_jisx0201_roman = id;
1163       else if (ISO_CHARSET_TABLE (2, 0, '@') == id)
1164 	charset_jisx0208_1978 = id;
1165       else if (ISO_CHARSET_TABLE (2, 0, 'B') == id)
1166 	charset_jisx0208 = id;
1167       else if (ISO_CHARSET_TABLE (2, 0, 'C') == id)
1168 	charset_ksc5601 = id;
1169     }
1170 
1171   if (charset.emacs_mule_id >= 0)
1172     {
1173       emacs_mule_charset[charset.emacs_mule_id] = id;
1174       if (charset.emacs_mule_id < 0xA0)
1175 	emacs_mule_bytes[charset.emacs_mule_id] = charset.dimension + 1;
1176       else
1177 	emacs_mule_bytes[charset.emacs_mule_id] = charset.dimension + 2;
1178       if (new_definition_p)
1179 	Vemacs_mule_charset_list = nconc2 (Vemacs_mule_charset_list,
1180 					   list1i (id));
1181     }
1182 
1183   if (new_definition_p)
1184     {
1185       Vcharset_list = Fcons (args[charset_arg_name], Vcharset_list);
1186       if (charset.supplementary_p)
1187 	Vcharset_ordered_list = nconc2 (Vcharset_ordered_list, list1i (id));
1188       else
1189 	{
1190 	  Lisp_Object tail;
1191 
1192 	  for (tail = Vcharset_ordered_list; CONSP (tail); tail = XCDR (tail))
1193 	    {
1194 	      struct charset *cs = CHARSET_FROM_ID (XFIXNUM (XCAR (tail)));
1195 
1196 	      if (cs->supplementary_p)
1197 		break;
1198 	    }
1199 	  if (EQ (tail, Vcharset_ordered_list))
1200 	    Vcharset_ordered_list = Fcons (make_fixnum (id),
1201 					   Vcharset_ordered_list);
1202 	  else if (NILP (tail))
1203 	    Vcharset_ordered_list = nconc2 (Vcharset_ordered_list,
1204 					    list1i (id));
1205 	  else
1206 	    {
1207 	      val = Fcons (XCAR (tail), XCDR (tail));
1208 	      XSETCDR (tail, val);
1209 	      XSETCAR (tail, make_fixnum (id));
1210 	    }
1211 	}
1212       charset_ordered_list_tick++;
1213     }
1214 
1215   return Qnil;
1216 }
1217 
1218 
1219 /* Same as Fdefine_charset_internal but arguments are more convenient
1220    to call from C (typically in syms_of_charset).  This can define a
1221    charset of `offset' method only.  Return the ID of the new
1222    charset.  */
1223 
1224 static int
define_charset_internal(Lisp_Object name,int dimension,const char * code_space_chars,unsigned min_code,unsigned max_code,int iso_final,int iso_revision,int emacs_mule_id,bool ascii_compatible,bool supplementary,int code_offset)1225 define_charset_internal (Lisp_Object name,
1226 			 int dimension,
1227 			 const char *code_space_chars,
1228 			 unsigned min_code, unsigned max_code,
1229 			 int iso_final, int iso_revision, int emacs_mule_id,
1230 			 bool ascii_compatible, bool supplementary,
1231 			 int code_offset)
1232 {
1233   const unsigned char *code_space = (const unsigned char *) code_space_chars;
1234   Lisp_Object args[charset_arg_max];
1235   Lisp_Object val;
1236   int i;
1237 
1238   args[charset_arg_name] = name;
1239   args[charset_arg_dimension] = make_fixnum (dimension);
1240   val = make_uninit_vector (8);
1241   for (i = 0; i < 8; i++)
1242     ASET (val, i, make_fixnum (code_space[i]));
1243   args[charset_arg_code_space] = val;
1244   args[charset_arg_min_code] = make_fixnum (min_code);
1245   args[charset_arg_max_code] = make_fixnum (max_code);
1246   args[charset_arg_iso_final]
1247     = (iso_final < 0 ? Qnil : make_fixnum (iso_final));
1248   args[charset_arg_iso_revision] = make_fixnum (iso_revision);
1249   args[charset_arg_emacs_mule_id]
1250     = (emacs_mule_id < 0 ? Qnil : make_fixnum (emacs_mule_id));
1251   args[charset_arg_ascii_compatible_p] = ascii_compatible ? Qt : Qnil;
1252   args[charset_arg_supplementary_p] = supplementary ? Qt : Qnil;
1253   args[charset_arg_invalid_code] = Qnil;
1254   args[charset_arg_code_offset] = make_fixnum (code_offset);
1255   args[charset_arg_map] = Qnil;
1256   args[charset_arg_subset] = Qnil;
1257   args[charset_arg_superset] = Qnil;
1258   args[charset_arg_unify_map] = Qnil;
1259 
1260   args[charset_arg_plist] =
1261      list (QCname,
1262 	   args[charset_arg_name],
1263 	   intern_c_string (":dimension"),
1264 	   args[charset_arg_dimension],
1265 	   intern_c_string (":code-space"),
1266 	   args[charset_arg_code_space],
1267 	   intern_c_string (":iso-final-char"),
1268 	   args[charset_arg_iso_final],
1269 	   intern_c_string (":emacs-mule-id"),
1270 	   args[charset_arg_emacs_mule_id],
1271 	   QCascii_compatible_p,
1272 	   args[charset_arg_ascii_compatible_p],
1273 	   intern_c_string (":code-offset"),
1274 	   args[charset_arg_code_offset]);
1275   Fdefine_charset_internal (charset_arg_max, args);
1276 
1277   return XFIXNUM (CHARSET_SYMBOL_ID (name));
1278 }
1279 
1280 
1281 DEFUN ("define-charset-alias", Fdefine_charset_alias,
1282        Sdefine_charset_alias, 2, 2, 0,
1283        doc: /* Define ALIAS as an alias for charset CHARSET.  */)
1284   (Lisp_Object alias, Lisp_Object charset)
1285 {
1286   Lisp_Object attr;
1287 
1288   CHECK_CHARSET_GET_ATTR (charset, attr);
1289   Fputhash (alias, attr, Vcharset_hash_table);
1290   Vcharset_list = Fcons (alias, Vcharset_list);
1291   return Qnil;
1292 }
1293 
1294 
1295 DEFUN ("charset-plist", Fcharset_plist, Scharset_plist, 1, 1, 0,
1296        doc: /* Return the property list of CHARSET.  */)
1297   (Lisp_Object charset)
1298 {
1299   Lisp_Object attrs;
1300 
1301   CHECK_CHARSET_GET_ATTR (charset, attrs);
1302   return CHARSET_ATTR_PLIST (attrs);
1303 }
1304 
1305 
1306 DEFUN ("set-charset-plist", Fset_charset_plist, Sset_charset_plist, 2, 2, 0,
1307        doc: /* Set CHARSET's property list to PLIST.  */)
1308   (Lisp_Object charset, Lisp_Object plist)
1309 {
1310   Lisp_Object attrs;
1311 
1312   CHECK_CHARSET_GET_ATTR (charset, attrs);
1313   ASET (attrs, charset_plist, plist);
1314   return plist;
1315 }
1316 
1317 
1318 DEFUN ("unify-charset", Funify_charset, Sunify_charset, 1, 3, 0,
1319        doc: /* Unify characters of CHARSET with Unicode.
1320 This means reading the relevant file and installing the table defined
1321 by CHARSET's `:unify-map' property.
1322 
1323 Optional second arg UNIFY-MAP is a file name string or a vector.  It has
1324 the same meaning as the `:unify-map' attribute in the function
1325 `define-charset' (which see).
1326 
1327 Optional third argument DEUNIFY, if non-nil, means to de-unify CHARSET.  */)
1328   (Lisp_Object charset, Lisp_Object unify_map, Lisp_Object deunify)
1329 {
1330   int id;
1331   struct charset *cs;
1332 
1333   CHECK_CHARSET_GET_ID (charset, id);
1334   cs = CHARSET_FROM_ID (id);
1335   if (NILP (deunify)
1336       ? CHARSET_UNIFIED_P (cs) && ! NILP (CHARSET_DEUNIFIER (cs))
1337       : ! CHARSET_UNIFIED_P (cs))
1338     return Qnil;
1339 
1340   CHARSET_UNIFIED_P (cs) = 0;
1341   if (NILP (deunify))
1342     {
1343       if (CHARSET_METHOD (cs) != CHARSET_METHOD_OFFSET
1344 	  || CHARSET_CODE_OFFSET (cs) < 0x110000)
1345 	error ("Can't unify charset: %s", SDATA (SYMBOL_NAME (charset)));
1346       if (NILP (unify_map))
1347 	unify_map = CHARSET_UNIFY_MAP (cs);
1348       else
1349 	{
1350 	  if (! STRINGP (unify_map) && ! VECTORP (unify_map))
1351 	    signal_error ("Bad unify-map", unify_map);
1352 	  set_charset_attr (cs, charset_unify_map, unify_map);
1353 	}
1354       if (NILP (Vchar_unify_table))
1355 	Vchar_unify_table = Fmake_char_table (Qnil, Qnil);
1356       char_table_set_range (Vchar_unify_table,
1357 			    cs->min_char, cs->max_char, charset);
1358       CHARSET_UNIFIED_P (cs) = 1;
1359     }
1360   else if (CHAR_TABLE_P (Vchar_unify_table))
1361     {
1362       unsigned min_code = CHARSET_MIN_CODE (cs);
1363       unsigned max_code = CHARSET_MAX_CODE (cs);
1364       int min_char = DECODE_CHAR (cs, min_code);
1365       int max_char = DECODE_CHAR (cs, max_code);
1366 
1367       char_table_set_range (Vchar_unify_table, min_char, max_char, Qnil);
1368     }
1369 
1370   return Qnil;
1371 }
1372 
1373 /* Check that DIMENSION, CHARS, and FINAL_CHAR specify a valid ISO charset.
1374    Return true if it's a 96-character set, false if 94.  */
1375 
1376 static bool
check_iso_charset_parameter(Lisp_Object dimension,Lisp_Object chars,Lisp_Object final_char)1377 check_iso_charset_parameter (Lisp_Object dimension, Lisp_Object chars,
1378 			     Lisp_Object final_char)
1379 {
1380   CHECK_FIXNUM (dimension);
1381   CHECK_FIXNUM (chars);
1382   CHECK_CHARACTER (final_char);
1383 
1384   if (! (1 <= XFIXNUM (dimension) && XFIXNUM (dimension) <= 3))
1385     error ("Invalid DIMENSION %"pI"d, it should be 1, 2, or 3",
1386 	   XFIXNUM (dimension));
1387 
1388   bool chars_flag = XFIXNUM (chars) == 96;
1389   if (! (chars_flag || XFIXNUM (chars) == 94))
1390     error ("Invalid CHARS %"pI"d, it should be 94 or 96", XFIXNUM (chars));
1391 
1392   int final_ch = XFIXNAT (final_char);
1393   if (! ('0' <= final_ch && final_ch <= '~'))
1394     error ("Invalid FINAL-CHAR `%c', it should be `0'..`~'", final_ch);
1395 
1396   return chars_flag;
1397 }
1398 
1399 DEFUN ("get-unused-iso-final-char", Fget_unused_iso_final_char,
1400        Sget_unused_iso_final_char, 2, 2, 0,
1401        doc: /*
1402 Return an unused ISO final char for a charset of DIMENSION and CHARS.
1403 DIMENSION is the number of bytes to represent a character: 1 or 2.
1404 CHARS is the number of characters in a dimension: 94 or 96.
1405 
1406 This final char is for private use, thus the range is `0' (48) .. `?' (63).
1407 If there's no unused final char for the specified kind of charset,
1408 return nil.  */)
1409   (Lisp_Object dimension, Lisp_Object chars)
1410 {
1411   bool chars_flag = check_iso_charset_parameter (dimension, chars,
1412 						 make_fixnum ('0'));
1413   for (int final_char = '0'; final_char <= '?'; final_char++)
1414     if (ISO_CHARSET_TABLE (XFIXNUM (dimension), chars_flag, final_char) < 0)
1415       return make_fixnum (final_char);
1416   return Qnil;
1417 }
1418 
1419 
1420 DEFUN ("declare-equiv-charset", Fdeclare_equiv_charset, Sdeclare_equiv_charset,
1421        4, 4, 0,
1422        doc: /* Declare an equivalent charset for ISO-2022 decoding.
1423 
1424 On decoding by an ISO-2022 base coding system, when a charset
1425 specified by DIMENSION, CHARS, and FINAL-CHAR is designated, behave as
1426 if CHARSET is designated instead.  */)
1427   (Lisp_Object dimension, Lisp_Object chars, Lisp_Object final_char, Lisp_Object charset)
1428 {
1429   int id;
1430 
1431   CHECK_CHARSET_GET_ID (charset, id);
1432   bool chars_flag = check_iso_charset_parameter (dimension, chars, final_char);
1433   ISO_CHARSET_TABLE (XFIXNUM (dimension), chars_flag, XFIXNAT (final_char)) = id;
1434   return Qnil;
1435 }
1436 
1437 
1438 /* Return information about charsets in the text at PTR of NBYTES
1439    bytes, which are NCHARS characters.  The value is:
1440 
1441 	0: Each character is represented by one byte.  This is always
1442 	   true for a unibyte string.  For a multibyte string, true if
1443 	   it contains only ASCII characters.
1444 
1445 	1: No charsets other than ascii, control-1, and latin-1 are
1446 	   found.
1447 
1448 	2: Otherwise.
1449 */
1450 
1451 int
string_xstring_p(Lisp_Object string)1452 string_xstring_p (Lisp_Object string)
1453 {
1454   const unsigned char *p = SDATA (string);
1455   const unsigned char *endp = p + SBYTES (string);
1456 
1457   if (SCHARS (string) == SBYTES (string))
1458     return 0;
1459 
1460   while (p < endp)
1461     {
1462       int c = string_char_advance (&p);
1463 
1464       if (c >= 0x100)
1465 	return 2;
1466     }
1467   return 1;
1468 }
1469 
1470 
1471 /* Find charsets in the string at PTR of NCHARS and NBYTES.
1472 
1473    CHARSETS is a vector.  If Nth element is non-nil, it means the
1474    charset whose id is N is already found.
1475 
1476    It may lookup a translation table TABLE if supplied.  */
1477 
1478 static void
find_charsets_in_text(const unsigned char * ptr,ptrdiff_t nchars,ptrdiff_t nbytes,Lisp_Object charsets,Lisp_Object table,bool multibyte)1479 find_charsets_in_text (const unsigned char *ptr, ptrdiff_t nchars,
1480 		       ptrdiff_t nbytes, Lisp_Object charsets,
1481 		       Lisp_Object table, bool multibyte)
1482 {
1483   const unsigned char *pend = ptr + nbytes;
1484 
1485   if (nchars == nbytes)
1486     {
1487       if (multibyte)
1488 	ASET (charsets, charset_ascii, Qt);
1489       else
1490 	while (ptr < pend)
1491 	  {
1492 	    int c = *ptr++;
1493 
1494 	    if (!NILP (table))
1495 	      c = translate_char (table, c);
1496 	    if (ASCII_CHAR_P (c))
1497 	      ASET (charsets, charset_ascii, Qt);
1498 	    else
1499 	      ASET (charsets, charset_eight_bit, Qt);
1500 	  }
1501     }
1502   else
1503     {
1504       while (ptr < pend)
1505 	{
1506 	  int c = string_char_advance (&ptr);
1507 	  struct charset *charset;
1508 
1509 	  if (!NILP (table))
1510 	    c = translate_char (table, c);
1511 	  charset = CHAR_CHARSET (c);
1512 	  ASET (charsets, CHARSET_ID (charset), Qt);
1513 	}
1514     }
1515 }
1516 
1517 DEFUN ("find-charset-region", Ffind_charset_region, Sfind_charset_region,
1518        2, 3, 0,
1519        doc: /* Return a list of charsets in the region between BEG and END.
1520 BEG and END are buffer positions.
1521 Optional arg TABLE if non-nil is a translation table to look up.
1522 
1523 If the current buffer is unibyte, the returned list may contain
1524 only `ascii', `eight-bit-control', and `eight-bit-graphic'.  */)
1525   (Lisp_Object beg, Lisp_Object end, Lisp_Object table)
1526 {
1527   Lisp_Object charsets;
1528   ptrdiff_t from, from_byte, to, stop, stop_byte;
1529   int i;
1530   Lisp_Object val;
1531   bool multibyte = ! NILP (BVAR (current_buffer, enable_multibyte_characters));
1532 
1533   validate_region (&beg, &end);
1534   from = XFIXNAT (beg);
1535   stop = to = XFIXNAT (end);
1536 
1537   if (from < GPT && GPT < to)
1538     {
1539       stop = GPT;
1540       stop_byte = GPT_BYTE;
1541     }
1542   else
1543     stop_byte = CHAR_TO_BYTE (stop);
1544 
1545   from_byte = CHAR_TO_BYTE (from);
1546 
1547   charsets = make_nil_vector (charset_table_used);
1548   while (1)
1549     {
1550       find_charsets_in_text (BYTE_POS_ADDR (from_byte), stop - from,
1551 			     stop_byte - from_byte, charsets, table,
1552 			     multibyte);
1553       if (stop < to)
1554 	{
1555 	  from = stop, from_byte = stop_byte;
1556 	  stop = to, stop_byte = CHAR_TO_BYTE (stop);
1557 	}
1558       else
1559 	break;
1560     }
1561 
1562   val = Qnil;
1563   for (i = charset_table_used - 1; i >= 0; i--)
1564     if (!NILP (AREF (charsets, i)))
1565       val = Fcons (CHARSET_NAME (charset_table + i), val);
1566   return val;
1567 }
1568 
1569 DEFUN ("find-charset-string", Ffind_charset_string, Sfind_charset_string,
1570        1, 2, 0,
1571        doc: /* Return a list of charsets in STR.
1572 Optional arg TABLE if non-nil is a translation table to look up.
1573 
1574 If STR is unibyte, the returned list may contain
1575 only `ascii', `eight-bit-control', and `eight-bit-graphic'. */)
1576   (Lisp_Object str, Lisp_Object table)
1577 {
1578   CHECK_STRING (str);
1579 
1580   Lisp_Object charsets = make_nil_vector (charset_table_used);
1581   find_charsets_in_text (SDATA (str), SCHARS (str), SBYTES (str),
1582 			 charsets, table,
1583 			 STRING_MULTIBYTE (str));
1584   Lisp_Object val = Qnil;
1585   for (int i = charset_table_used - 1; i >= 0; i--)
1586     if (!NILP (AREF (charsets, i)))
1587       val = Fcons (CHARSET_NAME (charset_table + i), val);
1588   return val;
1589 }
1590 
1591 
1592 
1593 /* Return a unified character code for C (>= 0x110000).  VAL is a
1594    value of Vchar_unify_table for C; i.e. it is nil, an integer, or a
1595    charset symbol.  */
1596 static int
maybe_unify_char(int c,Lisp_Object val)1597 maybe_unify_char (int c, Lisp_Object val)
1598 {
1599   struct charset *charset;
1600 
1601   if (FIXNUMP (val))
1602     return XFIXNAT (val);
1603   if (NILP (val))
1604     return c;
1605 
1606   CHECK_CHARSET_GET_CHARSET (val, charset);
1607 #ifdef REL_ALLOC
1608   /* The call to load_charset below can allocate memory, which screws
1609      callers of this function through STRING_CHAR_* macros that hold C
1610      pointers to buffer text, if REL_ALLOC is used.  */
1611   r_alloc_inhibit_buffer_relocation (1);
1612 #endif
1613   load_charset (charset, 1);
1614   if (! inhibit_load_charset_map)
1615     {
1616       val = CHAR_TABLE_REF (Vchar_unify_table, c);
1617       if (! NILP (val))
1618 	c = XFIXNAT (val);
1619     }
1620   else
1621     {
1622       int code_index = c - CHARSET_CODE_OFFSET (charset);
1623       int unified = GET_TEMP_CHARSET_WORK_DECODER (code_index);
1624 
1625       if (unified > 0)
1626 	c = unified;
1627     }
1628 #ifdef REL_ALLOC
1629   r_alloc_inhibit_buffer_relocation (0);
1630 #endif
1631   return c;
1632 }
1633 
1634 
1635 /* Return a character corresponding to the code-point CODE of
1636    CHARSET.  */
1637 
1638 int
decode_char(struct charset * charset,unsigned int code)1639 decode_char (struct charset *charset, unsigned int code)
1640 {
1641   int c, char_index;
1642   enum charset_method method = CHARSET_METHOD (charset);
1643 
1644   if (code < CHARSET_MIN_CODE (charset) || code > CHARSET_MAX_CODE (charset))
1645     return -1;
1646 
1647   if (method == CHARSET_METHOD_SUBSET)
1648     {
1649       Lisp_Object subset_info;
1650 
1651       subset_info = CHARSET_SUBSET (charset);
1652       charset = CHARSET_FROM_ID (XFIXNAT (AREF (subset_info, 0)));
1653       code -= XFIXNUM (AREF (subset_info, 3));
1654       if (code < XFIXNAT (AREF (subset_info, 1))
1655 	  || code > XFIXNAT (AREF (subset_info, 2)))
1656 	c = -1;
1657       else
1658 	c = DECODE_CHAR (charset, code);
1659     }
1660   else if (method == CHARSET_METHOD_SUPERSET)
1661     {
1662       Lisp_Object parents;
1663 
1664       parents = CHARSET_SUPERSET (charset);
1665       c = -1;
1666       for (; CONSP (parents); parents = XCDR (parents))
1667 	{
1668 	  int id = XFIXNUM (XCAR (XCAR (parents)));
1669 	  int code_offset = XFIXNUM (XCDR (XCAR (parents)));
1670 	  unsigned this_code = code - code_offset;
1671 
1672 	  charset = CHARSET_FROM_ID (id);
1673 	  if ((c = DECODE_CHAR (charset, this_code)) >= 0)
1674 	    break;
1675 	}
1676     }
1677   else
1678     {
1679       char_index = CODE_POINT_TO_INDEX (charset, code);
1680       if (char_index < 0)
1681 	return -1;
1682 
1683       if (method == CHARSET_METHOD_MAP)
1684 	{
1685 	  Lisp_Object decoder;
1686 
1687 	  decoder = CHARSET_DECODER (charset);
1688 	  if (! VECTORP (decoder))
1689 	    {
1690 	      load_charset (charset, 1);
1691 	      decoder = CHARSET_DECODER (charset);
1692 	    }
1693 	  if (VECTORP (decoder))
1694 	    c = XFIXNUM (AREF (decoder, char_index));
1695 	  else
1696 	    c = GET_TEMP_CHARSET_WORK_DECODER (char_index);
1697 	}
1698       else			/* method == CHARSET_METHOD_OFFSET */
1699 	{
1700 	  c = char_index + CHARSET_CODE_OFFSET (charset);
1701 	  if (CHARSET_UNIFIED_P (charset)
1702 	      && MAX_UNICODE_CHAR < c && c <= MAX_5_BYTE_CHAR)
1703 	    {
1704 	      /* Unify C with a Unicode character if possible.  */
1705 	      Lisp_Object val = CHAR_TABLE_REF (Vchar_unify_table, c);
1706 	      c = maybe_unify_char (c, val);
1707 	    }
1708 	}
1709     }
1710 
1711   return c;
1712 }
1713 
1714 /* Variable used temporarily by the macro ENCODE_CHAR.  */
1715 Lisp_Object charset_work;
1716 
1717 /* Return a code-point of C in CHARSET.  If C doesn't belong to
1718    CHARSET, return CHARSET_INVALID_CODE (CHARSET).  If STRICT is true,
1719    use CHARSET's strict_max_char instead of max_char.  */
1720 
1721 unsigned
encode_char(struct charset * charset,int c)1722 encode_char (struct charset *charset, int c)
1723 {
1724   unsigned code;
1725   enum charset_method method = CHARSET_METHOD (charset);
1726 
1727   if (CHARSET_UNIFIED_P (charset))
1728     {
1729       Lisp_Object deunifier;
1730       int code_index = -1;
1731 
1732       deunifier = CHARSET_DEUNIFIER (charset);
1733       if (! CHAR_TABLE_P (deunifier))
1734 	{
1735 	  load_charset (charset, 2);
1736 	  deunifier = CHARSET_DEUNIFIER (charset);
1737 	}
1738       if (CHAR_TABLE_P (deunifier))
1739 	{
1740 	  Lisp_Object deunified = CHAR_TABLE_REF (deunifier, c);
1741 
1742 	  if (FIXNUMP (deunified))
1743 	    code_index = XFIXNUM (deunified);
1744 	}
1745       else
1746 	{
1747 	  code_index = GET_TEMP_CHARSET_WORK_ENCODER (c);
1748 	}
1749       if (code_index >= 0)
1750 	c = CHARSET_CODE_OFFSET (charset) + code_index;
1751     }
1752 
1753   if (method == CHARSET_METHOD_SUBSET)
1754     {
1755       Lisp_Object subset_info;
1756       struct charset *this_charset;
1757 
1758       subset_info = CHARSET_SUBSET (charset);
1759       this_charset = CHARSET_FROM_ID (XFIXNAT (AREF (subset_info, 0)));
1760       code = ENCODE_CHAR (this_charset, c);
1761       if (code == CHARSET_INVALID_CODE (this_charset)
1762 	  || code < XFIXNAT (AREF (subset_info, 1))
1763 	  || code > XFIXNAT (AREF (subset_info, 2)))
1764 	return CHARSET_INVALID_CODE (charset);
1765       code += XFIXNUM (AREF (subset_info, 3));
1766       return code;
1767     }
1768 
1769   if (method == CHARSET_METHOD_SUPERSET)
1770     {
1771       Lisp_Object parents;
1772 
1773       parents = CHARSET_SUPERSET (charset);
1774       for (; CONSP (parents); parents = XCDR (parents))
1775 	{
1776 	  int id = XFIXNUM (XCAR (XCAR (parents)));
1777 	  int code_offset = XFIXNUM (XCDR (XCAR (parents)));
1778 	  struct charset *this_charset = CHARSET_FROM_ID (id);
1779 
1780 	  code = ENCODE_CHAR (this_charset, c);
1781 	  if (code != CHARSET_INVALID_CODE (this_charset))
1782 	    return code + code_offset;
1783 	}
1784       return CHARSET_INVALID_CODE (charset);
1785     }
1786 
1787   if (! CHARSET_FAST_MAP_REF ((c), charset->fast_map)
1788       || c < CHARSET_MIN_CHAR (charset) || c > CHARSET_MAX_CHAR (charset))
1789     return CHARSET_INVALID_CODE (charset);
1790 
1791   if (method == CHARSET_METHOD_MAP)
1792     {
1793       Lisp_Object encoder;
1794       Lisp_Object val;
1795 
1796       encoder = CHARSET_ENCODER (charset);
1797       if (! CHAR_TABLE_P (CHARSET_ENCODER (charset)))
1798 	{
1799 	  load_charset (charset, 2);
1800 	  encoder = CHARSET_ENCODER (charset);
1801 	}
1802       if (CHAR_TABLE_P (encoder))
1803 	{
1804 	  val = CHAR_TABLE_REF (encoder, c);
1805 	  if (NILP (val))
1806 	    return CHARSET_INVALID_CODE (charset);
1807 	  code = XFIXNUM (val);
1808 	  if (! CHARSET_COMPACT_CODES_P (charset))
1809 	    code = INDEX_TO_CODE_POINT (charset, code);
1810 	}
1811       else
1812 	{
1813 	  code = GET_TEMP_CHARSET_WORK_ENCODER (c);
1814 	  code = INDEX_TO_CODE_POINT (charset, code);
1815 	}
1816     }
1817   else				/* method == CHARSET_METHOD_OFFSET */
1818     {
1819       unsigned code_index = c - CHARSET_CODE_OFFSET (charset);
1820 
1821       code = INDEX_TO_CODE_POINT (charset, code_index);
1822     }
1823 
1824   return code;
1825 }
1826 
1827 
1828 DEFUN ("decode-char", Fdecode_char, Sdecode_char, 2, 2, 0,
1829        doc: /* Decode the pair of CHARSET and CODE-POINT into a character.
1830 Return nil if CODE-POINT is not valid in CHARSET.
1831 
1832 CODE-POINT may be a cons (HIGHER-16-BIT-VALUE . LOWER-16-BIT-VALUE),
1833 although this usage is obsolescent.  */)
1834   (Lisp_Object charset, Lisp_Object code_point)
1835 {
1836   int c, id;
1837   unsigned code;
1838   struct charset *charsetp;
1839 
1840   CHECK_CHARSET_GET_ID (charset, id);
1841   code = cons_to_unsigned (code_point, UINT_MAX);
1842   charsetp = CHARSET_FROM_ID (id);
1843   c = DECODE_CHAR (charsetp, code);
1844   return (c >= 0 ? make_fixnum (c) : Qnil);
1845 }
1846 
1847 
1848 DEFUN ("encode-char", Fencode_char, Sencode_char, 2, 2, 0,
1849        doc: /* Encode the character CH into a code-point of CHARSET.
1850 Return the encoded code-point, a fixnum if its value is small enough,
1851 otherwise a bignum.
1852 Return nil if CHARSET doesn't support CH.  */)
1853   (Lisp_Object ch, Lisp_Object charset)
1854 {
1855   int c, id;
1856   unsigned code;
1857   struct charset *charsetp;
1858 
1859   CHECK_CHARSET_GET_ID (charset, id);
1860   CHECK_CHARACTER (ch);
1861   c = XFIXNAT (ch);
1862   charsetp = CHARSET_FROM_ID (id);
1863   code = ENCODE_CHAR (charsetp, c);
1864   if (code == CHARSET_INVALID_CODE (charsetp))
1865     return Qnil;
1866   /* There are much fewer codepoints in the world than we have positive
1867      fixnums, so it could be argued that we never really need a bignum,
1868      e.g. Unicode codepoints only need 21bit, and China's GB-10830
1869      can fit in 22bit.  Yet we encode GB-10830's chars in a sparse way
1870      (we just take the 4byte sequences as a 32bit int), so some
1871      GB-10830 chars (such as 0x81308130 in etc/charsets/gb108304.map) end
1872      up represented as bignums if EMACS_INT is 32 bits.  */
1873   return INT_TO_INTEGER (code);
1874 }
1875 
1876 
1877 DEFUN ("make-char", Fmake_char, Smake_char, 1, 5, 0,
1878        doc:
1879        /* Return a character of CHARSET whose position codes are CODEn.
1880 
1881 CODE1 through CODE4 are optional, but if you don't supply sufficient
1882 position codes, it is assumed that the minimum code in each dimension
1883 is specified.  */)
1884   (Lisp_Object charset, Lisp_Object code1, Lisp_Object code2, Lisp_Object code3, Lisp_Object code4)
1885 {
1886   int id, dimension;
1887   struct charset *charsetp;
1888   unsigned code;
1889   int c;
1890 
1891   CHECK_CHARSET_GET_ID (charset, id);
1892   charsetp = CHARSET_FROM_ID (id);
1893 
1894   dimension = CHARSET_DIMENSION (charsetp);
1895   if (NILP (code1))
1896     code = (CHARSET_ASCII_COMPATIBLE_P (charsetp)
1897 	    ? 0 : CHARSET_MIN_CODE (charsetp));
1898   else
1899     {
1900       CHECK_FIXNAT (code1);
1901       if (XFIXNAT (code1) >= 0x100)
1902 	args_out_of_range (make_fixnum (0xFF), code1);
1903       code = XFIXNAT (code1);
1904 
1905       if (dimension > 1)
1906 	{
1907 	  code <<= 8;
1908 	  if (NILP (code2))
1909 	    code |= charsetp->code_space[(dimension - 2) * 4];
1910 	  else
1911 	    {
1912 	      CHECK_FIXNAT (code2);
1913 	      if (XFIXNAT (code2) >= 0x100)
1914 		args_out_of_range (make_fixnum (0xFF), code2);
1915 	      code |= XFIXNAT (code2);
1916 	    }
1917 
1918 	  if (dimension > 2)
1919 	    {
1920 	      code <<= 8;
1921 	      if (NILP (code3))
1922 		code |= charsetp->code_space[(dimension - 3) * 4];
1923 	      else
1924 		{
1925 		  CHECK_FIXNAT (code3);
1926 		  if (XFIXNAT (code3) >= 0x100)
1927 		    args_out_of_range (make_fixnum (0xFF), code3);
1928 		  code |= XFIXNAT (code3);
1929 		}
1930 
1931 	      if (dimension > 3)
1932 		{
1933 		  code <<= 8;
1934 		  if (NILP (code4))
1935 		    code |= charsetp->code_space[0];
1936 		  else
1937 		    {
1938 		      CHECK_FIXNAT (code4);
1939 		      if (XFIXNAT (code4) >= 0x100)
1940 			args_out_of_range (make_fixnum (0xFF), code4);
1941 		      code |= XFIXNAT (code4);
1942 		    }
1943 		}
1944 	    }
1945 	}
1946     }
1947 
1948   if (CHARSET_ISO_FINAL (charsetp) >= 0)
1949     code &= 0x7F7F7F7F;
1950   c = DECODE_CHAR (charsetp, code);
1951   if (c < 0)
1952     error ("Invalid code(s)");
1953   return make_fixnum (c);
1954 }
1955 
1956 
1957 /* Return the first charset in CHARSET_LIST that contains C.
1958    CHARSET_LIST is a list of charset IDs.  If it is nil, use
1959    Vcharset_ordered_list.  */
1960 
1961 struct charset *
char_charset(int c,Lisp_Object charset_list,unsigned int * code_return)1962 char_charset (int c, Lisp_Object charset_list, unsigned int *code_return)
1963 {
1964   bool maybe_null = 0;
1965 
1966   if (NILP (charset_list))
1967     charset_list = Vcharset_ordered_list;
1968   else
1969     maybe_null = 1;
1970 
1971   while (CONSP (charset_list))
1972     {
1973       struct charset *charset = CHARSET_FROM_ID (XFIXNUM (XCAR (charset_list)));
1974       unsigned code = ENCODE_CHAR (charset, c);
1975 
1976       if (code != CHARSET_INVALID_CODE (charset))
1977 	{
1978 	  if (code_return)
1979 	    *code_return = code;
1980 	  return charset;
1981 	}
1982       charset_list = XCDR (charset_list);
1983       if (! maybe_null
1984 	  && c <= MAX_UNICODE_CHAR
1985 	  && EQ (charset_list, Vcharset_non_preferred_head))
1986 	return CHARSET_FROM_ID (charset_unicode);
1987     }
1988   return (maybe_null ? NULL
1989 	  : c <= MAX_5_BYTE_CHAR ? CHARSET_FROM_ID (charset_emacs)
1990 	  : CHARSET_FROM_ID (charset_eight_bit));
1991 }
1992 
1993 
1994 DEFUN ("split-char", Fsplit_char, Ssplit_char, 1, 1, 0,
1995        doc:
1996        /*Return list of charset and one to four position-codes of CH.
1997 The charset is decided by the current priority order of charsets.
1998 A position-code is a byte value of each dimension of the code-point of
1999 CH in the charset.  */)
2000   (Lisp_Object ch)
2001 {
2002   struct charset *charset;
2003   int c, dimension;
2004   unsigned code;
2005   Lisp_Object val;
2006 
2007   CHECK_CHARACTER (ch);
2008   c = XFIXNAT (ch);
2009   charset = CHAR_CHARSET (c);
2010   if (! charset)
2011     emacs_abort ();
2012   code = ENCODE_CHAR (charset, c);
2013   if (code == CHARSET_INVALID_CODE (charset))
2014     emacs_abort ();
2015   dimension = CHARSET_DIMENSION (charset);
2016   for (val = Qnil; dimension > 0; dimension--)
2017     {
2018       val = Fcons (make_fixnum (code & 0xFF), val);
2019       code >>= 8;
2020     }
2021   return Fcons (CHARSET_NAME (charset), val);
2022 }
2023 
2024 
2025 DEFUN ("char-charset", Fchar_charset, Schar_charset, 1, 2, 0,
2026        doc: /* Return the charset of highest priority that contains CH.
2027 ASCII characters are an exception: for them, this function always
2028 returns `ascii'.
2029 If optional 2nd arg RESTRICTION is non-nil, it is a list of charsets
2030 from which to find the charset.  It may also be a coding system.  In
2031 that case, find the charset from what supported by that coding system.  */)
2032   (Lisp_Object ch, Lisp_Object restriction)
2033 {
2034   struct charset *charset;
2035 
2036   CHECK_CHARACTER (ch);
2037   if (NILP (restriction))
2038     charset = CHAR_CHARSET (XFIXNUM (ch));
2039   else
2040     {
2041       if (CONSP (restriction))
2042 	{
2043 	  int c = XFIXNAT (ch);
2044 
2045 	  for (; CONSP (restriction); restriction = XCDR (restriction))
2046 	    {
2047 	      struct charset *rcharset;
2048 
2049 	      CHECK_CHARSET_GET_CHARSET (XCAR (restriction), rcharset);
2050 	      if (ENCODE_CHAR (rcharset, c) != CHARSET_INVALID_CODE (rcharset))
2051 		return XCAR (restriction);
2052 	    }
2053 	  return Qnil;
2054 	}
2055       restriction = coding_system_charset_list (restriction);
2056       charset = char_charset (XFIXNUM (ch), restriction, NULL);
2057       if (! charset)
2058 	return Qnil;
2059     }
2060   return (CHARSET_NAME (charset));
2061 }
2062 
2063 
2064 DEFUN ("charset-after", Fcharset_after, Scharset_after, 0, 1, 0,
2065        doc: /*
2066 Return charset of a character in the current buffer at position POS.
2067 If POS is nil, it defaults to the current point.
2068 If POS is out of range, the value is nil.  */)
2069   (Lisp_Object pos)
2070 {
2071   Lisp_Object ch;
2072   struct charset *charset;
2073 
2074   ch = Fchar_after (pos);
2075   if (! FIXNUMP (ch))
2076     return ch;
2077   charset = CHAR_CHARSET (XFIXNUM (ch));
2078   return (CHARSET_NAME (charset));
2079 }
2080 
2081 
2082 DEFUN ("iso-charset", Fiso_charset, Siso_charset, 3, 3, 0,
2083        doc: /*
2084 Return charset of ISO's specification DIMENSION, CHARS, and FINAL-CHAR.
2085 
2086 ISO 2022's designation sequence (escape sequence) distinguishes charsets
2087 by their DIMENSION, CHARS, and FINAL-CHAR,
2088 whereas Emacs distinguishes them by charset symbol.
2089 See the documentation of the function `charset-info' for the meanings of
2090 DIMENSION, CHARS, and FINAL-CHAR.  */)
2091   (Lisp_Object dimension, Lisp_Object chars, Lisp_Object final_char)
2092 {
2093   bool chars_flag = check_iso_charset_parameter (dimension, chars, final_char);
2094   int id = ISO_CHARSET_TABLE (XFIXNUM (dimension), chars_flag,
2095 			      XFIXNAT (final_char));
2096   return (id >= 0 ? CHARSET_NAME (CHARSET_FROM_ID (id)) : Qnil);
2097 }
2098 
2099 
2100 DEFUN ("clear-charset-maps", Fclear_charset_maps, Sclear_charset_maps,
2101        0, 0, 0,
2102        doc: /*
2103 Internal use only.
2104 Clear temporary charset mapping tables.
2105 It should be called only from temacs invoked for dumping.  */)
2106   (void)
2107 {
2108   if (temp_charset_work)
2109     {
2110       xfree (temp_charset_work);
2111       temp_charset_work = NULL;
2112     }
2113 
2114   if (CHAR_TABLE_P (Vchar_unify_table))
2115     Foptimize_char_table (Vchar_unify_table, Qnil);
2116 
2117   return Qnil;
2118 }
2119 
2120 DEFUN ("charset-priority-list", Fcharset_priority_list,
2121        Scharset_priority_list, 0, 1, 0,
2122        doc: /* Return the list of charsets ordered by priority.
2123 HIGHESTP non-nil means just return the highest priority one.  */)
2124   (Lisp_Object highestp)
2125 {
2126   Lisp_Object val = Qnil, list = Vcharset_ordered_list;
2127 
2128   if (!NILP (highestp))
2129     return CHARSET_NAME (CHARSET_FROM_ID (XFIXNUM (Fcar (list))));
2130 
2131   while (!NILP (list))
2132     {
2133       val = Fcons (CHARSET_NAME (CHARSET_FROM_ID (XFIXNUM (XCAR (list)))), val);
2134       list = XCDR (list);
2135     }
2136   return Fnreverse (val);
2137 }
2138 
2139 DEFUN ("set-charset-priority", Fset_charset_priority, Sset_charset_priority,
2140        1, MANY, 0,
2141        doc: /* Assign higher priority to the charsets given as arguments.
2142 usage: (set-charset-priority &rest charsets)  */)
2143   (ptrdiff_t nargs, Lisp_Object *args)
2144 {
2145   Lisp_Object new_head, old_list;
2146   Lisp_Object list_2022, list_emacs_mule;
2147   ptrdiff_t i;
2148   int id;
2149 
2150   old_list = Fcopy_sequence (Vcharset_ordered_list);
2151   new_head = Qnil;
2152   for (i = 0; i < nargs; i++)
2153     {
2154       CHECK_CHARSET_GET_ID (args[i], id);
2155       if (! NILP (Fmemq (make_fixnum (id), old_list)))
2156 	{
2157 	  old_list = Fdelq (make_fixnum (id), old_list);
2158 	  new_head = Fcons (make_fixnum (id), new_head);
2159 	}
2160     }
2161   Vcharset_non_preferred_head = old_list;
2162   Vcharset_ordered_list = nconc2 (Fnreverse (new_head), old_list);
2163 
2164   charset_ordered_list_tick++;
2165 
2166   charset_unibyte = -1;
2167   for (old_list = Vcharset_ordered_list, list_2022 = list_emacs_mule = Qnil;
2168        CONSP (old_list); old_list = XCDR (old_list))
2169     {
2170       if (! NILP (Fmemq (XCAR (old_list), Viso_2022_charset_list)))
2171 	list_2022 = Fcons (XCAR (old_list), list_2022);
2172       if (! NILP (Fmemq (XCAR (old_list), Vemacs_mule_charset_list)))
2173 	list_emacs_mule = Fcons (XCAR (old_list), list_emacs_mule);
2174       if (charset_unibyte < 0)
2175 	{
2176 	  struct charset *charset = CHARSET_FROM_ID (XFIXNUM (XCAR (old_list)));
2177 
2178 	  if (CHARSET_DIMENSION (charset) == 1
2179 	      && CHARSET_ASCII_COMPATIBLE_P (charset)
2180 	      && CHARSET_MAX_CHAR (charset) >= 0x80)
2181 	    charset_unibyte = CHARSET_ID (charset);
2182 	}
2183     }
2184   Viso_2022_charset_list = Fnreverse (list_2022);
2185   Vemacs_mule_charset_list = Fnreverse (list_emacs_mule);
2186   if (charset_unibyte < 0)
2187     charset_unibyte = charset_iso_8859_1;
2188 
2189   return Qnil;
2190 }
2191 
2192 DEFUN ("charset-id-internal", Fcharset_id_internal, Scharset_id_internal,
2193        0, 1, 0,
2194        doc: /* Internal use only.
2195 Return charset identification number of CHARSET.  */)
2196   (Lisp_Object charset)
2197 {
2198   int id;
2199 
2200   CHECK_CHARSET_GET_ID (charset, id);
2201   return make_fixnum (id);
2202 }
2203 
2204 struct charset_sort_data
2205 {
2206   Lisp_Object charset;
2207   int id;
2208   ptrdiff_t priority;
2209 };
2210 
2211 static int
charset_compare(const void * d1,const void * d2)2212 charset_compare (const void *d1, const void *d2)
2213 {
2214   const struct charset_sort_data *data1 = d1, *data2 = d2;
2215   if (data1->priority != data2->priority)
2216     return data1->priority < data2->priority ? -1 : 1;
2217   return 0;
2218 }
2219 
2220 DEFUN ("sort-charsets", Fsort_charsets, Ssort_charsets, 1, 1, 0,
2221        doc: /* Sort charset list CHARSETS by a priority of each charset.
2222 Return the sorted list.  CHARSETS is modified by side effects.
2223 See also `charset-priority-list' and `set-charset-priority'.  */)
2224      (Lisp_Object charsets)
2225 {
2226   ptrdiff_t n = list_length (charsets), i, j;
2227   int done;
2228   Lisp_Object tail, elt, attrs;
2229   struct charset_sort_data *sort_data;
2230   int id, min_id = INT_MAX, max_id = INT_MIN;
2231   USE_SAFE_ALLOCA;
2232 
2233   if (n == 0)
2234     return Qnil;
2235   SAFE_NALLOCA (sort_data, 1, n);
2236   for (tail = charsets, i = 0; CONSP (tail); tail = XCDR (tail), i++)
2237     {
2238       elt = XCAR (tail);
2239       CHECK_CHARSET_GET_ATTR (elt, attrs);
2240       sort_data[i].charset = elt;
2241       sort_data[i].id = id = XFIXNUM (CHARSET_ATTR_ID (attrs));
2242       if (id < min_id)
2243 	min_id = id;
2244       if (id > max_id)
2245 	max_id = id;
2246     }
2247   for (done = 0, tail = Vcharset_ordered_list, i = 0;
2248        done < n && CONSP (tail); tail = XCDR (tail), i++)
2249     {
2250       elt = XCAR (tail);
2251       id = XFIXNAT (elt);
2252       if (id >= min_id && id <= max_id)
2253 	for (j = 0; j < n; j++)
2254 	  if (sort_data[j].id == id)
2255 	    {
2256 	      sort_data[j].priority = i;
2257 	      done++;
2258 	    }
2259     }
2260   qsort (sort_data, n, sizeof *sort_data, charset_compare);
2261   for (i = 0, tail = charsets; CONSP (tail); tail = XCDR (tail), i++)
2262     XSETCAR (tail, sort_data[i].charset);
2263   SAFE_FREE ();
2264   return charsets;
2265 }
2266 
2267 
2268 void
init_charset(void)2269 init_charset (void)
2270 {
2271   Lisp_Object tempdir;
2272   tempdir = Fexpand_file_name (build_string ("charsets"), Vdata_directory);
2273   if (! file_accessible_directory_p (tempdir))
2274     {
2275       /* This used to be non-fatal (dir_warning), but it should not
2276          happen, and if it does sooner or later it will cause some
2277          obscure problem (eg bug#6401), so better exit.  */
2278       fprintf (stderr,
2279 	       ("Error: %s: %s\n"
2280 		"Emacs will not function correctly "
2281 		"without the character map files.\n"
2282 		"%s"
2283 		"Please check your installation!\n"),
2284 	       SDATA (tempdir), strerror (errno),
2285 	       (egetenv ("EMACSDATA")
2286 		? ("The EMACSDATA environment variable is set.  "
2287 		   "Maybe it has the wrong value?\n")
2288 		: ""));
2289       exit (1);
2290     }
2291 
2292   Vcharset_map_path = list1 (tempdir);
2293 }
2294 
2295 
2296 void
init_charset_once(void)2297 init_charset_once (void)
2298 {
2299   int i, j, k;
2300 
2301   for (i = 0; i < ISO_MAX_DIMENSION; i++)
2302     for (j = 0; j < ISO_MAX_CHARS; j++)
2303       for (k = 0; k < ISO_MAX_FINAL; k++)
2304         iso_charset_table[i][j][k] = -1;
2305 
2306   PDUMPER_REMEMBER_SCALAR (iso_charset_table);
2307 
2308   for (i = 0; i < 256; i++)
2309     emacs_mule_charset[i] = -1;
2310 
2311   PDUMPER_REMEMBER_SCALAR (emacs_mule_charset);
2312 
2313   charset_jisx0201_roman = -1;
2314   PDUMPER_REMEMBER_SCALAR (charset_jisx0201_roman);
2315 
2316   charset_jisx0208_1978 = -1;
2317   PDUMPER_REMEMBER_SCALAR (charset_jisx0208_1978);
2318 
2319   charset_jisx0208 = -1;
2320   PDUMPER_REMEMBER_SCALAR (charset_jisx0208);
2321 
2322   charset_ksc5601 = -1;
2323   PDUMPER_REMEMBER_SCALAR (charset_ksc5601);
2324 }
2325 
2326 /* Allocate an initial charset table that is large enough to handle
2327    Emacs while it is bootstrapping.  As of September 2011, the size
2328    needs to be at least 166; make it a bit bigger to allow for future
2329    expansion.
2330 
2331    Don't make the value so small that the table is reallocated during
2332    bootstrapping, as glibc malloc calls larger than just under 64 KiB
2333    during an initial bootstrap wreak havoc after dumping; see the
2334    M_MMAP_THRESHOLD value in alloc.c, plus there is an extra overhead
2335    internal to glibc malloc and perhaps to Emacs malloc debugging.  */
2336 static struct charset charset_table_init[180];
2337 
2338 void
syms_of_charset(void)2339 syms_of_charset (void)
2340 {
2341   DEFSYM (Qcharsetp, "charsetp");
2342 
2343   /* Special charset symbols.  */
2344   DEFSYM (Qascii, "ascii");
2345   DEFSYM (Qunicode, "unicode");
2346   DEFSYM (Qemacs, "emacs");
2347   DEFSYM (Qeight_bit, "eight-bit");
2348   DEFSYM (Qiso_8859_1, "iso-8859-1");
2349 
2350   staticpro (&Vcharset_ordered_list);
2351   Vcharset_ordered_list = Qnil;
2352 
2353   staticpro (&Viso_2022_charset_list);
2354   Viso_2022_charset_list = Qnil;
2355 
2356   staticpro (&Vemacs_mule_charset_list);
2357   Vemacs_mule_charset_list = Qnil;
2358 
2359   staticpro (&Vcharset_hash_table);
2360   Vcharset_hash_table = CALLN (Fmake_hash_table, QCtest, Qeq);
2361 
2362   charset_table = charset_table_init;
2363   charset_table_size = ARRAYELTS (charset_table_init);
2364   PDUMPER_REMEMBER_SCALAR (charset_table_size);
2365   charset_table_used = 0;
2366   PDUMPER_REMEMBER_SCALAR (charset_table_used);
2367 
2368   defsubr (&Scharsetp);
2369   defsubr (&Smap_charset_chars);
2370   defsubr (&Sdefine_charset_internal);
2371   defsubr (&Sdefine_charset_alias);
2372   defsubr (&Scharset_plist);
2373   defsubr (&Sset_charset_plist);
2374   defsubr (&Sunify_charset);
2375   defsubr (&Sget_unused_iso_final_char);
2376   defsubr (&Sdeclare_equiv_charset);
2377   defsubr (&Sfind_charset_region);
2378   defsubr (&Sfind_charset_string);
2379   defsubr (&Sdecode_char);
2380   defsubr (&Sencode_char);
2381   defsubr (&Ssplit_char);
2382   defsubr (&Smake_char);
2383   defsubr (&Schar_charset);
2384   defsubr (&Scharset_after);
2385   defsubr (&Siso_charset);
2386   defsubr (&Sclear_charset_maps);
2387   defsubr (&Scharset_priority_list);
2388   defsubr (&Sset_charset_priority);
2389   defsubr (&Scharset_id_internal);
2390   defsubr (&Ssort_charsets);
2391 
2392   DEFVAR_LISP ("charset-map-path", Vcharset_map_path,
2393 	       doc: /* List of directories to search for charset map files.  */);
2394   Vcharset_map_path = Qnil;
2395 
2396   DEFVAR_BOOL ("inhibit-load-charset-map", inhibit_load_charset_map,
2397 	       doc: /* Inhibit loading of charset maps.  Used when dumping Emacs.  */);
2398   inhibit_load_charset_map = 0;
2399 
2400   DEFVAR_LISP ("charset-list", Vcharset_list,
2401 	       doc: /* List of all charsets ever defined.  */);
2402   Vcharset_list = Qnil;
2403 
2404   DEFVAR_LISP ("current-iso639-language", Vcurrent_iso639_language,
2405 	       doc: /* ISO639 language mnemonic symbol for the current language environment.
2406 If the current language environment is for multiple languages (e.g. "Latin-1"),
2407 the value may be a list of mnemonics.  */);
2408   Vcurrent_iso639_language = Qnil;
2409 
2410   charset_ascii
2411     = define_charset_internal (Qascii, 1, "\x00\x7F\0\0\0\0\0",
2412                                0, 127, 'B', -1, 0, 1, 0, 0);
2413   PDUMPER_REMEMBER_SCALAR (charset_ascii);
2414 
2415   charset_iso_8859_1
2416     = define_charset_internal (Qiso_8859_1, 1, "\x00\xFF\0\0\0\0\0",
2417                                0, 255, -1, -1, -1, 1, 0, 0);
2418   PDUMPER_REMEMBER_SCALAR (charset_iso_8859_1);
2419 
2420   charset_unicode
2421     = define_charset_internal (Qunicode, 3, "\x00\xFF\x00\xFF\x00\x10\0",
2422                                0, MAX_UNICODE_CHAR, -1, 0, -1, 1, 0, 0);
2423   PDUMPER_REMEMBER_SCALAR (charset_unicode);
2424 
2425   charset_emacs
2426     = define_charset_internal (Qemacs, 3, "\x00\xFF\x00\xFF\x00\x3F\0",
2427                                0, MAX_5_BYTE_CHAR, -1, 0, -1, 1, 1, 0);
2428   PDUMPER_REMEMBER_SCALAR (charset_emacs);
2429 
2430   charset_eight_bit
2431     = define_charset_internal (Qeight_bit, 1, "\x80\xFF\0\0\0\0\0",
2432 			       128, 255, -1, 0, -1, 0, 1,
2433                                MAX_5_BYTE_CHAR + 1);
2434   PDUMPER_REMEMBER_SCALAR (charset_eight_bit);
2435 
2436   charset_unibyte = charset_iso_8859_1;
2437   PDUMPER_REMEMBER_SCALAR (charset_unibyte);
2438 }
2439