1 /* Basic character set support.
2
3 Copyright (C) 2001-2021 Free Software Foundation, Inc.
4
5 Copyright (C) 1995, 1996, 1997, 1998, 1999, 2000, 2001, 2002, 2003, 2004,
6 2005, 2006, 2007, 2008, 2009, 2010, 2011
7 National Institute of Advanced Industrial Science and Technology (AIST)
8 Registration Number H14PRO021
9
10 Copyright (C) 2003, 2004
11 National Institute of Advanced Industrial Science and Technology (AIST)
12 Registration Number H13PRO009
13
14 This file is part of GNU Emacs.
15
16 GNU Emacs is free software: you can redistribute it and/or modify
17 it under the terms of the GNU General Public License as published by
18 the Free Software Foundation, either version 3 of the License, or (at
19 your option) any later version.
20
21 GNU Emacs is distributed in the hope that it will be useful,
22 but WITHOUT ANY WARRANTY; without even the implied warranty of
23 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
24 GNU General Public License for more details.
25
26 You should have received a copy of the GNU General Public License
27 along with GNU Emacs. If not, see <https://www.gnu.org/licenses/>. */
28
29 #include <config.h>
30
31 #include <errno.h>
32 #include <stdlib.h>
33 #include <unistd.h>
34 #include <limits.h>
35 #include <sys/types.h>
36 #include "lisp.h"
37 #include "character.h"
38 #include "charset.h"
39 #include "coding.h"
40 #include "buffer.h"
41 #include "sysstdio.h"
42 #include "pdumper.h"
43
44 /*** GENERAL NOTES on CODED CHARACTER SETS (CHARSETS) ***
45
46 A coded character set ("charset" hereafter) is a meaningful
47 collection (i.e. language, culture, functionality, etc.) of
48 characters. Emacs handles multiple charsets at once. In Emacs Lisp
49 code, a charset is represented by a symbol. In C code, a charset is
50 represented by its ID number or by a pointer to a struct charset.
51
52 The actual information about each charset is stored in two places.
53 Lispy information is stored in the hash table Vcharset_hash_table as
54 a vector (charset attributes). The other information is stored in
55 charset_table as a struct charset.
56
57 */
58
59 /* Hash table that contains attributes of each charset. Keys are
60 charset symbols, and values are vectors of charset attributes. */
61 Lisp_Object Vcharset_hash_table;
62
63 /* Table of struct charset. */
64 struct charset *charset_table;
65 int charset_table_size;
66 int charset_table_used;
67
68 /* Special charsets corresponding to symbols. */
69 int charset_ascii;
70 int charset_eight_bit;
71 static int charset_iso_8859_1;
72 int charset_unicode;
73 static int charset_emacs;
74
75 /* The other special charsets. */
76 int charset_jisx0201_roman;
77 int charset_jisx0208_1978;
78 int charset_jisx0208;
79 int charset_ksc5601;
80
81 /* Charset of unibyte characters. */
82 int charset_unibyte;
83
84 /* List of charsets ordered by the priority. */
85 Lisp_Object Vcharset_ordered_list;
86
87 /* Sub-list of Vcharset_ordered_list that contains all non-preferred
88 charsets. */
89 Lisp_Object Vcharset_non_preferred_head;
90
91 /* Incremented every time we change the priority of charsets.
92 Wraps around. */
93 EMACS_UINT charset_ordered_list_tick;
94
95 /* List of iso-2022 charsets. */
96 Lisp_Object Viso_2022_charset_list;
97
98 /* List of emacs-mule charsets. */
99 Lisp_Object Vemacs_mule_charset_list;
100
101 int emacs_mule_charset[256];
102
103 /* Mapping table from ISO2022's charset (specified by DIMENSION,
104 CHARS, and FINAL-CHAR) to Emacs' charset. */
105 int iso_charset_table[ISO_MAX_DIMENSION][ISO_MAX_CHARS][ISO_MAX_FINAL];
106
107 #define CODE_POINT_TO_INDEX(charset, code) \
108 ((charset)->code_linear_p \
109 ? (int) ((code) - (charset)->min_code) \
110 : (((charset)->code_space_mask[(code) >> 24] & 0x8) \
111 && ((charset)->code_space_mask[((code) >> 16) & 0xFF] & 0x4) \
112 && ((charset)->code_space_mask[((code) >> 8) & 0xFF] & 0x2) \
113 && ((charset)->code_space_mask[(code) & 0xFF] & 0x1)) \
114 ? (int) (((((code) >> 24) - (charset)->code_space[12]) \
115 * (charset)->code_space[11]) \
116 + (((((code) >> 16) & 0xFF) - (charset)->code_space[8]) \
117 * (charset)->code_space[7]) \
118 + (((((code) >> 8) & 0xFF) - (charset)->code_space[4]) \
119 * (charset)->code_space[3]) \
120 + (((code) & 0xFF) - (charset)->code_space[0]) \
121 - ((charset)->char_index_offset)) \
122 : -1)
123
124
125 /* Return the code-point for the character index IDX in CHARSET.
126 IDX should be an unsigned int variable in a valid range (which is
127 always in nonnegative int range too). IDX contains garbage afterwards. */
128
129 #define INDEX_TO_CODE_POINT(charset, idx) \
130 ((charset)->code_linear_p \
131 ? (idx) + (charset)->min_code \
132 : (idx += (charset)->char_index_offset, \
133 (((charset)->code_space[0] + (idx) % (charset)->code_space[2]) \
134 | (((charset)->code_space[4] \
135 + ((idx) / (charset)->code_space[3] % (charset)->code_space[6])) \
136 << 8) \
137 | (((charset)->code_space[8] \
138 + ((idx) / (charset)->code_space[7] % (charset)->code_space[10])) \
139 << 16) \
140 | (((charset)->code_space[12] + ((idx) / (charset)->code_space[11])) \
141 << 24))))
142
143 /* Structure to hold mapping tables for a charset. Used by temacs
144 invoked for dumping. */
145
146 static struct
147 {
148 /* The current charset for which the following tables are setup. */
149 struct charset *current;
150
151 /* 1 iff the following table is used for encoder. */
152 short for_encoder;
153
154 /* When the following table is used for encoding, minimum and
155 maximum character of the current charset. */
156 int min_char, max_char;
157
158 /* A Unicode character corresponding to the code index 0 (i.e. the
159 minimum code-point) of the current charset, or -1 if the code
160 index 0 is not a Unicode character. This is checked when
161 table.encoder[CHAR] is zero. */
162 int zero_index_char;
163
164 union {
165 /* Table mapping code-indices (not code-points) of the current
166 charset to Unicode characters. If decoder[CHAR] is -1, CHAR
167 doesn't belong to the current charset. */
168 int decoder[0x10000];
169 /* Table mapping Unicode characters to code-indices of the current
170 charset. The first 0x10000 elements are for BMP (0..0xFFFF),
171 and the last 0x10000 are for SMP (0x10000..0x1FFFF) or SIP
172 (0x20000..0x2FFFF). Note that there is no charset map that
173 uses both SMP and SIP. */
174 unsigned short encoder[0x20000];
175 } table;
176 } *temp_charset_work;
177
178 #define SET_TEMP_CHARSET_WORK_ENCODER(C, CODE) \
179 do { \
180 if ((CODE) == 0) \
181 temp_charset_work->zero_index_char = (C); \
182 else if ((C) < 0x20000) \
183 temp_charset_work->table.encoder[(C)] = (CODE); \
184 else \
185 temp_charset_work->table.encoder[(C) - 0x10000] = (CODE); \
186 } while (0)
187
188 #define GET_TEMP_CHARSET_WORK_ENCODER(C) \
189 ((C) == temp_charset_work->zero_index_char ? 0 \
190 : (C) < 0x20000 ? (temp_charset_work->table.encoder[(C)] \
191 ? (int) temp_charset_work->table.encoder[(C)] : -1) \
192 : temp_charset_work->table.encoder[(C) - 0x10000] \
193 ? temp_charset_work->table.encoder[(C) - 0x10000] : -1)
194
195 #define SET_TEMP_CHARSET_WORK_DECODER(C, CODE) \
196 (temp_charset_work->table.decoder[(CODE)] = (C))
197
198 #define GET_TEMP_CHARSET_WORK_DECODER(CODE) \
199 (temp_charset_work->table.decoder[(CODE)])
200
201
202 /* Set to 1 to warn that a charset map is loaded and thus a buffer
203 text and a string data may be relocated. */
204 bool charset_map_loaded;
205
206 struct charset_map_entries
207 {
208 struct {
209 unsigned from, to;
210 int c;
211 } entry[0x10000];
212 struct charset_map_entries *next;
213 };
214
215 /* Load the mapping information of CHARSET from ENTRIES for
216 initializing (CONTROL_FLAG == 0), decoding (CONTROL_FLAG == 1), and
217 encoding (CONTROL_FLAG == 2).
218
219 If CONTROL_FLAG is 0, setup CHARSET->min_char, CHARSET->max_char,
220 and CHARSET->fast_map.
221
222 If CONTROL_FLAG is 1, setup the following tables according to
223 CHARSET->method and inhibit_load_charset_map.
224
225 CHARSET->method | inhibit_lcm == 0 | inhibit_lcm == 1
226 ----------------------+--------------------+---------------------------
227 CHARSET_METHOD_MAP | CHARSET->decoder | temp_charset_work->decoder
228 ----------------------+--------------------+---------------------------
229 CHARSET_METHOD_OFFSET | Vchar_unify_table | temp_charset_work->decoder
230
231 If CONTROL_FLAG is 2, setup the following tables.
232
233 CHARSET->method | inhibit_lcm == 0 | inhibit_lcm == 1
234 ----------------------+--------------------+---------------------------
235 CHARSET_METHOD_MAP | CHARSET->encoder | temp_charset_work->encoder
236 ----------------------+--------------------+--------------------------
237 CHARSET_METHOD_OFFSET | CHARSET->deunifier | temp_charset_work->encoder
238 */
239
240 static void
load_charset_map(struct charset * charset,struct charset_map_entries * entries,int n_entries,int control_flag)241 load_charset_map (struct charset *charset, struct charset_map_entries *entries, int n_entries, int control_flag)
242 {
243 Lisp_Object vec UNINIT;
244 Lisp_Object table UNINIT;
245 unsigned max_code = CHARSET_MAX_CODE (charset);
246 bool ascii_compatible_p = charset->ascii_compatible_p;
247 int min_char, max_char, nonascii_min_char;
248 int i;
249 unsigned char *fast_map = charset->fast_map;
250
251 if (n_entries <= 0)
252 return;
253
254 if (control_flag)
255 {
256 if (! inhibit_load_charset_map)
257 {
258 if (control_flag == 1)
259 {
260 if (charset->method == CHARSET_METHOD_MAP)
261 {
262 int n = CODE_POINT_TO_INDEX (charset, max_code) + 1;
263
264 vec = make_vector (n, make_fixnum (-1));
265 set_charset_attr (charset, charset_decoder, vec);
266 }
267 else
268 {
269 char_table_set_range (Vchar_unify_table,
270 charset->min_char, charset->max_char,
271 Qnil);
272 }
273 }
274 else
275 {
276 table = Fmake_char_table (Qnil, Qnil);
277 set_charset_attr (charset,
278 (charset->method == CHARSET_METHOD_MAP
279 ? charset_encoder : charset_deunifier),
280 table);
281 }
282 }
283 else
284 {
285 if (! temp_charset_work)
286 temp_charset_work = xmalloc (sizeof *temp_charset_work);
287 if (control_flag == 1)
288 {
289 memset (temp_charset_work->table.decoder, -1,
290 sizeof (int) * 0x10000);
291 }
292 else
293 {
294 memset (temp_charset_work->table.encoder, 0,
295 sizeof (unsigned short) * 0x20000);
296 temp_charset_work->zero_index_char = -1;
297 }
298 temp_charset_work->current = charset;
299 temp_charset_work->for_encoder = (control_flag == 2);
300 control_flag += 2;
301 }
302 charset_map_loaded = 1;
303 }
304
305 min_char = max_char = entries->entry[0].c;
306 nonascii_min_char = MAX_CHAR;
307 for (i = 0; i < n_entries; i++)
308 {
309 unsigned from, to;
310 int from_index, to_index, lim_index;
311 int from_c, to_c;
312 int idx = i % 0x10000;
313
314 if (i > 0 && idx == 0)
315 entries = entries->next;
316 from = entries->entry[idx].from;
317 to = entries->entry[idx].to;
318 from_c = entries->entry[idx].c;
319 from_index = CODE_POINT_TO_INDEX (charset, from);
320 if (from == to)
321 {
322 to_index = from_index;
323 to_c = from_c;
324 }
325 else
326 {
327 to_index = CODE_POINT_TO_INDEX (charset, to);
328 to_c = from_c + (to_index - from_index);
329 }
330 if (from_index < 0 || to_index < 0)
331 continue;
332 lim_index = to_index + 1;
333
334 if (to_c > max_char)
335 max_char = to_c;
336 else if (from_c < min_char)
337 min_char = from_c;
338
339 if (control_flag == 1)
340 {
341 if (charset->method == CHARSET_METHOD_MAP)
342 for (; from_index < lim_index; from_index++, from_c++)
343 ASET (vec, from_index, make_fixnum (from_c));
344 else
345 for (; from_index < lim_index; from_index++, from_c++)
346 CHAR_TABLE_SET (Vchar_unify_table,
347 CHARSET_CODE_OFFSET (charset) + from_index,
348 make_fixnum (from_c));
349 }
350 else if (control_flag == 2)
351 {
352 if (charset->method == CHARSET_METHOD_MAP
353 && CHARSET_COMPACT_CODES_P (charset))
354 for (; from_index < lim_index; from_index++, from_c++)
355 {
356 unsigned code = from_index;
357 code = INDEX_TO_CODE_POINT (charset, code);
358
359 if (NILP (CHAR_TABLE_REF (table, from_c)))
360 CHAR_TABLE_SET (table, from_c, make_fixnum (code));
361 }
362 else
363 for (; from_index < lim_index; from_index++, from_c++)
364 {
365 if (NILP (CHAR_TABLE_REF (table, from_c)))
366 CHAR_TABLE_SET (table, from_c, make_fixnum (from_index));
367 }
368 }
369 else if (control_flag == 3)
370 for (; from_index < lim_index; from_index++, from_c++)
371 SET_TEMP_CHARSET_WORK_DECODER (from_c, from_index);
372 else if (control_flag == 4)
373 for (; from_index < lim_index; from_index++, from_c++)
374 SET_TEMP_CHARSET_WORK_ENCODER (from_c, from_index);
375 else /* control_flag == 0 */
376 {
377 if (ascii_compatible_p)
378 {
379 if (! ASCII_CHAR_P (from_c))
380 {
381 if (from_c < nonascii_min_char)
382 nonascii_min_char = from_c;
383 }
384 else if (! ASCII_CHAR_P (to_c))
385 {
386 nonascii_min_char = 0x80;
387 }
388 }
389
390 for (; from_c <= to_c; from_c++)
391 CHARSET_FAST_MAP_SET (from_c, fast_map);
392 }
393 }
394
395 if (control_flag == 0)
396 {
397 CHARSET_MIN_CHAR (charset) = (ascii_compatible_p
398 ? nonascii_min_char : min_char);
399 CHARSET_MAX_CHAR (charset) = max_char;
400 }
401 else if (control_flag == 4)
402 {
403 temp_charset_work->min_char = min_char;
404 temp_charset_work->max_char = max_char;
405 }
406 }
407
408
409 /* Read a hexadecimal number (preceded by "0x") from the file FP while
410 paying attention to comment character '#'. LOOKAHEAD is the
411 lookahead byte if it is nonnegative. Store into *TERMINATOR the
412 input byte after the number, or EOF if an end-of-file or input
413 error occurred. Set *OVERFLOW if the number overflows. */
414
415 static unsigned
read_hex(FILE * fp,int lookahead,int * terminator,bool * overflow)416 read_hex (FILE *fp, int lookahead, int *terminator, bool *overflow)
417 {
418 int c = lookahead < 0 ? getc (fp) : lookahead;
419
420 while (true)
421 {
422 if (c == '#')
423 do
424 c = getc (fp);
425 while (0 <= c && c != '\n');
426 else if (c == '0')
427 {
428 c = getc (fp);
429 if (c < 0 || c == 'x')
430 break;
431 }
432 if (c < 0)
433 break;
434 c = getc (fp);
435 }
436
437 unsigned n = 0;
438 bool v = false;
439
440 if (0 <= c)
441 while (true)
442 {
443 c = getc (fp);
444 int digit = char_hexdigit (c);
445 if (digit < 0)
446 break;
447 v |= INT_LEFT_SHIFT_OVERFLOW (n, 4);
448 n = (n << 4) + digit;
449 }
450
451 *terminator = c;
452 *overflow |= v;
453 return n;
454 }
455
456 /* Return a mapping vector for CHARSET loaded from MAPFILE.
457 Each line of MAPFILE has this form
458 0xAAAA 0xCCCC
459 where 0xAAAA is a code-point and 0xCCCC is the corresponding
460 character code, or this form
461 0xAAAA-0xBBBB 0xCCCC
462 where 0xAAAA and 0xBBBB are code-points specifying a range, and
463 0xCCCC is the first character code of the range.
464
465 The returned vector has this form:
466 [ CODE1 CHAR1 CODE2 CHAR2 .... ]
467 where CODE1 is a code-point or a cons of code-points specifying a
468 range.
469
470 Note that this function uses `openp' to open MAPFILE but ignores
471 `file-name-handler-alist' to avoid running any Lisp code. */
472
473 static void
load_charset_map_from_file(struct charset * charset,Lisp_Object mapfile,int control_flag)474 load_charset_map_from_file (struct charset *charset, Lisp_Object mapfile,
475 int control_flag)
476 {
477 unsigned min_code = CHARSET_MIN_CODE (charset);
478 unsigned max_code = CHARSET_MAX_CODE (charset);
479 int fd;
480 FILE *fp;
481 struct charset_map_entries *head, *entries;
482 int n_entries;
483 AUTO_STRING (map, ".map");
484 AUTO_STRING (txt, ".txt");
485 AUTO_LIST2 (suffixes, map, txt);
486 ptrdiff_t count = SPECPDL_INDEX ();
487 record_unwind_protect_nothing ();
488 specbind (Qfile_name_handler_alist, Qnil);
489 fd = openp (Vcharset_map_path, mapfile, suffixes, NULL, Qnil, false, false);
490 fp = fd < 0 ? 0 : fdopen (fd, "r");
491 if (!fp)
492 {
493 int open_errno = errno;
494 emacs_close (fd);
495 report_file_errno ("Loading charset map", mapfile, open_errno);
496 }
497 set_unwind_protect_ptr (count, fclose_unwind, fp);
498 unbind_to (count + 1, Qnil);
499
500 /* Use record_xmalloc, as `charset_map_entries' is
501 large (larger than MAX_ALLOCA). */
502 head = record_xmalloc (sizeof *head);
503 entries = head;
504 memset (entries, 0, sizeof (struct charset_map_entries));
505
506 n_entries = 0;
507 int ch = -1;
508 while (true)
509 {
510 bool overflow = false;
511 unsigned from = read_hex (fp, ch, &ch, &overflow), to;
512 if (ch < 0)
513 break;
514 if (ch == '-')
515 {
516 to = read_hex (fp, -1, &ch, &overflow);
517 if (ch < 0)
518 break;
519 }
520 else
521 {
522 to = from;
523 ch = -1;
524 }
525 unsigned c = read_hex (fp, ch, &ch, &overflow);
526 if (ch < 0)
527 break;
528
529 if (overflow)
530 continue;
531 if (from < min_code || to > max_code || from > to || c > MAX_CHAR)
532 continue;
533
534 if (n_entries == 0x10000)
535 {
536 entries->next = record_xmalloc (sizeof *entries->next);
537 entries = entries->next;
538 memset (entries, 0, sizeof (struct charset_map_entries));
539 n_entries = 0;
540 }
541 int idx = n_entries;
542 entries->entry[idx].from = from;
543 entries->entry[idx].to = to;
544 entries->entry[idx].c = c;
545 n_entries++;
546 }
547 fclose (fp);
548 clear_unwind_protect (count);
549
550 load_charset_map (charset, head, n_entries, control_flag);
551 unbind_to (count, Qnil);
552 }
553
554 static void
load_charset_map_from_vector(struct charset * charset,Lisp_Object vec,int control_flag)555 load_charset_map_from_vector (struct charset *charset, Lisp_Object vec, int control_flag)
556 {
557 unsigned min_code = CHARSET_MIN_CODE (charset);
558 unsigned max_code = CHARSET_MAX_CODE (charset);
559 struct charset_map_entries *head, *entries;
560 int n_entries;
561 int len = ASIZE (vec);
562 int i;
563 USE_SAFE_ALLOCA;
564
565 if (len % 2 == 1)
566 {
567 add_to_log ("Failure in loading charset map: %V", vec);
568 return;
569 }
570
571 /* Use SAFE_ALLOCA instead of alloca, as `charset_map_entries' is
572 large (larger than MAX_ALLOCA). */
573 head = SAFE_ALLOCA (sizeof *head);
574 entries = head;
575 memset (entries, 0, sizeof (struct charset_map_entries));
576
577 n_entries = 0;
578 for (i = 0; i < len; i += 2)
579 {
580 Lisp_Object val, val2;
581 unsigned from, to;
582 EMACS_INT c;
583 int idx;
584
585 val = AREF (vec, i);
586 if (CONSP (val))
587 {
588 val2 = XCDR (val);
589 val = XCAR (val);
590 from = XFIXNAT (val);
591 to = XFIXNAT (val2);
592 }
593 else
594 from = to = XFIXNAT (val);
595 val = AREF (vec, i + 1);
596 CHECK_FIXNAT (val);
597 c = XFIXNAT (val);
598
599 if (from < min_code || to > max_code || from > to || c > MAX_CHAR)
600 continue;
601
602 if (n_entries > 0 && (n_entries % 0x10000) == 0)
603 {
604 entries->next = SAFE_ALLOCA (sizeof *entries->next);
605 entries = entries->next;
606 memset (entries, 0, sizeof (struct charset_map_entries));
607 }
608 idx = n_entries % 0x10000;
609 entries->entry[idx].from = from;
610 entries->entry[idx].to = to;
611 entries->entry[idx].c = c;
612 n_entries++;
613 }
614
615 load_charset_map (charset, head, n_entries, control_flag);
616 SAFE_FREE ();
617 }
618
619
620 /* Load a mapping table for CHARSET. CONTROL-FLAG tells what kind of
621 map it is (see the comment of load_charset_map for the detail). */
622
623 static void
load_charset(struct charset * charset,int control_flag)624 load_charset (struct charset *charset, int control_flag)
625 {
626 Lisp_Object map;
627
628 if (inhibit_load_charset_map
629 && temp_charset_work
630 && charset == temp_charset_work->current
631 && ((control_flag == 2) == temp_charset_work->for_encoder))
632 return;
633
634 if (CHARSET_METHOD (charset) == CHARSET_METHOD_MAP)
635 map = CHARSET_MAP (charset);
636 else
637 {
638 if (! CHARSET_UNIFIED_P (charset))
639 emacs_abort ();
640 map = CHARSET_UNIFY_MAP (charset);
641 }
642 if (STRINGP (map))
643 load_charset_map_from_file (charset, map, control_flag);
644 else
645 load_charset_map_from_vector (charset, map, control_flag);
646 }
647
648
649 DEFUN ("charsetp", Fcharsetp, Scharsetp, 1, 1, 0,
650 doc: /* Return non-nil if and only if OBJECT is a charset.*/)
651 (Lisp_Object object)
652 {
653 return (CHARSETP (object) ? Qt : Qnil);
654 }
655
656
657 static void
map_charset_for_dump(void (* c_function)(Lisp_Object,Lisp_Object),Lisp_Object function,Lisp_Object arg,unsigned int from,unsigned int to)658 map_charset_for_dump (void (*c_function) (Lisp_Object, Lisp_Object),
659 Lisp_Object function, Lisp_Object arg,
660 unsigned int from, unsigned int to)
661 {
662 int from_idx = CODE_POINT_TO_INDEX (temp_charset_work->current, from);
663 int to_idx = CODE_POINT_TO_INDEX (temp_charset_work->current, to);
664 Lisp_Object range = Fcons (Qnil, Qnil);
665 int c, stop;
666
667 c = temp_charset_work->min_char;
668 stop = (temp_charset_work->max_char < 0x20000
669 ? temp_charset_work->max_char : 0xFFFF);
670
671 while (1)
672 {
673 int idx = GET_TEMP_CHARSET_WORK_ENCODER (c);
674
675 if (idx >= from_idx && idx <= to_idx)
676 {
677 if (NILP (XCAR (range)))
678 XSETCAR (range, make_fixnum (c));
679 }
680 else if (! NILP (XCAR (range)))
681 {
682 XSETCDR (range, make_fixnum (c - 1));
683 if (c_function)
684 (*c_function) (arg, range);
685 else
686 call2 (function, range, arg);
687 XSETCAR (range, Qnil);
688 }
689 if (c == stop)
690 {
691 if (c == temp_charset_work->max_char)
692 {
693 if (! NILP (XCAR (range)))
694 {
695 XSETCDR (range, make_fixnum (c));
696 if (c_function)
697 (*c_function) (arg, range);
698 else
699 call2 (function, range, arg);
700 }
701 break;
702 }
703 c = 0x1FFFF;
704 stop = temp_charset_work->max_char;
705 }
706 c++;
707 }
708 }
709
710 void
map_charset_chars(void (* c_function)(Lisp_Object,Lisp_Object),Lisp_Object function,Lisp_Object arg,struct charset * charset,unsigned from,unsigned to)711 map_charset_chars (void (*c_function)(Lisp_Object, Lisp_Object), Lisp_Object function,
712 Lisp_Object arg, struct charset *charset, unsigned from, unsigned to)
713 {
714 Lisp_Object range;
715 bool partial = (from > CHARSET_MIN_CODE (charset)
716 || to < CHARSET_MAX_CODE (charset));
717
718 if (CHARSET_METHOD (charset) == CHARSET_METHOD_OFFSET)
719 {
720 int from_idx = CODE_POINT_TO_INDEX (charset, from);
721 int to_idx = CODE_POINT_TO_INDEX (charset, to);
722 int from_c = from_idx + CHARSET_CODE_OFFSET (charset);
723 int to_c = to_idx + CHARSET_CODE_OFFSET (charset);
724
725 if (CHARSET_UNIFIED_P (charset))
726 {
727 if (! CHAR_TABLE_P (CHARSET_DEUNIFIER (charset)))
728 load_charset (charset, 2);
729 if (CHAR_TABLE_P (CHARSET_DEUNIFIER (charset)))
730 map_char_table_for_charset (c_function, function,
731 CHARSET_DEUNIFIER (charset), arg,
732 partial ? charset : NULL, from, to);
733 else
734 map_charset_for_dump (c_function, function, arg, from, to);
735 }
736
737 range = Fcons (make_fixnum (from_c), make_fixnum (to_c));
738 if (NILP (function))
739 (*c_function) (arg, range);
740 else
741 call2 (function, range, arg);
742 }
743 else if (CHARSET_METHOD (charset) == CHARSET_METHOD_MAP)
744 {
745 if (! CHAR_TABLE_P (CHARSET_ENCODER (charset)))
746 load_charset (charset, 2);
747 if (CHAR_TABLE_P (CHARSET_ENCODER (charset)))
748 map_char_table_for_charset (c_function, function,
749 CHARSET_ENCODER (charset), arg,
750 partial ? charset : NULL, from, to);
751 else
752 map_charset_for_dump (c_function, function, arg, from, to);
753 }
754 else if (CHARSET_METHOD (charset) == CHARSET_METHOD_SUBSET)
755 {
756 Lisp_Object subset_info;
757 int offset;
758
759 subset_info = CHARSET_SUBSET (charset);
760 charset = CHARSET_FROM_ID (XFIXNAT (AREF (subset_info, 0)));
761 offset = XFIXNUM (AREF (subset_info, 3));
762 from -= offset;
763 if (from < XFIXNAT (AREF (subset_info, 1)))
764 from = XFIXNAT (AREF (subset_info, 1));
765 to -= offset;
766 if (to > XFIXNAT (AREF (subset_info, 2)))
767 to = XFIXNAT (AREF (subset_info, 2));
768 map_charset_chars (c_function, function, arg, charset, from, to);
769 }
770 else /* i.e. CHARSET_METHOD_SUPERSET */
771 {
772 Lisp_Object parents;
773
774 for (parents = CHARSET_SUPERSET (charset); CONSP (parents);
775 parents = XCDR (parents))
776 {
777 int offset;
778 unsigned this_from, this_to;
779
780 charset = CHARSET_FROM_ID (XFIXNAT (XCAR (XCAR (parents))));
781 offset = XFIXNUM (XCDR (XCAR (parents)));
782 this_from = from > offset ? from - offset : 0;
783 this_to = to > offset ? to - offset : 0;
784 if (this_from < CHARSET_MIN_CODE (charset))
785 this_from = CHARSET_MIN_CODE (charset);
786 if (this_to > CHARSET_MAX_CODE (charset))
787 this_to = CHARSET_MAX_CODE (charset);
788 map_charset_chars (c_function, function, arg, charset,
789 this_from, this_to);
790 }
791 }
792 }
793
794 DEFUN ("map-charset-chars", Fmap_charset_chars, Smap_charset_chars, 2, 5, 0,
795 doc: /* Call FUNCTION for all characters in CHARSET.
796 FUNCTION is called with an argument RANGE and the optional 3rd
797 argument ARG.
798
799 RANGE is a cons (FROM . TO), where FROM and TO indicate a range of
800 characters contained in CHARSET.
801
802 The optional 4th and 5th arguments FROM-CODE and TO-CODE specify the
803 range of code points (in CHARSET) of target characters. Note that
804 these are not character codes, but code points in CHARSET; for the
805 difference see `decode-char' and `list-charset-chars'. */)
806 (Lisp_Object function, Lisp_Object charset, Lisp_Object arg, Lisp_Object from_code, Lisp_Object to_code)
807 {
808 struct charset *cs;
809 unsigned from, to;
810
811 CHECK_CHARSET_GET_CHARSET (charset, cs);
812 if (NILP (from_code))
813 from = CHARSET_MIN_CODE (cs);
814 else
815 {
816 from = XFIXNUM (from_code);
817 if (from < CHARSET_MIN_CODE (cs))
818 from = CHARSET_MIN_CODE (cs);
819 }
820 if (NILP (to_code))
821 to = CHARSET_MAX_CODE (cs);
822 else
823 {
824 to = XFIXNUM (to_code);
825 if (to > CHARSET_MAX_CODE (cs))
826 to = CHARSET_MAX_CODE (cs);
827 }
828 map_charset_chars (NULL, function, arg, cs, from, to);
829 return Qnil;
830 }
831
832
833 /* Define a charset according to the arguments. The Nth argument is
834 the Nth attribute of the charset (the last attribute `charset-id'
835 is not included). See the docstring of `define-charset' for the
836 detail. */
837
838 DEFUN ("define-charset-internal", Fdefine_charset_internal,
839 Sdefine_charset_internal, charset_arg_max, MANY, 0,
840 doc: /* For internal use only.
841 usage: (define-charset-internal ...) */)
842 (ptrdiff_t nargs, Lisp_Object *args)
843 {
844 /* Charset attr vector. */
845 Lisp_Object attrs;
846 Lisp_Object val;
847 Lisp_Object hash_code;
848 struct Lisp_Hash_Table *hash_table = XHASH_TABLE (Vcharset_hash_table);
849 int i, j;
850 struct charset charset;
851 int id;
852 int dimension;
853 bool new_definition_p;
854 int nchars;
855
856 memset (&charset, 0, sizeof (charset));
857
858 if (nargs != charset_arg_max)
859 Fsignal (Qwrong_number_of_arguments,
860 Fcons (intern ("define-charset-internal"),
861 make_fixnum (nargs)));
862
863 attrs = make_nil_vector (charset_attr_max);
864
865 CHECK_SYMBOL (args[charset_arg_name]);
866 ASET (attrs, charset_name, args[charset_arg_name]);
867
868 val = args[charset_arg_code_space];
869 for (i = 0, dimension = 0, nchars = 1; ; i++)
870 {
871 Lisp_Object min_byte_obj = Faref (val, make_fixnum (i * 2));
872 Lisp_Object max_byte_obj = Faref (val, make_fixnum (i * 2 + 1));
873 int min_byte = check_integer_range (min_byte_obj, 0, 255);
874 int max_byte = check_integer_range (max_byte_obj, min_byte, 255);
875 charset.code_space[i * 4] = min_byte;
876 charset.code_space[i * 4 + 1] = max_byte;
877 charset.code_space[i * 4 + 2] = max_byte - min_byte + 1;
878 if (max_byte > 0)
879 dimension = i + 1;
880 if (i == 3)
881 break;
882 nchars *= charset.code_space[i * 4 + 2];
883 charset.code_space[i * 4 + 3] = nchars;
884 }
885
886 val = args[charset_arg_dimension];
887 charset.dimension
888 = !NILP (val) ? check_integer_range (val, 1, 4) : dimension;
889
890 charset.code_linear_p
891 = (charset.dimension == 1
892 || (charset.code_space[2] == 256
893 && (charset.dimension == 2
894 || (charset.code_space[6] == 256
895 && (charset.dimension == 3
896 || charset.code_space[10] == 256)))));
897
898 if (! charset.code_linear_p)
899 {
900 charset.code_space_mask = xzalloc (256);
901 for (i = 0; i < 4; i++)
902 for (j = charset.code_space[i * 4]; j <= charset.code_space[i * 4 + 1];
903 j++)
904 charset.code_space_mask[j] |= (1 << i);
905 }
906
907 charset.iso_chars_96 = charset.code_space[2] == 96;
908
909 charset.min_code = (charset.code_space[0]
910 | (charset.code_space[4] << 8)
911 | (charset.code_space[8] << 16)
912 | ((unsigned) charset.code_space[12] << 24));
913 charset.max_code = (charset.code_space[1]
914 | (charset.code_space[5] << 8)
915 | (charset.code_space[9] << 16)
916 | ((unsigned) charset.code_space[13] << 24));
917 charset.char_index_offset = 0;
918
919 val = args[charset_arg_min_code];
920 if (! NILP (val))
921 {
922 unsigned code = cons_to_unsigned (val, UINT_MAX);
923
924 if (code < charset.min_code
925 || code > charset.max_code)
926 args_out_of_range_3 (INT_TO_INTEGER (charset.min_code),
927 INT_TO_INTEGER (charset.max_code), val);
928 charset.char_index_offset = CODE_POINT_TO_INDEX (&charset, code);
929 charset.min_code = code;
930 }
931
932 val = args[charset_arg_max_code];
933 if (! NILP (val))
934 {
935 unsigned code = cons_to_unsigned (val, UINT_MAX);
936
937 if (code < charset.min_code
938 || code > charset.max_code)
939 args_out_of_range_3 (INT_TO_INTEGER (charset.min_code),
940 INT_TO_INTEGER (charset.max_code), val);
941 charset.max_code = code;
942 }
943
944 charset.compact_codes_p = charset.max_code < 0x10000;
945
946 val = args[charset_arg_invalid_code];
947 if (NILP (val))
948 {
949 if (charset.min_code > 0)
950 charset.invalid_code = 0;
951 else
952 {
953 if (charset.max_code < UINT_MAX)
954 charset.invalid_code = charset.max_code + 1;
955 else
956 error ("Attribute :invalid-code must be specified");
957 }
958 }
959 else
960 charset.invalid_code = cons_to_unsigned (val, UINT_MAX);
961
962 val = args[charset_arg_iso_final];
963 if (NILP (val))
964 charset.iso_final = -1;
965 else
966 {
967 CHECK_FIXNUM (val);
968 if (XFIXNUM (val) < '0' || XFIXNUM (val) > 127)
969 error ("Invalid iso-final-char: %"pI"d", XFIXNUM (val));
970 charset.iso_final = XFIXNUM (val);
971 }
972
973 val = args[charset_arg_iso_revision];
974 charset.iso_revision = !NILP (val) ? check_integer_range (val, -1, 63) : -1;
975
976 val = args[charset_arg_emacs_mule_id];
977 if (NILP (val))
978 charset.emacs_mule_id = -1;
979 else
980 {
981 CHECK_FIXNAT (val);
982 if ((XFIXNUM (val) > 0 && XFIXNUM (val) <= 128) || XFIXNUM (val) >= 256)
983 error ("Invalid emacs-mule-id: %"pI"d", XFIXNUM (val));
984 charset.emacs_mule_id = XFIXNUM (val);
985 }
986
987 charset.ascii_compatible_p = ! NILP (args[charset_arg_ascii_compatible_p]);
988
989 charset.supplementary_p = ! NILP (args[charset_arg_supplementary_p]);
990
991 charset.unified_p = 0;
992
993 memset (charset.fast_map, 0, sizeof (charset.fast_map));
994
995 if (! NILP (args[charset_arg_code_offset]))
996 {
997 val = args[charset_arg_code_offset];
998 CHECK_CHARACTER (val);
999
1000 charset.method = CHARSET_METHOD_OFFSET;
1001 charset.code_offset = XFIXNUM (val);
1002
1003 i = CODE_POINT_TO_INDEX (&charset, charset.max_code);
1004 if (MAX_CHAR - charset.code_offset < i)
1005 error ("Unsupported max char: %d", charset.max_char);
1006 charset.max_char = i + charset.code_offset;
1007 i = CODE_POINT_TO_INDEX (&charset, charset.min_code);
1008 charset.min_char = i + charset.code_offset;
1009
1010 i = (charset.min_char >> 7) << 7;
1011 for (; i < 0x10000 && i <= charset.max_char; i += 128)
1012 CHARSET_FAST_MAP_SET (i, charset.fast_map);
1013 i = (i >> 12) << 12;
1014 for (; i <= charset.max_char; i += 0x1000)
1015 CHARSET_FAST_MAP_SET (i, charset.fast_map);
1016 if (charset.code_offset == 0 && charset.max_char >= 0x80)
1017 charset.ascii_compatible_p = 1;
1018 }
1019 else if (! NILP (args[charset_arg_map]))
1020 {
1021 val = args[charset_arg_map];
1022 ASET (attrs, charset_map, val);
1023 charset.method = CHARSET_METHOD_MAP;
1024 }
1025 else if (! NILP (args[charset_arg_subset]))
1026 {
1027 Lisp_Object parent;
1028 Lisp_Object parent_min_code, parent_max_code, parent_code_offset;
1029 struct charset *parent_charset;
1030
1031 val = args[charset_arg_subset];
1032 parent = Fcar (val);
1033 CHECK_CHARSET_GET_CHARSET (parent, parent_charset);
1034 parent_min_code = Fnth (make_fixnum (1), val);
1035 CHECK_FIXNAT (parent_min_code);
1036 parent_max_code = Fnth (make_fixnum (2), val);
1037 CHECK_FIXNAT (parent_max_code);
1038 parent_code_offset = Fnth (make_fixnum (3), val);
1039 CHECK_FIXNUM (parent_code_offset);
1040 ASET (attrs, charset_subset,
1041 CALLN (Fvector, make_fixnum (parent_charset->id),
1042 parent_min_code, parent_max_code, parent_code_offset));
1043
1044 charset.method = CHARSET_METHOD_SUBSET;
1045 /* Here, we just copy the parent's fast_map. It's not accurate,
1046 but at least it works for quickly detecting which character
1047 DOESN'T belong to this charset. */
1048 memcpy (charset.fast_map, parent_charset->fast_map,
1049 sizeof charset.fast_map);
1050
1051 /* We also copy these for parents. */
1052 charset.min_char = parent_charset->min_char;
1053 charset.max_char = parent_charset->max_char;
1054 }
1055 else if (! NILP (args[charset_arg_superset]))
1056 {
1057 val = args[charset_arg_superset];
1058 charset.method = CHARSET_METHOD_SUPERSET;
1059 val = Fcopy_sequence (val);
1060 ASET (attrs, charset_superset, val);
1061
1062 charset.min_char = MAX_CHAR;
1063 charset.max_char = 0;
1064 for (; ! NILP (val); val = Fcdr (val))
1065 {
1066 Lisp_Object elt, car_part, cdr_part;
1067 int this_id, offset;
1068 struct charset *this_charset;
1069
1070 elt = Fcar (val);
1071 if (CONSP (elt))
1072 {
1073 car_part = XCAR (elt);
1074 cdr_part = XCDR (elt);
1075 CHECK_CHARSET_GET_ID (car_part, this_id);
1076 offset = check_integer_range (cdr_part, INT_MIN, INT_MAX);
1077 }
1078 else
1079 {
1080 CHECK_CHARSET_GET_ID (elt, this_id);
1081 offset = 0;
1082 }
1083 XSETCAR (val, Fcons (make_fixnum (this_id), make_fixnum (offset)));
1084
1085 this_charset = CHARSET_FROM_ID (this_id);
1086 if (charset.min_char > this_charset->min_char)
1087 charset.min_char = this_charset->min_char;
1088 if (charset.max_char < this_charset->max_char)
1089 charset.max_char = this_charset->max_char;
1090 for (i = 0; i < 190; i++)
1091 charset.fast_map[i] |= this_charset->fast_map[i];
1092 }
1093 }
1094 else
1095 error ("None of :code-offset, :map, :parents are specified");
1096
1097 val = args[charset_arg_unify_map];
1098 if (! NILP (val) && !STRINGP (val))
1099 CHECK_VECTOR (val);
1100 ASET (attrs, charset_unify_map, val);
1101
1102 CHECK_LIST (args[charset_arg_plist]);
1103 ASET (attrs, charset_plist, args[charset_arg_plist]);
1104
1105 charset.hash_index = hash_lookup (hash_table, args[charset_arg_name],
1106 &hash_code);
1107 if (charset.hash_index >= 0)
1108 {
1109 new_definition_p = 0;
1110 id = XFIXNAT (CHARSET_SYMBOL_ID (args[charset_arg_name]));
1111 set_hash_value_slot (hash_table, charset.hash_index, attrs);
1112 }
1113 else
1114 {
1115 charset.hash_index = hash_put (hash_table, args[charset_arg_name], attrs,
1116 hash_code);
1117 if (charset_table_used == charset_table_size)
1118 {
1119 /* Ensure that charset IDs fit into 'int' as well as into the
1120 restriction imposed by fixnums. Although the 'int' restriction
1121 could be removed, too much other code would need altering; for
1122 example, the IDs are stuffed into struct
1123 coding_system.charbuf[i] entries, which are 'int'. */
1124 int old_size = charset_table_size;
1125 ptrdiff_t new_size = old_size;
1126 struct charset *new_table =
1127 xpalloc (0, &new_size, 1,
1128 min (INT_MAX, MOST_POSITIVE_FIXNUM),
1129 sizeof *charset_table);
1130 memcpy (new_table, charset_table, old_size * sizeof *new_table);
1131 charset_table = new_table;
1132 charset_table_size = new_size;
1133 /* FIXME: This leaks memory, as the old charset_table becomes
1134 unreachable. If the old charset table is charset_table_init
1135 then this leak is intentional; otherwise, it's unclear.
1136 If the latter memory leak is intentional, a
1137 comment should be added to explain this. If not, the old
1138 charset_table should be freed, by passing it as the 1st argument
1139 to xpalloc and removing the memcpy. */
1140 }
1141 id = charset_table_used++;
1142 new_definition_p = 1;
1143 }
1144
1145 ASET (attrs, charset_id, make_fixnum (id));
1146 charset.id = id;
1147 charset_table[id] = charset;
1148
1149 if (charset.method == CHARSET_METHOD_MAP)
1150 {
1151 load_charset (&charset, 0);
1152 charset_table[id] = charset;
1153 }
1154
1155 if (charset.iso_final >= 0)
1156 {
1157 ISO_CHARSET_TABLE (charset.dimension, charset.iso_chars_96,
1158 charset.iso_final) = id;
1159 if (new_definition_p)
1160 Viso_2022_charset_list = nconc2 (Viso_2022_charset_list, list1i (id));
1161 if (ISO_CHARSET_TABLE (1, 0, 'J') == id)
1162 charset_jisx0201_roman = id;
1163 else if (ISO_CHARSET_TABLE (2, 0, '@') == id)
1164 charset_jisx0208_1978 = id;
1165 else if (ISO_CHARSET_TABLE (2, 0, 'B') == id)
1166 charset_jisx0208 = id;
1167 else if (ISO_CHARSET_TABLE (2, 0, 'C') == id)
1168 charset_ksc5601 = id;
1169 }
1170
1171 if (charset.emacs_mule_id >= 0)
1172 {
1173 emacs_mule_charset[charset.emacs_mule_id] = id;
1174 if (charset.emacs_mule_id < 0xA0)
1175 emacs_mule_bytes[charset.emacs_mule_id] = charset.dimension + 1;
1176 else
1177 emacs_mule_bytes[charset.emacs_mule_id] = charset.dimension + 2;
1178 if (new_definition_p)
1179 Vemacs_mule_charset_list = nconc2 (Vemacs_mule_charset_list,
1180 list1i (id));
1181 }
1182
1183 if (new_definition_p)
1184 {
1185 Vcharset_list = Fcons (args[charset_arg_name], Vcharset_list);
1186 if (charset.supplementary_p)
1187 Vcharset_ordered_list = nconc2 (Vcharset_ordered_list, list1i (id));
1188 else
1189 {
1190 Lisp_Object tail;
1191
1192 for (tail = Vcharset_ordered_list; CONSP (tail); tail = XCDR (tail))
1193 {
1194 struct charset *cs = CHARSET_FROM_ID (XFIXNUM (XCAR (tail)));
1195
1196 if (cs->supplementary_p)
1197 break;
1198 }
1199 if (EQ (tail, Vcharset_ordered_list))
1200 Vcharset_ordered_list = Fcons (make_fixnum (id),
1201 Vcharset_ordered_list);
1202 else if (NILP (tail))
1203 Vcharset_ordered_list = nconc2 (Vcharset_ordered_list,
1204 list1i (id));
1205 else
1206 {
1207 val = Fcons (XCAR (tail), XCDR (tail));
1208 XSETCDR (tail, val);
1209 XSETCAR (tail, make_fixnum (id));
1210 }
1211 }
1212 charset_ordered_list_tick++;
1213 }
1214
1215 return Qnil;
1216 }
1217
1218
1219 /* Same as Fdefine_charset_internal but arguments are more convenient
1220 to call from C (typically in syms_of_charset). This can define a
1221 charset of `offset' method only. Return the ID of the new
1222 charset. */
1223
1224 static int
define_charset_internal(Lisp_Object name,int dimension,const char * code_space_chars,unsigned min_code,unsigned max_code,int iso_final,int iso_revision,int emacs_mule_id,bool ascii_compatible,bool supplementary,int code_offset)1225 define_charset_internal (Lisp_Object name,
1226 int dimension,
1227 const char *code_space_chars,
1228 unsigned min_code, unsigned max_code,
1229 int iso_final, int iso_revision, int emacs_mule_id,
1230 bool ascii_compatible, bool supplementary,
1231 int code_offset)
1232 {
1233 const unsigned char *code_space = (const unsigned char *) code_space_chars;
1234 Lisp_Object args[charset_arg_max];
1235 Lisp_Object val;
1236 int i;
1237
1238 args[charset_arg_name] = name;
1239 args[charset_arg_dimension] = make_fixnum (dimension);
1240 val = make_uninit_vector (8);
1241 for (i = 0; i < 8; i++)
1242 ASET (val, i, make_fixnum (code_space[i]));
1243 args[charset_arg_code_space] = val;
1244 args[charset_arg_min_code] = make_fixnum (min_code);
1245 args[charset_arg_max_code] = make_fixnum (max_code);
1246 args[charset_arg_iso_final]
1247 = (iso_final < 0 ? Qnil : make_fixnum (iso_final));
1248 args[charset_arg_iso_revision] = make_fixnum (iso_revision);
1249 args[charset_arg_emacs_mule_id]
1250 = (emacs_mule_id < 0 ? Qnil : make_fixnum (emacs_mule_id));
1251 args[charset_arg_ascii_compatible_p] = ascii_compatible ? Qt : Qnil;
1252 args[charset_arg_supplementary_p] = supplementary ? Qt : Qnil;
1253 args[charset_arg_invalid_code] = Qnil;
1254 args[charset_arg_code_offset] = make_fixnum (code_offset);
1255 args[charset_arg_map] = Qnil;
1256 args[charset_arg_subset] = Qnil;
1257 args[charset_arg_superset] = Qnil;
1258 args[charset_arg_unify_map] = Qnil;
1259
1260 args[charset_arg_plist] =
1261 list (QCname,
1262 args[charset_arg_name],
1263 intern_c_string (":dimension"),
1264 args[charset_arg_dimension],
1265 intern_c_string (":code-space"),
1266 args[charset_arg_code_space],
1267 intern_c_string (":iso-final-char"),
1268 args[charset_arg_iso_final],
1269 intern_c_string (":emacs-mule-id"),
1270 args[charset_arg_emacs_mule_id],
1271 QCascii_compatible_p,
1272 args[charset_arg_ascii_compatible_p],
1273 intern_c_string (":code-offset"),
1274 args[charset_arg_code_offset]);
1275 Fdefine_charset_internal (charset_arg_max, args);
1276
1277 return XFIXNUM (CHARSET_SYMBOL_ID (name));
1278 }
1279
1280
1281 DEFUN ("define-charset-alias", Fdefine_charset_alias,
1282 Sdefine_charset_alias, 2, 2, 0,
1283 doc: /* Define ALIAS as an alias for charset CHARSET. */)
1284 (Lisp_Object alias, Lisp_Object charset)
1285 {
1286 Lisp_Object attr;
1287
1288 CHECK_CHARSET_GET_ATTR (charset, attr);
1289 Fputhash (alias, attr, Vcharset_hash_table);
1290 Vcharset_list = Fcons (alias, Vcharset_list);
1291 return Qnil;
1292 }
1293
1294
1295 DEFUN ("charset-plist", Fcharset_plist, Scharset_plist, 1, 1, 0,
1296 doc: /* Return the property list of CHARSET. */)
1297 (Lisp_Object charset)
1298 {
1299 Lisp_Object attrs;
1300
1301 CHECK_CHARSET_GET_ATTR (charset, attrs);
1302 return CHARSET_ATTR_PLIST (attrs);
1303 }
1304
1305
1306 DEFUN ("set-charset-plist", Fset_charset_plist, Sset_charset_plist, 2, 2, 0,
1307 doc: /* Set CHARSET's property list to PLIST. */)
1308 (Lisp_Object charset, Lisp_Object plist)
1309 {
1310 Lisp_Object attrs;
1311
1312 CHECK_CHARSET_GET_ATTR (charset, attrs);
1313 ASET (attrs, charset_plist, plist);
1314 return plist;
1315 }
1316
1317
1318 DEFUN ("unify-charset", Funify_charset, Sunify_charset, 1, 3, 0,
1319 doc: /* Unify characters of CHARSET with Unicode.
1320 This means reading the relevant file and installing the table defined
1321 by CHARSET's `:unify-map' property.
1322
1323 Optional second arg UNIFY-MAP is a file name string or a vector. It has
1324 the same meaning as the `:unify-map' attribute in the function
1325 `define-charset' (which see).
1326
1327 Optional third argument DEUNIFY, if non-nil, means to de-unify CHARSET. */)
1328 (Lisp_Object charset, Lisp_Object unify_map, Lisp_Object deunify)
1329 {
1330 int id;
1331 struct charset *cs;
1332
1333 CHECK_CHARSET_GET_ID (charset, id);
1334 cs = CHARSET_FROM_ID (id);
1335 if (NILP (deunify)
1336 ? CHARSET_UNIFIED_P (cs) && ! NILP (CHARSET_DEUNIFIER (cs))
1337 : ! CHARSET_UNIFIED_P (cs))
1338 return Qnil;
1339
1340 CHARSET_UNIFIED_P (cs) = 0;
1341 if (NILP (deunify))
1342 {
1343 if (CHARSET_METHOD (cs) != CHARSET_METHOD_OFFSET
1344 || CHARSET_CODE_OFFSET (cs) < 0x110000)
1345 error ("Can't unify charset: %s", SDATA (SYMBOL_NAME (charset)));
1346 if (NILP (unify_map))
1347 unify_map = CHARSET_UNIFY_MAP (cs);
1348 else
1349 {
1350 if (! STRINGP (unify_map) && ! VECTORP (unify_map))
1351 signal_error ("Bad unify-map", unify_map);
1352 set_charset_attr (cs, charset_unify_map, unify_map);
1353 }
1354 if (NILP (Vchar_unify_table))
1355 Vchar_unify_table = Fmake_char_table (Qnil, Qnil);
1356 char_table_set_range (Vchar_unify_table,
1357 cs->min_char, cs->max_char, charset);
1358 CHARSET_UNIFIED_P (cs) = 1;
1359 }
1360 else if (CHAR_TABLE_P (Vchar_unify_table))
1361 {
1362 unsigned min_code = CHARSET_MIN_CODE (cs);
1363 unsigned max_code = CHARSET_MAX_CODE (cs);
1364 int min_char = DECODE_CHAR (cs, min_code);
1365 int max_char = DECODE_CHAR (cs, max_code);
1366
1367 char_table_set_range (Vchar_unify_table, min_char, max_char, Qnil);
1368 }
1369
1370 return Qnil;
1371 }
1372
1373 /* Check that DIMENSION, CHARS, and FINAL_CHAR specify a valid ISO charset.
1374 Return true if it's a 96-character set, false if 94. */
1375
1376 static bool
check_iso_charset_parameter(Lisp_Object dimension,Lisp_Object chars,Lisp_Object final_char)1377 check_iso_charset_parameter (Lisp_Object dimension, Lisp_Object chars,
1378 Lisp_Object final_char)
1379 {
1380 CHECK_FIXNUM (dimension);
1381 CHECK_FIXNUM (chars);
1382 CHECK_CHARACTER (final_char);
1383
1384 if (! (1 <= XFIXNUM (dimension) && XFIXNUM (dimension) <= 3))
1385 error ("Invalid DIMENSION %"pI"d, it should be 1, 2, or 3",
1386 XFIXNUM (dimension));
1387
1388 bool chars_flag = XFIXNUM (chars) == 96;
1389 if (! (chars_flag || XFIXNUM (chars) == 94))
1390 error ("Invalid CHARS %"pI"d, it should be 94 or 96", XFIXNUM (chars));
1391
1392 int final_ch = XFIXNAT (final_char);
1393 if (! ('0' <= final_ch && final_ch <= '~'))
1394 error ("Invalid FINAL-CHAR `%c', it should be `0'..`~'", final_ch);
1395
1396 return chars_flag;
1397 }
1398
1399 DEFUN ("get-unused-iso-final-char", Fget_unused_iso_final_char,
1400 Sget_unused_iso_final_char, 2, 2, 0,
1401 doc: /*
1402 Return an unused ISO final char for a charset of DIMENSION and CHARS.
1403 DIMENSION is the number of bytes to represent a character: 1 or 2.
1404 CHARS is the number of characters in a dimension: 94 or 96.
1405
1406 This final char is for private use, thus the range is `0' (48) .. `?' (63).
1407 If there's no unused final char for the specified kind of charset,
1408 return nil. */)
1409 (Lisp_Object dimension, Lisp_Object chars)
1410 {
1411 bool chars_flag = check_iso_charset_parameter (dimension, chars,
1412 make_fixnum ('0'));
1413 for (int final_char = '0'; final_char <= '?'; final_char++)
1414 if (ISO_CHARSET_TABLE (XFIXNUM (dimension), chars_flag, final_char) < 0)
1415 return make_fixnum (final_char);
1416 return Qnil;
1417 }
1418
1419
1420 DEFUN ("declare-equiv-charset", Fdeclare_equiv_charset, Sdeclare_equiv_charset,
1421 4, 4, 0,
1422 doc: /* Declare an equivalent charset for ISO-2022 decoding.
1423
1424 On decoding by an ISO-2022 base coding system, when a charset
1425 specified by DIMENSION, CHARS, and FINAL-CHAR is designated, behave as
1426 if CHARSET is designated instead. */)
1427 (Lisp_Object dimension, Lisp_Object chars, Lisp_Object final_char, Lisp_Object charset)
1428 {
1429 int id;
1430
1431 CHECK_CHARSET_GET_ID (charset, id);
1432 bool chars_flag = check_iso_charset_parameter (dimension, chars, final_char);
1433 ISO_CHARSET_TABLE (XFIXNUM (dimension), chars_flag, XFIXNAT (final_char)) = id;
1434 return Qnil;
1435 }
1436
1437
1438 /* Return information about charsets in the text at PTR of NBYTES
1439 bytes, which are NCHARS characters. The value is:
1440
1441 0: Each character is represented by one byte. This is always
1442 true for a unibyte string. For a multibyte string, true if
1443 it contains only ASCII characters.
1444
1445 1: No charsets other than ascii, control-1, and latin-1 are
1446 found.
1447
1448 2: Otherwise.
1449 */
1450
1451 int
string_xstring_p(Lisp_Object string)1452 string_xstring_p (Lisp_Object string)
1453 {
1454 const unsigned char *p = SDATA (string);
1455 const unsigned char *endp = p + SBYTES (string);
1456
1457 if (SCHARS (string) == SBYTES (string))
1458 return 0;
1459
1460 while (p < endp)
1461 {
1462 int c = string_char_advance (&p);
1463
1464 if (c >= 0x100)
1465 return 2;
1466 }
1467 return 1;
1468 }
1469
1470
1471 /* Find charsets in the string at PTR of NCHARS and NBYTES.
1472
1473 CHARSETS is a vector. If Nth element is non-nil, it means the
1474 charset whose id is N is already found.
1475
1476 It may lookup a translation table TABLE if supplied. */
1477
1478 static void
find_charsets_in_text(const unsigned char * ptr,ptrdiff_t nchars,ptrdiff_t nbytes,Lisp_Object charsets,Lisp_Object table,bool multibyte)1479 find_charsets_in_text (const unsigned char *ptr, ptrdiff_t nchars,
1480 ptrdiff_t nbytes, Lisp_Object charsets,
1481 Lisp_Object table, bool multibyte)
1482 {
1483 const unsigned char *pend = ptr + nbytes;
1484
1485 if (nchars == nbytes)
1486 {
1487 if (multibyte)
1488 ASET (charsets, charset_ascii, Qt);
1489 else
1490 while (ptr < pend)
1491 {
1492 int c = *ptr++;
1493
1494 if (!NILP (table))
1495 c = translate_char (table, c);
1496 if (ASCII_CHAR_P (c))
1497 ASET (charsets, charset_ascii, Qt);
1498 else
1499 ASET (charsets, charset_eight_bit, Qt);
1500 }
1501 }
1502 else
1503 {
1504 while (ptr < pend)
1505 {
1506 int c = string_char_advance (&ptr);
1507 struct charset *charset;
1508
1509 if (!NILP (table))
1510 c = translate_char (table, c);
1511 charset = CHAR_CHARSET (c);
1512 ASET (charsets, CHARSET_ID (charset), Qt);
1513 }
1514 }
1515 }
1516
1517 DEFUN ("find-charset-region", Ffind_charset_region, Sfind_charset_region,
1518 2, 3, 0,
1519 doc: /* Return a list of charsets in the region between BEG and END.
1520 BEG and END are buffer positions.
1521 Optional arg TABLE if non-nil is a translation table to look up.
1522
1523 If the current buffer is unibyte, the returned list may contain
1524 only `ascii', `eight-bit-control', and `eight-bit-graphic'. */)
1525 (Lisp_Object beg, Lisp_Object end, Lisp_Object table)
1526 {
1527 Lisp_Object charsets;
1528 ptrdiff_t from, from_byte, to, stop, stop_byte;
1529 int i;
1530 Lisp_Object val;
1531 bool multibyte = ! NILP (BVAR (current_buffer, enable_multibyte_characters));
1532
1533 validate_region (&beg, &end);
1534 from = XFIXNAT (beg);
1535 stop = to = XFIXNAT (end);
1536
1537 if (from < GPT && GPT < to)
1538 {
1539 stop = GPT;
1540 stop_byte = GPT_BYTE;
1541 }
1542 else
1543 stop_byte = CHAR_TO_BYTE (stop);
1544
1545 from_byte = CHAR_TO_BYTE (from);
1546
1547 charsets = make_nil_vector (charset_table_used);
1548 while (1)
1549 {
1550 find_charsets_in_text (BYTE_POS_ADDR (from_byte), stop - from,
1551 stop_byte - from_byte, charsets, table,
1552 multibyte);
1553 if (stop < to)
1554 {
1555 from = stop, from_byte = stop_byte;
1556 stop = to, stop_byte = CHAR_TO_BYTE (stop);
1557 }
1558 else
1559 break;
1560 }
1561
1562 val = Qnil;
1563 for (i = charset_table_used - 1; i >= 0; i--)
1564 if (!NILP (AREF (charsets, i)))
1565 val = Fcons (CHARSET_NAME (charset_table + i), val);
1566 return val;
1567 }
1568
1569 DEFUN ("find-charset-string", Ffind_charset_string, Sfind_charset_string,
1570 1, 2, 0,
1571 doc: /* Return a list of charsets in STR.
1572 Optional arg TABLE if non-nil is a translation table to look up.
1573
1574 If STR is unibyte, the returned list may contain
1575 only `ascii', `eight-bit-control', and `eight-bit-graphic'. */)
1576 (Lisp_Object str, Lisp_Object table)
1577 {
1578 CHECK_STRING (str);
1579
1580 Lisp_Object charsets = make_nil_vector (charset_table_used);
1581 find_charsets_in_text (SDATA (str), SCHARS (str), SBYTES (str),
1582 charsets, table,
1583 STRING_MULTIBYTE (str));
1584 Lisp_Object val = Qnil;
1585 for (int i = charset_table_used - 1; i >= 0; i--)
1586 if (!NILP (AREF (charsets, i)))
1587 val = Fcons (CHARSET_NAME (charset_table + i), val);
1588 return val;
1589 }
1590
1591
1592
1593 /* Return a unified character code for C (>= 0x110000). VAL is a
1594 value of Vchar_unify_table for C; i.e. it is nil, an integer, or a
1595 charset symbol. */
1596 static int
maybe_unify_char(int c,Lisp_Object val)1597 maybe_unify_char (int c, Lisp_Object val)
1598 {
1599 struct charset *charset;
1600
1601 if (FIXNUMP (val))
1602 return XFIXNAT (val);
1603 if (NILP (val))
1604 return c;
1605
1606 CHECK_CHARSET_GET_CHARSET (val, charset);
1607 #ifdef REL_ALLOC
1608 /* The call to load_charset below can allocate memory, which screws
1609 callers of this function through STRING_CHAR_* macros that hold C
1610 pointers to buffer text, if REL_ALLOC is used. */
1611 r_alloc_inhibit_buffer_relocation (1);
1612 #endif
1613 load_charset (charset, 1);
1614 if (! inhibit_load_charset_map)
1615 {
1616 val = CHAR_TABLE_REF (Vchar_unify_table, c);
1617 if (! NILP (val))
1618 c = XFIXNAT (val);
1619 }
1620 else
1621 {
1622 int code_index = c - CHARSET_CODE_OFFSET (charset);
1623 int unified = GET_TEMP_CHARSET_WORK_DECODER (code_index);
1624
1625 if (unified > 0)
1626 c = unified;
1627 }
1628 #ifdef REL_ALLOC
1629 r_alloc_inhibit_buffer_relocation (0);
1630 #endif
1631 return c;
1632 }
1633
1634
1635 /* Return a character corresponding to the code-point CODE of
1636 CHARSET. */
1637
1638 int
decode_char(struct charset * charset,unsigned int code)1639 decode_char (struct charset *charset, unsigned int code)
1640 {
1641 int c, char_index;
1642 enum charset_method method = CHARSET_METHOD (charset);
1643
1644 if (code < CHARSET_MIN_CODE (charset) || code > CHARSET_MAX_CODE (charset))
1645 return -1;
1646
1647 if (method == CHARSET_METHOD_SUBSET)
1648 {
1649 Lisp_Object subset_info;
1650
1651 subset_info = CHARSET_SUBSET (charset);
1652 charset = CHARSET_FROM_ID (XFIXNAT (AREF (subset_info, 0)));
1653 code -= XFIXNUM (AREF (subset_info, 3));
1654 if (code < XFIXNAT (AREF (subset_info, 1))
1655 || code > XFIXNAT (AREF (subset_info, 2)))
1656 c = -1;
1657 else
1658 c = DECODE_CHAR (charset, code);
1659 }
1660 else if (method == CHARSET_METHOD_SUPERSET)
1661 {
1662 Lisp_Object parents;
1663
1664 parents = CHARSET_SUPERSET (charset);
1665 c = -1;
1666 for (; CONSP (parents); parents = XCDR (parents))
1667 {
1668 int id = XFIXNUM (XCAR (XCAR (parents)));
1669 int code_offset = XFIXNUM (XCDR (XCAR (parents)));
1670 unsigned this_code = code - code_offset;
1671
1672 charset = CHARSET_FROM_ID (id);
1673 if ((c = DECODE_CHAR (charset, this_code)) >= 0)
1674 break;
1675 }
1676 }
1677 else
1678 {
1679 char_index = CODE_POINT_TO_INDEX (charset, code);
1680 if (char_index < 0)
1681 return -1;
1682
1683 if (method == CHARSET_METHOD_MAP)
1684 {
1685 Lisp_Object decoder;
1686
1687 decoder = CHARSET_DECODER (charset);
1688 if (! VECTORP (decoder))
1689 {
1690 load_charset (charset, 1);
1691 decoder = CHARSET_DECODER (charset);
1692 }
1693 if (VECTORP (decoder))
1694 c = XFIXNUM (AREF (decoder, char_index));
1695 else
1696 c = GET_TEMP_CHARSET_WORK_DECODER (char_index);
1697 }
1698 else /* method == CHARSET_METHOD_OFFSET */
1699 {
1700 c = char_index + CHARSET_CODE_OFFSET (charset);
1701 if (CHARSET_UNIFIED_P (charset)
1702 && MAX_UNICODE_CHAR < c && c <= MAX_5_BYTE_CHAR)
1703 {
1704 /* Unify C with a Unicode character if possible. */
1705 Lisp_Object val = CHAR_TABLE_REF (Vchar_unify_table, c);
1706 c = maybe_unify_char (c, val);
1707 }
1708 }
1709 }
1710
1711 return c;
1712 }
1713
1714 /* Variable used temporarily by the macro ENCODE_CHAR. */
1715 Lisp_Object charset_work;
1716
1717 /* Return a code-point of C in CHARSET. If C doesn't belong to
1718 CHARSET, return CHARSET_INVALID_CODE (CHARSET). If STRICT is true,
1719 use CHARSET's strict_max_char instead of max_char. */
1720
1721 unsigned
encode_char(struct charset * charset,int c)1722 encode_char (struct charset *charset, int c)
1723 {
1724 unsigned code;
1725 enum charset_method method = CHARSET_METHOD (charset);
1726
1727 if (CHARSET_UNIFIED_P (charset))
1728 {
1729 Lisp_Object deunifier;
1730 int code_index = -1;
1731
1732 deunifier = CHARSET_DEUNIFIER (charset);
1733 if (! CHAR_TABLE_P (deunifier))
1734 {
1735 load_charset (charset, 2);
1736 deunifier = CHARSET_DEUNIFIER (charset);
1737 }
1738 if (CHAR_TABLE_P (deunifier))
1739 {
1740 Lisp_Object deunified = CHAR_TABLE_REF (deunifier, c);
1741
1742 if (FIXNUMP (deunified))
1743 code_index = XFIXNUM (deunified);
1744 }
1745 else
1746 {
1747 code_index = GET_TEMP_CHARSET_WORK_ENCODER (c);
1748 }
1749 if (code_index >= 0)
1750 c = CHARSET_CODE_OFFSET (charset) + code_index;
1751 }
1752
1753 if (method == CHARSET_METHOD_SUBSET)
1754 {
1755 Lisp_Object subset_info;
1756 struct charset *this_charset;
1757
1758 subset_info = CHARSET_SUBSET (charset);
1759 this_charset = CHARSET_FROM_ID (XFIXNAT (AREF (subset_info, 0)));
1760 code = ENCODE_CHAR (this_charset, c);
1761 if (code == CHARSET_INVALID_CODE (this_charset)
1762 || code < XFIXNAT (AREF (subset_info, 1))
1763 || code > XFIXNAT (AREF (subset_info, 2)))
1764 return CHARSET_INVALID_CODE (charset);
1765 code += XFIXNUM (AREF (subset_info, 3));
1766 return code;
1767 }
1768
1769 if (method == CHARSET_METHOD_SUPERSET)
1770 {
1771 Lisp_Object parents;
1772
1773 parents = CHARSET_SUPERSET (charset);
1774 for (; CONSP (parents); parents = XCDR (parents))
1775 {
1776 int id = XFIXNUM (XCAR (XCAR (parents)));
1777 int code_offset = XFIXNUM (XCDR (XCAR (parents)));
1778 struct charset *this_charset = CHARSET_FROM_ID (id);
1779
1780 code = ENCODE_CHAR (this_charset, c);
1781 if (code != CHARSET_INVALID_CODE (this_charset))
1782 return code + code_offset;
1783 }
1784 return CHARSET_INVALID_CODE (charset);
1785 }
1786
1787 if (! CHARSET_FAST_MAP_REF ((c), charset->fast_map)
1788 || c < CHARSET_MIN_CHAR (charset) || c > CHARSET_MAX_CHAR (charset))
1789 return CHARSET_INVALID_CODE (charset);
1790
1791 if (method == CHARSET_METHOD_MAP)
1792 {
1793 Lisp_Object encoder;
1794 Lisp_Object val;
1795
1796 encoder = CHARSET_ENCODER (charset);
1797 if (! CHAR_TABLE_P (CHARSET_ENCODER (charset)))
1798 {
1799 load_charset (charset, 2);
1800 encoder = CHARSET_ENCODER (charset);
1801 }
1802 if (CHAR_TABLE_P (encoder))
1803 {
1804 val = CHAR_TABLE_REF (encoder, c);
1805 if (NILP (val))
1806 return CHARSET_INVALID_CODE (charset);
1807 code = XFIXNUM (val);
1808 if (! CHARSET_COMPACT_CODES_P (charset))
1809 code = INDEX_TO_CODE_POINT (charset, code);
1810 }
1811 else
1812 {
1813 code = GET_TEMP_CHARSET_WORK_ENCODER (c);
1814 code = INDEX_TO_CODE_POINT (charset, code);
1815 }
1816 }
1817 else /* method == CHARSET_METHOD_OFFSET */
1818 {
1819 unsigned code_index = c - CHARSET_CODE_OFFSET (charset);
1820
1821 code = INDEX_TO_CODE_POINT (charset, code_index);
1822 }
1823
1824 return code;
1825 }
1826
1827
1828 DEFUN ("decode-char", Fdecode_char, Sdecode_char, 2, 2, 0,
1829 doc: /* Decode the pair of CHARSET and CODE-POINT into a character.
1830 Return nil if CODE-POINT is not valid in CHARSET.
1831
1832 CODE-POINT may be a cons (HIGHER-16-BIT-VALUE . LOWER-16-BIT-VALUE),
1833 although this usage is obsolescent. */)
1834 (Lisp_Object charset, Lisp_Object code_point)
1835 {
1836 int c, id;
1837 unsigned code;
1838 struct charset *charsetp;
1839
1840 CHECK_CHARSET_GET_ID (charset, id);
1841 code = cons_to_unsigned (code_point, UINT_MAX);
1842 charsetp = CHARSET_FROM_ID (id);
1843 c = DECODE_CHAR (charsetp, code);
1844 return (c >= 0 ? make_fixnum (c) : Qnil);
1845 }
1846
1847
1848 DEFUN ("encode-char", Fencode_char, Sencode_char, 2, 2, 0,
1849 doc: /* Encode the character CH into a code-point of CHARSET.
1850 Return the encoded code-point, a fixnum if its value is small enough,
1851 otherwise a bignum.
1852 Return nil if CHARSET doesn't support CH. */)
1853 (Lisp_Object ch, Lisp_Object charset)
1854 {
1855 int c, id;
1856 unsigned code;
1857 struct charset *charsetp;
1858
1859 CHECK_CHARSET_GET_ID (charset, id);
1860 CHECK_CHARACTER (ch);
1861 c = XFIXNAT (ch);
1862 charsetp = CHARSET_FROM_ID (id);
1863 code = ENCODE_CHAR (charsetp, c);
1864 if (code == CHARSET_INVALID_CODE (charsetp))
1865 return Qnil;
1866 /* There are much fewer codepoints in the world than we have positive
1867 fixnums, so it could be argued that we never really need a bignum,
1868 e.g. Unicode codepoints only need 21bit, and China's GB-10830
1869 can fit in 22bit. Yet we encode GB-10830's chars in a sparse way
1870 (we just take the 4byte sequences as a 32bit int), so some
1871 GB-10830 chars (such as 0x81308130 in etc/charsets/gb108304.map) end
1872 up represented as bignums if EMACS_INT is 32 bits. */
1873 return INT_TO_INTEGER (code);
1874 }
1875
1876
1877 DEFUN ("make-char", Fmake_char, Smake_char, 1, 5, 0,
1878 doc:
1879 /* Return a character of CHARSET whose position codes are CODEn.
1880
1881 CODE1 through CODE4 are optional, but if you don't supply sufficient
1882 position codes, it is assumed that the minimum code in each dimension
1883 is specified. */)
1884 (Lisp_Object charset, Lisp_Object code1, Lisp_Object code2, Lisp_Object code3, Lisp_Object code4)
1885 {
1886 int id, dimension;
1887 struct charset *charsetp;
1888 unsigned code;
1889 int c;
1890
1891 CHECK_CHARSET_GET_ID (charset, id);
1892 charsetp = CHARSET_FROM_ID (id);
1893
1894 dimension = CHARSET_DIMENSION (charsetp);
1895 if (NILP (code1))
1896 code = (CHARSET_ASCII_COMPATIBLE_P (charsetp)
1897 ? 0 : CHARSET_MIN_CODE (charsetp));
1898 else
1899 {
1900 CHECK_FIXNAT (code1);
1901 if (XFIXNAT (code1) >= 0x100)
1902 args_out_of_range (make_fixnum (0xFF), code1);
1903 code = XFIXNAT (code1);
1904
1905 if (dimension > 1)
1906 {
1907 code <<= 8;
1908 if (NILP (code2))
1909 code |= charsetp->code_space[(dimension - 2) * 4];
1910 else
1911 {
1912 CHECK_FIXNAT (code2);
1913 if (XFIXNAT (code2) >= 0x100)
1914 args_out_of_range (make_fixnum (0xFF), code2);
1915 code |= XFIXNAT (code2);
1916 }
1917
1918 if (dimension > 2)
1919 {
1920 code <<= 8;
1921 if (NILP (code3))
1922 code |= charsetp->code_space[(dimension - 3) * 4];
1923 else
1924 {
1925 CHECK_FIXNAT (code3);
1926 if (XFIXNAT (code3) >= 0x100)
1927 args_out_of_range (make_fixnum (0xFF), code3);
1928 code |= XFIXNAT (code3);
1929 }
1930
1931 if (dimension > 3)
1932 {
1933 code <<= 8;
1934 if (NILP (code4))
1935 code |= charsetp->code_space[0];
1936 else
1937 {
1938 CHECK_FIXNAT (code4);
1939 if (XFIXNAT (code4) >= 0x100)
1940 args_out_of_range (make_fixnum (0xFF), code4);
1941 code |= XFIXNAT (code4);
1942 }
1943 }
1944 }
1945 }
1946 }
1947
1948 if (CHARSET_ISO_FINAL (charsetp) >= 0)
1949 code &= 0x7F7F7F7F;
1950 c = DECODE_CHAR (charsetp, code);
1951 if (c < 0)
1952 error ("Invalid code(s)");
1953 return make_fixnum (c);
1954 }
1955
1956
1957 /* Return the first charset in CHARSET_LIST that contains C.
1958 CHARSET_LIST is a list of charset IDs. If it is nil, use
1959 Vcharset_ordered_list. */
1960
1961 struct charset *
char_charset(int c,Lisp_Object charset_list,unsigned int * code_return)1962 char_charset (int c, Lisp_Object charset_list, unsigned int *code_return)
1963 {
1964 bool maybe_null = 0;
1965
1966 if (NILP (charset_list))
1967 charset_list = Vcharset_ordered_list;
1968 else
1969 maybe_null = 1;
1970
1971 while (CONSP (charset_list))
1972 {
1973 struct charset *charset = CHARSET_FROM_ID (XFIXNUM (XCAR (charset_list)));
1974 unsigned code = ENCODE_CHAR (charset, c);
1975
1976 if (code != CHARSET_INVALID_CODE (charset))
1977 {
1978 if (code_return)
1979 *code_return = code;
1980 return charset;
1981 }
1982 charset_list = XCDR (charset_list);
1983 if (! maybe_null
1984 && c <= MAX_UNICODE_CHAR
1985 && EQ (charset_list, Vcharset_non_preferred_head))
1986 return CHARSET_FROM_ID (charset_unicode);
1987 }
1988 return (maybe_null ? NULL
1989 : c <= MAX_5_BYTE_CHAR ? CHARSET_FROM_ID (charset_emacs)
1990 : CHARSET_FROM_ID (charset_eight_bit));
1991 }
1992
1993
1994 DEFUN ("split-char", Fsplit_char, Ssplit_char, 1, 1, 0,
1995 doc:
1996 /*Return list of charset and one to four position-codes of CH.
1997 The charset is decided by the current priority order of charsets.
1998 A position-code is a byte value of each dimension of the code-point of
1999 CH in the charset. */)
2000 (Lisp_Object ch)
2001 {
2002 struct charset *charset;
2003 int c, dimension;
2004 unsigned code;
2005 Lisp_Object val;
2006
2007 CHECK_CHARACTER (ch);
2008 c = XFIXNAT (ch);
2009 charset = CHAR_CHARSET (c);
2010 if (! charset)
2011 emacs_abort ();
2012 code = ENCODE_CHAR (charset, c);
2013 if (code == CHARSET_INVALID_CODE (charset))
2014 emacs_abort ();
2015 dimension = CHARSET_DIMENSION (charset);
2016 for (val = Qnil; dimension > 0; dimension--)
2017 {
2018 val = Fcons (make_fixnum (code & 0xFF), val);
2019 code >>= 8;
2020 }
2021 return Fcons (CHARSET_NAME (charset), val);
2022 }
2023
2024
2025 DEFUN ("char-charset", Fchar_charset, Schar_charset, 1, 2, 0,
2026 doc: /* Return the charset of highest priority that contains CH.
2027 ASCII characters are an exception: for them, this function always
2028 returns `ascii'.
2029 If optional 2nd arg RESTRICTION is non-nil, it is a list of charsets
2030 from which to find the charset. It may also be a coding system. In
2031 that case, find the charset from what supported by that coding system. */)
2032 (Lisp_Object ch, Lisp_Object restriction)
2033 {
2034 struct charset *charset;
2035
2036 CHECK_CHARACTER (ch);
2037 if (NILP (restriction))
2038 charset = CHAR_CHARSET (XFIXNUM (ch));
2039 else
2040 {
2041 if (CONSP (restriction))
2042 {
2043 int c = XFIXNAT (ch);
2044
2045 for (; CONSP (restriction); restriction = XCDR (restriction))
2046 {
2047 struct charset *rcharset;
2048
2049 CHECK_CHARSET_GET_CHARSET (XCAR (restriction), rcharset);
2050 if (ENCODE_CHAR (rcharset, c) != CHARSET_INVALID_CODE (rcharset))
2051 return XCAR (restriction);
2052 }
2053 return Qnil;
2054 }
2055 restriction = coding_system_charset_list (restriction);
2056 charset = char_charset (XFIXNUM (ch), restriction, NULL);
2057 if (! charset)
2058 return Qnil;
2059 }
2060 return (CHARSET_NAME (charset));
2061 }
2062
2063
2064 DEFUN ("charset-after", Fcharset_after, Scharset_after, 0, 1, 0,
2065 doc: /*
2066 Return charset of a character in the current buffer at position POS.
2067 If POS is nil, it defaults to the current point.
2068 If POS is out of range, the value is nil. */)
2069 (Lisp_Object pos)
2070 {
2071 Lisp_Object ch;
2072 struct charset *charset;
2073
2074 ch = Fchar_after (pos);
2075 if (! FIXNUMP (ch))
2076 return ch;
2077 charset = CHAR_CHARSET (XFIXNUM (ch));
2078 return (CHARSET_NAME (charset));
2079 }
2080
2081
2082 DEFUN ("iso-charset", Fiso_charset, Siso_charset, 3, 3, 0,
2083 doc: /*
2084 Return charset of ISO's specification DIMENSION, CHARS, and FINAL-CHAR.
2085
2086 ISO 2022's designation sequence (escape sequence) distinguishes charsets
2087 by their DIMENSION, CHARS, and FINAL-CHAR,
2088 whereas Emacs distinguishes them by charset symbol.
2089 See the documentation of the function `charset-info' for the meanings of
2090 DIMENSION, CHARS, and FINAL-CHAR. */)
2091 (Lisp_Object dimension, Lisp_Object chars, Lisp_Object final_char)
2092 {
2093 bool chars_flag = check_iso_charset_parameter (dimension, chars, final_char);
2094 int id = ISO_CHARSET_TABLE (XFIXNUM (dimension), chars_flag,
2095 XFIXNAT (final_char));
2096 return (id >= 0 ? CHARSET_NAME (CHARSET_FROM_ID (id)) : Qnil);
2097 }
2098
2099
2100 DEFUN ("clear-charset-maps", Fclear_charset_maps, Sclear_charset_maps,
2101 0, 0, 0,
2102 doc: /*
2103 Internal use only.
2104 Clear temporary charset mapping tables.
2105 It should be called only from temacs invoked for dumping. */)
2106 (void)
2107 {
2108 if (temp_charset_work)
2109 {
2110 xfree (temp_charset_work);
2111 temp_charset_work = NULL;
2112 }
2113
2114 if (CHAR_TABLE_P (Vchar_unify_table))
2115 Foptimize_char_table (Vchar_unify_table, Qnil);
2116
2117 return Qnil;
2118 }
2119
2120 DEFUN ("charset-priority-list", Fcharset_priority_list,
2121 Scharset_priority_list, 0, 1, 0,
2122 doc: /* Return the list of charsets ordered by priority.
2123 HIGHESTP non-nil means just return the highest priority one. */)
2124 (Lisp_Object highestp)
2125 {
2126 Lisp_Object val = Qnil, list = Vcharset_ordered_list;
2127
2128 if (!NILP (highestp))
2129 return CHARSET_NAME (CHARSET_FROM_ID (XFIXNUM (Fcar (list))));
2130
2131 while (!NILP (list))
2132 {
2133 val = Fcons (CHARSET_NAME (CHARSET_FROM_ID (XFIXNUM (XCAR (list)))), val);
2134 list = XCDR (list);
2135 }
2136 return Fnreverse (val);
2137 }
2138
2139 DEFUN ("set-charset-priority", Fset_charset_priority, Sset_charset_priority,
2140 1, MANY, 0,
2141 doc: /* Assign higher priority to the charsets given as arguments.
2142 usage: (set-charset-priority &rest charsets) */)
2143 (ptrdiff_t nargs, Lisp_Object *args)
2144 {
2145 Lisp_Object new_head, old_list;
2146 Lisp_Object list_2022, list_emacs_mule;
2147 ptrdiff_t i;
2148 int id;
2149
2150 old_list = Fcopy_sequence (Vcharset_ordered_list);
2151 new_head = Qnil;
2152 for (i = 0; i < nargs; i++)
2153 {
2154 CHECK_CHARSET_GET_ID (args[i], id);
2155 if (! NILP (Fmemq (make_fixnum (id), old_list)))
2156 {
2157 old_list = Fdelq (make_fixnum (id), old_list);
2158 new_head = Fcons (make_fixnum (id), new_head);
2159 }
2160 }
2161 Vcharset_non_preferred_head = old_list;
2162 Vcharset_ordered_list = nconc2 (Fnreverse (new_head), old_list);
2163
2164 charset_ordered_list_tick++;
2165
2166 charset_unibyte = -1;
2167 for (old_list = Vcharset_ordered_list, list_2022 = list_emacs_mule = Qnil;
2168 CONSP (old_list); old_list = XCDR (old_list))
2169 {
2170 if (! NILP (Fmemq (XCAR (old_list), Viso_2022_charset_list)))
2171 list_2022 = Fcons (XCAR (old_list), list_2022);
2172 if (! NILP (Fmemq (XCAR (old_list), Vemacs_mule_charset_list)))
2173 list_emacs_mule = Fcons (XCAR (old_list), list_emacs_mule);
2174 if (charset_unibyte < 0)
2175 {
2176 struct charset *charset = CHARSET_FROM_ID (XFIXNUM (XCAR (old_list)));
2177
2178 if (CHARSET_DIMENSION (charset) == 1
2179 && CHARSET_ASCII_COMPATIBLE_P (charset)
2180 && CHARSET_MAX_CHAR (charset) >= 0x80)
2181 charset_unibyte = CHARSET_ID (charset);
2182 }
2183 }
2184 Viso_2022_charset_list = Fnreverse (list_2022);
2185 Vemacs_mule_charset_list = Fnreverse (list_emacs_mule);
2186 if (charset_unibyte < 0)
2187 charset_unibyte = charset_iso_8859_1;
2188
2189 return Qnil;
2190 }
2191
2192 DEFUN ("charset-id-internal", Fcharset_id_internal, Scharset_id_internal,
2193 0, 1, 0,
2194 doc: /* Internal use only.
2195 Return charset identification number of CHARSET. */)
2196 (Lisp_Object charset)
2197 {
2198 int id;
2199
2200 CHECK_CHARSET_GET_ID (charset, id);
2201 return make_fixnum (id);
2202 }
2203
2204 struct charset_sort_data
2205 {
2206 Lisp_Object charset;
2207 int id;
2208 ptrdiff_t priority;
2209 };
2210
2211 static int
charset_compare(const void * d1,const void * d2)2212 charset_compare (const void *d1, const void *d2)
2213 {
2214 const struct charset_sort_data *data1 = d1, *data2 = d2;
2215 if (data1->priority != data2->priority)
2216 return data1->priority < data2->priority ? -1 : 1;
2217 return 0;
2218 }
2219
2220 DEFUN ("sort-charsets", Fsort_charsets, Ssort_charsets, 1, 1, 0,
2221 doc: /* Sort charset list CHARSETS by a priority of each charset.
2222 Return the sorted list. CHARSETS is modified by side effects.
2223 See also `charset-priority-list' and `set-charset-priority'. */)
2224 (Lisp_Object charsets)
2225 {
2226 ptrdiff_t n = list_length (charsets), i, j;
2227 int done;
2228 Lisp_Object tail, elt, attrs;
2229 struct charset_sort_data *sort_data;
2230 int id, min_id = INT_MAX, max_id = INT_MIN;
2231 USE_SAFE_ALLOCA;
2232
2233 if (n == 0)
2234 return Qnil;
2235 SAFE_NALLOCA (sort_data, 1, n);
2236 for (tail = charsets, i = 0; CONSP (tail); tail = XCDR (tail), i++)
2237 {
2238 elt = XCAR (tail);
2239 CHECK_CHARSET_GET_ATTR (elt, attrs);
2240 sort_data[i].charset = elt;
2241 sort_data[i].id = id = XFIXNUM (CHARSET_ATTR_ID (attrs));
2242 if (id < min_id)
2243 min_id = id;
2244 if (id > max_id)
2245 max_id = id;
2246 }
2247 for (done = 0, tail = Vcharset_ordered_list, i = 0;
2248 done < n && CONSP (tail); tail = XCDR (tail), i++)
2249 {
2250 elt = XCAR (tail);
2251 id = XFIXNAT (elt);
2252 if (id >= min_id && id <= max_id)
2253 for (j = 0; j < n; j++)
2254 if (sort_data[j].id == id)
2255 {
2256 sort_data[j].priority = i;
2257 done++;
2258 }
2259 }
2260 qsort (sort_data, n, sizeof *sort_data, charset_compare);
2261 for (i = 0, tail = charsets; CONSP (tail); tail = XCDR (tail), i++)
2262 XSETCAR (tail, sort_data[i].charset);
2263 SAFE_FREE ();
2264 return charsets;
2265 }
2266
2267
2268 void
init_charset(void)2269 init_charset (void)
2270 {
2271 Lisp_Object tempdir;
2272 tempdir = Fexpand_file_name (build_string ("charsets"), Vdata_directory);
2273 if (! file_accessible_directory_p (tempdir))
2274 {
2275 /* This used to be non-fatal (dir_warning), but it should not
2276 happen, and if it does sooner or later it will cause some
2277 obscure problem (eg bug#6401), so better exit. */
2278 fprintf (stderr,
2279 ("Error: %s: %s\n"
2280 "Emacs will not function correctly "
2281 "without the character map files.\n"
2282 "%s"
2283 "Please check your installation!\n"),
2284 SDATA (tempdir), strerror (errno),
2285 (egetenv ("EMACSDATA")
2286 ? ("The EMACSDATA environment variable is set. "
2287 "Maybe it has the wrong value?\n")
2288 : ""));
2289 exit (1);
2290 }
2291
2292 Vcharset_map_path = list1 (tempdir);
2293 }
2294
2295
2296 void
init_charset_once(void)2297 init_charset_once (void)
2298 {
2299 int i, j, k;
2300
2301 for (i = 0; i < ISO_MAX_DIMENSION; i++)
2302 for (j = 0; j < ISO_MAX_CHARS; j++)
2303 for (k = 0; k < ISO_MAX_FINAL; k++)
2304 iso_charset_table[i][j][k] = -1;
2305
2306 PDUMPER_REMEMBER_SCALAR (iso_charset_table);
2307
2308 for (i = 0; i < 256; i++)
2309 emacs_mule_charset[i] = -1;
2310
2311 PDUMPER_REMEMBER_SCALAR (emacs_mule_charset);
2312
2313 charset_jisx0201_roman = -1;
2314 PDUMPER_REMEMBER_SCALAR (charset_jisx0201_roman);
2315
2316 charset_jisx0208_1978 = -1;
2317 PDUMPER_REMEMBER_SCALAR (charset_jisx0208_1978);
2318
2319 charset_jisx0208 = -1;
2320 PDUMPER_REMEMBER_SCALAR (charset_jisx0208);
2321
2322 charset_ksc5601 = -1;
2323 PDUMPER_REMEMBER_SCALAR (charset_ksc5601);
2324 }
2325
2326 /* Allocate an initial charset table that is large enough to handle
2327 Emacs while it is bootstrapping. As of September 2011, the size
2328 needs to be at least 166; make it a bit bigger to allow for future
2329 expansion.
2330
2331 Don't make the value so small that the table is reallocated during
2332 bootstrapping, as glibc malloc calls larger than just under 64 KiB
2333 during an initial bootstrap wreak havoc after dumping; see the
2334 M_MMAP_THRESHOLD value in alloc.c, plus there is an extra overhead
2335 internal to glibc malloc and perhaps to Emacs malloc debugging. */
2336 static struct charset charset_table_init[180];
2337
2338 void
syms_of_charset(void)2339 syms_of_charset (void)
2340 {
2341 DEFSYM (Qcharsetp, "charsetp");
2342
2343 /* Special charset symbols. */
2344 DEFSYM (Qascii, "ascii");
2345 DEFSYM (Qunicode, "unicode");
2346 DEFSYM (Qemacs, "emacs");
2347 DEFSYM (Qeight_bit, "eight-bit");
2348 DEFSYM (Qiso_8859_1, "iso-8859-1");
2349
2350 staticpro (&Vcharset_ordered_list);
2351 Vcharset_ordered_list = Qnil;
2352
2353 staticpro (&Viso_2022_charset_list);
2354 Viso_2022_charset_list = Qnil;
2355
2356 staticpro (&Vemacs_mule_charset_list);
2357 Vemacs_mule_charset_list = Qnil;
2358
2359 staticpro (&Vcharset_hash_table);
2360 Vcharset_hash_table = CALLN (Fmake_hash_table, QCtest, Qeq);
2361
2362 charset_table = charset_table_init;
2363 charset_table_size = ARRAYELTS (charset_table_init);
2364 PDUMPER_REMEMBER_SCALAR (charset_table_size);
2365 charset_table_used = 0;
2366 PDUMPER_REMEMBER_SCALAR (charset_table_used);
2367
2368 defsubr (&Scharsetp);
2369 defsubr (&Smap_charset_chars);
2370 defsubr (&Sdefine_charset_internal);
2371 defsubr (&Sdefine_charset_alias);
2372 defsubr (&Scharset_plist);
2373 defsubr (&Sset_charset_plist);
2374 defsubr (&Sunify_charset);
2375 defsubr (&Sget_unused_iso_final_char);
2376 defsubr (&Sdeclare_equiv_charset);
2377 defsubr (&Sfind_charset_region);
2378 defsubr (&Sfind_charset_string);
2379 defsubr (&Sdecode_char);
2380 defsubr (&Sencode_char);
2381 defsubr (&Ssplit_char);
2382 defsubr (&Smake_char);
2383 defsubr (&Schar_charset);
2384 defsubr (&Scharset_after);
2385 defsubr (&Siso_charset);
2386 defsubr (&Sclear_charset_maps);
2387 defsubr (&Scharset_priority_list);
2388 defsubr (&Sset_charset_priority);
2389 defsubr (&Scharset_id_internal);
2390 defsubr (&Ssort_charsets);
2391
2392 DEFVAR_LISP ("charset-map-path", Vcharset_map_path,
2393 doc: /* List of directories to search for charset map files. */);
2394 Vcharset_map_path = Qnil;
2395
2396 DEFVAR_BOOL ("inhibit-load-charset-map", inhibit_load_charset_map,
2397 doc: /* Inhibit loading of charset maps. Used when dumping Emacs. */);
2398 inhibit_load_charset_map = 0;
2399
2400 DEFVAR_LISP ("charset-list", Vcharset_list,
2401 doc: /* List of all charsets ever defined. */);
2402 Vcharset_list = Qnil;
2403
2404 DEFVAR_LISP ("current-iso639-language", Vcurrent_iso639_language,
2405 doc: /* ISO639 language mnemonic symbol for the current language environment.
2406 If the current language environment is for multiple languages (e.g. "Latin-1"),
2407 the value may be a list of mnemonics. */);
2408 Vcurrent_iso639_language = Qnil;
2409
2410 charset_ascii
2411 = define_charset_internal (Qascii, 1, "\x00\x7F\0\0\0\0\0",
2412 0, 127, 'B', -1, 0, 1, 0, 0);
2413 PDUMPER_REMEMBER_SCALAR (charset_ascii);
2414
2415 charset_iso_8859_1
2416 = define_charset_internal (Qiso_8859_1, 1, "\x00\xFF\0\0\0\0\0",
2417 0, 255, -1, -1, -1, 1, 0, 0);
2418 PDUMPER_REMEMBER_SCALAR (charset_iso_8859_1);
2419
2420 charset_unicode
2421 = define_charset_internal (Qunicode, 3, "\x00\xFF\x00\xFF\x00\x10\0",
2422 0, MAX_UNICODE_CHAR, -1, 0, -1, 1, 0, 0);
2423 PDUMPER_REMEMBER_SCALAR (charset_unicode);
2424
2425 charset_emacs
2426 = define_charset_internal (Qemacs, 3, "\x00\xFF\x00\xFF\x00\x3F\0",
2427 0, MAX_5_BYTE_CHAR, -1, 0, -1, 1, 1, 0);
2428 PDUMPER_REMEMBER_SCALAR (charset_emacs);
2429
2430 charset_eight_bit
2431 = define_charset_internal (Qeight_bit, 1, "\x80\xFF\0\0\0\0\0",
2432 128, 255, -1, 0, -1, 0, 1,
2433 MAX_5_BYTE_CHAR + 1);
2434 PDUMPER_REMEMBER_SCALAR (charset_eight_bit);
2435
2436 charset_unibyte = charset_iso_8859_1;
2437 PDUMPER_REMEMBER_SCALAR (charset_unibyte);
2438 }
2439