1 /* charset.c -- charset module.
2 Copyright (C) 2003, 2004, 2005, 2006, 2007, 2008, 2009, 2010, 2011, 2012
3 National Institute of Advanced Industrial Science and Technology (AIST)
4 Registration Number H15PRO112
5
6 This file is part of the m17n library.
7
8 The m17n library is free software; you can redistribute it and/or
9 modify it under the terms of the GNU Lesser General Public License
10 as published by the Free Software Foundation; either version 2.1 of
11 the License, or (at your option) any later version.
12
13 The m17n library is distributed in the hope that it will be useful,
14 but WITHOUT ANY WARRANTY; without even the implied warranty of
15 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
16 Lesser General Public License for more details.
17
18 You should have received a copy of the GNU Lesser General Public
19 License along with the m17n library; if not, write to the Free
20 Software Foundation, Inc., 51 Franklin Street, Fifth Floor,
21 Boston, MA 02110-1301 USA. */
22 /***en
23 @addtogroup m17nCharset
24 @brief Charset objects and API for them.
25
26 The m17n library uses @e charset objects to represent a coded
27 character sets (CCS). The m17n library supports many predefined
28 coded character sets. Moreover, application programs can add
29 other charsets. A character can belong to multiple charsets.
30
31 The m17n library distinguishes the following three concepts:
32
33 @li A @e code-point is a number assigned by the CCS to each
34 character. Code-points may or may not be continuous. The type
35 @c unsigned is used to represent a code-point. An invalid
36 code-point is represented by the macro @c MCHAR_INVALID_CODE.
37
38 @li A @e character @e index is the canonical index of a character
39 in a CCS. The character that has the character index N occupies
40 the Nth position when all the characters in the current CCS are
41 sorted by their code-points. Character indices in a CCS are
42 continuous and start with 0.
43
44 @li A @e character @e code is the internal representation in the
45 m17n library of a character. A character code is a signed integer
46 of 21 bits or longer.
47
48 Each charset object defines how characters are converted between
49 code-points and character codes. To @e encode means converting
50 code-points to character codes and to @e decode means converting
51 character codes to code-points. */
52
53 /***ja
54 @addtogroup m17nCharset
55 @brief ʸ�����åȥ��֥������ȤȤ���˴ؤ��� API.
56
57 m17n �饤�֥��ϡ���沽ʸ������ (CCS) �� @e ʸ�����å�
58 �ȸƤ֥��֥������Ȥ�ɽ�����롣
59 m17n �饤�֥���¿������沽ʸ������餫����ݡ��Ȥ��Ƥ��뤷�����ץꥱ�������ץ���ब�ȼ���ʸ�����åȤ��ɲä��뤳�Ȥ��ǽ�Ǥ��롣
60 ��Ĥ�ʸ����ʣ����ʸ�����åȤ�°���Ƥ�褤��
61
62 m17n �饤�֥��ϡ��ʲ��γ�ǰ����̤��Ƥ���:
63
64 @li @e �����ɥݥ���� �Ȥϡ�CCS ��������θġ���ʸ�����Ф������������ͤǤ��롣
65 �����ɥݥ���Ȥ�Ϣ³���Ƥ���Ȥϸ¤�ʤ��������ɥݥ���Ȥ�
66 @c unsigned ���ˤ�ä�ɽ����롣̵���ʥ����ɥݥ���Ȥϥޥ���
67 @c MCHAR_INVALID_CODE ��ɽ����롣
68
69 @li @e ʸ������ǥå��� �Ȥϡ�CCS ��dz�ʸ���˳�����Ƥ������������줿����ǥå����Ǥ��롣
70 ʸ������ǥå����� N ��ʸ���ϡ�CCS �����ʸ�����ɥݥ���Ƚ���¤٤��Ȥ��� N ���ܤ˸����롣
71 CCS ���ʸ������ǥå�����Ϣ³���Ƥ��ꡢ0 ����Ϥޤ롣
72
73 @li @e ʸ�������� �Ȥϡ�m17n �饤�֥����ˤ�����ʸ��������ɽ���Ǥ��ꡢ21 �ӥåȰʾ��Ĺ�����������դ������Ǥ��롣
74
75 ��ʸ�����åȥ��֥������Ȥϡ�����ʸ�����åȤ�°����ʸ���Υ����ɥݥ���Ȥ�ʸ�������ɤȤδ֤��Ѵ����ꤹ�롣
76 �����ɥݥ���Ȥ���ʸ�������ɤؤ��Ѵ��� @e �ǥ�����
77 �ȸƤӡ�ʸ�������ɤ��饳���ɥݥ���Ȥؤ��Ѵ��� @e ������ �ȸƤ֡� */
78
79
80 /*=*/
81 #if !defined (FOR_DOXYGEN) || defined (DOXYGEN_INTERNAL_MODULE)
82 /*** @addtogroup m17nInternal
83 @{ */
84
85 #include <config.h>
86 #include <stdio.h>
87 #include <stdlib.h>
88 #include <string.h>
89 #include <limits.h>
90
91 #include "m17n.h"
92 #include "m17n-misc.h"
93 #include "internal.h"
94 #include "symbol.h"
95 #include "database.h"
96 #include "chartab.h"
97 #include "plist.h"
98 #include "charset.h"
99 #include "coding.h"
100
101 static int unified_max;
102
103 /** List of all charsets ever defined. */
104
105 struct MCharsetList
106 {
107 int size, inc, used;
108 MCharset **charsets;
109 };
110
111 static struct MCharsetList charset_list;
112
113 static MPlist *charset_definition_list;
114
115 /** Make a charset object from the template of MCharset structure
116 CHARSET, and return a pointer to the new charset object.
117 CHARSET->code_range[4N + 2] and CHARSET->code_range[4N + 3] are
118 not yet set. */
119
120 static MCharset *
make_charset(MCharset * charset)121 make_charset (MCharset *charset)
122 {
123 unsigned min_code, max_code;
124 int i, n;
125 int *range = charset->code_range;
126
127 if (charset->dimension < 1 || charset->dimension > 4)
128 MERROR (MERROR_CHARSET, NULL);
129 if ((charset->final_byte > 0 && charset->final_byte < '0')
130 || charset->final_byte > 127)
131 MERROR (MERROR_CHARSET, NULL);
132
133 for (i = 0, n = 1; i < 4; i++)
134 {
135 if (range[i * 4] > range[i * 4 + 1])
136 MERROR (MERROR_CHARSET, NULL);
137 range[i * 4 + 2] = range[i * 4 + 1] - range[i * 4] + 1;
138 n *= range[i * 4 + 2];
139 range[i * 4 + 3] = n;
140 }
141
142 min_code = range[0] | (range[4] << 8) | (range[8] << 16) | (range[12] << 24);
143 if (charset->min_code == 0)
144 charset->min_code = min_code;
145 else if (charset->min_code < min_code)
146 MERROR (MERROR_CHARSET, NULL);
147 max_code = range[1] | (range[5] << 8) | (range[9] << 16) | (range[13] << 24);
148 if (charset->max_code == 0)
149 charset->max_code = max_code;
150 else if (charset->max_code > max_code)
151 MERROR (MERROR_CHARSET, NULL);
152
153 charset->code_range_min_code = min_code;
154 charset->fully_loaded = 0;
155 charset->simple = 0;
156
157 if (charset->method == Msubset)
158 {
159 MCharset *parent;
160
161 if (charset->nparents != 1)
162 MERROR (MERROR_CHARSET, NULL);
163 parent = charset->parents[0];
164 if (parent->method == Msuperset
165 || charset->min_code - charset->subset_offset < parent->min_code
166 || charset->max_code - charset->subset_offset > parent->max_code)
167 MERROR (MERROR_CHARSET, NULL);
168 }
169 else if (charset->method == Msuperset)
170 {
171 if (charset->nparents < 2)
172 MERROR (MERROR_CHARSET, NULL);
173 for (i = 0; i < charset->nparents; i++)
174 if (charset->min_code > charset->parents[i]->min_code
175 || charset->max_code < charset->parents[i]->max_code)
176 MERROR (MERROR_CHARSET, NULL);
177 }
178 else
179 {
180 charset->no_code_gap
181 = (charset->dimension == 1
182 || (range[2] == 256
183 && (charset->dimension == 2
184 || (range[6] == 256
185 && (charset->dimension == 3
186 || range[10] == 256)))));
187
188 if (! charset->no_code_gap)
189 {
190 int j;
191
192 memset (charset->code_range_mask, 0,
193 sizeof charset->code_range_mask);
194 for (i = 0; i < 4; i++)
195 for (j = range[i * 4]; j <= range[i * 4 + 1]; j++)
196 charset->code_range_mask[j] |= (1 << i);
197 }
198
199 if (charset->method == Moffset)
200 {
201 charset->max_char = charset->min_char + range[15] - 1;
202 if (charset->min_char < 0
203 || charset->max_char < 0 || charset->max_char > unified_max)
204 MERROR (MERROR_CHARSET, NULL);
205 charset->simple = charset->no_code_gap;
206 charset->fully_loaded = 1;
207 }
208 else if (charset->method == Munify)
209 {
210 /* The magic number 12 below is to align to the SUB_BITS_2
211 (defined in chartab.c) boundary in a char-table. */
212 unified_max -= ((range[15] >> 12) + 1) << 12;
213 charset->unified_max = unified_max;
214 }
215 else if (charset->method != Mmap)
216 MERROR (MERROR_CHARSET, NULL);
217 }
218
219 MLIST_APPEND1 (&charset_list, charsets, charset, MERROR_CHARSET);
220
221 if (charset->final_byte > 0)
222 {
223 MLIST_APPEND1 (&mcharset__iso_2022_table, charsets, charset,
224 MERROR_CHARSET);
225 if (charset->revision <= 0)
226 {
227 int chars = range[2];
228
229 if (chars == 128) /* ASCII case */
230 chars = 94;
231 else if (chars == 256) /* ISO-8859-X case */
232 chars = 96;
233 MCHARSET_ISO_2022 (charset->dimension, chars, charset->final_byte)
234 = charset;
235 }
236 }
237
238 return charset;
239 }
240
241 static int
load_charset_fully(MCharset * charset)242 load_charset_fully (MCharset *charset)
243 {
244 if (charset->method == Msubset)
245 {
246 MCharset *parent = charset->parents[0];
247
248 if (! parent->fully_loaded
249 && load_charset_fully (parent) < 0)
250 MERROR (MERROR_CHARSET, -1);
251 if (parent->method == Moffset)
252 {
253 unsigned code;
254
255 code = charset->min_code - charset->subset_offset;
256 charset->min_char = DECODE_CHAR (parent, code);
257 code = charset->max_code - charset->subset_offset;
258 charset->max_char = DECODE_CHAR (parent, code);
259 }
260 else
261 {
262 unsigned min_code = charset->min_code - charset->subset_offset;
263 unsigned max_code = charset->max_code - charset->subset_offset;
264 int min_char = DECODE_CHAR (parent, min_code);
265 int max_char = min_char;
266
267 for (++min_code; min_code <= max_code; min_code++)
268 {
269 int c = DECODE_CHAR (parent, min_code);
270
271 if (c >= 0)
272 {
273 if (c < min_char)
274 min_char = c;
275 else if (c > max_char)
276 max_char = c;
277 }
278 }
279 charset->min_char = min_char;
280 charset->max_char = max_char;
281 }
282 }
283 else if (charset->method == Msuperset)
284 {
285 int min_char = 0, max_char = 0;
286 int i;
287
288 for (i = 0; i < charset->nparents; i++)
289 {
290 MCharset *parent = charset->parents[i];
291
292 if (! parent->fully_loaded
293 && load_charset_fully (parent) < 0)
294 MERROR (MERROR_CHARSET, -1);
295 if (i == 0)
296 min_char = parent->min_char, max_char = parent->max_char;
297 else if (parent->min_char < min_char)
298 min_char = parent->min_char;
299 else if (parent->max_char > max_char)
300 max_char = parent->max_char;
301 }
302 charset->min_char = min_char;
303 charset->max_char = max_char;
304 }
305 else /* charset->method is Mmap or Munify */
306 {
307 MDatabase *mdb = mdatabase_find (Mcharset, charset->name, Mnil, Mnil);
308 MPlist *plist;
309
310 if (! mdb || ! (plist = mdatabase_load (mdb)))
311 MERROR (MERROR_CHARSET, -1);
312 charset->decoder = mplist_value (plist);
313 charset->encoder = mplist_value (mplist_next (plist));
314 M17N_OBJECT_UNREF (plist);
315 mchartable_range (charset->encoder,
316 &charset->min_char, &charset->max_char);
317 if (charset->method == Mmap)
318 charset->simple = charset->no_code_gap;
319 else
320 charset->max_char = charset->unified_max + 1 + charset->code_range[15];
321 }
322
323 charset->fully_loaded = 1;
324 return 0;
325 }
326
327 /** Load a data of type @c charset from the file FD. */
328
329 static void *
load_charset(FILE * fp,MSymbol charset_name)330 load_charset (FILE *fp, MSymbol charset_name)
331 {
332 MCharset *charset = MCHARSET (charset_name);
333 int *decoder;
334 MCharTable *encoder;
335 int size;
336 int i, c;
337 int found = 0;
338 MPlist *plist;
339
340 if (! charset)
341 MERROR (MERROR_DB, NULL);
342 size = (charset->code_range[15]
343 - (charset->min_code - charset->code_range_min_code));
344 MTABLE_MALLOC (decoder, size, MERROR_DB);
345 for (i = 0; i < size; i++)
346 decoder[i] = -1;
347 encoder = mchartable (Minteger, (void *) MCHAR_INVALID_CODE);
348
349 while ((c = getc (fp)) != EOF)
350 {
351 unsigned code1, code2, c1, c2;
352 int idx1, idx2;
353 char buf[256];
354
355 ungetc (c, fp);
356 if (! fgets (buf, 256, fp))
357 break;
358 if (c != '#')
359 {
360 if (sscanf (buf, "0x%x-0x%x 0x%x", &code1, &code2, &c1) == 3)
361 {
362 idx1 = CODE_POINT_TO_INDEX (charset, code1);
363 if (idx1 >= size)
364 continue;
365 idx2 = CODE_POINT_TO_INDEX (charset, code2);
366 if (idx2 >= size)
367 idx2 = size - 1;
368 c2 = c1 + (idx2 - idx1);
369 }
370 else if (sscanf (buf, "0x%x 0x%x", &code1, &c1) == 2)
371 {
372 idx1 = idx2 = CODE_POINT_TO_INDEX (charset, code1);
373 if (idx1 >= size)
374 continue;
375 c2 = c1;
376 }
377 else
378 continue;
379 if (idx1 >= 0 && idx2 >= 0)
380 {
381 decoder[idx1] = c1;
382 mchartable_set (encoder, c1, (void *) code1);
383 for (idx1++, c1++; idx1 <= idx2; idx1++, c1++)
384 {
385 code1 = INDEX_TO_CODE_POINT (charset, idx1);
386 decoder[idx1] = c1;
387 mchartable_set (encoder, c1, (void *) code1);
388 }
389 found++;
390 }
391 }
392 }
393
394 if (! found)
395 {
396 free (decoder);
397 M17N_OBJECT_UNREF (encoder);
398 return NULL;
399 }
400 plist = mplist ();
401 mplist_add (plist, Mt, decoder);
402 mplist_add (plist, Mt, encoder);
403 return plist;
404 }
405
406
407 /* Internal API */
408
409 MPlist *mcharset__cache;
410
411 /* Predefined charsets. */
412 MCharset *mcharset__ascii;
413 MCharset *mcharset__binary;
414 MCharset *mcharset__m17n;
415 MCharset *mcharset__unicode;
416
417 MCharsetISO2022Table mcharset__iso_2022_table;
418
419 /** Initialize charset handler. */
420
421 int
mcharset__init()422 mcharset__init ()
423 {
424 MPlist *param, *pl;
425
426 unified_max = MCHAR_MAX;
427
428 mdatabase__load_charset_func = load_charset;
429 mcharset__cache = mplist ();
430 mplist_set (mcharset__cache, Mt, NULL);
431
432 MLIST_INIT1 (&charset_list, charsets, 128);
433 MLIST_INIT1 (&mcharset__iso_2022_table, charsets, 128);
434 charset_definition_list = mplist ();
435
436 memset (mcharset__iso_2022_table.classified, 0,
437 sizeof (mcharset__iso_2022_table.classified));
438
439 Mmethod = msymbol ("method");
440 Moffset = msymbol ("offset");
441 Mmap = msymbol ("map");
442 Munify = msymbol ("unify");
443 Msubset = msymbol ("subset");
444 Msuperset = msymbol ("superset");
445
446 Mdimension = msymbol ("dimension");
447 Mmin_range = msymbol ("min-range");
448 Mmax_range = msymbol ("max-range");
449 Mmin_code = msymbol ("min-code");
450 Mmax_code = msymbol ("max-code");
451 Mascii_compatible = msymbol ("ascii-compatible");
452 Mfinal_byte = msymbol ("final-byte");
453 Mrevision = msymbol ("revision");
454 Mmin_char = msymbol ("min-char");
455 Mmapfile = msymbol_as_managing_key ("mapfile");
456 Mparents = msymbol_as_managing_key ("parents");
457 Msubset_offset = msymbol ("subset-offset");
458 Mdefine_coding = msymbol ("define-coding");
459 Maliases = msymbol_as_managing_key ("aliases");
460
461 param = mplist ();
462 pl = param;
463 /* Setup predefined charsets. */
464 pl = mplist_add (pl, Mmethod, Moffset);
465 pl = mplist_add (pl, Mmin_range, (void *) 0);
466 pl = mplist_add (pl, Mmax_range, (void *) 0x7F);
467 pl = mplist_add (pl, Mascii_compatible, Mt);
468 pl = mplist_add (pl, Mfinal_byte, (void *) 'B');
469 pl = mplist_add (pl, Mmin_char, (void *) 0);
470 Mcharset_ascii = mchar_define_charset ("ascii", param);
471
472 mplist_put (param, Mmax_range, (void *) 0xFF);
473 mplist_put (param, Mfinal_byte, NULL);
474 Mcharset_iso_8859_1 = mchar_define_charset ("iso-8859-1", param);
475
476 mplist_put (param, Mmax_range, (void *) 0x10FFFF);
477 Mcharset_unicode = mchar_define_charset ("unicode", param);
478
479 mplist_put (param, Mmax_range, (void *) MCHAR_MAX);
480 Mcharset_m17n = mchar_define_charset ("m17n", param);
481
482 mplist_put (param, Mmax_range, (void *) 0xFF);
483 Mcharset_binary = mchar_define_charset ("binary", param);
484
485 M17N_OBJECT_UNREF (param);
486
487 mcharset__ascii = MCHARSET (Mcharset_ascii);
488 mcharset__binary = MCHARSET (Mcharset_binary);
489 mcharset__m17n = MCHARSET (Mcharset_m17n);
490 mcharset__unicode = MCHARSET (Mcharset_unicode);
491
492 return 0;
493 }
494
495 void
mcharset__fini(void)496 mcharset__fini (void)
497 {
498 int i;
499 MPlist *plist;
500
501 for (i = 0; i < charset_list.used; i++)
502 {
503 MCharset *charset = charset_list.charsets[i];
504
505 if (charset->decoder)
506 free (charset->decoder);
507 if (charset->encoder)
508 M17N_OBJECT_UNREF (charset->encoder);
509 free (charset);
510 }
511 M17N_OBJECT_UNREF (mcharset__cache);
512 MLIST_FREE1 (&charset_list, charsets);
513 MLIST_FREE1 (&mcharset__iso_2022_table, charsets);
514 MPLIST_DO (plist, charset_definition_list)
515 M17N_OBJECT_UNREF (MPLIST_VAL (plist));
516 M17N_OBJECT_UNREF (charset_definition_list);
517 }
518
519
520 MCharset *
mcharset__find(MSymbol name)521 mcharset__find (MSymbol name)
522 {
523 MCharset *charset;
524
525 charset = msymbol_get (name, Mcharset);
526 if (! charset)
527 {
528 MPlist *param = mplist_get (charset_definition_list, name);
529
530 MPLIST_KEY (mcharset__cache) = Mt;
531 if (! param)
532 return NULL;
533 param = mplist__from_plist (param);
534 mchar_define_charset (MSYMBOL_NAME (name), param);
535 charset = msymbol_get (name, Mcharset);
536 M17N_OBJECT_UNREF (param);
537 }
538 MPLIST_KEY (mcharset__cache) = name;
539 MPLIST_VAL (mcharset__cache) = charset;
540 return charset;
541 }
542
543
544 /** Return the character corresponding to code-point CODE in CHARSET.
545 If CODE is invalid for CHARSET, return -1. */
546
547 int
mcharset__decode_char(MCharset * charset,unsigned code)548 mcharset__decode_char (MCharset *charset, unsigned code)
549 {
550 int idx;
551
552 if (code < 128 && charset->ascii_compatible)
553 return (int) code;
554 if (code < charset->min_code || code > charset->max_code)
555 return -1;
556
557 if (! charset->fully_loaded
558 && load_charset_fully (charset) < 0)
559 MERROR (MERROR_CHARSET, -1);
560
561 if (charset->method == Msubset)
562 {
563 MCharset *parent = charset->parents[0];
564
565 code -= charset->subset_offset;
566 return DECODE_CHAR (parent, code);
567 }
568
569 if (charset->method == Msuperset)
570 {
571 int i;
572
573 for (i = 0; i < charset->nparents; i++)
574 {
575 MCharset *parent = charset->parents[i];
576 int c = DECODE_CHAR (parent, code);
577
578 if (c >= 0)
579 return c;
580 }
581 return -1;
582 }
583
584 idx = CODE_POINT_TO_INDEX (charset, code);
585 if (idx < 0)
586 return -1;
587
588 if (charset->method == Mmap)
589 return charset->decoder[idx];
590
591 if (charset->method == Munify)
592 {
593 int c = charset->decoder[idx];
594
595 if (c < 0)
596 c = charset->unified_max + 1 + idx;
597 return c;
598 }
599
600 /* Now charset->method should be Moffset. */
601 return (charset->min_char + idx);
602 }
603
604
605 /** Return the code point of character C in CHARSET. If CHARSET does not
606 contain C, return MCHAR_INVALID_CODE. */
607
608 unsigned
mcharset__encode_char(MCharset * charset,int c)609 mcharset__encode_char (MCharset *charset, int c)
610 {
611 if (! charset->fully_loaded
612 && load_charset_fully (charset) < 0)
613 MERROR (MERROR_CHARSET, MCHAR_INVALID_CODE);
614
615 if (charset->method == Msubset)
616 {
617 MCharset *parent = charset->parents[0];
618 unsigned code = ENCODE_CHAR (parent, c);
619
620 if (code == MCHAR_INVALID_CODE)
621 return code;
622 code += charset->subset_offset;
623 if (code >= charset->min_code && code <= charset->max_code)
624 return code;
625 return MCHAR_INVALID_CODE;
626 }
627
628 if (charset->method == Msuperset)
629 {
630 int i;
631
632 for (i = 0; i < charset->nparents; i++)
633 {
634 MCharset *parent = charset->parents[i];
635 unsigned code = ENCODE_CHAR (parent, c);
636
637 if (code != MCHAR_INVALID_CODE)
638 return code;
639 }
640 return MCHAR_INVALID_CODE;
641 }
642
643 if (c < charset->min_char || c > charset->max_char)
644 return MCHAR_INVALID_CODE;
645
646 if (charset->method == Mmap)
647 return (unsigned) mchartable_lookup (charset->encoder, c);
648
649 if (charset->method == Munify)
650 {
651 if (c > charset->unified_max)
652 {
653 c -= charset->unified_max - 1;
654 return INDEX_TO_CODE_POINT (charset, c);
655 }
656 return (unsigned) mchartable_lookup (charset->encoder, c);
657 }
658
659 /* Now charset->method should be Moffset */
660 c -= charset->min_char;
661 return INDEX_TO_CODE_POINT (charset, c);
662 }
663
664 int
mcharset__load_from_database()665 mcharset__load_from_database ()
666 {
667 MDatabase *mdb = mdatabase_find (msymbol ("charset-list"), Mnil, Mnil, Mnil);
668 MPlist *def_list, *plist;
669 MPlist *definitions = charset_definition_list;
670 int mdebug_flag = MDEBUG_CHARSET;
671
672 if (! mdb)
673 return 0;
674 MDEBUG_PUSH_TIME ();
675 def_list = (MPlist *) mdatabase_load (mdb);
676 MDEBUG_PRINT_TIME ("CHARSET", (mdebug__output, " to load data."));
677 MDEBUG_POP_TIME ();
678 if (! def_list)
679 return -1;
680
681 MDEBUG_PUSH_TIME ();
682 MPLIST_DO (plist, def_list)
683 {
684 MPlist *pl, *p;
685 MSymbol name;
686
687 if (! MPLIST_PLIST_P (plist))
688 MERROR (MERROR_CHARSET, -1);
689 pl = MPLIST_PLIST (plist);
690 if (! MPLIST_SYMBOL_P (pl))
691 MERROR (MERROR_CHARSET, -1);
692 name = MPLIST_SYMBOL (pl);
693 pl = MPLIST_NEXT (pl);
694 definitions = mplist_add (definitions, name, pl);
695 M17N_OBJECT_REF (pl);
696 p = mplist__from_plist (pl);
697 mchar_define_charset (MSYMBOL_NAME (name), p);
698 M17N_OBJECT_UNREF (p);
699 }
700
701 M17N_OBJECT_UNREF (def_list);
702 MDEBUG_PRINT_TIME ("CHARSET", (mdebug__output, " to parse the loaded data."));
703 MDEBUG_POP_TIME ();
704 return 0;
705 }
706
707 /*** @} */
708 #endif /* !FOR_DOXYGEN || DOXYGEN_INTERNAL_MODULE */
709
710
711 /* External API */
712
713 /*** @addtogroup m17nCharset */
714 /*** @{ */
715 /*=*/
716
717 #ifdef FOR_DOXYGEN
718 /***en
719 @brief Invalid code-point.
720
721 The macro #MCHAR_INVALID_CODE gives the invalid code-point. */
722
723 /***ja
724 @brief ̵���ʥ����ɥݥ����.
725
726 �ޥ��� #MCHAR_INVALID_CODE ��̵���ʥ����ɥݥ���Ȥ��� */
727
728 #define MCHAR_INVALID_CODE
729 #endif
730
731 /*=*/
732
733 /***en
734 @name Variables: Symbols representing a charset.
735
736 Each of the following symbols represents a predefined charset. */
737
738 /***ja
739 @name �ѿ�: ʸ�����åȤ�ɽ����������Ѥߥ���ܥ�.
740
741 �ʲ��γƥ���ܥ�ϡ�����Ѥ�ʸ�����åȤ�ɽ�����롣 */
742 /*=*/
743 /*** @{ */
744 /*=*/
745 /***en
746 @brief Symbol representing the charset ASCII.
747
748 The symbol #Mcharset_ascii has name <tt>"ascii"</tt> and represents
749 the charset ISO 646, USA Version X3.4-1968 (ISO-IR-6). */
750 /***ja
751 @brief ASCII ʸ�����åȤ�ɽ�����륷��ܥ�.
752
753 ����ܥ� #Mcharset_ascii �� <tt>"ascii"</tt> �Ȥ���̾���������
754 ISO 646, USA Version X3.4-1968 (ISO-IR-6) ʸ�����åȤ�ɽ�����롣
755 */
756
757 MSymbol Mcharset_ascii;
758
759 /*=*/
760 /***en
761 @brief Symbol representing the charset ISO/IEC 8859/1.
762
763 The symbol #Mcharset_iso_8859_1 has name <tt>"iso-8859-1"</tt>
764 and represents the charset ISO/IEC 8859-1:1998. */
765 /***ja
766 @brief ISO/IEC 8859-1:1998 ʸ�����åȤ�ɽ�����륷��ܥ�.
767
768 ����ܥ� #Mcharset_iso_8859_1 �� <tt>"iso-8859-1"</tt>
769 �Ȥ���̾���������ISO/IEC 8859-1:1998 ʸ�����åȤ�ɽ�����롣
770 */
771
772 MSymbol Mcharset_iso_8859_1;
773
774 /***en
775 @brief Symbol representing the charset Unicode.
776
777 The symbol #Mcharset_unicode has name <tt>"unicode"</tt> and
778 represents the charset Unicode. */
779 /***ja
780 @brief Unicode ʸ�����åȤ�ɽ�����륷��ܥ�.
781
782 ����ܥ� #Mcharset_unicode �� <tt>"unicode"</tt>
783 �Ȥ���̾���������Unicode ʸ�����åȤ�ɽ�����롣 */
784
785 MSymbol Mcharset_unicode;
786
787 /*=*/
788 /***en
789 @brief Symbol representing the largest charset.
790
791 The symbol #Mcharset_m17n has name <tt>"m17n"</tt> and
792 represents the charset that contains all characters supported by
793 the m17n library. */
794 /***ja
795 @brief ��ʸ����ޤ�ʸ�����åȤ�ɽ�����륷��ܥ�.
796
797 ����ܥ� #Mcharset_m17n �� <tt>"m17n"</tt> �Ȥ���̾���������
798 m17n �饤�֥�꤬�������Ƥ�ʸ����ޤ�ʸ�����åȤ�ɽ�����롣 */
799
800 MSymbol Mcharset_m17n;
801
802 /*=*/
803 /***en
804 @brief Symbol representing the charset for ill-decoded characters.
805
806 The symbol #Mcharset_binary has name <tt>"binary"</tt> and
807 represents the fake charset which the decoding functions put to an
808 M-text as a text property when they encounter an invalid byte
809 (sequence).
810
811 See @ref m17nConv for more details. */
812
813 /***ja
814 @brief �������ǥ����ɤǤ��ʤ�ʸ����ʸ�����åȤ�ɽ�����륷��ܥ�.
815
816 ����ܥ� #Mcharset_binary �� <tt>"binary"</tt>
817 �Ȥ���̾������������� (fake) ʸ�����åȤ�ɽ�����롣
818 �ǥ����ɴؿ��ϡ�M-text �Υƥ����ȥץ�ѥƥ��Ȥ��ơ�̵���ʥХ��ȡʥ��������ˤ������������֤��ղä��롣
819
820 �ܺ٤� @ref m17nConv ���ȤΤ��ȡ� */
821
822 MSymbol Mcharset_binary;
823
824 /*=*/
825 /*** @} */
826 /*=*/
827
828 /***en
829 @name Variables: Parameter keys for mchar_define_charset ().
830
831 These are the predefined symbols to use as parameter keys for the
832 function mchar_define_charset () (which see). */
833
834 /***ja
835 @name �ѿ�: mchar_define_charset �ѤΥѥ���������
836
837 �����ϡ��ؿ� mchar_define_charset () �ѤΥѥ����������Ȥ��ƻȤ��륷��ܥ�Ǥ��롣
838 �ܤ����Ϥ��δؿ��β���ȤΤ��ȡ�*/
839 /*** @{ */
840 /*=*/
841
842 MSymbol Mmethod;
843 MSymbol Mdimension;
844 MSymbol Mmin_range;
845 MSymbol Mmax_range;
846 MSymbol Mmin_code;
847 MSymbol Mmax_code;
848 MSymbol Mascii_compatible;
849 MSymbol Mfinal_byte;
850 MSymbol Mrevision;
851 MSymbol Mmin_char;
852 MSymbol Mmapfile;
853 MSymbol Mparents;
854 MSymbol Msubset_offset;
855 MSymbol Mdefine_coding;
856 MSymbol Maliases;
857 /*=*/
858 /*** @} */
859 /*=*/
860
861 /***en
862 @name Variables: Symbols representing charset methods.
863
864 These are the predefined symbols that can be a value of the
865 @b Mmethod parameter of a charset used in an argument to the
866 mchar_define_charset () function.
867
868 A method specifies how code-points and character codes are
869 converted. See the documentation of the mchar_define_charset ()
870 function for the details. */
871
872 /***ja
873 @name �ѿ�: ʸ�����åȤΥ�åɻ���˻Ȥ��륷��ܥ�
874
875 �����ϡ�ʸ�����åȤ� @e ��å� ����ꤹ�뤿�������Ѥߥ���ܥ�Ǥ��ꡢʸ�����åȤ�
876 @b Mmethod �ѥ������ͤȤʤ뤳�Ȥ��Ǥ��롣
877 �����ͤϴؿ� mchar_define_charset () �ΰ����Ȥ��ƻȤ��롣
878
879 ��åɤȤϡ������ɥݥ���Ȥ�ʸ�������ɤ�����Ѵ�����ݤ������Τ��ȤǤ��롣
880 �ܤ����ϴؿ� mchar_define_charset () �β���ȤΤ��ȡ� */
881 /*** @{ */
882 /*=*/
883 /***en
884 @brief Symbol for the offset type method of charset.
885
886 The symbol #Moffset has the name <tt>"offset"</tt> and, when used
887 as a value of @b Mmethod parameter of a charset, it means that the
888 conversion of code-points and character codes of the charset is
889 done by this calculation:
890
891 @verbatim
892 CHARACTER-CODE = CODE-POINT - MIN-CODE + MIN-CHAR
893 @endverbatim
894
895 where, MIN-CODE is a value of @b Mmin_code parameter of the charset,
896 and MIN-CHAR is a value of @b Mmin_char parameter. */
897
898 /***ja
899 @brief ���ե��åȷ��Υ�åɤ�����ܥ�.
900
901 ����ܥ� #Moffset �� <tt>"offset"</tt> �Ȥ���̾���������ʸ�����åȤ�
902 @b Mmethod �ѥ������ͤȤ����Ѥ���줿���ˤϡ������ɥݥ���Ȥ�ʸ�����åȤ�ʸ�������ɤδ֤��Ѵ����ʲ��μ��˽��äƹԤ��뤳�Ȥ��̣���롣
903
904 @verbatim
905 ʸ�������� = �����ɥݥ���� - MIN-CODE + MIN-CHAR
906 @endverbatim
907
908 �����ǡ�MIN-CODE ��ʸ�����åȤ� @b Mmin_code �ѥ������ͤǤ��ꡢMIN-CHAR ��
909 @b Mmin_char �ѥ������ͤǤ��롣 */
910
911 MSymbol Moffset;
912 /*=*/
913
914 /***en @brief Symbol for the map type method of charset.
915
916 The symbol #Mmap has the name <tt>"map"</tt> and, when used as a
917 value of @b Mmethod parameter of a charset, it means that the
918 conversion of code-points and character codes of the charset is
919 done by map looking up. The map must be given by @b Mmapfile
920 parameter. */
921
922 /***ja @brief �ޥå��Υ�åɤ�����ܥ�.
923
924 ����ܥ� #Mmap �� <tt>"map"</tt> �Ȥ���̾���������ʸ�����åȤ�
925 @b Mmethod �ѥ������ͤȤ����Ѥ���줿���ˤϡ������ɥݥ���Ȥ�ʸ�����åȤ�ʸ�������ɤδ֤��Ѵ����ޥåפȤ��뤳�Ȥˤ�äƹԤ��뤳�Ȥ��̣���롣
926 �ޥåפ� @b Mmapfile �ѥ����Ȥ���Ϳ���ʤ���Фʤ�ʤ��� */
927
928 MSymbol Mmap;
929 /*=*/
930
931 /***en @brief Symbol for the unify type method of charset.
932
933 The symbol #Munify has the name <tt>"unify"</tt> and, when used as
934 a value of @b Mmethod parameter of a charset, it means that the
935 conversion of code-points and character codes of the charset is
936 done by map looking up and offsetting. The map must be given by
937 @b Mmapfile parameter. For this kind of charset, a unique
938 continuous character code space for all characters is assigned.
939
940 If the map has an entry for a code-point, the conversion is done
941 by looking up the map. Otherwise, the conversion is done by this
942 calculation:
943
944 @verbatim
945 CHARACTER-CODE = CODE-POINT - MIN-CODE + LOWEST-CHAR-CODE
946 @endverbatim
947
948 where, MIN-CODE is a value of @b Mmin_code parameter of the charset,
949 and LOWEST-CHAR-CODE is the lowest character code of the assigned
950 code space. */
951
952 /***ja @brief ��˥ե������Υ�åɤ�����ܥ�.
953
954 ����ܥ� #Munify �� <tt>"unify"</tt> �Ȥ���̾���������ʸ�����åȤ�
955 @b Mmethod �ѥ������ͤȤ����Ѥ���줿���ˤϡ������ɥݥ���Ȥ�ʸ�����åȤ�ʸ�������ɤδ֤��Ѵ������ޥåפλ��Ȥȥ��ե��åȤ��Ȥ߹�碌�ˤ�äƹԤ��뤳�Ȥ��̣���롣
956 �ޥåפ� @b Mmapfile �ѥ����Ȥ���Ϳ���ʤ���Фʤ�ʤ���
957 ���μ�γ�ʸ�����åȤˤϡ���ʸ�����Ф���Ϣ³���륳���ɥ��ڡ��������줾�������Ƥ��롣
958
959 �����ɥݥ���Ȥ��ޥåפ˴ޤޤ�Ƥ���С��Ѵ��ϥޥå��Ȥˤ�äƹԤ��롣
960 �����Ǥʤ���С��ʲ��μ��˽�����
961
962 @verbatim
963 CHARACTER-CODE = CODE-POINT - MIN-CODE + LOWEST-CHAR-CODE
964 @endverbatim
965
966 �����ǡ�MIN-CODE ��ʸ�����åȤ� @b Mmin_code �ѥ������ͤǤ��ꡢ
967 LOWEST-CHAR-CODE �ϳ�����Ƥ�줿�����ɥ��ڡ����κǤ⾮����ʸ�������ɤǤ��롣
968 */
969
970 MSymbol Munify;
971 /*=*/
972
973 /***en
974 @brief Symbol for the subset type method of charset.
975
976 The symbol #Msubset has the name <tt>"subset"</tt> and, when used
977 as a value of @b Mmethod parameter of a charset, it means that the
978 charset is a subset of a parent charset. The parent charset must
979 be given by @b Mparents parameter. The conversion of code-points
980 and character codes of the charset is done conceptually by this
981 calculation:
982
983 @verbatim
984 CHARACTER-CODE = PARENT-CODE (CODE-POINT) + SUBSET-OFFSET
985 @endverbatim
986
987 where, PARENT-CODE is a pseudo function that returns a character
988 code of CODE-POINT in the parent charset, and SUBSET-OFFSET is a
989 value given by @b Msubset_offset parameter. */
990
991 /***ja @brief ���֥��åȷ��Υ�åɤ�����ܥ�.
992
993 ����ܥ� #Msubset �� <tt>"subset"</tt> �Ȥ���̾���������ʸ�����åȤ�
994 @b Mmethod �ѥ������ͤȤ����Ѥ���줿���ˤϡ�����ʸ�����åȤ��̤�ʸ�����åȡʿ�ʸ�����åȡˤ���ʬ����Ǥ��뤳�Ȥ��̣���롣
995 ��ʸ�����åȤ� @b Mparents �ѥ����ˤ�ä�Ϳ�����ʤ��ƤϤʤ�ʤ���
996 �����ɥݥ���Ȥ�ʸ�����åȤ�ʸ�������ɤδ֤��Ѵ��ϡ���ǰŪ�ˤϰʲ��μ��˽�����
997
998 @verbatim
999 CHARACTER-CODE = PARENT-CODE (CODE-POINT) + SUBSET-OFFSET
1000 @endverbatim
1001
1002 ������ PARENT-CODE �� CODE-POINT
1003 �ο�ʸ�����å���Ǥ�ʸ�������ɤ��֤����ؿ��Ǥ��ꡢSUBSET-OFFSET ��
1004 @b Msubset_offset �ѥ�����Ϳ�������ͤǤ��롣
1005 */
1006
1007 MSymbol Msubset;
1008 /*=*/
1009
1010 /***en
1011 @brief Symbol for the superset type method of charset.
1012
1013 The symbol #Msuperset has the name <tt>"superset"</tt> and, when
1014 used as a value of @b Mmethod parameter of a charset, it means that
1015 the charset is a superset of parent charsets. The parent charsets
1016 must be given by @b Mparents parameter. */
1017
1018 /***ja
1019 @brief �����ѡ����åȷ��Υ�åɤ�����ܥ�.
1020
1021 ����ܥ� #Msuperset �� <tt>"superset"</tt> �Ȥ���̾���������ʸ�����åȤ�
1022 @b Mmethod �ѥ������ͤȤ����Ѥ���줿���ˤϡ�����ʸ�����åȤ��̤�ʸ�����åȡʿ�ʸ�����åȡˤξ�̽���Ǥ��뤳�Ȥ��̣���롣
1023 ��ʸ�����åȤ� @b Mparents �ѥ����ˤ�ä�Ϳ�����ʤ��ƤϤʤ�ʤ���
1024 */
1025
1026 MSymbol Msuperset;
1027 /*=*/
1028 /*** @} */
1029
1030 /***en
1031 @brief Define a charset.
1032
1033 The mchar_define_charset () function defines a new charset and
1034 makes it accessible via a symbol whose name is $NAME. $PLIST
1035 specifies parameters of the charset as below:
1036
1037 <ul>
1038
1039 <li> Key is @b Mmethod, value is a symbol.
1040
1041 The value specifies the method for decoding/encoding code-points
1042 in the charset. It must be #Moffset, #Mmap (default), #Munify,
1043 #Msubset, or #Msuperset.
1044
1045 <li> Key is @b Mdimension, value is an integer
1046
1047 The value specifies the dimension of code-points of the charset.
1048 It must be 1 (default), 2, 3, or 4.
1049
1050 <li> Key is @b Mmin_range, value is an unsigned integer
1051
1052 The value specifies the minimum range of a code-point, which means
1053 that the Nth byte of the value is the minimum Nth byte of
1054 code-points of the charset. The default value is 0.
1055
1056 <li> Key is @b Mmax_range, value is an unsigned integer
1057
1058 The value specifies the maximum range of a code-point, which means
1059 that the Nth byte of the value is the maximum Nth byte of
1060 code-points of the charset. The default value is 0xFF, 0xFFFF,
1061 0xFFFFFF, or 0xFFFFFFFF if the dimension is 1, 2, 3, or 4
1062 respectively.
1063
1064 <li> Key is @b Mmin_code, value is an unsigned integer
1065
1066 The value specifies the minimum code-point of
1067 the charset. The default value is the minimum range.
1068
1069 <li> Key is @b Mmax_code, value is an unsigned integer
1070
1071 The value specifies the maximum code-point of
1072 the charset. The default value is the maximum range.
1073
1074 <li> Key is @b Mascii_compatible, value is a symbol
1075
1076 The value specifies whether the charset is ASCII compatible or
1077 not. If the value is #Mnil (default), it is not ASCII
1078 compatible, else compatible.
1079
1080 <li> Key is @b Mfinal_byte, value is an integer
1081
1082 The value specifies the @e final @e byte of the charset registered
1083 in The International Registry. It must be 0 (default) or 32..127.
1084 The value 0 means that the charset is not in the registry.
1085
1086 <li> Key is @b Mrevision, value is an integer
1087
1088 The value specifies the @e revision @e number of the charset
1089 registered in The International Registry. It must be 0..127. If
1090 the charset is not in The International Registry, the value is
1091 ignored. The value 0 means that the charset has no revision
1092 number.
1093
1094 <li> Key is @b Mmin_char, value is an integer
1095
1096 The value specifies the minimum character code of the charset.
1097 The default value is 0.
1098
1099 <li> Key is @b Mmapfile, value is an M-text
1100
1101 If the method is #Mmap or #Munify, a data that contains
1102 mapping information is added to the m17n database by calling
1103 the function mdatabase_define () with the value as an argument $EXTRA_INFO,
1104 i.e. the value is used as a file name of the data.
1105
1106 Otherwise, this parameter is ignored.
1107
1108 <li> Key is @b Mparents, value is a plist
1109
1110 If the method is #Msubset, the value must is a plist of length
1111 1, and the value of the plist must be a symbol representing a
1112 parent charset.
1113
1114 If the method is #Msuperset, the value must be a plist of length
1115 less than 9, and the values of the plist must be symbols
1116 representing subset charsets.
1117
1118 Otherwise, this parameter is ignored.
1119
1120 <li> Key is @b Mdefine_coding, value is a symbol
1121
1122 If the dimension of the charset is 1, the value specifies whether
1123 or not to define a coding system of the same name whose type is
1124 #Mcharset. A coding system is defined if the value is not #Mnil.
1125
1126 Otherwise, this parameter is ignored.
1127
1128 </ul>
1129
1130 @return
1131 If the operation was successful, mchar_define_charset () returns a
1132 symbol whose name is $NAME. Otherwise it returns #Mnil and
1133 assigns an error code to the external variable #merror_code. */
1134
1135 /***ja
1136 @brief ʸ�����åȤ��������.
1137
1138 �ؿ� mchar_define_charset () �Ͽ�����ʸ�����åȤ�������������
1139 $NAME �Ȥ���̾������ĥ���ܥ��ͳ�ǥ��������Ǥ���褦�ˤ��롣
1140 $PLIST ����������ʸ�����åȤΥѥ�����ʲ��Τ褦�˻��ꤹ�롣
1141
1142 <ul>
1143
1144 <li> ������ @b Mmethod ���ͤ�����ܥ�λ�
1145
1146 �ͤϡ�#Moffset, #Mmap (�ǥե������), #Munify, #Msubset,
1147 #Msuperset �Τ����줫�Ǥ��ꡢʸ�����åȤΥ����ɥݥ���Ȥ�ǥ����ɡ������ɤ���ݤΥ�åɤ���ꤹ�롣
1148
1149 <li> ������ @b Mdimension ���ͤ������ͤλ�
1150
1151 �ͤϡ�1 (�ǥե������), 2, 3, 4
1152 �Τ����줫�Ǥ��ꡢʸ�����åȤΥ����ɥݥ���Ȥμ����Ǥ��롣
1153
1154 <li> ������ @b Mmin_range ���ͤ����������ͤλ�
1155
1156 �ͤϥ����ɥݥ���ȤκǾ����ͤǤ��롣���ʤ���������ͤ� N
1157 ���ܤΥХ��ȤϤ���ʸ�����åȤΥ����ɥݥ���Ȥ� N ���ܤΥХ��ȤκǾ��Τ�ΤȤʤ롣
1158 �ǥե�����ͤ� 0 ��
1159
1160 <li> ������ @b Mmax_range ���ͤ����������ͤλ�
1161
1162 �ͤϥ����ɥݥ���Ȥκ�����ͤǤ��롣���ʤ���������ͤ� N
1163 ���ܤΥХ��ȤϤ���ʸ�����åȤΥ����ɥݥ���Ȥ� N ���ܤΥХ��Ȥκ���Τ�ΤȤʤ롣
1164 �ǥե�����ͤϡ������ɥݥ���Ȥμ����� 1, 2, 3, 4 �λ������줾��
1165 0xFF, 0xFFFF, 0xFFFFFF, 0xFFFFFFFF ��
1166
1167 <li> ������ @b Mmin_code ���ͤ����������ͤλ�
1168
1169 �ͤϤ���ʸ�����åȤκǾ��Υ����ɥݥ���ȤǤ��롣�ǥե�����ͤ�
1170 @b Mmin_range ���͡�
1171
1172 <li> ������ @b Mmax_code ���ͤ����������ͤλ�
1173
1174 �ͤϤ���ʸ�����åȤκ���Υ����ɥݥ���ȤǤ��롣�ǥե�����ͤ�
1175 @b Mmax_range ���͡�
1176
1177 <li> ������ @b Mascii_compatible ���ͤ�����ܥ�λ�
1178
1179 �ͤϤ���ʸ�����åȤ� ASCII �ߴ��Ǥ��뤫�ɤ��������ǥե�����ͤ�
1180 #Mnil �Ǥ���иߴ��ǤϤʤ�������ʳ��ξ��ϸߴ��Ǥ��롣
1181
1182 <li> ������ @b Mfinal_byte ���ͤ������ͤλ�
1183
1184 �ͤϤ���ʸ�����åȤ� The International Registry ����Ͽ����Ƥ���
1185 @e ��ü�Х��� �Ǥ��ꡢ0 (�ǥե������) �Ǥ��뤫 32..127 �Ǥ��롣0
1186 ����Ͽ����Ƥ��ʤ����Ȥ��̣���롣
1187
1188 <li> ������ @b Mrevision ���ͤ������ͤλ�
1189
1190 �ͤ� The International Registry ����Ͽ����Ƥ��� @e revision @e
1191 number �Ǥ��ꡢ0..127 �Ǥ��롣
1192 ʸ�����åȤ���Ͽ����Ƥ��ʤ����ˤϤ����ͤ�̵�뤵��롣
1193 0 �� revision number ��¸�ߤ��ʤ����Ȥ��̣���롣
1194
1195 <li> ������ @b Mmin_char ���ͤ������ͤλ�
1196
1197 �ͤϤ���ʸ�����åȤκǾ���ʸ�������ɤǤ��롣�ǥե�����ͤ� 0 ��
1198
1199 <li> ������ @b Mmapfile ���ͤ� M-text �λ�
1200
1201 ��åɤ� #Mmap �� #Munify �λ����ؿ� mdatabase_define ()
1202 ���ͤ���� $EXTRA_INFO �Ȥ��ƸƤ֤��Ȥˤ�äơ��ޥåԥ˴ؤ���ǡ�����
1203 m17n �ǡ����١������ɲä���롣
1204 ���ʤ���������ͤϥǡ����ե������̾���Ǥ��롣
1205
1206 �����Ǥʤ���С����Υѥ�����̵�뤵��롣
1207
1208 <li> ������ @b Mparents ���ͤ� plist �λ�
1209
1210 ��åɤ� #Msubset �ʤ�С��ͤ�Ĺ�� 1 �� plist
1211 �Ǥ��ꡢ�����ͤϤ���ʸ�����åȤξ�̽���Ȥʤ�ʸ�����åȤ�����ܥ�Ǥ��롣
1212
1213 ��åɤ� #Msuperset �ʤ�С��ͤ�Ĺ�� 8 �ʲ��� plist
1214 �Ǥ��ꡢ�������ͤϤ���ʸ�����åȤβ��̽���Ǥ���ʸ�����åȤ�����ܥ�Ǥ��롣
1215
1216 �����Ǥʤ���С����Υѥ�����̵�뤵��롣
1217
1218 <li> ������ @b Mdefine_coding ���ͤ�����ܥ�λ�
1219
1220 ʸ�����åȤμ����� 1 �ʤ�С��ͤ� #Mnil �ʳ��ξ��� #Mcharset ��
1221 ��Ʊ��̾������ĥ����ɷϤ�������롣
1222
1223 �����Ǥʤ���С����Υѥ�����̵�뤵��롣
1224
1225 </ul>
1226
1227 @return
1228 ���������������mchar_define_charset() �� $NAME
1229 �Ȥ���̾���Υ���ܥ���֤��������Ǥʤ���� #Mnil ���֤��������ѿ�
1230 #merror_code �˥��顼�����ɤ����ꤹ�롣*/
1231
1232 /***
1233 @errors
1234 @c MERROR_CHARSET */
1235
1236 MSymbol
mchar_define_charset(const char * name,MPlist * plist)1237 mchar_define_charset (const char *name, MPlist *plist)
1238 {
1239 MSymbol sym = msymbol (name);
1240 MCharset *charset;
1241 int i;
1242 unsigned min_range, max_range;
1243 MPlist *pl;
1244 MText *mapfile = (MText *) mplist_get (plist, Mmapfile);
1245
1246 MSTRUCT_CALLOC (charset, MERROR_CHARSET);
1247 charset->name = sym;
1248 charset->method = (MSymbol) mplist_get (plist, Mmethod);
1249 if (! charset->method)
1250 {
1251 if (mapfile)
1252 charset->method = Mmap;
1253 else
1254 charset->method = Moffset;
1255 }
1256 if (charset->method == Mmap || charset->method == Munify)
1257 {
1258 if (! mapfile)
1259 MERROR (MERROR_CHARSET, Mnil);
1260 mdatabase_define (Mcharset, sym, Mnil, Mnil, NULL, mapfile->data);
1261 }
1262 if (! (charset->dimension = (int) mplist_get (plist, Mdimension)))
1263 charset->dimension = 1;
1264
1265 min_range = (unsigned) mplist_get (plist, Mmin_range);
1266 if ((pl = mplist_find_by_key (plist, Mmax_range)))
1267 {
1268 max_range = (unsigned) MPLIST_VAL (pl);
1269 if (max_range >= 0x1000000)
1270 charset->dimension = 4;
1271 else if (max_range >= 0x10000 && charset->dimension < 3)
1272 charset->dimension = 3;
1273 else if (max_range >= 0x100 && charset->dimension < 2)
1274 charset->dimension = 2;
1275 }
1276 else if (charset->dimension == 1)
1277 max_range = 0xFF;
1278 else if (charset->dimension == 2)
1279 max_range = 0xFFFF;
1280 else if (charset->dimension == 3)
1281 max_range = 0xFFFFFF;
1282 else
1283 max_range = 0xFFFFFFFF;
1284
1285 memset (charset->code_range, 0, sizeof charset->code_range);
1286 for (i = 0; i < charset->dimension; i++, min_range >>= 8, max_range >>= 8)
1287 {
1288 charset->code_range[i * 4] = min_range & 0xFF;
1289 charset->code_range[i * 4 + 1] = max_range & 0xFF;
1290 }
1291 if ((charset->min_code = (int) mplist_get (plist, Mmin_code)) < min_range)
1292 charset->min_code = min_range;
1293 if ((charset->max_code = (int) mplist_get (plist, Mmax_code)) > max_range)
1294 charset->max_code = max_range;
1295 charset->ascii_compatible
1296 = (MSymbol) mplist_get (plist, Mascii_compatible) != Mnil;
1297 charset->final_byte = (int) mplist_get (plist, Mfinal_byte);
1298 charset->revision = (int) mplist_get (plist, Mrevision);
1299 charset->min_char = (int) mplist_get (plist, Mmin_char);
1300 pl = (MPlist *) mplist_get (plist, Mparents);
1301 charset->nparents = pl ? mplist_length (pl) : 0;
1302 if (charset->nparents > 8)
1303 charset->nparents = 8;
1304 for (i = 0; i < charset->nparents; i++, pl = MPLIST_NEXT (pl))
1305 {
1306 MSymbol parent_name;
1307
1308 if (MPLIST_KEY (pl) != Msymbol)
1309 MERROR (MERROR_CHARSET, Mnil);
1310 parent_name = MPLIST_SYMBOL (pl);
1311 if (! (charset->parents[i] = MCHARSET (parent_name)))
1312 MERROR (MERROR_CHARSET, Mnil);
1313 }
1314
1315 charset->subset_offset = (int) mplist_get (plist, Msubset_offset);
1316
1317 msymbol_put (sym, Mcharset, charset);
1318 charset = make_charset (charset);
1319 if (! charset)
1320 return Mnil;
1321 msymbol_put (msymbol__canonicalize (sym), Mcharset, charset);
1322
1323 for (pl = (MPlist *) mplist_get (plist, Maliases);
1324 pl && MPLIST_KEY (pl) == Msymbol;
1325 pl = MPLIST_NEXT (pl))
1326 {
1327 MSymbol alias = MPLIST_SYMBOL (pl);
1328
1329 msymbol_put (alias, Mcharset, charset);
1330 msymbol_put (msymbol__canonicalize (alias), Mcharset, charset);
1331 }
1332
1333 if (mplist_get (plist, Mdefine_coding)
1334 && charset->dimension == 1
1335 && charset->code_range[0] == 0 && charset->code_range[1] == 255)
1336 mconv__register_charset_coding (sym);
1337 return (sym);
1338 }
1339
1340 /*=*/
1341
1342 /***en
1343 @brief Resolve charset name.
1344
1345 The mchar_resolve_charset () function returns $SYMBOL if it
1346 represents a charset. Otherwise, canonicalize $SYMBOL as to a
1347 charset name, and if the canonicalized name represents a charset,
1348 return it. Otherwise, return #Mnil. */
1349
1350 /***ja
1351 @brief ʸ�����å�̾���褹��.
1352
1353 �ؿ� mchar_resolve_charset () �� $SYMBOL
1354 ��ʸ�����åȤ��Ƥ���Ф�����֤���
1355
1356 �����Ǥʤ���С�$SYMBOL ��ʸ�����å�̾�Ȥ����������������줬ʸ�����åȤ��Ƥ��Ƥ����������������Τ��֤���
1357 �����Ǥʤ���С�#Mnil ���֤��� */
1358
1359 MSymbol
mchar_resolve_charset(MSymbol symbol)1360 mchar_resolve_charset (MSymbol symbol)
1361 {
1362 MCharset *charset = (MCharset *) msymbol_get (symbol, Mcharset);
1363
1364 if (! charset)
1365 {
1366 symbol = msymbol__canonicalize (symbol);
1367 charset = (MCharset *) msymbol_get (symbol, Mcharset);
1368 }
1369
1370 return (charset ? charset->name : Mnil);
1371 }
1372
1373 /*=*/
1374
1375 /***en
1376 @brief List symbols representing charsets.
1377
1378 The mchar_list_charsets () function makes an array of symbols
1379 representing a charset, stores the pointer to the array in a place
1380 pointed to by $SYMBOLS, and returns the length of the array. */
1381
1382 /***ja
1383 @brief ʸ�����åȤ�ɽ�魯����ܥ�����.
1384
1385 �ؿ� mchar_list_charsets ()
1386 �ϡ�ʸ�����åȤ�����ܥ���¤٤�������ꡢ$SYMBOLS
1387 �ǥݥ���Ȥ��줿���ˤ�������ؤΥݥ����֤��������Ĺ�����֤��� */
1388
1389 int
mchar_list_charset(MSymbol ** symbols)1390 mchar_list_charset (MSymbol **symbols)
1391 {
1392 int i;
1393
1394 MTABLE_MALLOC ((*symbols), charset_list.used, MERROR_CHARSET);
1395 for (i = 0; i < charset_list.used; i++)
1396 (*symbols)[i] = charset_list.charsets[i]->name;
1397 return i;
1398 }
1399
1400 /*=*/
1401
1402 /***en
1403 @brief Decode a code-point.
1404
1405 The mchar_decode () function decodes code-point $CODE in the
1406 charset represented by the symbol $CHARSET_NAME to get a character
1407 code.
1408
1409 @return
1410 If decoding was successful, mchar_decode () returns the decoded
1411 character code. Otherwise it returns -1. */
1412
1413 /***ja
1414 @brief �����ɥݥ���Ȥ�ǥ����ɤ���.
1415
1416 �ؿ� mchar_decode () �ϡ�����ܥ� $CHARSET_NAME �Ǽ������ʸ�����å����
1417 $CODE �Ȥ��������ɥݥ���Ȥ�ǥ����ɤ���ʸ�������ɤ����롣
1418
1419 @return
1420 �ǥ����ɤ���������С�mchar_decode () �ϥǥ����ɤ��줿ʸ�������ɤ��֤���
1421 �����Ǥʤ���� -1 ���֤��� */
1422
1423 /***
1424 @seealso
1425 mchar_encode () */
1426
1427 int
mchar_decode(MSymbol charset_name,unsigned code)1428 mchar_decode (MSymbol charset_name, unsigned code)
1429 {
1430 MCharset *charset = MCHARSET (charset_name);
1431
1432 if (! charset)
1433 return MCHAR_INVALID_CODE;
1434 return DECODE_CHAR (charset, code);
1435 }
1436
1437 /*=*/
1438
1439 /***en
1440 @brief Encode a character code.
1441
1442 The mchar_encode () function encodes character code $C to get a
1443 code-point in the charset represented by the symbol $CHARSET_NAME.
1444
1445 @return
1446 If encoding was successful, mchar_encode () returns the encoded
1447 code-point. Otherwise it returns #MCHAR_INVALID_CODE. */
1448
1449 /***ja
1450 @brief ʸ�������ɤ��ɤ���.
1451
1452 �ؿ� mchar_encode () �ϡ�ʸ�������� $C ���ɤ��ƥ���ܥ�
1453 $CHARSET_NAME �Ǽ������ʸ�����å���ˤ����륳���ɥݥ���Ȥ����롣
1454
1455 @return
1456 �����ɤ���������С�mchar_encode () �ϥ��ɤ��줿�����ɥݥ���Ȥ��֤���
1457 �����Ǥʤ���� #MCHAR_INVALID_CODE ���֤��� */
1458
1459 /***
1460 @seealso
1461 mchar_decode () */
1462
1463 unsigned
mchar_encode(MSymbol charset_name,int c)1464 mchar_encode (MSymbol charset_name, int c)
1465 {
1466 MCharset *charset = MCHARSET (charset_name);
1467
1468 if (! charset)
1469 return MCHAR_INVALID_CODE;
1470 return ENCODE_CHAR (charset, c);
1471 }
1472
1473 /*=*/
1474
1475 /***en
1476 @brief Call a function for all the characters in a specified charset.
1477
1478 The mcharset_map_chars () function calls $FUNC for all the
1479 characters in the charset named $CHARSET_NAME. A call is done for
1480 a chunk of consecutive characters rather than character by
1481 character.
1482
1483 $FUNC receives three arguments: $FROM, $TO, and $ARG. $FROM and
1484 $TO specify the range of character codes in $CHARSET. $ARG is the
1485 same as $FUNC_ARG.
1486
1487 @return
1488 If the operation was successful, mcharset_map_chars () returns 0.
1489 Otherwise, it returns -1 and assigns an error code to the external
1490 variable #merror_code. */
1491
1492 /***ja
1493 @brief ���ꤷ��ʸ�����åȤΤ��٤Ƥ�ʸ�����Ф��ƴؿ���Ƥ�.
1494
1495 �ؿ� mcharset_map_chars () �� $CHARSET_NAME
1496 �Ȥ���̾�������ʸ�����å���Τ��٤Ƥ�ʸ�����Ф��� $FUNC ��Ƥ֡�
1497 �ƤӽФ��ϰ�ʸ����ǤϤʤ���Ϣ³����ʸ���ΤޤȤޤ�ñ�̤ǹԤʤ��롣
1498
1499 �ؿ� $FUNC �ˤ�$FROM, $TO, $ARG �Σ��������Ϥ���롣$FROM �� $TO
1500 �� $CHARSET ���ʸ�������ɤ��ϰϤ���ꤹ�롣$ARG �� $FUNC_ARG
1501 ��Ʊ���Ǥ��롣
1502
1503 @return
1504 ��������������� mcharset_map_chars () �� 0 ���֤���
1505 �����Ǥʤ���� -1 ���֤��������ѿ� #merror_code �˥��顼�����ɤ����ꤹ�롣 */
1506
1507 /***
1508 @errors
1509 @c MERROR_CHARSET */
1510
1511 int
mchar_map_charset(MSymbol charset_name,void (* func)(int from,int to,void * arg),void * func_arg)1512 mchar_map_charset (MSymbol charset_name,
1513 void (*func) (int from, int to, void *arg),
1514 void *func_arg)
1515 {
1516 MCharset *charset;
1517
1518 charset = MCHARSET (charset_name);
1519 if (! charset)
1520 MERROR (MERROR_CHARSET, -1);
1521
1522 if (charset->encoder)
1523 {
1524 int c = charset->min_char;
1525 int next_c;
1526
1527 if ((int) mchartable__lookup (charset->encoder, c, &next_c, 1) < 0)
1528 c = next_c;
1529 while (c <= charset->max_char)
1530 {
1531 if ((int) mchartable__lookup (charset->encoder, c, &next_c, 1) >= 0)
1532 (*func) (c, next_c - 1, func_arg);
1533 c = next_c;
1534 }
1535 }
1536 else
1537 (*func) (charset->min_char, charset->max_char, func_arg);
1538 return 0;
1539 }
1540
1541 /*=*/
1542
1543 /*** @} */
1544
1545 /*
1546 Local Variables:
1547 coding: euc-japan
1548 End:
1549 */
1550