1 /* charset.c -- charset module.
2    Copyright (C) 2003, 2004, 2005, 2006, 2007, 2008, 2009, 2010, 2011, 2012
3      National Institute of Advanced Industrial Science and Technology (AIST)
4      Registration Number H15PRO112
5 
6    This file is part of the m17n library.
7 
8    The m17n library is free software; you can redistribute it and/or
9    modify it under the terms of the GNU Lesser General Public License
10    as published by the Free Software Foundation; either version 2.1 of
11    the License, or (at your option) any later version.
12 
13    The m17n library is distributed in the hope that it will be useful,
14    but WITHOUT ANY WARRANTY; without even the implied warranty of
15    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
16    Lesser General Public License for more details.
17 
18    You should have received a copy of the GNU Lesser General Public
19    License along with the m17n library; if not, write to the Free
20    Software Foundation, Inc., 51 Franklin Street, Fifth Floor,
21    Boston, MA 02110-1301 USA.  */
22 /***en
23     @addtogroup m17nCharset
24     @brief Charset objects and API for them.
25 
26     The m17n library uses @e charset objects to represent a coded
27     character sets (CCS).  The m17n library supports many predefined
28     coded character sets.  Moreover, application programs can add
29     other charsets.  A character can belong to multiple charsets.
30 
31     The m17n library distinguishes the following three concepts:
32 
33     @li A @e code-point is a number assigned by the CCS to each
34     character.  Code-points may or may not be continuous.  The type
35     @c unsigned is used to represent a code-point.  An invalid
36     code-point is represented by the macro @c MCHAR_INVALID_CODE.
37 
38     @li A @e character @e index is the canonical index of a character
39     in a CCS.  The character that has the character index N occupies
40     the Nth position when all the characters in the current CCS are
41     sorted by their code-points.  Character indices in a CCS are
42     continuous and start with 0.
43 
44     @li A @e character @e code is the internal representation in the
45     m17n library of a character.  A character code is a signed integer
46     of 21 bits or longer.
47 
48     Each charset object defines how characters are converted between
49     code-points and character codes.  To @e encode means converting
50     code-points to character codes and to @e decode means converting
51     character codes to code-points.  */
52 
53 /***ja
54     @addtogroup m17nCharset
55     @brief ʸ�����åȥ��֥������ȤȤ���˴ؤ��� API.
56 
57     m17n �饤�֥��ϡ���沽ʸ������ (CCS) �� @e ʸ�����å�
58     �ȸƤ֥��֥������Ȥ�ɽ�����롣
59     m17n �饤�֥���¿������沽ʸ��������餫���᥵�ݡ��Ȥ��Ƥ��뤷�����ץꥱ�������ץ���ब�ȼ���ʸ�����åȤ��ɲä��뤳�Ȥ��ǽ�Ǥ��롣
60     ��Ĥ�ʸ����ʣ����ʸ�����åȤ�°���Ƥ�褤��
61 
62     m17n �饤�֥��ϡ��ʲ��γ�ǰ����̤��Ƥ���:
63 
64     @li @e �����ɥݥ���� �Ȥϡ�CCS ��������θġ���ʸ�����Ф������������ͤǤ��롣
65     �����ɥݥ���Ȥ�Ϣ³���Ƥ���Ȥϸ¤�ʤ��������ɥݥ���Ȥ�
66     @c unsigned ���ˤ�ä�ɽ����롣̵���ʥ����ɥݥ���Ȥϥޥ���
67     @c MCHAR_INVALID_CODE ��ɽ����롣
68 
69     @li @e ʸ������ǥå��� �Ȥϡ�CCS ��dz�ʸ���˳�����Ƥ������������줿����ǥå����Ǥ��롣
70     ʸ������ǥå����� N ��ʸ���ϡ�CCS �����ʸ�������ɥݥ���Ƚ���¤٤��Ȥ��� N ���ܤ˸����롣
71     CCS ���ʸ������ǥå�����Ϣ³���Ƥ��ꡢ0 ����Ϥޤ롣
72 
73     @li @e ʸ�������� �Ȥϡ�m17n �饤�֥����ˤ�����ʸ��������ɽ���Ǥ��ꡢ21 �ӥåȰʾ��Ĺ�����������դ������Ǥ��롣
74 
75     ��ʸ�����åȥ��֥������Ȥϡ�����ʸ�����åȤ�°����ʸ���Υ����ɥݥ���Ȥ�ʸ�������ɤȤδ֤��Ѵ����ꤹ�롣
76     �����ɥݥ���Ȥ���ʸ�������ɤؤ��Ѵ��� @e �ǥ�����
77     �ȸƤӡ�ʸ�������ɤ��饳���ɥݥ���Ȥؤ��Ѵ��� @e �������� �ȸƤ֡�  */
78 
79 
80 /*=*/
81 #if !defined (FOR_DOXYGEN) || defined (DOXYGEN_INTERNAL_MODULE)
82 /*** @addtogroup m17nInternal
83      @{ */
84 
85 #include <config.h>
86 #include <stdio.h>
87 #include <stdlib.h>
88 #include <string.h>
89 #include <limits.h>
90 
91 #include "m17n.h"
92 #include "m17n-misc.h"
93 #include "internal.h"
94 #include "symbol.h"
95 #include "database.h"
96 #include "chartab.h"
97 #include "plist.h"
98 #include "charset.h"
99 #include "coding.h"
100 
101 static int unified_max;
102 
103 /** List of all charsets ever defined.  */
104 
105 struct MCharsetList
106 {
107   int size, inc, used;
108   MCharset **charsets;
109 };
110 
111 static struct MCharsetList charset_list;
112 
113 static MPlist *charset_definition_list;
114 
115 /** Make a charset object from the template of MCharset structure
116     CHARSET, and return a pointer to the new charset object.
117     CHARSET->code_range[4N + 2] and CHARSET->code_range[4N + 3] are
118     not yet set.  */
119 
120 static MCharset *
make_charset(MCharset * charset)121 make_charset (MCharset *charset)
122 {
123   unsigned min_code, max_code;
124   int i, n;
125   int *range = charset->code_range;
126 
127   if (charset->dimension < 1 || charset->dimension > 4)
128     MERROR (MERROR_CHARSET, NULL);
129   if ((charset->final_byte > 0 && charset->final_byte < '0')
130       || charset->final_byte > 127)
131     MERROR (MERROR_CHARSET, NULL);
132 
133   for (i = 0, n = 1; i < 4; i++)
134     {
135       if (range[i * 4] > range[i * 4 + 1])
136 	MERROR (MERROR_CHARSET, NULL);
137       range[i * 4 + 2] = range[i * 4 + 1] - range[i * 4] + 1;
138       n *= range[i * 4 + 2];
139       range[i * 4 + 3] = n;
140     }
141 
142   min_code = range[0] | (range[4] << 8) | (range[8] << 16) | (range[12] << 24);
143   if (charset->min_code == 0)
144     charset->min_code = min_code;
145   else if (charset->min_code < min_code)
146     MERROR (MERROR_CHARSET, NULL);
147   max_code = range[1] | (range[5] << 8) | (range[9] << 16) | (range[13] << 24);
148   if (charset->max_code == 0)
149     charset->max_code = max_code;
150   else if (charset->max_code > max_code)
151     MERROR (MERROR_CHARSET, NULL);
152 
153   charset->code_range_min_code = min_code;
154   charset->fully_loaded = 0;
155   charset->simple = 0;
156 
157   if (charset->method == Msubset)
158     {
159       MCharset *parent;
160 
161       if (charset->nparents != 1)
162 	MERROR (MERROR_CHARSET, NULL);
163       parent = charset->parents[0];
164       if (parent->method == Msuperset
165 	  || charset->min_code - charset->subset_offset < parent->min_code
166 	  || charset->max_code - charset->subset_offset > parent->max_code)
167 	MERROR (MERROR_CHARSET, NULL);
168     }
169   else if (charset->method == Msuperset)
170     {
171       if (charset->nparents < 2)
172 	MERROR (MERROR_CHARSET, NULL);
173       for (i = 0; i < charset->nparents; i++)
174 	if (charset->min_code > charset->parents[i]->min_code
175 	    || charset->max_code < charset->parents[i]->max_code)
176 	  MERROR (MERROR_CHARSET, NULL);
177     }
178   else
179     {
180       charset->no_code_gap
181 	= (charset->dimension == 1
182 	   || (range[2] == 256
183 	       && (charset->dimension == 2
184 		   || (range[6] == 256
185 		       && (charset->dimension == 3
186 			   || range[10] == 256)))));
187 
188       if (! charset->no_code_gap)
189 	{
190 	  int j;
191 
192 	  memset (charset->code_range_mask, 0,
193 		  sizeof charset->code_range_mask);
194 	  for (i = 0; i < 4; i++)
195 	    for (j = range[i * 4]; j <= range[i * 4 + 1]; j++)
196 	      charset->code_range_mask[j] |= (1 << i);
197 	}
198 
199       if (charset->method == Moffset)
200 	{
201 	  charset->max_char = charset->min_char + range[15] - 1;
202 	  if (charset->min_char < 0
203 	      || charset->max_char < 0 || charset->max_char > unified_max)
204 	    MERROR (MERROR_CHARSET, NULL);
205 	  charset->simple = charset->no_code_gap;
206 	  charset->fully_loaded = 1;
207 	}
208       else if (charset->method == Munify)
209 	{
210 	  /* The magic number 12 below is to align to the SUB_BITS_2
211 	     (defined in chartab.c) boundary in a char-table.  */
212 	  unified_max -= ((range[15] >> 12) + 1) << 12;
213 	  charset->unified_max = unified_max;
214 	}
215       else if (charset->method != Mmap)
216 	MERROR (MERROR_CHARSET, NULL);
217     }
218 
219   MLIST_APPEND1 (&charset_list, charsets, charset, MERROR_CHARSET);
220 
221   if (charset->final_byte > 0)
222     {
223       MLIST_APPEND1 (&mcharset__iso_2022_table, charsets, charset,
224 		     MERROR_CHARSET);
225       if (charset->revision <= 0)
226 	{
227 	  int chars = range[2];
228 
229 	  if (chars == 128)	/* ASCII case */
230 	    chars = 94;
231 	  else if (chars == 256) /* ISO-8859-X case */
232 	    chars = 96;
233 	  MCHARSET_ISO_2022 (charset->dimension, chars, charset->final_byte)
234 	    = charset;
235 	}
236     }
237 
238   return charset;
239 }
240 
241 static int
load_charset_fully(MCharset * charset)242 load_charset_fully (MCharset *charset)
243 {
244   if (charset->method == Msubset)
245     {
246       MCharset *parent = charset->parents[0];
247 
248       if (! parent->fully_loaded
249 	  && load_charset_fully (parent) < 0)
250 	MERROR (MERROR_CHARSET, -1);
251       if (parent->method == Moffset)
252 	{
253 	  unsigned code;
254 
255 	  code = charset->min_code - charset->subset_offset;
256 	  charset->min_char = DECODE_CHAR (parent, code);
257 	  code = charset->max_code - charset->subset_offset;
258 	  charset->max_char = DECODE_CHAR (parent, code);
259 	}
260       else
261 	{
262 	  unsigned min_code = charset->min_code - charset->subset_offset;
263 	  unsigned max_code = charset->max_code - charset->subset_offset;
264 	  int min_char = DECODE_CHAR (parent, min_code);
265 	  int max_char = min_char;
266 
267 	  for (++min_code; min_code <= max_code; min_code++)
268 	    {
269 	      int c = DECODE_CHAR (parent, min_code);
270 
271 	      if (c >= 0)
272 		{
273 		  if (c < min_char)
274 		    min_char = c;
275 		  else if (c > max_char)
276 		    max_char = c;
277 		}
278 	    }
279 	  charset->min_char = min_char;
280 	  charset->max_char = max_char;
281 	}
282     }
283   else if (charset->method == Msuperset)
284     {
285       int min_char = 0, max_char = 0;
286       int i;
287 
288       for (i = 0; i < charset->nparents; i++)
289 	{
290 	  MCharset *parent = charset->parents[i];
291 
292 	  if (! parent->fully_loaded
293 	      && load_charset_fully (parent) < 0)
294 	    MERROR (MERROR_CHARSET, -1);
295 	  if (i == 0)
296 	    min_char = parent->min_char, max_char = parent->max_char;
297 	  else if (parent->min_char < min_char)
298 	    min_char = parent->min_char;
299 	  else if (parent->max_char > max_char)
300 	    max_char = parent->max_char;
301 	}
302       charset->min_char = min_char;
303       charset->max_char = max_char;
304     }
305   else 				/* charset->method is Mmap or Munify */
306     {
307       MDatabase *mdb = mdatabase_find (Mcharset, charset->name, Mnil, Mnil);
308       MPlist *plist;
309 
310       if (! mdb || ! (plist = mdatabase_load (mdb)))
311 	MERROR (MERROR_CHARSET, -1);
312       charset->decoder = mplist_value (plist);
313       charset->encoder = mplist_value (mplist_next (plist));
314       M17N_OBJECT_UNREF (plist);
315       mchartable_range (charset->encoder,
316 			&charset->min_char, &charset->max_char);
317       if (charset->method == Mmap)
318 	charset->simple = charset->no_code_gap;
319       else
320 	charset->max_char = charset->unified_max + 1 + charset->code_range[15];
321     }
322 
323   charset->fully_loaded = 1;
324   return 0;
325 }
326 
327 /** Load a data of type @c charset from the file FD.  */
328 
329 static void *
load_charset(FILE * fp,MSymbol charset_name)330 load_charset (FILE *fp, MSymbol charset_name)
331 {
332   MCharset *charset = MCHARSET (charset_name);
333   int *decoder;
334   MCharTable *encoder;
335   int size;
336   int i, c;
337   int found = 0;
338   MPlist *plist;
339 
340   if (! charset)
341     MERROR (MERROR_DB, NULL);
342   size = (charset->code_range[15]
343 	  - (charset->min_code - charset->code_range_min_code));
344   MTABLE_MALLOC (decoder, size, MERROR_DB);
345   for (i = 0; i < size; i++)
346     decoder[i] = -1;
347   encoder = mchartable (Minteger, (void *) MCHAR_INVALID_CODE);
348 
349   while ((c = getc (fp)) != EOF)
350     {
351       unsigned code1, code2, c1, c2;
352       int idx1, idx2;
353       char buf[256];
354 
355       ungetc (c, fp);
356       if (! fgets (buf, 256, fp))
357 	break;
358       if (c != '#')
359 	{
360 	  if (sscanf (buf, "0x%x-0x%x 0x%x", &code1, &code2, &c1) == 3)
361 	    {
362 	      idx1 = CODE_POINT_TO_INDEX (charset, code1);
363 	      if (idx1 >= size)
364 		continue;
365 	      idx2 = CODE_POINT_TO_INDEX (charset, code2);
366 	      if (idx2 >= size)
367 		idx2 = size - 1;
368 	      c2 = c1 + (idx2 - idx1);
369 	    }
370 	  else if (sscanf (buf, "0x%x 0x%x", &code1, &c1) == 2)
371 	    {
372 	      idx1 = idx2 = CODE_POINT_TO_INDEX (charset, code1);
373 	      if (idx1 >= size)
374 		continue;
375 	      c2 = c1;
376 	    }
377 	  else
378 	    continue;
379 	  if (idx1 >= 0 && idx2 >= 0)
380 	    {
381 	      decoder[idx1] = c1;
382 	      mchartable_set (encoder, c1, (void *) code1);
383 	      for (idx1++, c1++; idx1 <= idx2; idx1++, c1++)
384 		{
385 		  code1 = INDEX_TO_CODE_POINT (charset, idx1);
386 		  decoder[idx1] = c1;
387 		  mchartable_set (encoder, c1, (void *) code1);
388 		}
389 	      found++;
390 	    }
391 	}
392     }
393 
394   if (! found)
395     {
396       free (decoder);
397       M17N_OBJECT_UNREF (encoder);
398       return NULL;
399     }
400   plist = mplist ();
401   mplist_add (plist, Mt, decoder);
402   mplist_add (plist, Mt, encoder);
403   return plist;
404 }
405 
406 
407 /* Internal API */
408 
409 MPlist *mcharset__cache;
410 
411 /* Predefined charsets.  */
412 MCharset *mcharset__ascii;
413 MCharset *mcharset__binary;
414 MCharset *mcharset__m17n;
415 MCharset *mcharset__unicode;
416 
417 MCharsetISO2022Table mcharset__iso_2022_table;
418 
419 /** Initialize charset handler.  */
420 
421 int
mcharset__init()422 mcharset__init ()
423 {
424   MPlist *param, *pl;
425 
426   unified_max = MCHAR_MAX;
427 
428   mdatabase__load_charset_func = load_charset;
429   mcharset__cache = mplist ();
430   mplist_set (mcharset__cache, Mt, NULL);
431 
432   MLIST_INIT1 (&charset_list, charsets, 128);
433   MLIST_INIT1 (&mcharset__iso_2022_table, charsets, 128);
434   charset_definition_list = mplist ();
435 
436   memset (mcharset__iso_2022_table.classified, 0,
437 	  sizeof (mcharset__iso_2022_table.classified));
438 
439   Mmethod = msymbol ("method");
440   Moffset = msymbol ("offset");
441   Mmap = msymbol ("map");
442   Munify = msymbol ("unify");
443   Msubset = msymbol ("subset");
444   Msuperset = msymbol ("superset");
445 
446   Mdimension = msymbol ("dimension");
447   Mmin_range = msymbol ("min-range");
448   Mmax_range = msymbol ("max-range");
449   Mmin_code = msymbol ("min-code");
450   Mmax_code = msymbol ("max-code");
451   Mascii_compatible = msymbol ("ascii-compatible");
452   Mfinal_byte = msymbol ("final-byte");
453   Mrevision = msymbol ("revision");
454   Mmin_char = msymbol ("min-char");
455   Mmapfile = msymbol_as_managing_key ("mapfile");
456   Mparents = msymbol_as_managing_key ("parents");
457   Msubset_offset = msymbol ("subset-offset");
458   Mdefine_coding = msymbol ("define-coding");
459   Maliases = msymbol_as_managing_key ("aliases");
460 
461   param = mplist ();
462   pl = param;
463   /* Setup predefined charsets.  */
464   pl = mplist_add (pl, Mmethod, Moffset);
465   pl = mplist_add (pl, Mmin_range, (void *) 0);
466   pl = mplist_add (pl, Mmax_range, (void *) 0x7F);
467   pl = mplist_add (pl, Mascii_compatible, Mt);
468   pl = mplist_add (pl, Mfinal_byte, (void *) 'B');
469   pl = mplist_add (pl, Mmin_char, (void *) 0);
470   Mcharset_ascii = mchar_define_charset ("ascii", param);
471 
472   mplist_put (param, Mmax_range, (void *) 0xFF);
473   mplist_put (param, Mfinal_byte, NULL);
474   Mcharset_iso_8859_1 = mchar_define_charset ("iso-8859-1", param);
475 
476   mplist_put (param, Mmax_range, (void *) 0x10FFFF);
477   Mcharset_unicode = mchar_define_charset ("unicode", param);
478 
479   mplist_put (param, Mmax_range, (void *) MCHAR_MAX);
480   Mcharset_m17n = mchar_define_charset ("m17n", param);
481 
482   mplist_put (param, Mmax_range, (void *) 0xFF);
483   Mcharset_binary = mchar_define_charset ("binary", param);
484 
485   M17N_OBJECT_UNREF (param);
486 
487   mcharset__ascii = MCHARSET (Mcharset_ascii);
488   mcharset__binary = MCHARSET (Mcharset_binary);
489   mcharset__m17n = MCHARSET (Mcharset_m17n);
490   mcharset__unicode = MCHARSET (Mcharset_unicode);
491 
492   return 0;
493 }
494 
495 void
mcharset__fini(void)496 mcharset__fini (void)
497 {
498   int i;
499   MPlist *plist;
500 
501   for (i = 0; i < charset_list.used; i++)
502     {
503       MCharset *charset = charset_list.charsets[i];
504 
505       if (charset->decoder)
506 	free (charset->decoder);
507       if (charset->encoder)
508 	M17N_OBJECT_UNREF (charset->encoder);
509       free (charset);
510     }
511   M17N_OBJECT_UNREF (mcharset__cache);
512   MLIST_FREE1 (&charset_list, charsets);
513   MLIST_FREE1 (&mcharset__iso_2022_table, charsets);
514   MPLIST_DO (plist, charset_definition_list)
515     M17N_OBJECT_UNREF (MPLIST_VAL (plist));
516   M17N_OBJECT_UNREF (charset_definition_list);
517 }
518 
519 
520 MCharset *
mcharset__find(MSymbol name)521 mcharset__find (MSymbol name)
522 {
523   MCharset *charset;
524 
525   charset = msymbol_get (name, Mcharset);
526   if (! charset)
527     {
528       MPlist *param = mplist_get (charset_definition_list, name);
529 
530       MPLIST_KEY (mcharset__cache) = Mt;
531       if (! param)
532 	return NULL;
533       param = mplist__from_plist (param);
534       mchar_define_charset (MSYMBOL_NAME (name), param);
535       charset = msymbol_get (name, Mcharset);
536       M17N_OBJECT_UNREF (param);
537     }
538   MPLIST_KEY (mcharset__cache) = name;
539   MPLIST_VAL (mcharset__cache) = charset;
540   return charset;
541 }
542 
543 
544 /** Return the character corresponding to code-point CODE in CHARSET.
545     If CODE is invalid for CHARSET, return -1.  */
546 
547 int
mcharset__decode_char(MCharset * charset,unsigned code)548 mcharset__decode_char (MCharset *charset, unsigned code)
549 {
550   int idx;
551 
552   if (code < 128 && charset->ascii_compatible)
553     return (int) code;
554   if (code < charset->min_code || code > charset->max_code)
555     return -1;
556 
557   if (! charset->fully_loaded
558       && load_charset_fully (charset) < 0)
559     MERROR (MERROR_CHARSET, -1);
560 
561   if (charset->method == Msubset)
562     {
563       MCharset *parent = charset->parents[0];
564 
565       code -= charset->subset_offset;
566       return DECODE_CHAR (parent, code);
567     }
568 
569   if (charset->method == Msuperset)
570     {
571       int i;
572 
573       for (i = 0; i < charset->nparents; i++)
574 	{
575 	  MCharset *parent = charset->parents[i];
576 	  int c = DECODE_CHAR (parent, code);
577 
578 	  if (c >= 0)
579 	    return c;
580 	}
581       return -1;
582     }
583 
584   idx = CODE_POINT_TO_INDEX (charset, code);
585   if (idx < 0)
586     return -1;
587 
588   if (charset->method == Mmap)
589     return charset->decoder[idx];
590 
591   if (charset->method == Munify)
592     {
593       int c = charset->decoder[idx];
594 
595       if (c < 0)
596 	c = charset->unified_max + 1 + idx;
597       return c;
598     }
599 
600   /* Now charset->method should be Moffset.  */
601   return (charset->min_char + idx);
602 }
603 
604 
605 /** Return the code point of character C in CHARSET.  If CHARSET does not
606     contain C, return MCHAR_INVALID_CODE.  */
607 
608 unsigned
mcharset__encode_char(MCharset * charset,int c)609 mcharset__encode_char (MCharset *charset, int c)
610 {
611   if (! charset->fully_loaded
612       && load_charset_fully (charset) < 0)
613     MERROR (MERROR_CHARSET, MCHAR_INVALID_CODE);
614 
615   if (charset->method == Msubset)
616     {
617       MCharset *parent = charset->parents[0];
618       unsigned code = ENCODE_CHAR (parent, c);
619 
620       if (code == MCHAR_INVALID_CODE)
621 	return code;
622       code += charset->subset_offset;
623       if (code >= charset->min_code && code <= charset->max_code)
624 	return code;
625       return MCHAR_INVALID_CODE;
626     }
627 
628   if (charset->method == Msuperset)
629     {
630       int i;
631 
632       for (i = 0; i < charset->nparents; i++)
633 	{
634 	  MCharset *parent = charset->parents[i];
635 	  unsigned code = ENCODE_CHAR (parent, c);
636 
637 	  if (code != MCHAR_INVALID_CODE)
638 	    return code;
639 	}
640       return MCHAR_INVALID_CODE;
641     }
642 
643   if (c < charset->min_char || c > charset->max_char)
644     return MCHAR_INVALID_CODE;
645 
646   if (charset->method == Mmap)
647     return (unsigned) mchartable_lookup (charset->encoder, c);
648 
649   if (charset->method == Munify)
650     {
651       if (c > charset->unified_max)
652 	{
653 	  c -= charset->unified_max - 1;
654 	  return INDEX_TO_CODE_POINT (charset, c);
655 	}
656       return (unsigned) mchartable_lookup (charset->encoder, c);
657     }
658 
659   /* Now charset->method should be Moffset */
660   c -= charset->min_char;
661   return INDEX_TO_CODE_POINT (charset, c);
662 }
663 
664 int
mcharset__load_from_database()665 mcharset__load_from_database ()
666 {
667   MDatabase *mdb = mdatabase_find (msymbol ("charset-list"), Mnil, Mnil, Mnil);
668   MPlist *def_list, *plist;
669   MPlist *definitions = charset_definition_list;
670   int mdebug_flag = MDEBUG_CHARSET;
671 
672   if (! mdb)
673     return 0;
674   MDEBUG_PUSH_TIME ();
675   def_list = (MPlist *) mdatabase_load (mdb);
676   MDEBUG_PRINT_TIME ("CHARSET", (mdebug__output, " to load data."));
677   MDEBUG_POP_TIME ();
678   if (! def_list)
679     return -1;
680 
681   MDEBUG_PUSH_TIME ();
682   MPLIST_DO (plist, def_list)
683     {
684       MPlist *pl, *p;
685       MSymbol name;
686 
687       if (! MPLIST_PLIST_P (plist))
688 	MERROR (MERROR_CHARSET, -1);
689       pl = MPLIST_PLIST (plist);
690       if (! MPLIST_SYMBOL_P (pl))
691 	MERROR (MERROR_CHARSET, -1);
692       name = MPLIST_SYMBOL (pl);
693       pl = MPLIST_NEXT (pl);
694       definitions = mplist_add (definitions, name, pl);
695       M17N_OBJECT_REF (pl);
696       p = mplist__from_plist (pl);
697       mchar_define_charset (MSYMBOL_NAME (name), p);
698       M17N_OBJECT_UNREF (p);
699     }
700 
701   M17N_OBJECT_UNREF (def_list);
702   MDEBUG_PRINT_TIME ("CHARSET", (mdebug__output, " to parse the loaded data."));
703   MDEBUG_POP_TIME ();
704   return 0;
705 }
706 
707 /*** @} */
708 #endif /* !FOR_DOXYGEN || DOXYGEN_INTERNAL_MODULE */
709 
710 
711 /* External API */
712 
713 /*** @addtogroup m17nCharset */
714 /*** @{ */
715 /*=*/
716 
717 #ifdef FOR_DOXYGEN
718 /***en
719     @brief Invalid code-point.
720 
721     The macro #MCHAR_INVALID_CODE gives the invalid code-point.  */
722 
723 /***ja
724     @brief ̵���ʥ����ɥݥ����.
725 
726     �ޥ��� #MCHAR_INVALID_CODE ��̵���ʥ����ɥݥ���Ȥ�����  */
727 
728 #define MCHAR_INVALID_CODE
729 #endif
730 
731 /*=*/
732 
733 /***en
734     @name Variables: Symbols representing a charset.
735 
736     Each of the following symbols represents a predefined charset.  */
737 
738 /***ja
739     @name �ѿ�: ʸ�����åȤ�ɽ����������Ѥߥ���ܥ�.
740 
741     �ʲ��γƥ���ܥ�ϡ�����Ѥ�ʸ�����åȤ�ɽ�����롣  */
742 /*=*/
743 /*** @{ */
744 /*=*/
745 /***en
746     @brief Symbol representing the charset ASCII.
747 
748     The symbol #Mcharset_ascii has name <tt>"ascii"</tt> and represents
749     the charset ISO 646, USA Version X3.4-1968 (ISO-IR-6).  */
750 /***ja
751     @brief ASCII ʸ�����åȤ�ɽ�����륷��ܥ�.
752 
753     ����ܥ� #Mcharset_ascii �� <tt>"ascii"</tt> �Ȥ���̾���������
754     ISO 646, USA Version X3.4-1968 (ISO-IR-6) ʸ�����åȤ�ɽ�����롣
755      */
756 
757 MSymbol Mcharset_ascii;
758 
759 /*=*/
760 /***en
761     @brief Symbol representing the charset ISO/IEC 8859/1.
762 
763     The symbol #Mcharset_iso_8859_1 has name <tt>"iso-8859-1"</tt>
764     and represents the charset ISO/IEC 8859-1:1998.  */
765 /***ja
766     @brief ISO/IEC 8859-1:1998 ʸ�����åȤ�ɽ�����륷��ܥ�.
767 
768     ����ܥ� #Mcharset_iso_8859_1 �� <tt>"iso-8859-1"</tt>
769     �Ȥ���̾���������ISO/IEC 8859-1:1998 ʸ�����åȤ�ɽ�����롣
770     */
771 
772 MSymbol Mcharset_iso_8859_1;
773 
774 /***en
775     @brief Symbol representing the charset Unicode.
776 
777     The symbol #Mcharset_unicode has name <tt>"unicode"</tt> and
778     represents the charset Unicode.  */
779 /***ja
780     @brief Unicode ʸ�����åȤ�ɽ�����륷��ܥ�.
781 
782     ����ܥ� #Mcharset_unicode �� <tt>"unicode"</tt>
783     �Ȥ���̾���������Unicode ʸ�����åȤ�ɽ�����롣 */
784 
785 MSymbol Mcharset_unicode;
786 
787 /*=*/
788 /***en
789     @brief Symbol representing the largest charset.
790 
791     The symbol #Mcharset_m17n has name <tt>"m17n"</tt> and
792     represents the charset that contains all characters supported by
793     the m17n library.  */
794 /***ja
795     @brief ��ʸ����ޤ�ʸ�����åȤ�ɽ�����륷��ܥ�.
796 
797     ����ܥ� #Mcharset_m17n �� <tt>"m17n"</tt> �Ȥ���̾���������
798     m17n �饤�֥�꤬�������Ƥ�ʸ����ޤ�ʸ�����åȤ�ɽ�����롣 */
799 
800 MSymbol Mcharset_m17n;
801 
802 /*=*/
803 /***en
804     @brief Symbol representing the charset for ill-decoded characters.
805 
806     The symbol #Mcharset_binary has name <tt>"binary"</tt> and
807     represents the fake charset which the decoding functions put to an
808     M-text as a text property when they encounter an invalid byte
809     (sequence).
810 
811     See @ref m17nConv for more details.  */
812 
813 /***ja
814     @brief �������ǥ����ɤǤ��ʤ�ʸ����ʸ�����åȤ�ɽ�����륷��ܥ�.
815 
816     ����ܥ� #Mcharset_binary �� <tt>"binary"</tt>
817     �Ȥ���̾������������� (fake) ʸ�����åȤ�ɽ�����롣
818     �ǥ����ɴؿ��ϡ�M-text �Υƥ����ȥץ�ѥƥ��Ȥ��ơ�̵���ʥХ��ȡʥ����������ˤ������������֤��ղä��롣
819 
820      �ܺ٤� @ref m17nConv ���ȤΤ��ȡ� */
821 
822 MSymbol Mcharset_binary;
823 
824 /*=*/
825 /*** @} */
826 /*=*/
827 
828 /***en
829     @name Variables: Parameter keys for mchar_define_charset ().
830 
831     These are the predefined symbols to use as parameter keys for the
832     function mchar_define_charset () (which see).  */
833 
834 /***ja
835     @name �ѿ�: mchar_define_charset �ѤΥѥ�᡼��������
836 
837     �����ϡ��ؿ� mchar_define_charset () �ѤΥѥ�᡼���������Ȥ��ƻȤ��륷��ܥ�Ǥ��롣
838     �ܤ����Ϥ��δؿ��β�����ȤΤ��ȡ�*/
839 /*** @{ */
840 /*=*/
841 
842 MSymbol Mmethod;
843 MSymbol Mdimension;
844 MSymbol Mmin_range;
845 MSymbol Mmax_range;
846 MSymbol Mmin_code;
847 MSymbol Mmax_code;
848 MSymbol Mascii_compatible;
849 MSymbol Mfinal_byte;
850 MSymbol Mrevision;
851 MSymbol Mmin_char;
852 MSymbol Mmapfile;
853 MSymbol Mparents;
854 MSymbol Msubset_offset;
855 MSymbol Mdefine_coding;
856 MSymbol Maliases;
857 /*=*/
858 /*** @} */
859 /*=*/
860 
861 /***en
862     @name Variables: Symbols representing charset methods.
863 
864     These are the predefined symbols that can be a value of the
865     @b Mmethod parameter of a charset used in an argument to the
866     mchar_define_charset () function.
867 
868     A method specifies how code-points and character codes are
869     converted.  See the documentation of the mchar_define_charset ()
870     function for the details.  */
871 
872 /***ja
873     @name �ѿ�: ʸ�����åȤΥ᥽�åɻ���˻Ȥ��륷��ܥ�
874 
875     �����ϡ�ʸ�����åȤ� @e �᥽�å� ����ꤹ�뤿�������Ѥߥ���ܥ�Ǥ��ꡢʸ�����åȤ�
876     @b Mmethod �ѥ�᡼�����ͤȤʤ뤳�Ȥ��Ǥ��롣
877     �����ͤϴؿ� mchar_define_charset () �ΰ����Ȥ��ƻȤ��롣
878 
879     �᥽�åɤȤϡ������ɥݥ���Ȥ�ʸ�������ɤ�����Ѵ�����ݤ������Τ��ȤǤ��롣
880     �ܤ����ϴؿ� mchar_define_charset () �β�����ȤΤ��ȡ�  */
881 /*** @{ */
882 /*=*/
883 /***en
884     @brief Symbol for the offset type method of charset.
885 
886     The symbol #Moffset has the name <tt>"offset"</tt> and, when used
887     as a value of @b Mmethod parameter of a charset, it means that the
888     conversion of code-points and character codes of the charset is
889     done by this calculation:
890 
891 @verbatim
892 CHARACTER-CODE = CODE-POINT - MIN-CODE + MIN-CHAR
893 @endverbatim
894 
895     where, MIN-CODE is a value of @b Mmin_code parameter of the charset,
896     and MIN-CHAR is a value of @b Mmin_char parameter.  */
897 
898 /***ja
899     @brief ���ե��åȷ��Υ᥽�åɤ�������ܥ�.
900 
901     ����ܥ� #Moffset �� <tt>"offset"</tt> �Ȥ���̾���������ʸ�����åȤ�
902     @b Mmethod �ѥ�᡼�����ͤȤ����Ѥ���줿���ˤϡ������ɥݥ���Ȥ�ʸ�����åȤ�ʸ�������ɤδ֤��Ѵ����ʲ��μ��˽��äƹԤ��뤳�Ȥ��̣���롣
903 
904 @verbatim
905 ʸ�������� = �����ɥݥ���� - MIN-CODE + MIN-CHAR
906 @endverbatim
907 
908     �����ǡ�MIN-CODE ��ʸ�����åȤ� @b Mmin_code �ѥ�᡼�����ͤǤ��ꡢMIN-CHAR ��
909     @b Mmin_char �ѥ�᡼�����ͤǤ��롣 */
910 
911 MSymbol Moffset;
912 /*=*/
913 
914 /***en @brief Symbol for the map type method of charset.
915 
916     The symbol #Mmap has the name <tt>"map"</tt> and, when used as a
917     value of @b Mmethod parameter of a charset, it means that the
918     conversion of code-points and character codes of the charset is
919     done by map looking up.  The map must be given by @b Mmapfile
920     parameter.  */
921 
922 /***ja @brief �ޥå׷��Υ᥽�åɤ�������ܥ�.
923 
924     ����ܥ� #Mmap �� <tt>"map"</tt> �Ȥ���̾���������ʸ�����åȤ�
925     @b Mmethod �ѥ�᡼�����ͤȤ����Ѥ���줿���ˤϡ������ɥݥ���Ȥ�ʸ�����åȤ�ʸ�������ɤδ֤��Ѵ����ޥåפ��Ȥ��뤳�Ȥˤ�äƹԤ��뤳�Ȥ��̣���롣
926     �ޥåפ� @b Mmapfile �ѥ�᡼���Ȥ���Ϳ���ʤ���Фʤ�ʤ��� */
927 
928 MSymbol Mmap;
929 /*=*/
930 
931 /***en @brief Symbol for the unify type method of charset.
932 
933     The symbol #Munify has the name <tt>"unify"</tt> and, when used as
934     a value of @b Mmethod parameter of a charset, it means that the
935     conversion of code-points and character codes of the charset is
936     done by map looking up and offsetting.  The map must be given by
937     @b Mmapfile parameter.  For this kind of charset, a unique
938     continuous character code space for all characters is assigned.
939 
940     If the map has an entry for a code-point, the conversion is done
941     by looking up the map.  Otherwise, the conversion is done by this
942     calculation:
943 
944 @verbatim
945 CHARACTER-CODE = CODE-POINT - MIN-CODE + LOWEST-CHAR-CODE
946 @endverbatim
947 
948     where, MIN-CODE is a value of @b Mmin_code parameter of the charset,
949     and LOWEST-CHAR-CODE is the lowest character code of the assigned
950     code space.  */
951 
952 /***ja @brief ��˥ե������Υ᥽�åɤ�������ܥ�.
953 
954     ����ܥ� #Munify �� <tt>"unify"</tt> �Ȥ���̾���������ʸ�����åȤ�
955     @b Mmethod �ѥ�᡼�����ͤȤ����Ѥ���줿���ˤϡ������ɥݥ���Ȥ�ʸ�����åȤ�ʸ�������ɤδ֤��Ѵ������ޥåפλ��Ȥȥ��ե��åȤ��Ȥ߹�碌�ˤ�äƹԤ��뤳�Ȥ��̣���롣
956     �ޥåפ� @b Mmapfile �ѥ�᡼���Ȥ���Ϳ���ʤ���Фʤ�ʤ���
957     ���μ�γ�ʸ�����åȤˤϡ���ʸ�����Ф���Ϣ³���륳���ɥ��ڡ��������줾�������Ƥ��롣
958 
959     �����ɥݥ���Ȥ��ޥåפ˴ޤޤ�Ƥ���С��Ѵ��ϥޥå׻��Ȥˤ�äƹԤ��롣
960     �����Ǥʤ���С��ʲ��μ��˽�����
961 
962 @verbatim
963 CHARACTER-CODE = CODE-POINT - MIN-CODE + LOWEST-CHAR-CODE
964 @endverbatim
965 
966     �����ǡ�MIN-CODE ��ʸ�����åȤ� @b Mmin_code �ѥ�᡼�����ͤǤ��ꡢ
967     LOWEST-CHAR-CODE �ϳ�����Ƥ�줿�����ɥ��ڡ����κǤ⾮����ʸ�������ɤǤ��롣
968     */
969 
970 MSymbol Munify;
971 /*=*/
972 
973 /***en
974     @brief Symbol for the subset type method of charset.
975 
976     The symbol #Msubset has the name <tt>"subset"</tt> and, when used
977     as a value of @b Mmethod parameter of a charset, it means that the
978     charset is a subset of a parent charset.  The parent charset must
979     be given by @b Mparents parameter.  The conversion of code-points
980     and character codes of the charset is done conceptually by this
981     calculation:
982 
983 @verbatim
984 CHARACTER-CODE = PARENT-CODE (CODE-POINT) + SUBSET-OFFSET
985 @endverbatim
986 
987     where, PARENT-CODE is a pseudo function that returns a character
988     code of CODE-POINT in the parent charset, and SUBSET-OFFSET is a
989     value given by @b Msubset_offset parameter.  */
990 
991 /***ja @brief ���֥��åȷ��Υ᥽�åɤ�������ܥ�.
992 
993     ����ܥ� #Msubset �� <tt>"subset"</tt> �Ȥ���̾���������ʸ�����åȤ�
994     @b Mmethod �ѥ�᡼�����ͤȤ����Ѥ���줿���ˤϡ�����ʸ�����åȤ��̤�ʸ�����åȡʿ�ʸ�����åȡˤ���ʬ����Ǥ��뤳�Ȥ��̣���롣
995     ��ʸ�����åȤ� @b Mparents �ѥ�᡼���ˤ�ä�Ϳ�����ʤ��ƤϤʤ�ʤ���
996     �����ɥݥ���Ȥ�ʸ�����åȤ�ʸ�������ɤδ֤��Ѵ��ϡ���ǰŪ�ˤϰʲ��μ��˽�����
997 
998 @verbatim
999 CHARACTER-CODE = PARENT-CODE (CODE-POINT) + SUBSET-OFFSET
1000 @endverbatim
1001 
1002     ������ PARENT-CODE �� CODE-POINT
1003     �ο�ʸ�����å���Ǥ�ʸ�������ɤ��֤����ؿ��Ǥ��ꡢSUBSET-OFFSET ��
1004     @b Msubset_offset �ѥ�᡼����Ϳ�������ͤǤ��롣
1005     */
1006 
1007 MSymbol Msubset;
1008 /*=*/
1009 
1010 /***en
1011     @brief Symbol for the superset type method of charset.
1012 
1013     The symbol #Msuperset has the name <tt>"superset"</tt> and, when
1014     used as a value of @b Mmethod parameter of a charset, it means that
1015     the charset is a superset of parent charsets.  The parent charsets
1016     must be given by @b Mparents parameter.  */
1017 
1018 /***ja
1019     @brief �����ѡ����åȷ��Υ᥽�åɤ�������ܥ�.
1020 
1021     ����ܥ� #Msuperset �� <tt>"superset"</tt> �Ȥ���̾���������ʸ�����åȤ�
1022     @b Mmethod �ѥ�᡼�����ͤȤ����Ѥ���줿���ˤϡ�����ʸ�����åȤ��̤�ʸ�����åȡʿ�ʸ�����åȡˤξ�̽���Ǥ��뤳�Ȥ��̣���롣
1023     ��ʸ�����åȤ� @b Mparents �ѥ�᡼���ˤ�ä�Ϳ�����ʤ��ƤϤʤ�ʤ���
1024     */
1025 
1026 MSymbol Msuperset;
1027 /*=*/
1028 /*** @}  */
1029 
1030 /***en
1031     @brief Define a charset.
1032 
1033     The mchar_define_charset () function defines a new charset and
1034     makes it accessible via a symbol whose name is $NAME.  $PLIST
1035     specifies parameters of the charset as below:
1036 
1037     <ul>
1038 
1039     <li> Key is @b Mmethod, value is a symbol.
1040 
1041     The value specifies the method for decoding/encoding code-points
1042     in the charset.  It must be #Moffset, #Mmap (default), #Munify,
1043     #Msubset, or #Msuperset.
1044 
1045     <li> Key is @b Mdimension, value is an integer
1046 
1047     The value specifies the dimension of code-points of the charset.
1048     It must be 1 (default), 2, 3, or 4.
1049 
1050     <li> Key is @b Mmin_range, value is an unsigned integer
1051 
1052     The value specifies the minimum range of a code-point, which means
1053     that the Nth byte of the value is the minimum Nth byte of
1054     code-points of the charset.   The default value is 0.
1055 
1056     <li> Key is @b Mmax_range, value is an unsigned integer
1057 
1058     The value specifies the maximum range of a code-point, which means
1059     that the Nth byte of the value is the maximum Nth byte of
1060     code-points of the charset.  The default value is 0xFF, 0xFFFF,
1061     0xFFFFFF, or 0xFFFFFFFF if the dimension is 1, 2, 3, or 4
1062     respectively.
1063 
1064     <li> Key is @b Mmin_code, value is an unsigned integer
1065 
1066     The value specifies the minimum code-point of
1067     the charset.  The default value is the minimum range.
1068 
1069     <li> Key is @b Mmax_code, value is an unsigned integer
1070 
1071     The value specifies the maximum code-point of
1072     the charset.  The default value is the maximum range.
1073 
1074     <li> Key is @b Mascii_compatible, value is a symbol
1075 
1076     The value specifies whether the charset is ASCII compatible or
1077     not.  If the value is #Mnil (default), it is not ASCII
1078     compatible, else compatible.
1079 
1080     <li> Key is @b Mfinal_byte, value is an integer
1081 
1082     The value specifies the @e final @e byte of the charset registered
1083     in The International Registry.  It must be 0 (default) or 32..127.
1084     The value 0 means that the charset is not in the registry.
1085 
1086     <li> Key is @b Mrevision, value is an integer
1087 
1088     The value specifies the @e revision @e number of the charset
1089     registered in The International Registry.  It must be 0..127.  If
1090     the charset is not in The International Registry, the value is
1091     ignored.  The value 0 means that the charset has no revision
1092     number.
1093 
1094     <li> Key is @b Mmin_char, value is an integer
1095 
1096     The value specifies the minimum character code of the charset.
1097     The default value is 0.
1098 
1099     <li> Key is @b Mmapfile, value is an M-text
1100 
1101     If the method is #Mmap or #Munify, a data that contains
1102     mapping information is added to the m17n database by calling
1103     the function mdatabase_define () with the value as an argument $EXTRA_INFO,
1104     i.e. the value is used as a file name of the data.
1105 
1106     Otherwise, this parameter is ignored.
1107 
1108     <li> Key is @b Mparents, value is a plist
1109 
1110     If the method is #Msubset, the value must is a plist of length
1111     1, and the value of the plist must be a symbol representing a
1112     parent charset.
1113 
1114     If the method is #Msuperset, the value must be a plist of length
1115     less than 9, and the values of the plist must be symbols
1116     representing subset charsets.
1117 
1118     Otherwise, this parameter is ignored.
1119 
1120     <li> Key is @b Mdefine_coding, value is a symbol
1121 
1122     If the dimension of the charset is 1, the value specifies whether
1123     or not to define a coding system of the same name whose type is
1124     #Mcharset.  A coding system is defined if the value is not #Mnil.
1125 
1126     Otherwise, this parameter is ignored.
1127 
1128     </ul>
1129 
1130     @return
1131     If the operation was successful, mchar_define_charset () returns a
1132     symbol whose name is $NAME.  Otherwise it returns #Mnil and
1133     assigns an error code to the external variable #merror_code.  */
1134 
1135 /***ja
1136     @brief ʸ�����åȤ��������.
1137 
1138     �ؿ� mchar_define_charset () �Ͽ�����ʸ�����åȤ�������������
1139     $NAME �Ȥ���̾������ĥ���ܥ��ͳ�ǥ��������Ǥ���褦�ˤ��롣
1140     $PLIST ����������ʸ�����åȤΥѥ�᡼����ʲ��Τ褦�˻��ꤹ�롣
1141 
1142     <ul>
1143 
1144     <li> ������ @b Mmethod ���ͤ�����ܥ�λ�
1145 
1146     �ͤϡ�#Moffset, #Mmap (�ǥե������), #Munify, #Msubset,
1147     #Msuperset �Τ����줫�Ǥ��ꡢʸ�����åȤΥ����ɥݥ���Ȥ�ǥ����ɡ��������ɤ���ݤΥ᥽�åɤ���ꤹ�롣
1148 
1149     <li> ������ @b Mdimension ���ͤ������ͤλ�
1150 
1151     �ͤϡ�1 (�ǥե������), 2, 3, 4
1152     �Τ����줫�Ǥ��ꡢʸ�����åȤΥ����ɥݥ���Ȥμ����Ǥ��롣
1153 
1154     <li> ������ @b Mmin_range ���ͤ����������ͤλ�
1155 
1156     �ͤϥ����ɥݥ���ȤκǾ����ͤǤ��롣���ʤ���������ͤ� N
1157     ���ܤΥХ��ȤϤ���ʸ�����åȤΥ����ɥݥ���Ȥ� N ���ܤΥХ��ȤκǾ��Τ�ΤȤʤ롣
1158     �ǥե�����ͤ� 0 ��
1159 
1160     <li> ������ @b Mmax_range ���ͤ����������ͤλ�
1161 
1162     �ͤϥ����ɥݥ���Ȥκ�����ͤǤ��롣���ʤ���������ͤ� N
1163     ���ܤΥХ��ȤϤ���ʸ�����åȤΥ����ɥݥ���Ȥ� N ���ܤΥХ��Ȥκ���Τ�ΤȤʤ롣
1164     �ǥե�����ͤϡ������ɥݥ���Ȥμ����� 1, 2, 3, 4 �λ������줾��
1165     0xFF, 0xFFFF, 0xFFFFFF, 0xFFFFFFFF ��
1166 
1167     <li> ������ @b Mmin_code ���ͤ����������ͤλ�
1168 
1169     �ͤϤ���ʸ�����åȤκǾ��Υ����ɥݥ���ȤǤ��롣�ǥե�����ͤ�
1170     @b Mmin_range ���͡�
1171 
1172     <li> ������ @b Mmax_code ���ͤ����������ͤλ�
1173 
1174     �ͤϤ���ʸ�����åȤκ���Υ����ɥݥ���ȤǤ��롣�ǥե�����ͤ�
1175     @b Mmax_range ���͡�
1176 
1177     <li> ������  @b Mascii_compatible ���ͤ�����ܥ�λ�
1178 
1179     �ͤϤ���ʸ�����åȤ� ASCII �ߴ��Ǥ��뤫�ɤ����������ǥե�����ͤ�
1180     #Mnil �Ǥ���иߴ��ǤϤʤ�������ʳ��ξ��ϸߴ��Ǥ��롣
1181 
1182     <li> ������  @b Mfinal_byte ���ͤ������ͤλ�
1183 
1184     �ͤϤ���ʸ�����åȤ� The International Registry ����Ͽ����Ƥ���
1185     @e ��ü�Х��� �Ǥ��ꡢ0 (�ǥե������) �Ǥ��뤫 32..127 �Ǥ��롣0
1186     ����Ͽ����Ƥ��ʤ����Ȥ��̣���롣
1187 
1188     <li> ������  @b Mrevision ���ͤ������ͤλ�
1189 
1190     �ͤ� The International Registry ����Ͽ����Ƥ��� @e revision @e
1191     number �Ǥ��ꡢ0..127 �Ǥ��롣
1192     ʸ�����åȤ���Ͽ����Ƥ��ʤ����ˤϤ����ͤ�̵�뤵��롣
1193     0 �� revision number ��¸�ߤ��ʤ����Ȥ��̣���롣
1194 
1195     <li> ������  @b Mmin_char ���ͤ������ͤλ�
1196 
1197     �ͤϤ���ʸ�����åȤκǾ���ʸ�������ɤǤ��롣�ǥե�����ͤ� 0 ��
1198 
1199     <li> ������ @b Mmapfile ���ͤ� M-text �λ�
1200 
1201     �᥽�åɤ� #Mmap �� #Munify �λ����ؿ� mdatabase_define ()
1202     �����ͤ���� $EXTRA_INFO �Ȥ��ƸƤ֤��Ȥˤ�äơ��ޥåԥ��˴ؤ���ǡ�����
1203     m17n �ǡ����١������ɲä���롣
1204     ���ʤ���������ͤϥǡ����ե������̾���Ǥ��롣
1205 
1206     �����Ǥʤ���С����Υѥ�᡼����̵�뤵��롣
1207 
1208     <li> ������ @b Mparents ���ͤ� plist �λ�
1209 
1210     �᥽�åɤ� #Msubset �ʤ�С��ͤ�Ĺ�� 1 �� plist
1211     �Ǥ��ꡢ�����ͤϤ���ʸ�����åȤξ�̽���Ȥʤ�ʸ�����åȤ�������ܥ�Ǥ��롣
1212 
1213     �᥽�åɤ� #Msuperset �ʤ�С��ͤ�Ĺ�� 8 �ʲ��� plist
1214     �Ǥ��ꡢ�������ͤϤ���ʸ�����åȤβ��̽���Ǥ���ʸ�����åȤ�������ܥ�Ǥ��롣
1215 
1216     �����Ǥʤ���С����Υѥ�᡼����̵�뤵��롣
1217 
1218     <li> ������  @b Mdefine_coding ���ͤ�����ܥ�λ�
1219 
1220     ʸ�����åȤμ����� 1 �ʤ�С��ͤ� #Mnil �ʳ��ξ��� #Mcharset ��
1221     ��Ʊ��̾������ĥ����ɷϤ�������롣
1222 
1223     �����Ǥʤ���С����Υѥ�᡼����̵�뤵��롣
1224 
1225     </ul>
1226 
1227     @return
1228     ��������������С�mchar_define_charset() �� $NAME
1229     �Ȥ���̾���Υ���ܥ���֤��������Ǥʤ���� #Mnil ���֤��������ѿ�
1230     #merror_code �˥��顼�����ɤ����ꤹ�롣*/
1231 
1232 /***
1233     @errors
1234     @c MERROR_CHARSET  */
1235 
1236 MSymbol
mchar_define_charset(const char * name,MPlist * plist)1237 mchar_define_charset (const char *name, MPlist *plist)
1238 {
1239   MSymbol sym = msymbol (name);
1240   MCharset *charset;
1241   int i;
1242   unsigned min_range, max_range;
1243   MPlist *pl;
1244   MText *mapfile = (MText *) mplist_get (plist, Mmapfile);
1245 
1246   MSTRUCT_CALLOC (charset, MERROR_CHARSET);
1247   charset->name = sym;
1248   charset->method = (MSymbol) mplist_get (plist, Mmethod);
1249   if (! charset->method)
1250     {
1251       if (mapfile)
1252 	charset->method = Mmap;
1253       else
1254 	charset->method = Moffset;
1255     }
1256   if (charset->method == Mmap || charset->method == Munify)
1257     {
1258       if (! mapfile)
1259 	MERROR (MERROR_CHARSET, Mnil);
1260       mdatabase_define (Mcharset, sym, Mnil, Mnil, NULL, mapfile->data);
1261     }
1262   if (! (charset->dimension = (int) mplist_get (plist, Mdimension)))
1263     charset->dimension = 1;
1264 
1265   min_range = (unsigned) mplist_get (plist, Mmin_range);
1266   if ((pl = mplist_find_by_key (plist, Mmax_range)))
1267     {
1268       max_range = (unsigned) MPLIST_VAL (pl);
1269       if (max_range >= 0x1000000)
1270 	charset->dimension = 4;
1271       else if (max_range >= 0x10000 && charset->dimension < 3)
1272 	charset->dimension = 3;
1273       else if (max_range >= 0x100 && charset->dimension < 2)
1274 	charset->dimension = 2;
1275     }
1276   else if (charset->dimension == 1)
1277     max_range = 0xFF;
1278   else if (charset->dimension == 2)
1279     max_range = 0xFFFF;
1280   else if (charset->dimension == 3)
1281     max_range = 0xFFFFFF;
1282   else
1283     max_range = 0xFFFFFFFF;
1284 
1285   memset (charset->code_range, 0, sizeof charset->code_range);
1286   for (i = 0; i < charset->dimension; i++, min_range >>= 8, max_range >>= 8)
1287     {
1288       charset->code_range[i * 4] = min_range & 0xFF;
1289       charset->code_range[i * 4 + 1] = max_range & 0xFF;
1290     }
1291   if ((charset->min_code = (int) mplist_get (plist, Mmin_code)) < min_range)
1292     charset->min_code = min_range;
1293   if ((charset->max_code = (int) mplist_get (plist, Mmax_code)) > max_range)
1294     charset->max_code = max_range;
1295   charset->ascii_compatible
1296     = (MSymbol) mplist_get (plist, Mascii_compatible) != Mnil;
1297   charset->final_byte = (int) mplist_get (plist, Mfinal_byte);
1298   charset->revision = (int) mplist_get (plist, Mrevision);
1299   charset->min_char = (int) mplist_get (plist, Mmin_char);
1300   pl = (MPlist *) mplist_get (plist, Mparents);
1301   charset->nparents = pl ? mplist_length (pl) : 0;
1302   if (charset->nparents > 8)
1303     charset->nparents = 8;
1304   for (i = 0; i < charset->nparents; i++, pl = MPLIST_NEXT (pl))
1305     {
1306       MSymbol parent_name;
1307 
1308       if (MPLIST_KEY (pl) != Msymbol)
1309 	MERROR (MERROR_CHARSET, Mnil);
1310       parent_name = MPLIST_SYMBOL (pl);
1311       if (! (charset->parents[i] = MCHARSET (parent_name)))
1312 	MERROR (MERROR_CHARSET, Mnil);
1313     }
1314 
1315   charset->subset_offset = (int) mplist_get (plist, Msubset_offset);
1316 
1317   msymbol_put (sym, Mcharset, charset);
1318   charset = make_charset (charset);
1319   if (! charset)
1320     return Mnil;
1321   msymbol_put (msymbol__canonicalize (sym), Mcharset, charset);
1322 
1323   for (pl = (MPlist *) mplist_get (plist, Maliases);
1324        pl && MPLIST_KEY (pl) == Msymbol;
1325        pl = MPLIST_NEXT (pl))
1326     {
1327       MSymbol alias = MPLIST_SYMBOL (pl);
1328 
1329       msymbol_put (alias, Mcharset, charset);
1330       msymbol_put (msymbol__canonicalize (alias), Mcharset, charset);
1331     }
1332 
1333   if (mplist_get (plist, Mdefine_coding)
1334       && charset->dimension == 1
1335       && charset->code_range[0] == 0 && charset->code_range[1] == 255)
1336     mconv__register_charset_coding (sym);
1337   return (sym);
1338 }
1339 
1340 /*=*/
1341 
1342 /***en
1343     @brief Resolve charset name.
1344 
1345     The mchar_resolve_charset () function returns $SYMBOL if it
1346     represents a charset.  Otherwise, canonicalize $SYMBOL as to a
1347     charset name, and if the canonicalized name represents a charset,
1348     return it.  Otherwise, return #Mnil.  */
1349 
1350 /***ja
1351     @brief ʸ�����å�̾���褹��.
1352 
1353     �ؿ� mchar_resolve_charset () �� $SYMBOL
1354     ��ʸ�����åȤ����Ƥ���Ф�����֤���
1355 
1356     �����Ǥʤ���С�$SYMBOL ��ʸ�����å�̾�Ȥ����������������줬ʸ�����åȤ����Ƥ��Ƥ����������������Τ��֤���
1357     �����Ǥʤ���С�#Mnil ���֤��� */
1358 
1359 MSymbol
mchar_resolve_charset(MSymbol symbol)1360 mchar_resolve_charset (MSymbol symbol)
1361 {
1362   MCharset *charset = (MCharset *) msymbol_get (symbol, Mcharset);
1363 
1364   if (! charset)
1365     {
1366       symbol = msymbol__canonicalize (symbol);
1367       charset = (MCharset *) msymbol_get (symbol, Mcharset);
1368     }
1369 
1370   return (charset ? charset->name : Mnil);
1371 }
1372 
1373 /*=*/
1374 
1375 /***en
1376     @brief List symbols representing charsets.
1377 
1378     The mchar_list_charsets () function makes an array of symbols
1379     representing a charset, stores the pointer to the array in a place
1380     pointed to by $SYMBOLS, and returns the length of the array.  */
1381 
1382 /***ja
1383     @brief ʸ�����åȤ�ɽ�魯����ܥ�������.
1384 
1385     �ؿ� mchar_list_charsets ()
1386     �ϡ�ʸ�����åȤ�������ܥ���¤٤�������ꡢ$SYMBOLS
1387     �ǥݥ���Ȥ��줿���ˤ�������ؤΥݥ������֤��������Ĺ�����֤��� */
1388 
1389 int
mchar_list_charset(MSymbol ** symbols)1390 mchar_list_charset (MSymbol **symbols)
1391 {
1392   int i;
1393 
1394   MTABLE_MALLOC ((*symbols), charset_list.used, MERROR_CHARSET);
1395   for (i = 0; i < charset_list.used; i++)
1396     (*symbols)[i] = charset_list.charsets[i]->name;
1397   return i;
1398 }
1399 
1400 /*=*/
1401 
1402 /***en
1403     @brief Decode a code-point.
1404 
1405     The mchar_decode () function decodes code-point $CODE in the
1406     charset represented by the symbol $CHARSET_NAME to get a character
1407     code.
1408 
1409     @return
1410     If decoding was successful, mchar_decode () returns the decoded
1411     character code.  Otherwise it returns -1.  */
1412 
1413 /***ja
1414     @brief �����ɥݥ���Ȥ�ǥ����ɤ���.
1415 
1416     �ؿ� mchar_decode () �ϡ�����ܥ� $CHARSET_NAME �Ǽ������ʸ�����å����
1417     $CODE �Ȥ��������ɥݥ���Ȥ�ǥ����ɤ���ʸ�������ɤ����롣
1418 
1419     @return
1420     �ǥ����ɤ���������С�mchar_decode () �ϥǥ����ɤ��줿ʸ�������ɤ��֤���
1421     �����Ǥʤ���� -1 ���֤���  */
1422 
1423 /***
1424     @seealso
1425     mchar_encode ()  */
1426 
1427 int
mchar_decode(MSymbol charset_name,unsigned code)1428 mchar_decode (MSymbol charset_name, unsigned code)
1429 {
1430   MCharset *charset = MCHARSET (charset_name);
1431 
1432   if (! charset)
1433     return MCHAR_INVALID_CODE;
1434   return DECODE_CHAR (charset, code);
1435 }
1436 
1437 /*=*/
1438 
1439 /***en
1440     @brief Encode a character code.
1441 
1442     The mchar_encode () function encodes character code $C to get a
1443     code-point in the charset represented by the symbol $CHARSET_NAME.
1444 
1445     @return
1446     If encoding was successful, mchar_encode () returns the encoded
1447     code-point.  Otherwise it returns #MCHAR_INVALID_CODE.  */
1448 
1449 /***ja
1450     @brief ʸ�������ɤ������ɤ���.
1451 
1452     �ؿ� mchar_encode () �ϡ�ʸ�������� $C �������ɤ��ƥ���ܥ�
1453     $CHARSET_NAME �Ǽ������ʸ�����å���ˤ����륳���ɥݥ���Ȥ����롣
1454 
1455     @return
1456     �������ɤ���������С�mchar_encode () �ϥ����ɤ��줿�����ɥݥ���Ȥ��֤���
1457     �����Ǥʤ���� #MCHAR_INVALID_CODE ���֤���  */
1458 
1459 /***
1460     @seealso
1461     mchar_decode ()  */
1462 
1463 unsigned
mchar_encode(MSymbol charset_name,int c)1464 mchar_encode (MSymbol charset_name, int c)
1465 {
1466   MCharset *charset = MCHARSET (charset_name);
1467 
1468   if (! charset)
1469     return MCHAR_INVALID_CODE;
1470   return ENCODE_CHAR (charset, c);
1471 }
1472 
1473 /*=*/
1474 
1475 /***en
1476     @brief Call a function for all the characters in a specified charset.
1477 
1478     The mcharset_map_chars () function calls $FUNC for all the
1479     characters in the charset named $CHARSET_NAME.  A call is done for
1480     a chunk of consecutive characters rather than character by
1481     character.
1482 
1483     $FUNC receives three arguments: $FROM, $TO, and $ARG.  $FROM and
1484     $TO specify the range of character codes in $CHARSET.  $ARG is the
1485     same as $FUNC_ARG.
1486 
1487     @return
1488     If the operation was successful, mcharset_map_chars () returns 0.
1489     Otherwise, it returns -1 and assigns an error code to the external
1490     variable #merror_code.  */
1491 
1492 /***ja
1493     @brief ���ꤷ��ʸ�����åȤΤ��٤Ƥ�ʸ�����Ф��ƴؿ���Ƥ�.
1494 
1495     �ؿ� mcharset_map_chars () �� $CHARSET_NAME
1496     �Ȥ���̾�������ʸ�����å���Τ��٤Ƥ�ʸ�����Ф��� $FUNC ��Ƥ֡�
1497     �ƤӽФ��ϰ�ʸ����ǤϤʤ���Ϣ³����ʸ���ΤޤȤޤ�ñ�̤ǹԤʤ��롣
1498 
1499     �ؿ� $FUNC �ˤ�$FROM, $TO, $ARG �Σ��������Ϥ���롣$FROM �� $TO
1500     �� $CHARSET ���ʸ�������ɤ��ϰϤ���ꤹ�롣$ARG �� $FUNC_ARG
1501     ��Ʊ���Ǥ��롣
1502 
1503     @return
1504     ��������������� mcharset_map_chars () �� 0 ���֤���
1505     �����Ǥʤ���� -1 ���֤��������ѿ� #merror_code �˥��顼�����ɤ����ꤹ�롣  */
1506 
1507 /***
1508     @errors
1509     @c MERROR_CHARSET */
1510 
1511 int
mchar_map_charset(MSymbol charset_name,void (* func)(int from,int to,void * arg),void * func_arg)1512 mchar_map_charset (MSymbol charset_name,
1513 		   void (*func) (int from, int to, void *arg),
1514 		   void *func_arg)
1515 {
1516   MCharset *charset;
1517 
1518   charset = MCHARSET (charset_name);
1519   if (! charset)
1520     MERROR (MERROR_CHARSET, -1);
1521 
1522   if (charset->encoder)
1523     {
1524       int c = charset->min_char;
1525       int next_c;
1526 
1527       if ((int) mchartable__lookup (charset->encoder, c, &next_c, 1) < 0)
1528 	c = next_c;
1529       while (c <= charset->max_char)
1530 	{
1531 	  if ((int) mchartable__lookup (charset->encoder, c, &next_c, 1) >= 0)
1532 	    (*func) (c, next_c - 1, func_arg);
1533 	  c = next_c;
1534 	}
1535     }
1536   else
1537     (*func) (charset->min_char, charset->max_char, func_arg);
1538   return 0;
1539 }
1540 
1541 /*=*/
1542 
1543 /*** @} */
1544 
1545 /*
1546   Local Variables:
1547   coding: euc-japan
1548   End:
1549 */
1550