1 /*
2 * char.c - character and character set operations
3 *
4 * Copyright (c) 2000-2020 Shiro Kawai <shiro@acm.org>
5 *
6 * Redistribution and use in source and binary forms, with or without
7 * modification, are permitted provided that the following conditions
8 * are met:
9 *
10 * 1. Redistributions of source code must retain the above copyright
11 * notice, this list of conditions and the following disclaimer.
12 *
13 * 2. Redistributions in binary form must reproduce the above copyright
14 * notice, this list of conditions and the following disclaimer in the
15 * documentation and/or other materials provided with the distribution.
16 *
17 * 3. Neither the name of the authors nor the names of its contributors
18 * may be used to endorse or promote products derived from this
19 * software without specific prior written permission.
20 *
21 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
22 * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
23 * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
24 * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
25 * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
26 * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED
27 * TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
28 * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
29 * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
30 * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
31 * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
32 */
33
34 #include <ctype.h>
35 #define LIBGAUCHE_BODY
36 #include "gauche.h"
37 #include "gauche/char_attr.h"
38 #include "gauche/priv/charP.h"
39 #include "gauche/priv/vectorP.h"
40
41 static ScmObj predef_sets[SCM_CHAR_SET_NUM_PREDEFINED_SETS];
42 static ScmObj predef_sets_complement[SCM_CHAR_SET_NUM_PREDEFINED_SETS];
43
44 #include "char_attr.c" /* generated tables */
45
46 /*=======================================================================
47 * Character functions
48 */
49
Scm_CharEncodingName(void)50 ScmObj Scm_CharEncodingName(void)
51 {
52 return SCM_INTERN(SCM_CHAR_ENCODING_NAME);
53 }
54
55 /* includes encoding-specific auxiliary functions */
56 #define SCM_CHAR_ENCODING_BODY
57 #if defined(GAUCHE_CHAR_ENCODING_EUC_JP)
58 #include "gauche/char_euc_jp.h"
59 #elif defined(GAUCHE_CHAR_ENCODING_UTF_8)
60 #include "gauche/char_utf_8.h"
61 #elif defined(GAUCHE_CHAR_ENCODING_SJIS)
62 #include "gauche/char_sjis.h"
63 #else
64 #include "gauche/char_none.h"
65 #endif
66
Scm_SupportedCharacterEncodings(void)67 const char **Scm_SupportedCharacterEncodings(void)
68 {
69 return supportedCharacterEncodings;
70 }
71
Scm_SupportedCharacterEncodingP(const char * encoding)72 int Scm_SupportedCharacterEncodingP(const char *encoding)
73 {
74 const char **cs = supportedCharacterEncodings;
75 for (;*cs;cs++) {
76 const char *p = *cs;
77 const char *q = encoding;
78 for (;*p && *q; p++, q++) {
79 if (tolower(*p) != tolower(*q)) break;
80 }
81 if (*p == '\0' && *q == '\0') return TRUE;
82 }
83 return FALSE;
84 }
85
86 /* '0' -> 0, 'a' -> 10, etc.
87 Radix is assumed in the range [2, 36] if non-extended,
88 [2, 10] if extended.
89 'Extended' means we recognize not only ASCII but all Nd characters.
90 */
Scm_DigitToInt(ScmChar ch,int radix,int extended)91 int Scm_DigitToInt(ScmChar ch, int radix, int extended)
92 {
93 if (ch < '0') return -1;
94 if (radix < 2) return -1;
95 if (radix <= 10) {
96 if (ch < '0' + radix) return (ch - '0');
97 } else {
98 if (ch <= '9') return (ch - '0');
99 if (ch < 'A') return -1;
100 if (ch < 'A' + radix - 10) return (ch - 'A' + 10);
101 if (ch < 'a') return -1;
102 if (ch < 'a' + radix - 10) return (ch - 'a' + 10);
103 }
104 if (extended && ch > 0x80 && radix <= 10) {
105 ScmChar ucschar = Scm_CharToUcs(ch);
106 int val = ucs_digit_value(ucschar);
107 if (val < 0 || val >= radix) return -1;
108 return val;
109 } else {
110 return -1;
111 }
112 }
113
Scm_IntToDigit(int n,int radix,int basechar1,int basechar2)114 ScmChar Scm_IntToDigit(int n, int radix, int basechar1, int basechar2)
115 {
116 if (n < 0) return SCM_CHAR_INVALID;
117 if (basechar1 == 0) basechar1 = '0';
118 if (basechar2 == 0) basechar2 = 'a';
119 if (radix <= 10) {
120 if (n < radix) return (ScmChar)(n + basechar1);
121 else return SCM_CHAR_INVALID;
122 } else {
123 if (n < 10) return (ScmChar)(n + basechar1);
124 if (n < radix) return (ScmChar)(n - 10 + basechar2);
125 else return SCM_CHAR_INVALID;
126 }
127 }
128
129 /*
130 * Convert UCS4 code <-> character
131 * If the native encoding is not utf-8, gauche.charconv module is loaded.
132 * and these pointers are filled.
133 */
134 static ScmChar (*ucs2char_hook)(int ucs4) = NULL;
135 static int (*char2ucs_hook)(ScmChar ch) = NULL;
136
137 /* called by gauche.charconv */
Scm__InstallCharconvHooks(ScmChar (* u2c)(int),int (* c2u)(ScmChar))138 void Scm__InstallCharconvHooks(ScmChar (*u2c)(int), int (*c2u)(ScmChar))
139 {
140 ucs2char_hook = u2c;
141 char2ucs_hook = c2u;
142 }
143
144 /* TRANSIENT: These two variables are no longer used, but kept here for the
145 ABI compatibility. Remove them on 1.0 release. */
146 #if GAUCHE_API_VERSION < 1000
147 ScmChar (*Scm_UcsToCharHook)(int ucs4) = NULL;
148 int (*Scm_CharToUcsHook)(ScmChar ch) = NULL;
149 #endif /*GAUCHE_API_VERSION < 1000*/
150
Scm_UcsToChar(int n)151 ScmChar Scm_UcsToChar(int n)
152 {
153 if (n < 0) Scm_Error("bad character code: %d", n);
154 #if defined(GAUCHE_CHAR_ENCODING_UTF_8)
155 return (ScmChar)n;
156 #elif defined(GAUCHE_CHAR_ENCODING_EUC_JP) || defined(GAUCHE_CHAR_ENCODING_SJIS)
157 if (n < 0x80) return (ScmChar)n; /*ASCII range*/
158 if (ucs2char_hook == NULL) {
159 /* NB: we don't need mutex here, for the loading of gauche.charconv
160 is serialized in Scm_Require. */
161 Scm_Require(SCM_MAKE_STR("gauche/charconv"),
162 SCM_LOAD_PROPAGATE_ERROR, NULL);
163 if (ucs2char_hook == NULL) {
164 Scm_Error("couldn't autoload gauche.charconv");
165 }
166 }
167 return ucs2char_hook(n);
168 #else
169 /* Encoding == 'none'. It would be safer to reject anything beyond
170 0xff, but it prevents 'none' gosh from reading any source files that
171 have escaped characters in that range, even the section is cond-expanded.
172 That's awfully incovenient, so we use a substitution character '?' here,
173 relying the programmer to properly conditionalize the code.
174 We plan to drop 'none' encoding support in 1.0, so this kludge is
175 just a temporary measure.
176 */
177 if (n < 0x100) return (ScmChar)n; /* ISO8859-1 */
178 else return (ScmChar)'?';
179 #endif
180 }
181
Scm_CharToUcs(ScmChar ch)182 int Scm_CharToUcs(ScmChar ch)
183 {
184 if (ch == SCM_CHAR_INVALID) Scm_Error("bad character");
185 #if defined(GAUCHE_CHAR_ENCODING_UTF_8)
186 return (int)ch;
187 #elif defined(GAUCHE_CHAR_ENCODING_EUC_JP) || defined(GAUCHE_CHAR_ENCODING_SJIS)
188 if (ch < 0x80) return (int)ch; /*ASCII range*/
189 if (char2ucs_hook == NULL) {
190 /* NB: we don't need mutex here, for the loading of gauche.charconv
191 is serialized in Scm_Require. */
192 Scm_Require(SCM_MAKE_STR("gauche/charconv"),
193 SCM_LOAD_PROPAGATE_ERROR, NULL);
194 if (char2ucs_hook == NULL) {
195 Scm_Error("couldn't autoload gauche.charconv");
196 }
197 }
198 return char2ucs_hook(ch);
199 #else
200 return (int)ch; /* ISO8859-1 */
201 #endif /*!GAUCHE_CHAR_ENCODING_UTF_8*/
202 }
203
204 /*
205 * Charcter classification for lexical parsing
206 */
207
208 /* Table of initial 128 bytes of ASCII characters to dispatch for
209 special meanings. */
210
211 enum {
212 INITIAL = 1<<0, /* <initial> */
213 SUBSEQUENT = 1<<1, /* <subsequent> */
214 SIGN_SUBSEQUENT = 1<<2, /* <sign subsequent> */
215 DELIMITER = 1<<3, /* <delimiter> */
216 GAUCHE_DELIMITER = 1<<4, /* Gauche-extended delimiter */
217 };
218
219 static const unsigned char ctypes[128] = {
220 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
221 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
222 /*SPC*/ DELIMITER,
223 /* ! */ INITIAL|SUBSEQUENT|SIGN_SUBSEQUENT,
224 /* " */ DELIMITER,
225 /* # */ 0,
226 /* $ */ INITIAL|SUBSEQUENT|SIGN_SUBSEQUENT,
227 /* % */ INITIAL|SUBSEQUENT|SIGN_SUBSEQUENT,
228 /* & */ INITIAL|SUBSEQUENT|SIGN_SUBSEQUENT,
229 /* ' */ GAUCHE_DELIMITER,
230 /* ( */ DELIMITER,
231 /* ) */ DELIMITER,
232 /* * */ INITIAL|SUBSEQUENT|SIGN_SUBSEQUENT,
233 /* + */ SUBSEQUENT|SIGN_SUBSEQUENT,
234 /* , */ GAUCHE_DELIMITER,
235 /* - */ SUBSEQUENT|SIGN_SUBSEQUENT,
236 /* . */ SUBSEQUENT,
237 /* / */ INITIAL|SUBSEQUENT|SIGN_SUBSEQUENT,
238
239 /* 0 */ SUBSEQUENT,
240 /* 1 */ SUBSEQUENT,
241 /* 2 */ SUBSEQUENT,
242 /* 3 */ SUBSEQUENT,
243 /* 4 */ SUBSEQUENT,
244 /* 5 */ SUBSEQUENT,
245 /* 6 */ SUBSEQUENT,
246 /* 7 */ SUBSEQUENT,
247 /* 8 */ SUBSEQUENT,
248 /* 9 */ SUBSEQUENT,
249 /* : */ INITIAL|SUBSEQUENT|SIGN_SUBSEQUENT,
250 /* ; */ DELIMITER,
251 /* < */ INITIAL|SUBSEQUENT|SIGN_SUBSEQUENT,
252 /* = */ INITIAL|SUBSEQUENT|SIGN_SUBSEQUENT,
253 /* > */ INITIAL|SUBSEQUENT|SIGN_SUBSEQUENT,
254 /* ? */ INITIAL|SUBSEQUENT|SIGN_SUBSEQUENT,
255
256 /* @ */ SUBSEQUENT|SIGN_SUBSEQUENT,
257 /* A */ INITIAL|SUBSEQUENT|SIGN_SUBSEQUENT,
258 /* B */ INITIAL|SUBSEQUENT|SIGN_SUBSEQUENT,
259 /* C */ INITIAL|SUBSEQUENT|SIGN_SUBSEQUENT,
260 /* D */ INITIAL|SUBSEQUENT|SIGN_SUBSEQUENT,
261 /* E */ INITIAL|SUBSEQUENT|SIGN_SUBSEQUENT,
262 /* F */ INITIAL|SUBSEQUENT|SIGN_SUBSEQUENT,
263 /* G */ INITIAL|SUBSEQUENT|SIGN_SUBSEQUENT,
264 /* H */ INITIAL|SUBSEQUENT|SIGN_SUBSEQUENT,
265 /* I */ INITIAL|SUBSEQUENT|SIGN_SUBSEQUENT,
266 /* J */ INITIAL|SUBSEQUENT|SIGN_SUBSEQUENT,
267 /* K */ INITIAL|SUBSEQUENT|SIGN_SUBSEQUENT,
268 /* L */ INITIAL|SUBSEQUENT|SIGN_SUBSEQUENT,
269 /* M */ INITIAL|SUBSEQUENT|SIGN_SUBSEQUENT,
270 /* N */ INITIAL|SUBSEQUENT|SIGN_SUBSEQUENT,
271 /* O */ INITIAL|SUBSEQUENT|SIGN_SUBSEQUENT,
272
273 /* P */ INITIAL|SUBSEQUENT|SIGN_SUBSEQUENT,
274 /* Q */ INITIAL|SUBSEQUENT|SIGN_SUBSEQUENT,
275 /* R */ INITIAL|SUBSEQUENT|SIGN_SUBSEQUENT,
276 /* S */ INITIAL|SUBSEQUENT|SIGN_SUBSEQUENT,
277 /* T */ INITIAL|SUBSEQUENT|SIGN_SUBSEQUENT,
278 /* U */ INITIAL|SUBSEQUENT|SIGN_SUBSEQUENT,
279 /* V */ INITIAL|SUBSEQUENT|SIGN_SUBSEQUENT,
280 /* W */ INITIAL|SUBSEQUENT|SIGN_SUBSEQUENT,
281 /* X */ INITIAL|SUBSEQUENT|SIGN_SUBSEQUENT,
282 /* Y */ INITIAL|SUBSEQUENT|SIGN_SUBSEQUENT,
283 /* Z */ INITIAL|SUBSEQUENT|SIGN_SUBSEQUENT,
284 /* [ */ GAUCHE_DELIMITER,
285 /* \ */ 0,
286 /* ] */ GAUCHE_DELIMITER,
287 /* ^ */ INITIAL|SUBSEQUENT|SIGN_SUBSEQUENT,
288 /* _ */ INITIAL|SUBSEQUENT|SIGN_SUBSEQUENT,
289
290 /* ` */ GAUCHE_DELIMITER,
291 /* a */ INITIAL|SUBSEQUENT|SIGN_SUBSEQUENT,
292 /* b */ INITIAL|SUBSEQUENT|SIGN_SUBSEQUENT,
293 /* c */ INITIAL|SUBSEQUENT|SIGN_SUBSEQUENT,
294 /* d */ INITIAL|SUBSEQUENT|SIGN_SUBSEQUENT,
295 /* e */ INITIAL|SUBSEQUENT|SIGN_SUBSEQUENT,
296 /* f */ INITIAL|SUBSEQUENT|SIGN_SUBSEQUENT,
297 /* g */ INITIAL|SUBSEQUENT|SIGN_SUBSEQUENT,
298 /* h */ INITIAL|SUBSEQUENT|SIGN_SUBSEQUENT,
299 /* i */ INITIAL|SUBSEQUENT|SIGN_SUBSEQUENT,
300 /* j */ INITIAL|SUBSEQUENT|SIGN_SUBSEQUENT,
301 /* k */ INITIAL|SUBSEQUENT|SIGN_SUBSEQUENT,
302 /* l */ INITIAL|SUBSEQUENT|SIGN_SUBSEQUENT,
303 /* m */ INITIAL|SUBSEQUENT|SIGN_SUBSEQUENT,
304 /* n */ INITIAL|SUBSEQUENT|SIGN_SUBSEQUENT,
305 /* o */ INITIAL|SUBSEQUENT|SIGN_SUBSEQUENT,
306
307 /* p */ INITIAL|SUBSEQUENT|SIGN_SUBSEQUENT,
308 /* q */ INITIAL|SUBSEQUENT|SIGN_SUBSEQUENT,
309 /* r */ INITIAL|SUBSEQUENT|SIGN_SUBSEQUENT,
310 /* s */ INITIAL|SUBSEQUENT|SIGN_SUBSEQUENT,
311 /* t */ INITIAL|SUBSEQUENT|SIGN_SUBSEQUENT,
312 /* u */ INITIAL|SUBSEQUENT|SIGN_SUBSEQUENT,
313 /* v */ INITIAL|SUBSEQUENT|SIGN_SUBSEQUENT,
314 /* w */ INITIAL|SUBSEQUENT|SIGN_SUBSEQUENT,
315 /* x */ INITIAL|SUBSEQUENT|SIGN_SUBSEQUENT,
316 /* y */ INITIAL|SUBSEQUENT|SIGN_SUBSEQUENT,
317 /* z */ INITIAL|SUBSEQUENT|SIGN_SUBSEQUENT,
318 /* { */ GAUCHE_DELIMITER,
319 /* | */ DELIMITER,
320 /* } */ GAUCHE_DELIMITER,
321 /* ~ */ INITIAL|SUBSEQUENT|SIGN_SUBSEQUENT,
322 /*DEL*/ 0,
323 };
324
Scm_CharLexerCategoryP(ScmChar c,ScmCharLexerCategory category)325 int Scm_CharLexerCategoryP(ScmChar c, ScmCharLexerCategory category)
326 {
327 if (c < 128) {
328 switch (category) {
329 case SCM_CHAR_INITIAL:
330 return !!(ctypes[c] & INITIAL);
331 case SCM_CHAR_SUBSEQUENT:
332 return !!(ctypes[c] & SUBSEQUENT);
333 case SCM_CHAR_SIGN_SUBSEQUENT:
334 return !!(ctypes[c] & SIGN_SUBSEQUENT);
335 }
336 }
337 if (c == 0x200c || c == 0x200d) {
338 /* can be INITIAL, SUBSEQUENT and SIGN_SUBSEQUENT */
339 return TRUE;
340 }
341 switch (Scm_CharGeneralCategory(c)) {
342 case SCM_CHAR_CATEGORY_Lu:
343 case SCM_CHAR_CATEGORY_Ll:
344 case SCM_CHAR_CATEGORY_Lt:
345 case SCM_CHAR_CATEGORY_Lm:
346 case SCM_CHAR_CATEGORY_Lo:
347 case SCM_CHAR_CATEGORY_Mn:
348 case SCM_CHAR_CATEGORY_Nl:
349 case SCM_CHAR_CATEGORY_No:
350 case SCM_CHAR_CATEGORY_Pd:
351 case SCM_CHAR_CATEGORY_Pc:
352 case SCM_CHAR_CATEGORY_Po:
353 case SCM_CHAR_CATEGORY_Sc:
354 case SCM_CHAR_CATEGORY_Sm:
355 case SCM_CHAR_CATEGORY_Sk:
356 case SCM_CHAR_CATEGORY_So:
357 case SCM_CHAR_CATEGORY_Co:
358 /* can be INITIAL, SUBSEQUENT and SIGN_SUBSEQUENT */
359 return TRUE;
360 case SCM_CHAR_CATEGORY_Nd:
361 case SCM_CHAR_CATEGORY_Mc:
362 case SCM_CHAR_CATEGORY_Me:
363 return (category != SCM_CHAR_INITIAL);
364 default: return FALSE;
365 }
366 }
367
368
369 /*=======================================================================
370 * Character set (cf. SRFI-14)
371 */
372 /* NB: operations on charset are not very optimized, for I don't see
373 * the immediate needs to do so, except Scm_CharSetContains.
374 */
375
376 static void charset_print(ScmObj obj, ScmPort *out, ScmWriteContext*);
377 static int charset_compare(ScmObj x, ScmObj y, int equalp);
378 SCM_DEFINE_BUILTIN_CLASS(Scm_CharSetClass,
379 charset_print, charset_compare, NULL, NULL,
380 SCM_CLASS_COLLECTION_CPL);
381
382 #define MASK_ISSET(cs, ch) SCM_BITS_TEST(cs->small, ch)
383 #define MASK_SET(cs, ch) SCM_BITS_SET(cs->small, ch)
384 #define MASK_RESET(cs, ch) SCM_BITS_RESET(cs->small, ch)
385
check_mutable(ScmCharSet * cs)386 static inline void check_mutable(ScmCharSet *cs)
387 {
388 if (SCM_CHAR_SET_IMMUTABLE_P(cs))
389 Scm_Error("Char set is immutable: %S", cs);
390 }
391
set_large(ScmCharSet * cs,int flag)392 static inline void set_large(ScmCharSet *cs, int flag)
393 {
394 if (flag) {
395 cs->flags |= SCM_CHAR_SET_LARGE;
396 } else {
397 cs->flags &= ~(SCM_CHAR_SET_LARGE);
398 }
399 }
400
401 /*----------------------------------------------------------------------
402 * Printer
403 */
404
charset_print_ch(ScmPort * out,ScmChar ch,int firstp)405 static void charset_print_ch(ScmPort *out, ScmChar ch, int firstp)
406 {
407 if (ch != 0 && ch < 0x80
408 && (strchr("[]-\\", ch) != NULL || (ch == '^' && firstp))) {
409 Scm_Printf(out, "\\%C", ch);
410 } else {
411 switch (Scm_CharGeneralCategory(ch)) {
412 case SCM_CHAR_CATEGORY_Mn:
413 case SCM_CHAR_CATEGORY_Mc:
414 case SCM_CHAR_CATEGORY_Me:
415 case SCM_CHAR_CATEGORY_Cc:
416 case SCM_CHAR_CATEGORY_Cf:
417 case SCM_CHAR_CATEGORY_Cs:
418 case SCM_CHAR_CATEGORY_Co:
419 case SCM_CHAR_CATEGORY_Cn:
420 if (ch < 0x10000) Scm_Printf(out, "\\u%04lx", ch);
421 else Scm_Printf(out, "\\U%08lx", ch);
422 break;
423 default:
424 Scm_Putc(ch, out);
425 }
426 }
427 }
428
charset_print(ScmObj obj,ScmPort * out,ScmWriteContext * ctx SCM_UNUSED)429 static void charset_print(ScmObj obj, ScmPort *out,
430 ScmWriteContext *ctx SCM_UNUSED)
431 {
432 int prev, code, first = TRUE;
433 ScmCharSet *cs = SCM_CHAR_SET(obj);
434
435 Scm_Printf(out, "#[");
436 for (prev = -1, code = 0; code < SCM_CHAR_SET_SMALL_CHARS; code++) {
437 if (MASK_ISSET(cs, code) && prev < 0) {
438 charset_print_ch(out, code, first);
439 prev = code;
440 first = FALSE;
441 }
442 else if (!MASK_ISSET(cs, code) && prev >= 0) {
443 if (code - prev > 1) {
444 if (code - prev > 2) Scm_Printf(out, "-");
445 charset_print_ch(out, code-1, FALSE);
446 }
447 prev = -1;
448 }
449 }
450 if (prev >= 0) {
451 if (code - prev > 1) {
452 if (prev < 0x7e) Scm_Printf(out, "-");
453 charset_print_ch(out, code-1, FALSE);
454 }
455 }
456
457 if (cs->flags & SCM_CHAR_SET_IMMUTABLE) {
458 const uint32_t *v = cs->large.frozen.vec;
459 for (ScmSize i = 0; i < cs->large.frozen.size; i += 2) {
460 charset_print_ch(out, (int)v[i], FALSE);
461 if (v[i] != v[i+1]) {
462 if (v[i+1] - v[i] > 2) Scm_Printf(out, "-");
463 charset_print_ch(out, (int)v[i+1], FALSE);
464 }
465 }
466 } else {
467 ScmTreeIter iter;
468 ScmDictEntry *e;
469 Scm_TreeIterInit(&iter, &cs->large.tree, NULL);
470 while ((e = Scm_TreeIterNext(&iter)) != NULL) {
471 charset_print_ch(out, (int)e->key, FALSE);
472 if (e->value != e->key) {
473 if (e->value - e->key > 2) Scm_Printf(out, "-");
474 charset_print_ch(out, (int)e->value, FALSE);
475 }
476 }
477 }
478 Scm_Printf(out, "]");
479 }
480
481 /*-----------------------------------------------------------------
482 * Iterators for large char set
483 */
484 typedef struct cs_iter_rec {
485 ScmCharSet *cs;
486 int end;
487 union {
488 ScmTreeIter ti;
489 ScmSize vi;
490 } iter;
491 } cs_iter;
492
cs_iter_init(cs_iter * ci,ScmCharSet * cs)493 static void cs_iter_init(cs_iter *ci, ScmCharSet *cs)
494 {
495 ci->cs = cs;
496 if (!SCM_CHAR_SET_LARGE_P(cs)) {
497 ci->end = TRUE;
498 } else {
499 ci->end = FALSE;
500 if (SCM_CHAR_SET_IMMUTABLE_P(cs)) {
501 ci->iter.vi = 0;
502 } else {
503 Scm_TreeIterInit(&ci->iter.ti, &cs->large.tree, NULL);
504 }
505 }
506 }
507
508 /* returns FALSE if already exhausted */
cs_iter_next(cs_iter * ci,ScmChar * from,ScmChar * to)509 static int cs_iter_next(cs_iter *ci,
510 ScmChar *from /*out*/,
511 ScmChar *to /*out*/)
512 {
513 if (ci->end) return FALSE;
514 if (SCM_CHAR_SET_IMMUTABLE_P(ci->cs)) {
515 if (ci->iter.vi >= ci->cs->large.frozen.size) {
516 ci->end = TRUE;
517 return FALSE;
518 } else {
519 *from = (ScmChar)ci->cs->large.frozen.vec[ci->iter.vi];
520 *to = (ScmChar)ci->cs->large.frozen.vec[ci->iter.vi+1];
521 ci->iter.vi += 2;
522 return TRUE;
523 }
524 } else {
525 ScmDictEntry *e = Scm_TreeIterNext(&ci->iter.ti);
526 if (e == NULL) {
527 ci->end = TRUE;
528 return FALSE;
529 } else {
530 *from = (ScmChar)e->key;
531 *to = (ScmChar)e->value;
532 return TRUE;
533 }
534 }
535 }
536
537 /*-----------------------------------------------------------------
538 * Constructors
539 */
cmp(ScmTreeCore * tc SCM_UNUSED,intptr_t a,intptr_t b)540 static int cmp(ScmTreeCore *tc SCM_UNUSED, intptr_t a, intptr_t b)
541 {
542 if (a > b) return 1;
543 if (a < b) return -1;
544 return 0;
545 }
546
make_charset(void)547 static ScmCharSet *make_charset(void)
548 {
549 ScmCharSet *cs = SCM_NEW(ScmCharSet);
550 SCM_SET_CLASS(cs, SCM_CLASS_CHARSET);
551 Scm_BitsFill(cs->small, 0, SCM_CHAR_SET_SMALL_CHARS, 0);
552 Scm_TreeCoreInit(&cs->large.tree, cmp, NULL);
553 cs->flags = 0; /* small & mutable by default */
554 return cs;
555 }
556
Scm_MakeEmptyCharSet(void)557 ScmObj Scm_MakeEmptyCharSet(void)
558 {
559 return SCM_OBJ(make_charset());
560 }
561
562 /* This is mainly for precompiled module. */
Scm_MakeImmutableCharSet(const ScmBits * small,const uint32_t * vec,size_t size)563 ScmObj Scm_MakeImmutableCharSet(const ScmBits *small,
564 const uint32_t *vec,
565 size_t size)
566 {
567 SCM_ASSERT(size % 2 == 0);
568 ScmCharSet *cs = SCM_NEW(ScmCharSet);
569 SCM_SET_CLASS(cs, SCM_CLASS_CHARSET);
570 cs->flags |= SCM_CHAR_SET_IMMUTABLE;
571 memcpy(cs->small, small, sizeof(cs->small));
572 if (vec != NULL && size > 0) {
573 set_large(cs, TRUE);
574 if ((cs->large.frozen.size = size) == 2) {
575 cs->large.frozen.ivec[0] = vec[0];
576 cs->large.frozen.ivec[1] = vec[1];
577 cs->large.frozen.vec = cs->large.frozen.ivec;
578 } else {
579 cs->large.frozen.vec = vec;
580 }
581 }
582 return SCM_OBJ(cs);
583 }
584
Scm_CharSetCopy(ScmCharSet * src)585 ScmObj Scm_CharSetCopy(ScmCharSet *src)
586 {
587 ScmCharSet *dst = make_charset();
588 Scm_BitsCopyX(dst->small, 0, src->small, 0, SCM_CHAR_SET_SMALL_CHARS);
589 set_large(dst, SCM_CHAR_SET_LARGE_P(src));
590 if (SCM_CHAR_SET_IMMUTABLE_P(src)) {
591 /* The destination is mutable */
592 const uint32_t *vec = src->large.frozen.vec;
593 for (ScmSize k = 0; k < src->large.frozen.size; k += 2) {
594 ScmDictEntry *e = Scm_TreeCoreSearch(&dst->large.tree,
595 vec[k], SCM_DICT_CREATE);
596 e->value = vec[k+1];
597 }
598 } else {
599 Scm_TreeCoreCopy(&dst->large.tree, &src->large.tree);
600 }
601 return SCM_OBJ(dst);
602 }
603
604 /* Creates flat searched vector to be used for immutable charset.
605 SRC must be a mutable charset.
606 The caller must provide uint32_t[2] buffer for ivec. */
char_set_freeze_vec(ScmCharSet * src,uint32_t * ivec,ScmSize * size)607 static uint32_t *char_set_freeze_vec(ScmCharSet *src,
608 uint32_t *ivec,
609 ScmSize *size /*out*/)
610 {
611 SCM_ASSERT(!SCM_CHAR_SET_IMMUTABLE_P(src));
612 size_t s = (size_t)Scm_TreeCoreNumEntries(&src->large.tree) * 2;
613 uint32_t *v = (s == 2)? ivec : SCM_NEW_ATOMIC_ARRAY(uint32_t, s);
614
615 cs_iter iter;
616 cs_iter_init(&iter, src);
617 ScmChar lo, hi;
618 for (size_t k = 0; cs_iter_next(&iter, &lo, &hi); k += 2) {
619 SCM_ASSERT(k < s);
620 v[k] = (uint32_t)lo;
621 v[k+1] = (uint32_t)hi;
622 }
623 *size = s;
624 return v;
625 }
626
Scm_CharSetFreeze(ScmCharSet * src)627 ScmObj Scm_CharSetFreeze(ScmCharSet *src)
628 {
629 if (SCM_CHAR_SET_IMMUTABLE_P(src)) return SCM_OBJ(src);
630 ScmCharSet *dst = make_charset();
631 Scm_BitsCopyX(dst->small, 0, src->small, 0, SCM_CHAR_SET_SMALL_CHARS);
632
633 dst->flags |= SCM_CHAR_SET_IMMUTABLE;
634 if (SCM_CHAR_SET_LARGE_P(src)) {
635 set_large(dst, TRUE);
636 dst->large.frozen.vec = char_set_freeze_vec(src,
637 dst->large.frozen.ivec,
638 &dst->large.frozen.size);
639 } else {
640 dst->large.frozen.vec = NULL;
641 dst->large.frozen.size = 0;
642 }
643 return SCM_OBJ(dst);
644 }
645
Scm_CharSetFreezeX(ScmCharSet * src)646 ScmObj Scm_CharSetFreezeX(ScmCharSet *src)
647 {
648 if (SCM_CHAR_SET_IMMUTABLE_P(src)) return SCM_OBJ(src);
649 if (SCM_CHAR_SET_LARGE_P(src)) {
650 ScmSize s;
651 uint32_t iv[2];
652 uint32_t *v = char_set_freeze_vec(src, iv, &s);
653 src->large.frozen.size = s;
654 if (s == 2) {
655 src->large.frozen.vec = src->large.frozen.ivec;
656 src->large.frozen.ivec[0] = iv[0];
657 src->large.frozen.ivec[1] = iv[1];
658 } else {
659 src->large.frozen.vec = v;
660 }
661 }
662 src->flags |= SCM_CHAR_SET_IMMUTABLE;
663 return SCM_OBJ(src);
664 }
665
666 /*-----------------------------------------------------------------
667 * Comparison
668 */
charset_compare(ScmObj x,ScmObj y,int equalp)669 static int charset_compare(ScmObj x, ScmObj y, int equalp)
670 {
671 ScmCharSet *xx = SCM_CHAR_SET(x);
672 ScmCharSet *yy = SCM_CHAR_SET(y);
673
674 if (equalp) {
675 return (Scm_CharSetEq(xx, yy)? 0 : 1);
676 } else {
677 if (Scm_CharSetEq(xx, yy)) return 0;
678 if (Scm_CharSetLE(xx, yy)) return -1;
679 if (Scm_CharSetLE(yy, xx)) return 1;
680 Scm_Error("cannot compare char-sets: %S vs %S", x, y);
681 return 0; /* dummy */
682 }
683 }
684
Scm_CharSetEq(ScmCharSet * x,ScmCharSet * y)685 int Scm_CharSetEq(ScmCharSet *x, ScmCharSet *y)
686 {
687 if (!Scm_BitsEqual(x->small, y->small, 0, SCM_CHAR_SET_SMALL_CHARS))
688 return FALSE;
689 if (!SCM_CHAR_SET_IMMUTABLE_P(x) && !SCM_CHAR_SET_IMMUTABLE_P(y)) {
690 /* shortcut */
691 return Scm_TreeCoreEq(&x->large.tree, &y->large.tree);
692 } else {
693 cs_iter xi, yi;
694 cs_iter_init(&xi, x);
695 cs_iter_init(&yi, y);
696 for (;;) {
697 ScmChar xl, xh, yl, yh;
698 int xr = cs_iter_next(&xi, &xl, &xh);
699 int yr = cs_iter_next(&yi, &yl, &yh);
700 if (xr == FALSE && yr == FALSE) return TRUE;
701 if (!(xr && yr)) return FALSE;
702 if (!(xl == yl && xh == yh)) return FALSE;
703 }
704 }
705 }
706
707 /* See if cs contains the range [lo,hi] in large char range. */
cs_contains_range(ScmCharSet * s,ScmChar lo,ScmChar hi)708 static int cs_contains_range(ScmCharSet *s, ScmChar lo, ScmChar hi)
709 {
710 if (!SCM_CHAR_SET_LARGE_P(s)) return FALSE;
711 /* We can have two cases.
712 *
713 * Case 1:
714 * lo<---------->hi
715 * ye<----------------->
716 * Case 2:
717 * lo<---------->hi
718 * yl<------------------->
719 */
720 if (SCM_CHAR_SET_IMMUTABLE_P(s)) {
721 size_t ye, yl;
722 ye = Scm_BinarySearchU32(s->large.frozen.vec, s->large.frozen.size,
723 (uint32_t)lo, 1, &yl, NULL);
724 if (ye != (size_t)-1) { /* case 1 */
725 if (s->large.frozen.vec[ye+1] < (unsigned)hi) return FALSE;
726 } else if (yl != (size_t)-1) { /* case 2 */
727 if (s->large.frozen.vec[yl+1] < (unsigned)hi) return FALSE;
728 } else {
729 return FALSE;
730 }
731 } else {
732 ScmDictEntry *ye, *yl, *yh;
733 ye = Scm_TreeCoreClosestEntries(&s->large.tree, lo, &yl, &yh);
734 if (ye) { /* case 1 */
735 if (ye->value < hi) return FALSE;
736 } else if (yl) { /* case 2 */
737 if (yl->value < hi) return FALSE;
738 } else {
739 return FALSE;
740 }
741 }
742 return TRUE;
743 }
744
745 /* whether x <= y */
Scm_CharSetLE(ScmCharSet * x,ScmCharSet * y)746 int Scm_CharSetLE(ScmCharSet *x, ScmCharSet *y)
747 {
748 if (!Scm_BitsIncludes(y->small, x->small, 0, SCM_CHAR_SET_SMALL_CHARS))
749 return FALSE;
750
751 cs_iter xi;
752 cs_iter_init(&xi, x);
753 ScmChar lo, hi;
754 while (cs_iter_next(&xi, &lo, &hi)) {
755 if (!cs_contains_range(y, lo, hi)) return FALSE;
756 }
757 return TRUE;
758 }
759
760 /*-----------------------------------------------------------------
761 * Modification
762 * We reject immutable set at the top, so that we only deal with treemap.
763 */
764
Scm_CharSetAddRange(ScmCharSet * cs,ScmChar from,ScmChar to)765 ScmObj Scm_CharSetAddRange(ScmCharSet *cs, ScmChar from, ScmChar to)
766 {
767 check_mutable(cs);
768
769 ScmDictEntry *e, *lo, *hi;
770
771 if (to < from) return SCM_OBJ(cs);
772 if (from < SCM_CHAR_SET_SMALL_CHARS) {
773 if (to < SCM_CHAR_SET_SMALL_CHARS) {
774 Scm_BitsFill(cs->small, (int)from, (int)to+1, TRUE);
775 return SCM_OBJ(cs);
776 }
777 Scm_BitsFill(cs->small, (int)from, SCM_CHAR_SET_SMALL_CHARS, TRUE);
778 from = SCM_CHAR_SET_SMALL_CHARS;
779 }
780
781 set_large(cs, TRUE);
782
783 /* Let e have the lower bound. */
784 e = Scm_TreeCoreClosestEntries(&cs->large.tree, from, &lo, &hi);
785 if (!e) {
786 if (!lo || lo->value < from-1) {
787 e = Scm_TreeCoreSearch(&cs->large.tree, from, SCM_DICT_CREATE);
788 } else {
789 e = lo;
790 }
791 }
792 /* Set up the upper bound.
793 NB: if e is a new entry, e->value is 0. */
794 if (e->value >= to) return SCM_OBJ(cs);
795
796 hi = e;
797 while ((hi = Scm_TreeCoreNextEntry(&cs->large.tree, hi->key)) != NULL) {
798 if (hi->key > to+1) {
799 e->value = to;
800 return SCM_OBJ(cs);
801 }
802 Scm_TreeCoreSearch(&cs->large.tree, hi->key, SCM_DICT_DELETE);
803 if (hi->value > to) {
804 e->value = hi->value;
805 return SCM_OBJ(cs);
806 }
807 }
808 e->value = to;
809 return SCM_OBJ(cs);
810 }
811
Scm_CharSetAdd(ScmCharSet * dst,ScmCharSet * src)812 ScmObj Scm_CharSetAdd(ScmCharSet *dst, ScmCharSet *src)
813 {
814 check_mutable(dst);
815
816 if (dst == src) return SCM_OBJ(dst); /* precaution */
817
818 if (SCM_CHAR_SET_LARGE_P(src)) {
819 set_large(dst, TRUE);
820 }
821
822 ScmTreeIter iter;
823 ScmDictEntry *e;
824 Scm_BitsOperate(dst->small, SCM_BIT_IOR, dst->small, src->small,
825 0, SCM_CHAR_SET_SMALL_CHARS);
826 if (SCM_CHAR_SET_IMMUTABLE_P(src)) {
827 ScmSize k;
828 for (k = 0; k < src->large.frozen.size; k += 2) {
829 Scm_CharSetAddRange(dst,
830 SCM_CHAR(src->large.frozen.vec[k]),
831 SCM_CHAR(src->large.frozen.vec[k+1]));
832 }
833 } else {
834 Scm_TreeIterInit(&iter, &src->large.tree, NULL);
835 while ((e = Scm_TreeIterNext(&iter)) != NULL) {
836 Scm_CharSetAddRange(dst, SCM_CHAR(e->key), SCM_CHAR(e->value));
837 }
838 }
839 return SCM_OBJ(dst);
840 }
841
Scm_CharSetComplement(ScmCharSet * cs)842 ScmObj Scm_CharSetComplement(ScmCharSet *cs)
843 {
844 check_mutable(cs);
845
846 ScmDictEntry *e, *n;
847
848 Scm_BitsOperate(cs->small, SCM_BIT_NOT1, cs->small, NULL,
849 0, SCM_CHAR_SET_SMALL_CHARS);
850 int last = SCM_CHAR_SET_SMALL_CHARS-1;
851 int largep = FALSE;
852 /* we can't use treeiter, since we modify the tree while traversing it. */
853 while ((e = Scm_TreeCoreNextEntry(&cs->large.tree, last)) != NULL) {
854 Scm_TreeCoreSearch(&cs->large.tree, e->key, SCM_DICT_DELETE);
855 if (last < e->key-1) {
856 n = Scm_TreeCoreSearch(&cs->large.tree, last+1, SCM_DICT_CREATE);
857 n->value = e->key-1;
858 largep = TRUE;
859 }
860 last = (int)e->value;
861 }
862 if (last < SCM_CHAR_MAX) {
863 n = Scm_TreeCoreSearch(&cs->large.tree, last+1, SCM_DICT_CREATE);
864 n->value = SCM_CHAR_MAX;
865 largep = TRUE;
866 }
867 set_large(cs, largep);
868 return SCM_OBJ(cs);
869 }
870
871 /* Make CS case-insensitive. */
Scm_CharSetCaseFold(ScmCharSet * cs)872 ScmObj Scm_CharSetCaseFold(ScmCharSet *cs)
873 {
874 check_mutable(cs);
875
876 for (int ch='a'; ch<='z'; ch++) {
877 if (MASK_ISSET(cs, ch) || MASK_ISSET(cs, (ch-('a'-'A')))) {
878 MASK_SET(cs, ch);
879 MASK_SET(cs, (ch-('a'-'A')));
880 }
881 }
882
883 ScmTreeIter iter;
884 ScmDictEntry *e;
885 Scm_TreeIterInit(&iter, &cs->large.tree, NULL);
886 while ((e = Scm_TreeIterNext(&iter)) != NULL) {
887 for (ScmChar c = e->key; c <= e->value; c++) {
888 ScmChar uch = Scm_CharUpcase(c);
889 ScmChar lch = Scm_CharDowncase(c);
890 Scm_CharSetAddRange(cs, uch, uch);
891 Scm_CharSetAddRange(cs, lch, lch);
892 }
893 }
894 return SCM_OBJ(cs);
895 }
896
897 /*-----------------------------------------------------------------
898 * Query
899 */
900
Scm_CharSetContains(ScmCharSet * cs,ScmChar c)901 int Scm_CharSetContains(ScmCharSet *cs, ScmChar c)
902 {
903 if (c < 0) return FALSE;
904 if (c < SCM_CHAR_SET_SMALL_CHARS) return MASK_ISSET(cs, c);
905 else if (SCM_CHAR_SET_IMMUTABLE_P(cs)) {
906 if (cs->large.frozen.size == 2) {
907 /* shortcut */
908 return (c >= (ScmChar)cs->large.frozen.ivec[0]
909 && c <= (ScmChar)cs->large.frozen.ivec[1]);
910 } else {
911 size_t lo;
912 size_t k = Scm_BinarySearchU32(cs->large.frozen.vec,
913 cs->large.frozen.size,
914 (uint32_t)c,
915 1, &lo, NULL);
916 if ((k != (size_t)-1)
917 || (lo != (size_t)-1 && (unsigned)c <= cs->large.frozen.vec[lo+1]))
918 return TRUE;
919 else
920 return FALSE;
921 }
922 } else {
923 ScmDictEntry *e, *l, *h;
924 e = Scm_TreeCoreClosestEntries(&cs->large.tree, (int)c, &l, &h);
925 if (e || (l && l->value >= c)) return TRUE;
926 else return FALSE;
927 }
928 }
929
930 /*-----------------------------------------------------------------
931 * Inspection
932 */
933
934 /* returns a list of ranges contained in the charset */
Scm_CharSetRanges(ScmCharSet * cs)935 ScmObj Scm_CharSetRanges(ScmCharSet *cs)
936 {
937 ScmObj h = SCM_NIL, t = SCM_NIL;
938 int ind, begin = 0, prev = FALSE;
939
940 for (ind = 0; ind < SCM_CHAR_SET_SMALL_CHARS; ind++) {
941 int bit = MASK_ISSET(cs, ind);
942 if (!prev && bit) begin = ind;
943 if (prev && !bit) {
944 ScmObj cell = Scm_Cons(SCM_MAKE_INT(begin), SCM_MAKE_INT(ind-1));
945 SCM_APPEND1(h, t, cell);
946 }
947 prev = bit;
948 }
949 if (prev) {
950 ScmObj cell = Scm_Cons(SCM_MAKE_INT(begin), SCM_MAKE_INT(ind-1));
951 SCM_APPEND1(h, t, cell);
952 }
953
954 cs_iter iter;
955 cs_iter_init(&iter, cs);
956 ScmChar lo, hi;
957 while (cs_iter_next(&iter, &lo, &hi)) {
958 ScmObj cell = Scm_Cons(SCM_MAKE_INT(lo), SCM_MAKE_INT(hi));
959 SCM_APPEND1(h, t, cell);
960 }
961 return h;
962 }
963
Scm_CharSetDump(ScmCharSet * cs,ScmPort * port)964 void Scm_CharSetDump(ScmCharSet *cs, ScmPort *port)
965 {
966 Scm_Printf(port, "CharSet %p%s\nmask:",
967 cs,
968 SCM_CHAR_SET_IMMUTABLE_P(cs) ? " (frozen)" : "");
969 for (int i=0; i<SCM_BITS_NUM_WORDS(SCM_CHAR_SET_SMALL_CHARS); i++) {
970 #if SIZEOF_LONG == 4
971 Scm_Printf(port, "[%08lx]", cs->small[i]);
972 #else
973 Scm_Printf(port, "[%016lx]", cs->small[i]);
974 #endif
975 }
976 Scm_Printf(port, "\nranges:");
977 cs_iter iter;
978 cs_iter_init(&iter, cs);
979 ScmChar lo, hi;
980 while (cs_iter_next(&iter, &lo, &hi)) {
981 Scm_Printf(port, " %x-%x", lo, hi);
982 }
983 Scm_Printf(port, "\n");
984 }
985
986 /*-----------------------------------------------------------------
987 * Reader
988 */
989
990 /* Parse regexp-style character set specification (e.g. [a-zA-Z]).
991 Assumes the opening bracket is already read.
992 Always return a fresh charset, that can be modified afterwards.
993
994 If the input syntax is invalid, either signals an error or returns
995 #f, depending error_p flag.
996
997 If bracket_syntax is TRUE, the first closing bracket ']' in the
998 charset (except the complimenting caret) is taken as a literal
999 character, instead of terminating the charset. It should be TRUE
1000 during reading the regexp syntax for compatibility to POSIX regexp.
1001
1002 If complement_p is not NULL, the location get a boolean value of
1003 whether complement character (caret in the beginning) appeared or not.
1004 In that case, the returned charset is not complemented. */
1005
1006 static ScmObj read_predef_charset(ScmPort*, int);
1007
Scm_CharSetRead(ScmPort * input,int * complement_p,int error_p,int bracket_syntax)1008 ScmObj Scm_CharSetRead(ScmPort *input, int *complement_p,
1009 int error_p, int bracket_syntax)
1010 {
1011 #define REAL_BEGIN 1
1012 #define CARET_BEGIN 2
1013 ScmCharSet *set = make_charset();
1014 int begin = REAL_BEGIN;
1015 int complement = FALSE; /* Flag for the initial ^ */
1016 int inrange = FALSE; /* The range '-' is being read */
1017 ScmChar lastchar = SCM_CHAR_INVALID; /* The char before '-' range */
1018 const char *prefetched = NULL; /* \x notation requires lookahead. After
1019 it reads extra characters, this points
1020 to them. The pointed string is guaranteed
1021 to have only hexadecimal characters. */
1022 ScmDString buf; /* Save read characters for error message */
1023 Scm_DStringInit(&buf);
1024 int ch;
1025
1026 for (;;) {
1027 if (prefetched) {
1028 ch = *prefetched++;
1029 if (*prefetched == '\0') prefetched = NULL;
1030 } else {
1031 ch = Scm_Getc(input);
1032 if (ch == EOF) goto err;
1033 }
1034
1035 Scm_DStringPutc(&buf, ch);
1036
1037 if (begin == REAL_BEGIN && ch == '^') {
1038 complement = TRUE;
1039 begin = CARET_BEGIN;
1040 continue;
1041 }
1042
1043 ScmObj moreset;
1044 switch (ch) {
1045 case '^':
1046 if (begin == REAL_BEGIN) {
1047 complement = TRUE;
1048 begin = CARET_BEGIN;
1049 continue;
1050 } else {
1051 goto ordchar;
1052 }
1053 case ']':
1054 if (begin && bracket_syntax) goto ordchar;
1055 else break;
1056 case '-':
1057 if (begin || inrange) goto ordchar;
1058 inrange = TRUE;
1059 begin = FALSE;
1060 continue;
1061 case '\\':
1062 ch = Scm_Getc(input);
1063 if (ch == EOF) goto err;
1064 Scm_DStringPutc(&buf, ch);
1065 switch (ch) {
1066 case 'a': ch = 7; goto ordchar;
1067 case 'b': ch = 8; goto ordchar;
1068 case 'n': ch = '\n'; goto ordchar;
1069 case 'r': ch = '\r'; goto ordchar;
1070 case 't': ch = '\t'; goto ordchar;
1071 case 'f': ch = '\f'; goto ordchar;
1072 case 'e': ch = 0x1b; goto ordchar;
1073 case 'x': case 'u': case 'U': {
1074 ScmDString xbuf;
1075 Scm_DStringInit(&xbuf);
1076 ScmObj mode = Scm_GetPortReaderLexicalMode(input);
1077 ScmObj z = Scm_ReadXdigitsFromPort(input, ch, mode, FALSE, &xbuf);
1078 if (SCM_STRINGP(z)) {
1079 /* parse failure. z contains the prefetched string */
1080 Scm_DStringAdd(&buf, SCM_STRING(z));
1081 goto err;
1082 }
1083 /* xbuf contains the character that was hex-encoded,
1084 plus any hex digits that are prefetched. */
1085 Scm_DStringPutc(&xbuf, '\0');
1086 const char *cp = Scm_DStringPeek(&xbuf, NULL, NULL);
1087 SCM_CHAR_GET(cp, ch);
1088 cp += SCM_CHAR_NFOLLOWS(*cp)+1;
1089 if (*cp != '\0') {
1090 prefetched = cp;
1091 }
1092 goto ordchar;
1093 }
1094 case 'd':
1095 moreset = Scm_GetStandardCharSet(SCM_CHAR_SET_ASCII_DIGIT);
1096 goto addset;
1097 case 'D':
1098 moreset = Scm_GetStandardCharSet(-SCM_CHAR_SET_ASCII_DIGIT);
1099 goto addset;
1100 case 's':
1101 moreset = Scm_GetStandardCharSet(SCM_CHAR_SET_ASCII_WHITESPACE);
1102 goto addset;
1103 case 'S':
1104 moreset = Scm_GetStandardCharSet(-SCM_CHAR_SET_ASCII_WHITESPACE);
1105 goto addset;
1106 case 'w':
1107 moreset = Scm_GetStandardCharSet(SCM_CHAR_SET_ASCII_WORD);
1108 goto addset;
1109 case 'W':
1110 moreset = Scm_GetStandardCharSet(-SCM_CHAR_SET_ASCII_WORD);
1111 goto addset;
1112 case 'p': case 'P':
1113 moreset = Scm_GetStandardCharSet(Scm_CharSetParseCategory(input, ch));
1114 goto addset;
1115 default:
1116 goto ordchar;
1117 }
1118 case '[':
1119 moreset = read_predef_charset(input, error_p);
1120 if (!SCM_CHAR_SET_P(moreset)) goto err;
1121 addset:
1122 Scm_CharSetAdd(set, SCM_CHAR_SET(moreset));
1123 begin = FALSE;
1124 continue;
1125 ordchar:
1126 default:
1127 if (inrange) {
1128 if (lastchar < 0) {
1129 Scm_CharSetAddRange(set, '-', '-');
1130 Scm_CharSetAddRange(set, ch, ch);
1131 lastchar = ch;
1132 } else {
1133 Scm_CharSetAddRange(set, lastchar, ch);
1134 lastchar = -1;
1135 }
1136 inrange = FALSE;
1137 } else {
1138 Scm_CharSetAddRange(set, ch, ch);
1139 lastchar = ch;
1140 }
1141 begin = FALSE;
1142 continue;
1143 }
1144 break;
1145 }
1146
1147 if (inrange) {
1148 Scm_CharSetAddRange(set, '-', '-');
1149 if (lastchar >= 0) Scm_CharSetAddRange(set, lastchar, lastchar);
1150 }
1151 if (complement_p) {
1152 *complement_p = complement;
1153 return SCM_OBJ(set);
1154 } else {
1155 if (complement) Scm_CharSetComplement(set);
1156 return SCM_OBJ(set);
1157 }
1158 err:
1159 if (error_p) {
1160 /* TODO: We should deal with the case when input contains \0 */
1161 Scm_Error("Invalid charset syntax [%s%s...",
1162 complement? "^" : "",
1163 Scm_DStringGetz(&buf));
1164 }
1165 return SCM_FALSE;
1166 }
1167
1168 /* Predefined charset name table */
1169 struct predef_charset_posix_name_rec {
1170 const char *name;
1171 int cset; /* default cset */
1172 int cset_unicode; /* 'unicode' mode cset (:alnum: for unicode range)
1173 not used yet. */
1174 };
1175
1176 #define PREDEF_ENTRY(n, cs, csu) \
1177 { n, SCM_CPP_CAT(SCM_CHAR_SET_, cs), SCM_CPP_CAT(SCM_CHAR_SET_, csu) }
1178
1179 static struct predef_charset_posix_name_rec predef_charset_posix_names[] = {
1180 PREDEF_ENTRY("alpha:", ASCII_LETTER, LETTER),
1181 PREDEF_ENTRY("alnum:", ASCII_LETTER_DIGIT, LETTER_DIGIT),
1182 PREDEF_ENTRY("blank:", ASCII_BLANK, BLANK),
1183 PREDEF_ENTRY("cntrl:", ASCII_CONTROL, ISO_CONTROL),
1184 PREDEF_ENTRY("digit:", ASCII_DIGIT, DIGIT),
1185 PREDEF_ENTRY("graph:", ASCII_GRAPHIC, GRAPHIC),
1186 PREDEF_ENTRY("lower:", ASCII_LOWER, LOWER),
1187 PREDEF_ENTRY("print:", ASCII_PRINTING, PRINTING),
1188 PREDEF_ENTRY("punct:", ASCII_PUNCTUATION, PUNCTUATION),
1189 PREDEF_ENTRY("space:", ASCII_WHITESPACE, WHITESPACE),
1190 PREDEF_ENTRY("upper:", ASCII_UPPER, UPPER),
1191 PREDEF_ENTRY("word:", ASCII_WORD, WORD),
1192 PREDEF_ENTRY("xdigit:", HEX_DIGIT, HEX_DIGIT),
1193 PREDEF_ENTRY("ascii:", ASCII, ASCII), /* like Go */
1194
1195 /* Gauche extension - explicitly unicode range */
1196 PREDEF_ENTRY("ALPHA:", LETTER, LETTER),
1197 PREDEF_ENTRY("ALNUM:", LETTER_DIGIT, LETTER_DIGIT),
1198 PREDEF_ENTRY("BLANK:", BLANK, BLANK),
1199 PREDEF_ENTRY("CNTRL:", ISO_CONTROL, ISO_CONTROL),
1200 PREDEF_ENTRY("DIGIT:", DIGIT, DIGIT),
1201 PREDEF_ENTRY("GRAPH:", GRAPHIC, GRAPHIC),
1202 PREDEF_ENTRY("LOWER:", LOWER, LOWER),
1203 PREDEF_ENTRY("PRINT:", PRINTING, PRINTING),
1204 PREDEF_ENTRY("PUNCT:", PUNCTUATION, PUNCTUATION),
1205 PREDEF_ENTRY("SPACE:", WHITESPACE, WHITESPACE),
1206 PREDEF_ENTRY("UPPER:", UPPER, UPPER),
1207 PREDEF_ENTRY("TITLE:", TITLE, TITLE),
1208 PREDEF_ENTRY("WORD:", WORD, WORD),
1209 PREDEF_ENTRY("XDIGIT:", HEX_DIGIT, HEX_DIGIT),
1210
1211 { NULL, 0, 0 }
1212 };
1213
1214 /* Read posix [:alpha:] etc. The first '[' is already read.
1215 Return #f on error if errorp is FALSE. */
read_predef_charset(ScmPort * input,int error_p)1216 static ScmObj read_predef_charset(ScmPort *input, int error_p)
1217 {
1218 #define MAX_CHARSET_NAME_LEN 11
1219 char name[MAX_CHARSET_NAME_LEN+1];
1220 int namecnt = 0;
1221 for (; namecnt < MAX_CHARSET_NAME_LEN; namecnt++) {
1222 int ch = Scm_Getc(input);
1223 if (ch == EOF && !SCM_CHAR_ASCII_P(ch)) {
1224 name[namecnt] = '\0';
1225 goto err;
1226 }
1227 if (ch == ']') break;
1228 name[namecnt] = (char)ch;
1229 }
1230 if (namecnt == MAX_CHARSET_NAME_LEN) goto err;
1231 name[namecnt] = '\0';
1232
1233 int complement = FALSE;
1234 const char *start = name+1;
1235
1236 if (*start == '^') {
1237 complement = TRUE;
1238 start++;
1239 }
1240
1241 struct predef_charset_posix_name_rec *e = predef_charset_posix_names;
1242 while (e->name != NULL) {
1243 if (strcmp(start, e->name) == 0) {
1244 if (!complement) {
1245 return Scm_GetStandardCharSet(e->cset);
1246 } else {
1247 return Scm_GetStandardCharSet(-e->cset);
1248 }
1249 }
1250 e++;
1251 }
1252 err:
1253 /* here we got invalid charset name */
1254 if (error_p) {
1255 Scm_Error("invalid or unsupported POSIX charset '[%s]'", name);
1256 }
1257 return SCM_FALSE;
1258 }
1259
1260 static struct predef_charset_category_name_rec {
1261 const char *cat;
1262 int cset;
1263 } predef_charset_category_name[] = {
1264 { "L", SCM_CHAR_SET_L },
1265 { "LC", SCM_CHAR_SET_LC },
1266 { "Lu", SCM_CHAR_SET_Lu },
1267 { "Ll", SCM_CHAR_SET_Ll },
1268 { "Lt", SCM_CHAR_SET_Lt },
1269 { "Lm", SCM_CHAR_SET_Lm },
1270 { "M", SCM_CHAR_SET_M },
1271 { "Mn", SCM_CHAR_SET_Mn },
1272 { "Mc", SCM_CHAR_SET_Mc },
1273 { "Me", SCM_CHAR_SET_Me },
1274 { "N", SCM_CHAR_SET_N },
1275 { "Nd", SCM_CHAR_SET_Nd },
1276 { "Nl", SCM_CHAR_SET_Nl },
1277 { "No", SCM_CHAR_SET_No },
1278 { "P", SCM_CHAR_SET_P },
1279 { "Pc", SCM_CHAR_SET_Pc },
1280 { "Pd", SCM_CHAR_SET_Pd },
1281 { "Ps", SCM_CHAR_SET_Ps },
1282 { "Pe", SCM_CHAR_SET_Pe },
1283 { "Pi", SCM_CHAR_SET_Pi },
1284 { "Pf", SCM_CHAR_SET_Pf },
1285 { "Po", SCM_CHAR_SET_Po },
1286 { "S", SCM_CHAR_SET_S },
1287 { "Sm", SCM_CHAR_SET_Sm },
1288 { "Sc", SCM_CHAR_SET_Sc },
1289 { "Sk", SCM_CHAR_SET_Sk },
1290 { "So", SCM_CHAR_SET_So },
1291 { "Z", SCM_CHAR_SET_Z },
1292 { "Zs", SCM_CHAR_SET_Zs },
1293 { "Zl", SCM_CHAR_SET_Zl },
1294 { "Zp", SCM_CHAR_SET_Zp },
1295 { "C", SCM_CHAR_SET_C },
1296 { "Cc", SCM_CHAR_SET_Cc },
1297 { "Cf", SCM_CHAR_SET_Cf },
1298 { "Cs", SCM_CHAR_SET_Cs },
1299 { "Co", SCM_CHAR_SET_Co },
1300 { "Cn", SCM_CHAR_SET_Cn },
1301 { NULL, 0 }
1302 };
1303
1304 /* Read \p{Category}, \P{Category}. INPUT must point right after 'p' or
1305 'P'. KEY is either 'p' or 'P'. On successful reading, Returns the
1306 charset number and update *cp to point right after the syntax.
1307 Otherwise, throws an error.
1308 */
Scm_CharSetParseCategory(ScmPort * input,char key)1309 int Scm_CharSetParseCategory(ScmPort *input, char key)
1310 {
1311 int ch = Scm_Getc(input);
1312 if (ch != '{') {
1313 Scm_Error("\\%c must followed by '{'", key);
1314 }
1315 char name[3];
1316
1317 ch = Scm_Getc(input);
1318 if (ch == EOF || !SCM_CHAR_ASCII_P(ch)) {
1319 name[0] = '\0';
1320 goto bad;
1321 }
1322 name[0] = (char)ch;
1323
1324 ch = Scm_Getc(input);
1325 if (ch == EOF || !SCM_CHAR_ASCII_P(ch)) {
1326 name[1] = '\0';
1327 goto bad;
1328 }
1329 if (ch == '}') {
1330 name[1] = '\0';
1331 } else {
1332 name[1] = (char)ch;
1333 ch = Scm_Getc(input);
1334 name[2] = '\0';
1335 if (ch != '}') {
1336 goto bad;
1337 }
1338 }
1339
1340 for (int j=0; predef_charset_category_name[j].cat; j++) {
1341 if (strcmp(name, predef_charset_category_name[j].cat) == 0) {
1342 if (key == 'p') {
1343 return predef_charset_category_name[j].cset;
1344 } else {
1345 return -predef_charset_category_name[j].cset;
1346 }
1347 }
1348 }
1349 bad:
1350 Scm_Error("Bad charset category name near \\%c{%s...", key, name);
1351 return 0; /* dummy */
1352 }
1353
1354
1355 /*-----------------------------------------------------------------
1356 * Character attributes
1357 */
1358
Scm_CharGeneralCategory(ScmChar ch)1359 int Scm_CharGeneralCategory(ScmChar ch)
1360 {
1361 return (int)(Scm__LookupCharCategory(ch) & SCM_CHAR_CATEGORY_MASK);
1362 }
1363
Scm_CharAlphabeticP(ScmChar ch)1364 int Scm_CharAlphabeticP(ScmChar ch)
1365 {
1366 return (SCM_CHAR_ALPHA_MASK & Scm__LookupCharCategory(ch)) != 0;
1367 }
1368
Scm_CharUppercaseP(ScmChar ch)1369 int Scm_CharUppercaseP(ScmChar ch)
1370 {
1371 return ((SCM_CHAR_ALPHA_MASK & Scm__LookupCharCategory(ch))
1372 == SCM_CHAR_UPPERCASE_BITS);
1373 }
1374
Scm_CharLowercaseP(ScmChar ch)1375 int Scm_CharLowercaseP(ScmChar ch)
1376 {
1377 return ((SCM_CHAR_ALPHA_MASK & Scm__LookupCharCategory(ch))
1378 == SCM_CHAR_LOWERCASE_BITS);
1379 }
1380
Scm_CharTitlecaseP(ScmChar ch)1381 int Scm_CharTitlecaseP(ScmChar ch)
1382 {
1383 return (Scm_CharGeneralCategory(ch) == SCM_CHAR_CATEGORY_Lt);
1384 }
1385
Scm_CharNumericP(ScmChar ch)1386 int Scm_CharNumericP(ScmChar ch)
1387 {
1388 return (Scm_CharGeneralCategory(ch) == SCM_CHAR_CATEGORY_Nd);
1389 }
1390
1391 /* An internal entry to extract case mapping info.
1392 * Internal table is compressed, so the caller must provide
1393 * the buffer for ScmCharCaseMap.
1394 * The function returns either the pointer to the given buffer
1395 * with information filled, or a pointer to a static read-only
1396 * data structure in the internal table.
1397 */
1398 static const ScmCharCaseMap casemap_identity = {
1399 0, 0, 0, {-1}, {-1}, {-1}
1400 };
1401
Scm__CharCaseMap(ScmChar ch,ScmCharCaseMap * buf,int full)1402 const ScmCharCaseMap *Scm__CharCaseMap(ScmChar ch,
1403 ScmCharCaseMap *buf,
1404 int full)
1405 {
1406 if (ch < 0x10000) {
1407 int subtable = casemap_000[(ch >> 8) & 0xff];
1408 if (subtable == 255) return &casemap_identity;
1409
1410 unsigned short cmap =
1411 casemap_subtable[subtable][(unsigned char)(ch & 0xff)];
1412 if (cmap == SCM_CHAR_NO_CASE_MAPPING) return &casemap_identity;
1413 if (cmap & 0x8000) {
1414 /* mapping is extended. */
1415 return &(extended_casemaps[cmap & 0x7fff]);
1416 } else {
1417 /* mapping is simple */
1418 int off = (cmap & 0x2000)? (signed int)(cmap|~0x1fff) : cmap&0x1fff;
1419 if (cmap & 0x4000) {
1420 buf->to_upper_simple = off;
1421 buf->to_lower_simple = 0;
1422 buf->to_title_simple = off;
1423 } else {
1424 buf->to_upper_simple = 0;
1425 buf->to_lower_simple = off;
1426 buf->to_title_simple = 0;
1427 }
1428 if (full) {
1429 /* indicate no special mappings */
1430 buf->to_upper_full[0] = -1;
1431 buf->to_lower_full[0] = -1;
1432 buf->to_title_full[0] = -1;
1433 }
1434 return buf;
1435 }
1436 } else {
1437 /* TODO: 104xx*/
1438 return &casemap_identity;
1439 }
1440 }
1441
1442 /*
1443 * Case conversion API. For the time being, CharCaseMap works on Unicode
1444 * codepoints, so we have to convert from/to ScmChar if the internal encoding
1445 * is either EUC-JP or SJIS.
1446 */
1447 #define SIMPLE_CASE(code, buf, field) \
1448 (ScmChar)((code) + Scm__CharCaseMap((code), (buf), FALSE)->SCM_CPP_CAT3(to_, field, _simple))
1449
1450 #define SIMPLE_CASE_CV(code, buf, field) \
1451 ((code) = (ScmChar)Scm_CharToUcs((int)(code)), \
1452 (code) = SIMPLE_CASE(code, buf, field), \
1453 Scm_UcsToChar((int)(code)))
1454
Scm_CharUpcase(ScmChar ch)1455 ScmChar Scm_CharUpcase(ScmChar ch)
1456 {
1457 ScmCharCaseMap cm;
1458 #if defined(GAUCHE_CHAR_ENCODING_EUC_JP) || defined(GAUCHE_CHAR_ENCODING_SJIS)
1459 if (ch < 0x80) return SIMPLE_CASE(ch, &cm, upper);
1460 else if (Scm__CharInUnicodeP(ch)) return SIMPLE_CASE_CV(ch, &cm, upper);
1461 else return ch;
1462 #elif defined(GAUCHE_CHAR_ENCODING_UTF_8)
1463 return SIMPLE_CASE(ch, &cm, upper);
1464 #else
1465 /* Latin-1 mapping and Unicode mapping differ in U+00B5 (MICRO SIGN)
1466 and U+00FF (LATIN SMALL LETTER Y WITH DIAERESIS). In Unicode
1467 they map to U+039C and U+0178, respectively. In Latin-1 we don't
1468 have those characters, so we leave them alone. */
1469 if (ch == 0xb5 || ch == 0xff) return ch;
1470 else return SIMPLE_CASE(ch, &cm, upper);
1471 #endif
1472 }
1473
Scm_CharDowncase(ScmChar ch)1474 ScmChar Scm_CharDowncase(ScmChar ch)
1475 {
1476 ScmCharCaseMap cm;
1477 #if defined(GAUCHE_CHAR_ENCODING_EUC_JP) || defined(GAUCHE_CHAR_ENCODING_SJIS)
1478 if (ch < 0x80) return SIMPLE_CASE(ch, &cm, lower);
1479 else if (Scm__CharInUnicodeP(ch)) return SIMPLE_CASE_CV(ch, &cm, lower);
1480 else return ch;
1481 #else
1482 return SIMPLE_CASE(ch, &cm, lower);
1483 #endif
1484 }
1485
Scm_CharTitlecase(ScmChar ch)1486 ScmChar Scm_CharTitlecase(ScmChar ch)
1487 {
1488 ScmCharCaseMap cm;
1489 #if defined(GAUCHE_CHAR_ENCODING_EUC_JP) || defined(GAUCHE_CHAR_ENCODING_SJIS)
1490 if (ch < 0x80) return SIMPLE_CASE(ch, &cm, title);
1491 else if (Scm__CharInUnicodeP(ch)) return SIMPLE_CASE_CV(ch, &cm, title);
1492 else return ch;
1493 #elif defined(GAUCHE_CHAR_ENCODING_UTF_8)
1494 return SIMPLE_CASE(ch, &cm, title);
1495 #else
1496 /* In Latin-1, titlecase is the same as upcase. */
1497 return Scm_CharUpcase(ch);
1498 #endif
1499 }
1500
Scm_CharFoldcase(ScmChar ch)1501 ScmChar Scm_CharFoldcase(ScmChar ch)
1502 {
1503 ScmCharCaseMap cm;
1504 #if defined(GAUCHE_CHAR_ENCODING_EUC_JP) || defined(GAUCHE_CHAR_ENCODING_SJIS)
1505 if (Scm__CharInUnicodeP(ch)) {
1506 ScmChar ucs = (ScmChar)Scm_CharToUcs(ch);
1507 const ScmCharCaseMap *pcm = Scm__CharCaseMap(ucs, &cm, FALSE);
1508 if (pcm->to_lower_simple == 0 && pcm->to_upper_simple == 0) {
1509 /* we don't have case folding */
1510 return ch;
1511 }
1512 /* Otherwise, we do (char-downcase (char-upcase ch)) */
1513 if (pcm->to_upper_simple != 0) {
1514 ucs += pcm->to_upper_simple;
1515 pcm = Scm__CharCaseMap(ucs, &cm, FALSE);
1516 }
1517 return Scm_UcsToChar((int)(ucs + pcm->to_lower_simple));
1518 } else {
1519 return ch;
1520 }
1521 #elif defined(GAUCHE_CHAR_ENCODING_UTF_8)
1522 if (ch == 0x130 || ch == 0x131) {
1523 /* char-foldcase is identity for
1524 U+0130 Turkish I (LATIN CAPITAL LETTER I WITH DOT ABOVE) and
1525 U+0131 Turkish i (LATIN SMALL LETTER DOTLESS I) */
1526 return ch;
1527 }
1528 const ScmCharCaseMap *pcm = Scm__CharCaseMap(ch, &cm, FALSE);
1529 if (pcm->to_lower_simple == 0 && pcm->to_upper_simple == 0) {
1530 /* we don't have case folding */
1531 return ch;
1532 }
1533 /* Otherwise, we do (char-downcase (char-upcase ch)) */
1534 if (pcm->to_upper_simple != 0) {
1535 ch += pcm->to_upper_simple;
1536 pcm = Scm__CharCaseMap(ch, &cm, FALSE);
1537 }
1538 return ch + pcm->to_lower_simple;
1539 #else
1540 /* In Latin-1 range, foldcase is the same as donwcase. */
1541 return SIMPLE_CASE(ch, &cm, lower);
1542 #endif
1543 }
1544
1545 /*-----------------------------------------------------------------
1546 * Pre-defined charset
1547 */
1548
1549 /* Most predefined charset are pre-generated as static immutable data.
1550 See gen-unicode.scm for the generation code. */
1551
Scm_GetStandardCharSet(int id)1552 ScmObj Scm_GetStandardCharSet(int id)
1553 {
1554 if (id == 0
1555 || id >= SCM_CHAR_SET_NUM_PREDEFINED_SETS
1556 || id <= -SCM_CHAR_SET_NUM_PREDEFINED_SETS) {
1557 Scm_Error("bad id for predefined charset index: %d", id);
1558 }
1559
1560 if (id > 0) {
1561 return predef_sets[id];
1562 } else {
1563 if (!SCM_CHAR_SET_P(predef_sets_complement[-id])) {
1564 ScmObj cs = Scm_CharSetCopy(SCM_CHAR_SET(predef_sets[-id]));
1565 cs = Scm_CharSetComplement(SCM_CHAR_SET(cs));
1566 Scm_CharSetFreezeX(SCM_CHAR_SET(cs));
1567 predef_sets_complement[-id] = cs;
1568 }
1569 return predef_sets_complement[-id];
1570 }
1571 }
1572
Scm__InitChar(void)1573 void Scm__InitChar(void)
1574 {
1575 ScmModule *mod = Scm_GaucheModule();
1576
1577 init_predefined_charsets();
1578 predef_sets[SCM_CHAR_SET_FULL] = Scm_CharSetComplement(make_charset());
1579
1580 #define DEFCS(name, id) \
1581 Scm_Define(mod, SCM_SYMBOL(SCM_INTERN("char-set:" name)), predef_sets[SCM_CPP_CAT(SCM_CHAR_SET_, id)])
1582
1583 DEFCS("L", L);
1584 DEFCS("LC", LC);
1585 DEFCS("Lu", Lu);
1586 DEFCS("Ll", Ll);
1587 DEFCS("Lt", Lt);
1588 DEFCS("Lm", Lm);
1589 DEFCS("Lo", Lo);
1590 DEFCS("M", M);
1591 DEFCS("Mn", Mn);
1592 DEFCS("Mc", Mc);
1593 DEFCS("Me", Me);
1594 DEFCS("N", N);
1595 DEFCS("Nd", Nd);
1596 DEFCS("Nl", Nl);
1597 DEFCS("No", No);
1598 DEFCS("P", P);
1599 DEFCS("Pc", Pc);
1600 DEFCS("Pd", Pd);
1601 DEFCS("Ps", Ps);
1602 DEFCS("Pe", Pe);
1603 DEFCS("Pi", Pi);
1604 DEFCS("Pf", Pf);
1605 DEFCS("Po", Po);
1606 DEFCS("S", S);
1607 DEFCS("Sm", Sm);
1608 DEFCS("Sc", Sc);
1609 DEFCS("Sk", Sk);
1610 DEFCS("So", So);
1611 DEFCS("Z", Z);
1612 DEFCS("Zs", Zs);
1613 DEFCS("Zl", Zl);
1614 DEFCS("Zp", Zp);
1615 DEFCS("C", C);
1616 DEFCS("Cc", Cc);
1617 DEFCS("Cf", Cf);
1618 DEFCS("Cs", Cs);
1619 DEFCS("Co", Co);
1620 DEFCS("Cn", Cn);
1621
1622 DEFCS("lower-case", LOWER);
1623 DEFCS("ascii-lower-case", ASCII_LOWER);
1624 DEFCS("upper-case", UPPER);
1625 DEFCS("ascii-upper-case", ASCII_UPPER);
1626 DEFCS("title-case", TITLE);
1627 DEFCS("letter", LETTER);
1628 DEFCS("ascii-letter", ASCII_LETTER);
1629 DEFCS("digit", DIGIT);
1630 DEFCS("ascii-digit", ASCII_DIGIT);
1631 DEFCS("letter+digit", LETTER_DIGIT);
1632 DEFCS("ascii-letter+digit", ASCII_LETTER_DIGIT);
1633 DEFCS("graphic", GRAPHIC);
1634 DEFCS("ascii-graphic", ASCII_GRAPHIC);
1635 DEFCS("printing", PRINTING);
1636 DEFCS("ascii-printing", ASCII_PRINTING);
1637 DEFCS("whitespace", WHITESPACE);
1638 DEFCS("ascii-whitespace", ASCII_WHITESPACE);
1639 DEFCS("iso-control", ISO_CONTROL);
1640 DEFCS("ascii-control", ASCII_CONTROL);
1641 DEFCS("punctuation", PUNCTUATION);
1642 DEFCS("ascii-punctuation", ASCII_PUNCTUATION);
1643 DEFCS("symbol", SYMBOL);
1644 DEFCS("ascii-symbol", ASCII_SYMBOL);
1645 DEFCS("hex-digit", HEX_DIGIT);
1646 DEFCS("blank", BLANK);
1647 DEFCS("ascii-blank", ASCII_BLANK);
1648 DEFCS("ascii", ASCII);
1649 DEFCS("word", WORD);
1650 DEFCS("ascii-word", ASCII_WORD);
1651 DEFCS("empty", EMPTY);
1652 DEFCS("full", FULL);
1653
1654 /* We initialize complement charset on demand, except EMPTY and FULL. */
1655 for (int i=0; i<SCM_CHAR_SET_NUM_PREDEFINED_SETS; i++) {
1656 predef_sets_complement[i] = SCM_FALSE;
1657 }
1658 predef_sets_complement[SCM_CHAR_SET_EMPTY]
1659 = predef_sets_complement[SCM_CHAR_SET_FULL];
1660 predef_sets_complement[SCM_CHAR_SET_FULL]
1661 = predef_sets_complement[SCM_CHAR_SET_EMPTY];
1662
1663 /* Expose internal charset */
1664 #if defined(GAUCHE_CHAR_ENCODING_EUC_JP)
1665 Scm_AddFeature("gauche.ces.eucjp", NULL);
1666 #elif defined(GAUCHE_CHAR_ENCODING_SJIS)
1667 Scm_AddFeature("gauche.ces.sjis", NULL);
1668 #elif defined(GAUCHE_CHAR_ENCODING_UTF_8)
1669 Scm_AddFeature("gauche.ces.utf8", NULL);
1670 #else
1671 Scm_AddFeature("gauche.ces.none", NULL);
1672 #endif
1673 }
1674