1 /*
2  * char.c - character and character set operations
3  *
4  *   Copyright (c) 2000-2020  Shiro Kawai  <shiro@acm.org>
5  *
6  *   Redistribution and use in source and binary forms, with or without
7  *   modification, are permitted provided that the following conditions
8  *   are met:
9  *
10  *   1. Redistributions of source code must retain the above copyright
11  *      notice, this list of conditions and the following disclaimer.
12  *
13  *   2. Redistributions in binary form must reproduce the above copyright
14  *      notice, this list of conditions and the following disclaimer in the
15  *      documentation and/or other materials provided with the distribution.
16  *
17  *   3. Neither the name of the authors nor the names of its contributors
18  *      may be used to endorse or promote products derived from this
19  *      software without specific prior written permission.
20  *
21  *   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
22  *   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
23  *   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
24  *   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
25  *   OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
26  *   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED
27  *   TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
28  *   PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
29  *   LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
30  *   NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
31  *   SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
32  */
33 
34 #include <ctype.h>
35 #define LIBGAUCHE_BODY
36 #include "gauche.h"
37 #include "gauche/char_attr.h"
38 #include "gauche/priv/charP.h"
39 #include "gauche/priv/vectorP.h"
40 
41 static ScmObj predef_sets[SCM_CHAR_SET_NUM_PREDEFINED_SETS];
42 static ScmObj predef_sets_complement[SCM_CHAR_SET_NUM_PREDEFINED_SETS];
43 
44 #include "char_attr.c"          /* generated tables */
45 
46 /*=======================================================================
47  * Character functions
48  */
49 
Scm_CharEncodingName(void)50 ScmObj Scm_CharEncodingName(void)
51 {
52     return SCM_INTERN(SCM_CHAR_ENCODING_NAME);
53 }
54 
55 /* includes encoding-specific auxiliary functions */
56 #define SCM_CHAR_ENCODING_BODY
57 #if   defined(GAUCHE_CHAR_ENCODING_EUC_JP)
58 #include "gauche/char_euc_jp.h"
59 #elif defined(GAUCHE_CHAR_ENCODING_UTF_8)
60 #include "gauche/char_utf_8.h"
61 #elif defined(GAUCHE_CHAR_ENCODING_SJIS)
62 #include "gauche/char_sjis.h"
63 #else
64 #include "gauche/char_none.h"
65 #endif
66 
Scm_SupportedCharacterEncodings(void)67 const char **Scm_SupportedCharacterEncodings(void)
68 {
69     return supportedCharacterEncodings;
70 }
71 
Scm_SupportedCharacterEncodingP(const char * encoding)72 int Scm_SupportedCharacterEncodingP(const char *encoding)
73 {
74     const char **cs = supportedCharacterEncodings;
75     for (;*cs;cs++) {
76         const char *p = *cs;
77         const char *q = encoding;
78         for (;*p && *q; p++, q++) {
79             if (tolower(*p) != tolower(*q)) break;
80         }
81         if (*p == '\0' && *q == '\0') return TRUE;
82     }
83     return FALSE;
84 }
85 
86 /* '0' -> 0, 'a' -> 10, etc.
87    Radix is assumed in the range [2, 36] if non-extended,
88    [2, 10] if extended.
89    'Extended' means we recognize not only ASCII but all Nd characters.
90 */
Scm_DigitToInt(ScmChar ch,int radix,int extended)91 int Scm_DigitToInt(ScmChar ch, int radix, int extended)
92 {
93     if (ch < '0') return -1;
94     if (radix < 2) return -1;
95     if (radix <= 10) {
96         if (ch < '0' + radix) return (ch - '0');
97     } else {
98         if (ch <= '9') return (ch - '0');
99         if (ch < 'A') return -1;
100         if (ch < 'A' + radix - 10) return (ch - 'A' + 10);
101         if (ch < 'a') return -1;
102         if (ch < 'a' + radix - 10) return (ch - 'a' + 10);
103     }
104     if (extended && ch > 0x80 && radix <= 10) {
105         ScmChar ucschar = Scm_CharToUcs(ch);
106         int val = ucs_digit_value(ucschar);
107         if (val < 0 || val >= radix) return -1;
108         return val;
109     } else {
110         return -1;
111     }
112 }
113 
Scm_IntToDigit(int n,int radix,int basechar1,int basechar2)114 ScmChar Scm_IntToDigit(int n, int radix, int basechar1, int basechar2)
115 {
116     if (n < 0) return SCM_CHAR_INVALID;
117     if (basechar1 == 0) basechar1 = '0';
118     if (basechar2 == 0) basechar2 = 'a';
119     if (radix <= 10) {
120         if (n < radix) return (ScmChar)(n + basechar1);
121         else return SCM_CHAR_INVALID;
122     } else {
123         if (n < 10) return (ScmChar)(n + basechar1);
124         if (n < radix) return (ScmChar)(n - 10 + basechar2);
125         else return SCM_CHAR_INVALID;
126     }
127 }
128 
129 /*
130  * Convert UCS4 code <-> character
131  * If the native encoding is not utf-8, gauche.charconv module is loaded.
132  * and these pointers are filled.
133  */
134 static ScmChar (*ucs2char_hook)(int ucs4) = NULL;
135 static int     (*char2ucs_hook)(ScmChar ch) = NULL;
136 
137 /* called by gauche.charconv */
Scm__InstallCharconvHooks(ScmChar (* u2c)(int),int (* c2u)(ScmChar))138 void Scm__InstallCharconvHooks(ScmChar (*u2c)(int), int (*c2u)(ScmChar))
139 {
140     ucs2char_hook = u2c;
141     char2ucs_hook = c2u;
142 }
143 
144 /* TRANSIENT: These two variables are no longer used, but kept here for the
145    ABI compatibility.  Remove them on 1.0 release. */
146 #if GAUCHE_API_VERSION < 1000
147 ScmChar (*Scm_UcsToCharHook)(int ucs4) = NULL;
148 int (*Scm_CharToUcsHook)(ScmChar ch) = NULL;
149 #endif /*GAUCHE_API_VERSION < 1000*/
150 
Scm_UcsToChar(int n)151 ScmChar Scm_UcsToChar(int n)
152 {
153     if (n < 0) Scm_Error("bad character code: %d", n);
154 #if defined(GAUCHE_CHAR_ENCODING_UTF_8)
155     return (ScmChar)n;
156 #elif defined(GAUCHE_CHAR_ENCODING_EUC_JP) || defined(GAUCHE_CHAR_ENCODING_SJIS)
157     if (n < 0x80) return (ScmChar)n; /*ASCII range*/
158     if (ucs2char_hook == NULL) {
159         /* NB: we don't need mutex here, for the loading of gauche.charconv
160            is serialized in Scm_Require. */
161         Scm_Require(SCM_MAKE_STR("gauche/charconv"),
162                     SCM_LOAD_PROPAGATE_ERROR, NULL);
163         if (ucs2char_hook == NULL) {
164             Scm_Error("couldn't autoload gauche.charconv");
165         }
166     }
167     return ucs2char_hook(n);
168 #else
169     /* Encoding == 'none'.  It would be safer to reject anything beyond
170        0xff, but it prevents 'none' gosh from reading any source files that
171        have escaped characters in that range, even the section is cond-expanded.
172        That's awfully incovenient, so we use a substitution character '?' here,
173        relying the programmer to properly conditionalize the code.
174        We plan to drop 'none' encoding support in 1.0, so this kludge is
175        just a temporary measure.
176     */
177     if (n < 0x100) return (ScmChar)n; /* ISO8859-1 */
178     else return (ScmChar)'?';
179 #endif
180 }
181 
Scm_CharToUcs(ScmChar ch)182 int Scm_CharToUcs(ScmChar ch)
183 {
184     if (ch == SCM_CHAR_INVALID) Scm_Error("bad character");
185 #if defined(GAUCHE_CHAR_ENCODING_UTF_8)
186     return (int)ch;
187 #elif defined(GAUCHE_CHAR_ENCODING_EUC_JP) || defined(GAUCHE_CHAR_ENCODING_SJIS)
188     if (ch < 0x80) return (int)ch; /*ASCII range*/
189     if (char2ucs_hook == NULL) {
190         /* NB: we don't need mutex here, for the loading of gauche.charconv
191            is serialized in Scm_Require. */
192         Scm_Require(SCM_MAKE_STR("gauche/charconv"),
193                     SCM_LOAD_PROPAGATE_ERROR, NULL);
194         if (char2ucs_hook == NULL) {
195             Scm_Error("couldn't autoload gauche.charconv");
196         }
197     }
198     return char2ucs_hook(ch);
199 #else
200     return (int)ch;             /* ISO8859-1 */
201 #endif /*!GAUCHE_CHAR_ENCODING_UTF_8*/
202 }
203 
204 /*
205  * Charcter classification for lexical parsing
206  */
207 
208 /* Table of initial 128 bytes of ASCII characters to dispatch for
209    special meanings. */
210 
211 enum {
212     INITIAL          = 1<<0,      /* <initial> */
213     SUBSEQUENT       = 1<<1,      /* <subsequent> */
214     SIGN_SUBSEQUENT  = 1<<2,      /* <sign subsequent> */
215     DELIMITER        = 1<<3,      /* <delimiter> */
216     GAUCHE_DELIMITER = 1<<4,      /* Gauche-extended delimiter */
217 };
218 
219 static const unsigned char ctypes[128] = {
220     0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
221     0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
222     /*SPC*/  DELIMITER,
223     /* ! */  INITIAL|SUBSEQUENT|SIGN_SUBSEQUENT,
224     /* " */  DELIMITER,
225     /* # */  0,
226     /* $ */  INITIAL|SUBSEQUENT|SIGN_SUBSEQUENT,
227     /* % */  INITIAL|SUBSEQUENT|SIGN_SUBSEQUENT,
228     /* & */  INITIAL|SUBSEQUENT|SIGN_SUBSEQUENT,
229     /* ' */  GAUCHE_DELIMITER,
230     /* ( */  DELIMITER,
231     /* ) */  DELIMITER,
232     /* * */  INITIAL|SUBSEQUENT|SIGN_SUBSEQUENT,
233     /* + */  SUBSEQUENT|SIGN_SUBSEQUENT,
234     /* , */  GAUCHE_DELIMITER,
235     /* - */  SUBSEQUENT|SIGN_SUBSEQUENT,
236     /* . */  SUBSEQUENT,
237     /* / */  INITIAL|SUBSEQUENT|SIGN_SUBSEQUENT,
238 
239     /* 0 */  SUBSEQUENT,
240     /* 1 */  SUBSEQUENT,
241     /* 2 */  SUBSEQUENT,
242     /* 3 */  SUBSEQUENT,
243     /* 4 */  SUBSEQUENT,
244     /* 5 */  SUBSEQUENT,
245     /* 6 */  SUBSEQUENT,
246     /* 7 */  SUBSEQUENT,
247     /* 8 */  SUBSEQUENT,
248     /* 9 */  SUBSEQUENT,
249     /* : */  INITIAL|SUBSEQUENT|SIGN_SUBSEQUENT,
250     /* ; */  DELIMITER,
251     /* < */  INITIAL|SUBSEQUENT|SIGN_SUBSEQUENT,
252     /* = */  INITIAL|SUBSEQUENT|SIGN_SUBSEQUENT,
253     /* > */  INITIAL|SUBSEQUENT|SIGN_SUBSEQUENT,
254     /* ? */  INITIAL|SUBSEQUENT|SIGN_SUBSEQUENT,
255 
256     /* @ */  SUBSEQUENT|SIGN_SUBSEQUENT,
257     /* A */  INITIAL|SUBSEQUENT|SIGN_SUBSEQUENT,
258     /* B */  INITIAL|SUBSEQUENT|SIGN_SUBSEQUENT,
259     /* C */  INITIAL|SUBSEQUENT|SIGN_SUBSEQUENT,
260     /* D */  INITIAL|SUBSEQUENT|SIGN_SUBSEQUENT,
261     /* E */  INITIAL|SUBSEQUENT|SIGN_SUBSEQUENT,
262     /* F */  INITIAL|SUBSEQUENT|SIGN_SUBSEQUENT,
263     /* G */  INITIAL|SUBSEQUENT|SIGN_SUBSEQUENT,
264     /* H */  INITIAL|SUBSEQUENT|SIGN_SUBSEQUENT,
265     /* I */  INITIAL|SUBSEQUENT|SIGN_SUBSEQUENT,
266     /* J */  INITIAL|SUBSEQUENT|SIGN_SUBSEQUENT,
267     /* K */  INITIAL|SUBSEQUENT|SIGN_SUBSEQUENT,
268     /* L */  INITIAL|SUBSEQUENT|SIGN_SUBSEQUENT,
269     /* M */  INITIAL|SUBSEQUENT|SIGN_SUBSEQUENT,
270     /* N */  INITIAL|SUBSEQUENT|SIGN_SUBSEQUENT,
271     /* O */  INITIAL|SUBSEQUENT|SIGN_SUBSEQUENT,
272 
273     /* P */  INITIAL|SUBSEQUENT|SIGN_SUBSEQUENT,
274     /* Q */  INITIAL|SUBSEQUENT|SIGN_SUBSEQUENT,
275     /* R */  INITIAL|SUBSEQUENT|SIGN_SUBSEQUENT,
276     /* S */  INITIAL|SUBSEQUENT|SIGN_SUBSEQUENT,
277     /* T */  INITIAL|SUBSEQUENT|SIGN_SUBSEQUENT,
278     /* U */  INITIAL|SUBSEQUENT|SIGN_SUBSEQUENT,
279     /* V */  INITIAL|SUBSEQUENT|SIGN_SUBSEQUENT,
280     /* W */  INITIAL|SUBSEQUENT|SIGN_SUBSEQUENT,
281     /* X */  INITIAL|SUBSEQUENT|SIGN_SUBSEQUENT,
282     /* Y */  INITIAL|SUBSEQUENT|SIGN_SUBSEQUENT,
283     /* Z */  INITIAL|SUBSEQUENT|SIGN_SUBSEQUENT,
284     /* [ */  GAUCHE_DELIMITER,
285     /* \ */  0,
286     /* ] */  GAUCHE_DELIMITER,
287     /* ^ */  INITIAL|SUBSEQUENT|SIGN_SUBSEQUENT,
288     /* _ */  INITIAL|SUBSEQUENT|SIGN_SUBSEQUENT,
289 
290     /* ` */  GAUCHE_DELIMITER,
291     /* a */  INITIAL|SUBSEQUENT|SIGN_SUBSEQUENT,
292     /* b */  INITIAL|SUBSEQUENT|SIGN_SUBSEQUENT,
293     /* c */  INITIAL|SUBSEQUENT|SIGN_SUBSEQUENT,
294     /* d */  INITIAL|SUBSEQUENT|SIGN_SUBSEQUENT,
295     /* e */  INITIAL|SUBSEQUENT|SIGN_SUBSEQUENT,
296     /* f */  INITIAL|SUBSEQUENT|SIGN_SUBSEQUENT,
297     /* g */  INITIAL|SUBSEQUENT|SIGN_SUBSEQUENT,
298     /* h */  INITIAL|SUBSEQUENT|SIGN_SUBSEQUENT,
299     /* i */  INITIAL|SUBSEQUENT|SIGN_SUBSEQUENT,
300     /* j */  INITIAL|SUBSEQUENT|SIGN_SUBSEQUENT,
301     /* k */  INITIAL|SUBSEQUENT|SIGN_SUBSEQUENT,
302     /* l */  INITIAL|SUBSEQUENT|SIGN_SUBSEQUENT,
303     /* m */  INITIAL|SUBSEQUENT|SIGN_SUBSEQUENT,
304     /* n */  INITIAL|SUBSEQUENT|SIGN_SUBSEQUENT,
305     /* o */  INITIAL|SUBSEQUENT|SIGN_SUBSEQUENT,
306 
307     /* p */  INITIAL|SUBSEQUENT|SIGN_SUBSEQUENT,
308     /* q */  INITIAL|SUBSEQUENT|SIGN_SUBSEQUENT,
309     /* r */  INITIAL|SUBSEQUENT|SIGN_SUBSEQUENT,
310     /* s */  INITIAL|SUBSEQUENT|SIGN_SUBSEQUENT,
311     /* t */  INITIAL|SUBSEQUENT|SIGN_SUBSEQUENT,
312     /* u */  INITIAL|SUBSEQUENT|SIGN_SUBSEQUENT,
313     /* v */  INITIAL|SUBSEQUENT|SIGN_SUBSEQUENT,
314     /* w */  INITIAL|SUBSEQUENT|SIGN_SUBSEQUENT,
315     /* x */  INITIAL|SUBSEQUENT|SIGN_SUBSEQUENT,
316     /* y */  INITIAL|SUBSEQUENT|SIGN_SUBSEQUENT,
317     /* z */  INITIAL|SUBSEQUENT|SIGN_SUBSEQUENT,
318     /* { */  GAUCHE_DELIMITER,
319     /* | */  DELIMITER,
320     /* } */  GAUCHE_DELIMITER,
321     /* ~ */  INITIAL|SUBSEQUENT|SIGN_SUBSEQUENT,
322     /*DEL*/  0,
323 };
324 
Scm_CharLexerCategoryP(ScmChar c,ScmCharLexerCategory category)325 int Scm_CharLexerCategoryP(ScmChar c, ScmCharLexerCategory category)
326 {
327     if (c < 128) {
328         switch (category) {
329         case SCM_CHAR_INITIAL:
330             return !!(ctypes[c] & INITIAL);
331         case SCM_CHAR_SUBSEQUENT:
332             return !!(ctypes[c] & SUBSEQUENT);
333         case SCM_CHAR_SIGN_SUBSEQUENT:
334             return !!(ctypes[c] & SIGN_SUBSEQUENT);
335         }
336     }
337     if (c == 0x200c || c == 0x200d) {
338         /* can be INITIAL, SUBSEQUENT and SIGN_SUBSEQUENT */
339         return TRUE;
340     }
341     switch (Scm_CharGeneralCategory(c)) {
342     case SCM_CHAR_CATEGORY_Lu:
343     case SCM_CHAR_CATEGORY_Ll:
344     case SCM_CHAR_CATEGORY_Lt:
345     case SCM_CHAR_CATEGORY_Lm:
346     case SCM_CHAR_CATEGORY_Lo:
347     case SCM_CHAR_CATEGORY_Mn:
348     case SCM_CHAR_CATEGORY_Nl:
349     case SCM_CHAR_CATEGORY_No:
350     case SCM_CHAR_CATEGORY_Pd:
351     case SCM_CHAR_CATEGORY_Pc:
352     case SCM_CHAR_CATEGORY_Po:
353     case SCM_CHAR_CATEGORY_Sc:
354     case SCM_CHAR_CATEGORY_Sm:
355     case SCM_CHAR_CATEGORY_Sk:
356     case SCM_CHAR_CATEGORY_So:
357     case SCM_CHAR_CATEGORY_Co:
358         /* can be INITIAL, SUBSEQUENT and SIGN_SUBSEQUENT */
359         return TRUE;
360     case SCM_CHAR_CATEGORY_Nd:
361     case SCM_CHAR_CATEGORY_Mc:
362     case SCM_CHAR_CATEGORY_Me:
363         return (category != SCM_CHAR_INITIAL);
364     default: return FALSE;
365     }
366 }
367 
368 
369 /*=======================================================================
370  * Character set (cf. SRFI-14)
371  */
372 /* NB: operations on charset are not very optimized, for I don't see
373  * the immediate needs to do so, except Scm_CharSetContains.
374  */
375 
376 static void charset_print(ScmObj obj, ScmPort *out, ScmWriteContext*);
377 static int charset_compare(ScmObj x, ScmObj y, int equalp);
378 SCM_DEFINE_BUILTIN_CLASS(Scm_CharSetClass,
379                          charset_print, charset_compare, NULL, NULL,
380                          SCM_CLASS_COLLECTION_CPL);
381 
382 #define MASK_ISSET(cs, ch)  SCM_BITS_TEST(cs->small, ch)
383 #define MASK_SET(cs, ch)    SCM_BITS_SET(cs->small, ch)
384 #define MASK_RESET(cs, ch)  SCM_BITS_RESET(cs->small, ch)
385 
check_mutable(ScmCharSet * cs)386 static inline void check_mutable(ScmCharSet *cs)
387 {
388     if (SCM_CHAR_SET_IMMUTABLE_P(cs))
389         Scm_Error("Char set is immutable: %S", cs);
390 }
391 
set_large(ScmCharSet * cs,int flag)392 static inline void set_large(ScmCharSet *cs, int flag)
393 {
394     if (flag) {
395         cs->flags |= SCM_CHAR_SET_LARGE;
396     } else {
397         cs->flags &= ~(SCM_CHAR_SET_LARGE);
398     }
399 }
400 
401 /*----------------------------------------------------------------------
402  * Printer
403  */
404 
charset_print_ch(ScmPort * out,ScmChar ch,int firstp)405 static void charset_print_ch(ScmPort *out, ScmChar ch, int firstp)
406 {
407     if (ch != 0 && ch < 0x80
408         && (strchr("[]-\\", ch) != NULL || (ch == '^' && firstp))) {
409         Scm_Printf(out, "\\%C", ch);
410     } else {
411         switch (Scm_CharGeneralCategory(ch)) {
412         case SCM_CHAR_CATEGORY_Mn:
413         case SCM_CHAR_CATEGORY_Mc:
414         case SCM_CHAR_CATEGORY_Me:
415         case SCM_CHAR_CATEGORY_Cc:
416         case SCM_CHAR_CATEGORY_Cf:
417         case SCM_CHAR_CATEGORY_Cs:
418         case SCM_CHAR_CATEGORY_Co:
419         case SCM_CHAR_CATEGORY_Cn:
420             if (ch < 0x10000) Scm_Printf(out, "\\u%04lx", ch);
421             else              Scm_Printf(out, "\\U%08lx", ch);
422             break;
423         default:
424             Scm_Putc(ch, out);
425         }
426     }
427 }
428 
charset_print(ScmObj obj,ScmPort * out,ScmWriteContext * ctx SCM_UNUSED)429 static void charset_print(ScmObj obj, ScmPort *out,
430                           ScmWriteContext *ctx SCM_UNUSED)
431 {
432     int prev, code, first = TRUE;
433     ScmCharSet *cs = SCM_CHAR_SET(obj);
434 
435     Scm_Printf(out, "#[");
436     for (prev = -1, code = 0; code < SCM_CHAR_SET_SMALL_CHARS; code++) {
437         if (MASK_ISSET(cs, code) && prev < 0) {
438             charset_print_ch(out, code, first);
439             prev = code;
440             first = FALSE;
441         }
442         else if (!MASK_ISSET(cs, code) && prev >= 0) {
443             if (code - prev > 1) {
444                 if (code - prev > 2) Scm_Printf(out, "-");
445                 charset_print_ch(out, code-1, FALSE);
446             }
447             prev = -1;
448         }
449     }
450     if (prev >= 0) {
451         if (code - prev > 1) {
452             if (prev < 0x7e) Scm_Printf(out, "-");
453             charset_print_ch(out, code-1, FALSE);
454         }
455     }
456 
457     if (cs->flags & SCM_CHAR_SET_IMMUTABLE) {
458         const uint32_t *v = cs->large.frozen.vec;
459         for (ScmSize i = 0; i < cs->large.frozen.size; i += 2) {
460             charset_print_ch(out, (int)v[i], FALSE);
461             if (v[i] != v[i+1]) {
462                 if (v[i+1] - v[i] > 2) Scm_Printf(out, "-");
463                 charset_print_ch(out, (int)v[i+1], FALSE);
464             }
465         }
466     } else {
467         ScmTreeIter iter;
468         ScmDictEntry *e;
469         Scm_TreeIterInit(&iter, &cs->large.tree, NULL);
470         while ((e = Scm_TreeIterNext(&iter)) != NULL) {
471             charset_print_ch(out, (int)e->key, FALSE);
472             if (e->value != e->key) {
473                 if (e->value - e->key > 2) Scm_Printf(out, "-");
474                 charset_print_ch(out, (int)e->value, FALSE);
475             }
476         }
477     }
478     Scm_Printf(out, "]");
479 }
480 
481 /*-----------------------------------------------------------------
482  * Iterators for large char set
483  */
484 typedef struct cs_iter_rec {
485     ScmCharSet *cs;
486     int end;
487     union {
488         ScmTreeIter ti;
489         ScmSize vi;
490     } iter;
491 } cs_iter;
492 
cs_iter_init(cs_iter * ci,ScmCharSet * cs)493 static void cs_iter_init(cs_iter *ci, ScmCharSet *cs)
494 {
495     ci->cs = cs;
496     if (!SCM_CHAR_SET_LARGE_P(cs)) {
497         ci->end = TRUE;
498     } else {
499         ci->end = FALSE;
500         if (SCM_CHAR_SET_IMMUTABLE_P(cs)) {
501             ci->iter.vi = 0;
502         } else {
503             Scm_TreeIterInit(&ci->iter.ti, &cs->large.tree, NULL);
504         }
505     }
506 }
507 
508 /* returns FALSE if already exhausted */
cs_iter_next(cs_iter * ci,ScmChar * from,ScmChar * to)509 static int cs_iter_next(cs_iter *ci,
510                         ScmChar *from /*out*/,
511                         ScmChar *to   /*out*/)
512 {
513     if (ci->end) return FALSE;
514     if (SCM_CHAR_SET_IMMUTABLE_P(ci->cs)) {
515         if (ci->iter.vi >= ci->cs->large.frozen.size) {
516             ci->end = TRUE;
517             return FALSE;
518         } else {
519             *from = (ScmChar)ci->cs->large.frozen.vec[ci->iter.vi];
520             *to   = (ScmChar)ci->cs->large.frozen.vec[ci->iter.vi+1];
521             ci->iter.vi += 2;
522             return TRUE;
523         }
524     } else {
525         ScmDictEntry *e = Scm_TreeIterNext(&ci->iter.ti);
526         if (e == NULL) {
527             ci->end = TRUE;
528             return FALSE;
529         } else {
530             *from = (ScmChar)e->key;
531             *to = (ScmChar)e->value;
532             return TRUE;
533         }
534     }
535 }
536 
537 /*-----------------------------------------------------------------
538  * Constructors
539  */
cmp(ScmTreeCore * tc SCM_UNUSED,intptr_t a,intptr_t b)540 static int cmp(ScmTreeCore *tc SCM_UNUSED, intptr_t a, intptr_t b)
541 {
542     if (a > b) return 1;
543     if (a < b) return -1;
544     return 0;
545 }
546 
make_charset(void)547 static ScmCharSet *make_charset(void)
548 {
549     ScmCharSet *cs = SCM_NEW(ScmCharSet);
550     SCM_SET_CLASS(cs, SCM_CLASS_CHARSET);
551     Scm_BitsFill(cs->small, 0, SCM_CHAR_SET_SMALL_CHARS, 0);
552     Scm_TreeCoreInit(&cs->large.tree, cmp, NULL);
553     cs->flags = 0;              /* small & mutable by default */
554     return cs;
555 }
556 
Scm_MakeEmptyCharSet(void)557 ScmObj Scm_MakeEmptyCharSet(void)
558 {
559     return SCM_OBJ(make_charset());
560 }
561 
562 /* This is mainly for precompiled module. */
Scm_MakeImmutableCharSet(const ScmBits * small,const uint32_t * vec,size_t size)563 ScmObj Scm_MakeImmutableCharSet(const ScmBits *small,
564                                 const uint32_t *vec,
565                                 size_t size)
566 {
567     SCM_ASSERT(size % 2 == 0);
568     ScmCharSet *cs = SCM_NEW(ScmCharSet);
569     SCM_SET_CLASS(cs, SCM_CLASS_CHARSET);
570     cs->flags |= SCM_CHAR_SET_IMMUTABLE;
571     memcpy(cs->small, small, sizeof(cs->small));
572     if (vec != NULL && size > 0) {
573         set_large(cs, TRUE);
574         if ((cs->large.frozen.size = size) == 2) {
575             cs->large.frozen.ivec[0] = vec[0];
576             cs->large.frozen.ivec[1] = vec[1];
577             cs->large.frozen.vec = cs->large.frozen.ivec;
578         } else {
579             cs->large.frozen.vec = vec;
580         }
581     }
582     return SCM_OBJ(cs);
583 }
584 
Scm_CharSetCopy(ScmCharSet * src)585 ScmObj Scm_CharSetCopy(ScmCharSet *src)
586 {
587     ScmCharSet *dst = make_charset();
588     Scm_BitsCopyX(dst->small, 0, src->small, 0, SCM_CHAR_SET_SMALL_CHARS);
589     set_large(dst, SCM_CHAR_SET_LARGE_P(src));
590     if (SCM_CHAR_SET_IMMUTABLE_P(src)) {
591         /* The destination is mutable */
592         const uint32_t *vec = src->large.frozen.vec;
593         for (ScmSize k = 0; k < src->large.frozen.size; k += 2) {
594             ScmDictEntry *e = Scm_TreeCoreSearch(&dst->large.tree,
595                                                  vec[k], SCM_DICT_CREATE);
596             e->value = vec[k+1];
597         }
598     } else {
599         Scm_TreeCoreCopy(&dst->large.tree, &src->large.tree);
600     }
601     return SCM_OBJ(dst);
602 }
603 
604 /* Creates flat searched vector to be used for immutable charset.
605    SRC must be a mutable charset.
606    The caller must provide uint32_t[2] buffer for ivec. */
char_set_freeze_vec(ScmCharSet * src,uint32_t * ivec,ScmSize * size)607 static uint32_t *char_set_freeze_vec(ScmCharSet *src,
608                                      uint32_t *ivec,
609                                      ScmSize *size /*out*/)
610 {
611     SCM_ASSERT(!SCM_CHAR_SET_IMMUTABLE_P(src));
612     size_t s = (size_t)Scm_TreeCoreNumEntries(&src->large.tree) * 2;
613     uint32_t *v = (s == 2)? ivec : SCM_NEW_ATOMIC_ARRAY(uint32_t, s);
614 
615     cs_iter iter;
616     cs_iter_init(&iter, src);
617     ScmChar lo, hi;
618     for (size_t k = 0; cs_iter_next(&iter, &lo, &hi); k += 2) {
619         SCM_ASSERT(k < s);
620         v[k]   = (uint32_t)lo;
621         v[k+1] = (uint32_t)hi;
622     }
623     *size = s;
624     return v;
625 }
626 
Scm_CharSetFreeze(ScmCharSet * src)627 ScmObj Scm_CharSetFreeze(ScmCharSet *src)
628 {
629     if (SCM_CHAR_SET_IMMUTABLE_P(src)) return SCM_OBJ(src);
630     ScmCharSet *dst = make_charset();
631     Scm_BitsCopyX(dst->small, 0, src->small, 0, SCM_CHAR_SET_SMALL_CHARS);
632 
633     dst->flags |= SCM_CHAR_SET_IMMUTABLE;
634     if (SCM_CHAR_SET_LARGE_P(src)) {
635         set_large(dst, TRUE);
636         dst->large.frozen.vec = char_set_freeze_vec(src,
637                                                     dst->large.frozen.ivec,
638                                                     &dst->large.frozen.size);
639     } else {
640         dst->large.frozen.vec = NULL;
641         dst->large.frozen.size = 0;
642     }
643     return SCM_OBJ(dst);
644 }
645 
Scm_CharSetFreezeX(ScmCharSet * src)646 ScmObj Scm_CharSetFreezeX(ScmCharSet *src)
647 {
648     if (SCM_CHAR_SET_IMMUTABLE_P(src)) return SCM_OBJ(src);
649     if (SCM_CHAR_SET_LARGE_P(src)) {
650         ScmSize s;
651         uint32_t iv[2];
652         uint32_t *v = char_set_freeze_vec(src, iv, &s);
653         src->large.frozen.size = s;
654         if (s == 2) {
655             src->large.frozen.vec = src->large.frozen.ivec;
656             src->large.frozen.ivec[0] = iv[0];
657             src->large.frozen.ivec[1] = iv[1];
658         } else {
659             src->large.frozen.vec = v;
660         }
661     }
662     src->flags |= SCM_CHAR_SET_IMMUTABLE;
663     return SCM_OBJ(src);
664 }
665 
666 /*-----------------------------------------------------------------
667  * Comparison
668  */
charset_compare(ScmObj x,ScmObj y,int equalp)669 static int charset_compare(ScmObj x, ScmObj y, int equalp)
670 {
671     ScmCharSet *xx = SCM_CHAR_SET(x);
672     ScmCharSet *yy = SCM_CHAR_SET(y);
673 
674     if (equalp) {
675         return (Scm_CharSetEq(xx, yy)? 0 : 1);
676     } else {
677         if (Scm_CharSetEq(xx, yy)) return 0;
678         if (Scm_CharSetLE(xx, yy)) return -1;
679         if (Scm_CharSetLE(yy, xx)) return 1;
680         Scm_Error("cannot compare char-sets: %S vs %S", x, y);
681         return 0;               /* dummy */
682     }
683 }
684 
Scm_CharSetEq(ScmCharSet * x,ScmCharSet * y)685 int Scm_CharSetEq(ScmCharSet *x, ScmCharSet *y)
686 {
687     if (!Scm_BitsEqual(x->small, y->small, 0, SCM_CHAR_SET_SMALL_CHARS))
688         return FALSE;
689     if (!SCM_CHAR_SET_IMMUTABLE_P(x) && !SCM_CHAR_SET_IMMUTABLE_P(y)) {
690         /* shortcut */
691         return Scm_TreeCoreEq(&x->large.tree, &y->large.tree);
692     } else {
693         cs_iter xi, yi;
694         cs_iter_init(&xi, x);
695         cs_iter_init(&yi, y);
696         for (;;) {
697             ScmChar xl, xh, yl, yh;
698             int xr = cs_iter_next(&xi, &xl, &xh);
699             int yr = cs_iter_next(&yi, &yl, &yh);
700             if (xr == FALSE && yr == FALSE) return TRUE;
701             if (!(xr && yr)) return FALSE;
702             if (!(xl == yl && xh == yh)) return FALSE;
703         }
704     }
705 }
706 
707 /* See if cs contains the range [lo,hi] in large char range. */
cs_contains_range(ScmCharSet * s,ScmChar lo,ScmChar hi)708 static int cs_contains_range(ScmCharSet *s, ScmChar lo, ScmChar hi)
709 {
710     if (!SCM_CHAR_SET_LARGE_P(s)) return FALSE;
711     /* We can have two cases.
712      *
713      * Case 1:
714      *    lo<---------->hi
715      *    ye<----------------->
716      * Case 2:
717      *         lo<---------->hi
718      *    yl<------------------->
719      */
720     if (SCM_CHAR_SET_IMMUTABLE_P(s)) {
721         size_t ye, yl;
722         ye = Scm_BinarySearchU32(s->large.frozen.vec, s->large.frozen.size,
723                                  (uint32_t)lo, 1, &yl, NULL);
724         if (ye != (size_t)-1) { /* case 1 */
725             if (s->large.frozen.vec[ye+1] < (unsigned)hi) return FALSE;
726         } else if (yl != (size_t)-1) { /* case 2 */
727             if (s->large.frozen.vec[yl+1] < (unsigned)hi) return FALSE;
728         } else {
729             return FALSE;
730         }
731     } else {
732         ScmDictEntry *ye, *yl, *yh;
733         ye = Scm_TreeCoreClosestEntries(&s->large.tree, lo, &yl, &yh);
734         if (ye) {               /* case 1 */
735             if (ye->value < hi) return FALSE;
736         } else if (yl) {        /* case 2 */
737             if (yl->value < hi) return FALSE;
738         } else {
739             return FALSE;
740         }
741     }
742     return TRUE;
743 }
744 
745 /* whether x <= y */
Scm_CharSetLE(ScmCharSet * x,ScmCharSet * y)746 int Scm_CharSetLE(ScmCharSet *x, ScmCharSet *y)
747 {
748     if (!Scm_BitsIncludes(y->small, x->small, 0, SCM_CHAR_SET_SMALL_CHARS))
749         return FALSE;
750 
751     cs_iter xi;
752     cs_iter_init(&xi, x);
753     ScmChar lo, hi;
754     while (cs_iter_next(&xi, &lo, &hi)) {
755         if (!cs_contains_range(y, lo, hi)) return FALSE;
756     }
757     return TRUE;
758 }
759 
760 /*-----------------------------------------------------------------
761  * Modification
762  * We reject immutable set at the top, so that we only deal with treemap.
763  */
764 
Scm_CharSetAddRange(ScmCharSet * cs,ScmChar from,ScmChar to)765 ScmObj Scm_CharSetAddRange(ScmCharSet *cs, ScmChar from, ScmChar to)
766 {
767     check_mutable(cs);
768 
769     ScmDictEntry *e, *lo, *hi;
770 
771     if (to < from) return SCM_OBJ(cs);
772     if (from < SCM_CHAR_SET_SMALL_CHARS) {
773         if (to < SCM_CHAR_SET_SMALL_CHARS) {
774             Scm_BitsFill(cs->small, (int)from, (int)to+1, TRUE);
775             return SCM_OBJ(cs);
776         }
777         Scm_BitsFill(cs->small, (int)from, SCM_CHAR_SET_SMALL_CHARS, TRUE);
778         from = SCM_CHAR_SET_SMALL_CHARS;
779     }
780 
781     set_large(cs, TRUE);
782 
783     /* Let e have the lower bound. */
784     e = Scm_TreeCoreClosestEntries(&cs->large.tree, from, &lo, &hi);
785     if (!e) {
786         if (!lo || lo->value < from-1) {
787             e = Scm_TreeCoreSearch(&cs->large.tree, from, SCM_DICT_CREATE);
788         } else {
789             e = lo;
790         }
791     }
792     /* Set up the upper bound.
793        NB: if e is a new entry, e->value is 0. */
794     if (e->value >= to) return SCM_OBJ(cs);
795 
796     hi = e;
797     while ((hi = Scm_TreeCoreNextEntry(&cs->large.tree, hi->key)) != NULL) {
798         if (hi->key > to+1) {
799             e->value = to;
800             return SCM_OBJ(cs);
801         }
802         Scm_TreeCoreSearch(&cs->large.tree, hi->key, SCM_DICT_DELETE);
803         if (hi->value > to) {
804             e->value = hi->value;
805             return SCM_OBJ(cs);
806         }
807     }
808     e->value = to;
809     return SCM_OBJ(cs);
810 }
811 
Scm_CharSetAdd(ScmCharSet * dst,ScmCharSet * src)812 ScmObj Scm_CharSetAdd(ScmCharSet *dst, ScmCharSet *src)
813 {
814     check_mutable(dst);
815 
816     if (dst == src) return SCM_OBJ(dst);  /* precaution */
817 
818     if (SCM_CHAR_SET_LARGE_P(src)) {
819         set_large(dst, TRUE);
820     }
821 
822     ScmTreeIter iter;
823     ScmDictEntry *e;
824     Scm_BitsOperate(dst->small, SCM_BIT_IOR, dst->small, src->small,
825                     0, SCM_CHAR_SET_SMALL_CHARS);
826     if (SCM_CHAR_SET_IMMUTABLE_P(src)) {
827         ScmSize k;
828         for (k = 0; k < src->large.frozen.size; k += 2) {
829             Scm_CharSetAddRange(dst,
830                                 SCM_CHAR(src->large.frozen.vec[k]),
831                                 SCM_CHAR(src->large.frozen.vec[k+1]));
832         }
833     } else {
834         Scm_TreeIterInit(&iter, &src->large.tree, NULL);
835         while ((e = Scm_TreeIterNext(&iter)) != NULL) {
836             Scm_CharSetAddRange(dst, SCM_CHAR(e->key), SCM_CHAR(e->value));
837         }
838     }
839     return SCM_OBJ(dst);
840 }
841 
Scm_CharSetComplement(ScmCharSet * cs)842 ScmObj Scm_CharSetComplement(ScmCharSet *cs)
843 {
844     check_mutable(cs);
845 
846     ScmDictEntry *e, *n;
847 
848     Scm_BitsOperate(cs->small, SCM_BIT_NOT1, cs->small, NULL,
849                     0, SCM_CHAR_SET_SMALL_CHARS);
850     int last = SCM_CHAR_SET_SMALL_CHARS-1;
851     int largep = FALSE;
852     /* we can't use treeiter, since we modify the tree while traversing it. */
853     while ((e = Scm_TreeCoreNextEntry(&cs->large.tree, last)) != NULL) {
854         Scm_TreeCoreSearch(&cs->large.tree, e->key, SCM_DICT_DELETE);
855         if (last < e->key-1) {
856             n = Scm_TreeCoreSearch(&cs->large.tree, last+1, SCM_DICT_CREATE);
857             n->value = e->key-1;
858             largep = TRUE;
859         }
860         last = (int)e->value;
861     }
862     if (last < SCM_CHAR_MAX) {
863         n = Scm_TreeCoreSearch(&cs->large.tree, last+1, SCM_DICT_CREATE);
864         n->value = SCM_CHAR_MAX;
865         largep = TRUE;
866     }
867     set_large(cs, largep);
868     return SCM_OBJ(cs);
869 }
870 
871 /* Make CS case-insensitive. */
Scm_CharSetCaseFold(ScmCharSet * cs)872 ScmObj Scm_CharSetCaseFold(ScmCharSet *cs)
873 {
874     check_mutable(cs);
875 
876     for (int ch='a'; ch<='z'; ch++) {
877         if (MASK_ISSET(cs, ch) || MASK_ISSET(cs, (ch-('a'-'A')))) {
878             MASK_SET(cs, ch);
879             MASK_SET(cs, (ch-('a'-'A')));
880         }
881     }
882 
883     ScmTreeIter iter;
884     ScmDictEntry *e;
885     Scm_TreeIterInit(&iter, &cs->large.tree, NULL);
886     while ((e = Scm_TreeIterNext(&iter)) != NULL) {
887         for (ScmChar c = e->key; c <= e->value; c++) {
888             ScmChar uch = Scm_CharUpcase(c);
889             ScmChar lch = Scm_CharDowncase(c);
890             Scm_CharSetAddRange(cs, uch, uch);
891             Scm_CharSetAddRange(cs, lch, lch);
892         }
893     }
894     return SCM_OBJ(cs);
895 }
896 
897 /*-----------------------------------------------------------------
898  * Query
899  */
900 
Scm_CharSetContains(ScmCharSet * cs,ScmChar c)901 int Scm_CharSetContains(ScmCharSet *cs, ScmChar c)
902 {
903     if (c < 0) return FALSE;
904     if (c < SCM_CHAR_SET_SMALL_CHARS) return MASK_ISSET(cs, c);
905     else if (SCM_CHAR_SET_IMMUTABLE_P(cs)) {
906         if (cs->large.frozen.size == 2) {
907             /* shortcut */
908             return (c >= (ScmChar)cs->large.frozen.ivec[0]
909                     && c <= (ScmChar)cs->large.frozen.ivec[1]);
910         } else {
911             size_t lo;
912             size_t k = Scm_BinarySearchU32(cs->large.frozen.vec,
913                                            cs->large.frozen.size,
914                                            (uint32_t)c,
915                                            1, &lo, NULL);
916             if ((k != (size_t)-1)
917                 || (lo != (size_t)-1 && (unsigned)c <= cs->large.frozen.vec[lo+1]))
918                 return TRUE;
919             else
920                 return FALSE;
921         }
922     } else {
923         ScmDictEntry *e, *l, *h;
924         e = Scm_TreeCoreClosestEntries(&cs->large.tree, (int)c, &l, &h);
925         if (e || (l && l->value >= c)) return TRUE;
926         else return FALSE;
927     }
928 }
929 
930 /*-----------------------------------------------------------------
931  * Inspection
932  */
933 
934 /* returns a list of ranges contained in the charset */
Scm_CharSetRanges(ScmCharSet * cs)935 ScmObj Scm_CharSetRanges(ScmCharSet *cs)
936 {
937     ScmObj h = SCM_NIL, t = SCM_NIL;
938     int ind, begin = 0, prev = FALSE;
939 
940     for (ind = 0; ind < SCM_CHAR_SET_SMALL_CHARS; ind++) {
941         int bit = MASK_ISSET(cs, ind);
942         if (!prev && bit) begin = ind;
943         if (prev && !bit) {
944             ScmObj cell = Scm_Cons(SCM_MAKE_INT(begin), SCM_MAKE_INT(ind-1));
945             SCM_APPEND1(h, t, cell);
946         }
947         prev = bit;
948     }
949     if (prev) {
950         ScmObj cell = Scm_Cons(SCM_MAKE_INT(begin), SCM_MAKE_INT(ind-1));
951         SCM_APPEND1(h, t, cell);
952     }
953 
954     cs_iter iter;
955     cs_iter_init(&iter, cs);
956     ScmChar lo, hi;
957     while (cs_iter_next(&iter, &lo, &hi)) {
958         ScmObj cell = Scm_Cons(SCM_MAKE_INT(lo), SCM_MAKE_INT(hi));
959         SCM_APPEND1(h, t, cell);
960     }
961     return h;
962 }
963 
Scm_CharSetDump(ScmCharSet * cs,ScmPort * port)964 void Scm_CharSetDump(ScmCharSet *cs, ScmPort *port)
965 {
966     Scm_Printf(port, "CharSet %p%s\nmask:",
967                cs,
968                SCM_CHAR_SET_IMMUTABLE_P(cs) ? " (frozen)" : "");
969     for (int i=0; i<SCM_BITS_NUM_WORDS(SCM_CHAR_SET_SMALL_CHARS); i++) {
970 #if SIZEOF_LONG == 4
971         Scm_Printf(port, "[%08lx]", cs->small[i]);
972 #else
973         Scm_Printf(port, "[%016lx]", cs->small[i]);
974 #endif
975     }
976     Scm_Printf(port, "\nranges:");
977     cs_iter iter;
978     cs_iter_init(&iter, cs);
979     ScmChar lo, hi;
980     while (cs_iter_next(&iter, &lo, &hi)) {
981         Scm_Printf(port, " %x-%x", lo, hi);
982     }
983     Scm_Printf(port, "\n");
984 }
985 
986 /*-----------------------------------------------------------------
987  * Reader
988  */
989 
990 /* Parse regexp-style character set specification (e.g. [a-zA-Z]).
991    Assumes the opening bracket is already read.
992    Always return a fresh charset, that can be modified afterwards.
993 
994    If the input syntax is invalid, either signals an error or returns
995    #f, depending error_p flag.
996 
997    If bracket_syntax is TRUE, the first closing bracket ']' in the
998    charset (except the complimenting caret) is taken as a literal
999    character, instead of terminating the charset.  It should be TRUE
1000    during reading the regexp syntax for compatibility to POSIX regexp.
1001 
1002    If complement_p is not NULL, the location get a boolean value of
1003    whether complement character (caret in the beginning) appeared or not.
1004    In that case, the returned charset is not complemented. */
1005 
1006 static ScmObj read_predef_charset(ScmPort*, int);
1007 
Scm_CharSetRead(ScmPort * input,int * complement_p,int error_p,int bracket_syntax)1008 ScmObj Scm_CharSetRead(ScmPort *input, int *complement_p,
1009                        int error_p, int bracket_syntax)
1010 {
1011 #define REAL_BEGIN 1
1012 #define CARET_BEGIN 2
1013     ScmCharSet *set = make_charset();
1014     int begin = REAL_BEGIN;
1015     int complement = FALSE;     /* Flag for the initial ^ */
1016     int inrange = FALSE;        /* The range '-' is being read */
1017     ScmChar lastchar = SCM_CHAR_INVALID; /* The char before '-' range */
1018     const char *prefetched = NULL; /* \x notation requires lookahead.  After
1019                                       it reads extra characters, this points
1020                                       to them.  The pointed string is guaranteed
1021                                       to have only hexadecimal characters. */
1022     ScmDString buf;             /* Save read characters for error message */
1023     Scm_DStringInit(&buf);
1024     int ch;
1025 
1026     for (;;) {
1027         if (prefetched) {
1028             ch = *prefetched++;
1029             if (*prefetched == '\0') prefetched = NULL;
1030         } else {
1031             ch = Scm_Getc(input);
1032             if (ch == EOF) goto err;
1033         }
1034 
1035         Scm_DStringPutc(&buf, ch);
1036 
1037         if (begin == REAL_BEGIN && ch == '^') {
1038             complement = TRUE;
1039             begin = CARET_BEGIN;
1040             continue;
1041         }
1042 
1043         ScmObj moreset;
1044         switch (ch) {
1045         case '^':
1046             if (begin == REAL_BEGIN) {
1047                 complement = TRUE;
1048                 begin = CARET_BEGIN;
1049                 continue;
1050             } else {
1051                 goto ordchar;
1052             }
1053         case ']':
1054             if (begin && bracket_syntax) goto ordchar;
1055             else break;
1056         case '-':
1057             if (begin || inrange) goto ordchar;
1058             inrange = TRUE;
1059             begin = FALSE;
1060             continue;
1061         case '\\':
1062             ch = Scm_Getc(input);
1063             if (ch == EOF) goto err;
1064             Scm_DStringPutc(&buf, ch);
1065             switch (ch) {
1066             case 'a': ch = 7; goto ordchar;
1067             case 'b': ch = 8; goto ordchar;
1068             case 'n': ch = '\n'; goto ordchar;
1069             case 'r': ch = '\r'; goto ordchar;
1070             case 't': ch = '\t'; goto ordchar;
1071             case 'f': ch = '\f'; goto ordchar;
1072             case 'e': ch = 0x1b; goto ordchar;
1073             case 'x': case 'u': case 'U': {
1074                 ScmDString xbuf;
1075                 Scm_DStringInit(&xbuf);
1076                 ScmObj mode = Scm_GetPortReaderLexicalMode(input);
1077                 ScmObj z = Scm_ReadXdigitsFromPort(input, ch, mode, FALSE, &xbuf);
1078                 if (SCM_STRINGP(z)) {
1079                     /* parse failure.  z contains the prefetched string */
1080                     Scm_DStringAdd(&buf, SCM_STRING(z));
1081                     goto err;
1082                 }
1083                 /* xbuf contains the character that was hex-encoded,
1084                    plus any hex digits that are prefetched. */
1085                 Scm_DStringPutc(&xbuf, '\0');
1086                 const char *cp = Scm_DStringPeek(&xbuf, NULL, NULL);
1087                 SCM_CHAR_GET(cp, ch);
1088                 cp += SCM_CHAR_NFOLLOWS(*cp)+1;
1089                 if (*cp != '\0') {
1090                     prefetched = cp;
1091                 }
1092                 goto ordchar;
1093             }
1094             case 'd':
1095                 moreset = Scm_GetStandardCharSet(SCM_CHAR_SET_ASCII_DIGIT);
1096                 goto addset;
1097             case 'D':
1098                 moreset = Scm_GetStandardCharSet(-SCM_CHAR_SET_ASCII_DIGIT);
1099                 goto addset;
1100             case 's':
1101                 moreset = Scm_GetStandardCharSet(SCM_CHAR_SET_ASCII_WHITESPACE);
1102                 goto addset;
1103             case 'S':
1104                 moreset = Scm_GetStandardCharSet(-SCM_CHAR_SET_ASCII_WHITESPACE);
1105                 goto addset;
1106             case 'w':
1107                 moreset = Scm_GetStandardCharSet(SCM_CHAR_SET_ASCII_WORD);
1108                 goto addset;
1109             case 'W':
1110                 moreset = Scm_GetStandardCharSet(-SCM_CHAR_SET_ASCII_WORD);
1111                 goto addset;
1112             case 'p': case 'P':
1113                 moreset = Scm_GetStandardCharSet(Scm_CharSetParseCategory(input, ch));
1114                 goto addset;
1115             default:
1116                 goto ordchar;
1117             }
1118         case '[':
1119             moreset = read_predef_charset(input, error_p);
1120             if (!SCM_CHAR_SET_P(moreset)) goto err;
1121         addset:
1122             Scm_CharSetAdd(set, SCM_CHAR_SET(moreset));
1123             begin = FALSE;
1124             continue;
1125         ordchar:
1126         default:
1127             if (inrange) {
1128                 if (lastchar < 0) {
1129                     Scm_CharSetAddRange(set, '-', '-');
1130                     Scm_CharSetAddRange(set, ch, ch);
1131                     lastchar = ch;
1132                 } else {
1133                     Scm_CharSetAddRange(set, lastchar, ch);
1134                     lastchar = -1;
1135                 }
1136                 inrange = FALSE;
1137             } else {
1138                 Scm_CharSetAddRange(set, ch, ch);
1139                 lastchar = ch;
1140             }
1141             begin = FALSE;
1142             continue;
1143         }
1144         break;
1145     }
1146 
1147     if (inrange) {
1148         Scm_CharSetAddRange(set, '-', '-');
1149         if (lastchar >= 0) Scm_CharSetAddRange(set, lastchar, lastchar);
1150     }
1151     if (complement_p) {
1152         *complement_p = complement;
1153         return SCM_OBJ(set);
1154     } else {
1155         if (complement) Scm_CharSetComplement(set);
1156         return SCM_OBJ(set);
1157     }
1158   err:
1159     if (error_p) {
1160         /* TODO: We should deal with the case when input contains \0 */
1161         Scm_Error("Invalid charset syntax [%s%s...",
1162                   complement? "^" : "",
1163                   Scm_DStringGetz(&buf));
1164     }
1165     return SCM_FALSE;
1166 }
1167 
1168 /* Predefined charset name table */
1169 struct predef_charset_posix_name_rec {
1170     const char *name;
1171     int cset;           /* default cset */
1172     int cset_unicode;   /* 'unicode' mode cset (:alnum: for unicode range)
1173                            not used yet. */
1174 };
1175 
1176 #define PREDEF_ENTRY(n, cs, csu) \
1177     { n, SCM_CPP_CAT(SCM_CHAR_SET_, cs), SCM_CPP_CAT(SCM_CHAR_SET_, csu) }
1178 
1179 static struct predef_charset_posix_name_rec predef_charset_posix_names[] = {
1180     PREDEF_ENTRY("alpha:", ASCII_LETTER, LETTER),
1181     PREDEF_ENTRY("alnum:", ASCII_LETTER_DIGIT, LETTER_DIGIT),
1182     PREDEF_ENTRY("blank:", ASCII_BLANK, BLANK),
1183     PREDEF_ENTRY("cntrl:", ASCII_CONTROL, ISO_CONTROL),
1184     PREDEF_ENTRY("digit:", ASCII_DIGIT, DIGIT),
1185     PREDEF_ENTRY("graph:", ASCII_GRAPHIC, GRAPHIC),
1186     PREDEF_ENTRY("lower:", ASCII_LOWER, LOWER),
1187     PREDEF_ENTRY("print:", ASCII_PRINTING, PRINTING),
1188     PREDEF_ENTRY("punct:", ASCII_PUNCTUATION, PUNCTUATION),
1189     PREDEF_ENTRY("space:", ASCII_WHITESPACE, WHITESPACE),
1190     PREDEF_ENTRY("upper:", ASCII_UPPER, UPPER),
1191     PREDEF_ENTRY("word:",  ASCII_WORD, WORD),
1192     PREDEF_ENTRY("xdigit:", HEX_DIGIT, HEX_DIGIT),
1193     PREDEF_ENTRY("ascii:", ASCII, ASCII), /* like Go */
1194 
1195     /* Gauche extension - explicitly unicode range */
1196     PREDEF_ENTRY("ALPHA:", LETTER, LETTER),
1197     PREDEF_ENTRY("ALNUM:", LETTER_DIGIT, LETTER_DIGIT),
1198     PREDEF_ENTRY("BLANK:", BLANK, BLANK),
1199     PREDEF_ENTRY("CNTRL:", ISO_CONTROL, ISO_CONTROL),
1200     PREDEF_ENTRY("DIGIT:", DIGIT, DIGIT),
1201     PREDEF_ENTRY("GRAPH:", GRAPHIC, GRAPHIC),
1202     PREDEF_ENTRY("LOWER:", LOWER, LOWER),
1203     PREDEF_ENTRY("PRINT:", PRINTING, PRINTING),
1204     PREDEF_ENTRY("PUNCT:", PUNCTUATION, PUNCTUATION),
1205     PREDEF_ENTRY("SPACE:", WHITESPACE, WHITESPACE),
1206     PREDEF_ENTRY("UPPER:", UPPER, UPPER),
1207     PREDEF_ENTRY("TITLE:", TITLE, TITLE),
1208     PREDEF_ENTRY("WORD:",  WORD, WORD),
1209     PREDEF_ENTRY("XDIGIT:", HEX_DIGIT, HEX_DIGIT),
1210 
1211     { NULL, 0, 0 }
1212 };
1213 
1214 /* Read posix [:alpha:] etc.  The first '[' is already read.
1215    Return #f on error if errorp is FALSE. */
read_predef_charset(ScmPort * input,int error_p)1216 static ScmObj read_predef_charset(ScmPort *input, int error_p)
1217 {
1218 #define MAX_CHARSET_NAME_LEN  11
1219     char name[MAX_CHARSET_NAME_LEN+1];
1220     int namecnt = 0;
1221     for (; namecnt < MAX_CHARSET_NAME_LEN; namecnt++) {
1222         int ch = Scm_Getc(input);
1223         if (ch == EOF && !SCM_CHAR_ASCII_P(ch)) {
1224             name[namecnt] = '\0';
1225             goto err;
1226         }
1227         if (ch == ']') break;
1228         name[namecnt] = (char)ch;
1229     }
1230     if (namecnt == MAX_CHARSET_NAME_LEN) goto err;
1231     name[namecnt] = '\0';
1232 
1233     int complement = FALSE;
1234     const char *start = name+1;
1235 
1236     if (*start == '^') {
1237         complement = TRUE;
1238         start++;
1239     }
1240 
1241     struct predef_charset_posix_name_rec *e = predef_charset_posix_names;
1242     while (e->name != NULL) {
1243         if (strcmp(start, e->name) == 0) {
1244             if (!complement) {
1245                 return Scm_GetStandardCharSet(e->cset);
1246             } else {
1247                 return Scm_GetStandardCharSet(-e->cset);
1248             }
1249         }
1250         e++;
1251     }
1252  err:
1253     /* here we got invalid charset name */
1254     if (error_p) {
1255         Scm_Error("invalid or unsupported POSIX charset '[%s]'", name);
1256     }
1257     return SCM_FALSE;
1258 }
1259 
1260 static struct predef_charset_category_name_rec {
1261     const char *cat;
1262     int cset;
1263 } predef_charset_category_name[] = {
1264     { "L",  SCM_CHAR_SET_L },
1265     { "LC", SCM_CHAR_SET_LC },
1266     { "Lu", SCM_CHAR_SET_Lu },
1267     { "Ll", SCM_CHAR_SET_Ll },
1268     { "Lt", SCM_CHAR_SET_Lt },
1269     { "Lm", SCM_CHAR_SET_Lm },
1270     { "M",  SCM_CHAR_SET_M },
1271     { "Mn", SCM_CHAR_SET_Mn },
1272     { "Mc", SCM_CHAR_SET_Mc },
1273     { "Me", SCM_CHAR_SET_Me },
1274     { "N",  SCM_CHAR_SET_N },
1275     { "Nd", SCM_CHAR_SET_Nd },
1276     { "Nl", SCM_CHAR_SET_Nl },
1277     { "No", SCM_CHAR_SET_No },
1278     { "P",  SCM_CHAR_SET_P },
1279     { "Pc", SCM_CHAR_SET_Pc },
1280     { "Pd", SCM_CHAR_SET_Pd },
1281     { "Ps", SCM_CHAR_SET_Ps },
1282     { "Pe", SCM_CHAR_SET_Pe },
1283     { "Pi", SCM_CHAR_SET_Pi },
1284     { "Pf", SCM_CHAR_SET_Pf },
1285     { "Po", SCM_CHAR_SET_Po },
1286     { "S",  SCM_CHAR_SET_S },
1287     { "Sm", SCM_CHAR_SET_Sm },
1288     { "Sc", SCM_CHAR_SET_Sc },
1289     { "Sk", SCM_CHAR_SET_Sk },
1290     { "So", SCM_CHAR_SET_So },
1291     { "Z",  SCM_CHAR_SET_Z },
1292     { "Zs", SCM_CHAR_SET_Zs },
1293     { "Zl", SCM_CHAR_SET_Zl },
1294     { "Zp", SCM_CHAR_SET_Zp },
1295     { "C",  SCM_CHAR_SET_C },
1296     { "Cc", SCM_CHAR_SET_Cc },
1297     { "Cf", SCM_CHAR_SET_Cf },
1298     { "Cs", SCM_CHAR_SET_Cs },
1299     { "Co", SCM_CHAR_SET_Co },
1300     { "Cn", SCM_CHAR_SET_Cn },
1301     { NULL, 0 }
1302 };
1303 
1304 /* Read \p{Category}, \P{Category}.   INPUT must point right after 'p' or
1305    'P'.  KEY is either 'p' or 'P'.  On successful reading,  Returns the
1306    charset number and update *cp to point right after the syntax.
1307    Otherwise, throws an error.
1308 */
Scm_CharSetParseCategory(ScmPort * input,char key)1309 int Scm_CharSetParseCategory(ScmPort *input, char key)
1310 {
1311     int ch = Scm_Getc(input);
1312     if (ch != '{') {
1313         Scm_Error("\\%c must followed by '{'", key);
1314     }
1315     char name[3];
1316 
1317     ch = Scm_Getc(input);
1318     if (ch == EOF || !SCM_CHAR_ASCII_P(ch)) {
1319         name[0] = '\0';
1320         goto bad;
1321     }
1322     name[0] = (char)ch;
1323 
1324     ch = Scm_Getc(input);
1325     if (ch == EOF || !SCM_CHAR_ASCII_P(ch)) {
1326         name[1] = '\0';
1327         goto bad;
1328     }
1329     if (ch == '}') {
1330         name[1] = '\0';
1331     } else {
1332         name[1] = (char)ch;
1333         ch = Scm_Getc(input);
1334         name[2] = '\0';
1335         if (ch != '}') {
1336             goto bad;
1337         }
1338     }
1339 
1340     for (int j=0; predef_charset_category_name[j].cat; j++) {
1341         if (strcmp(name, predef_charset_category_name[j].cat) == 0) {
1342             if (key == 'p') {
1343                 return predef_charset_category_name[j].cset;
1344             } else {
1345                 return -predef_charset_category_name[j].cset;
1346             }
1347         }
1348     }
1349  bad:
1350     Scm_Error("Bad charset category name near \\%c{%s...", key, name);
1351     return 0;                   /* dummy */
1352 }
1353 
1354 
1355 /*-----------------------------------------------------------------
1356  * Character attributes
1357  */
1358 
Scm_CharGeneralCategory(ScmChar ch)1359 int Scm_CharGeneralCategory(ScmChar ch)
1360 {
1361     return (int)(Scm__LookupCharCategory(ch) & SCM_CHAR_CATEGORY_MASK);
1362 }
1363 
Scm_CharAlphabeticP(ScmChar ch)1364 int Scm_CharAlphabeticP(ScmChar ch)
1365 {
1366     return (SCM_CHAR_ALPHA_MASK & Scm__LookupCharCategory(ch)) != 0;
1367 }
1368 
Scm_CharUppercaseP(ScmChar ch)1369 int Scm_CharUppercaseP(ScmChar ch)
1370 {
1371     return ((SCM_CHAR_ALPHA_MASK & Scm__LookupCharCategory(ch))
1372             == SCM_CHAR_UPPERCASE_BITS);
1373 }
1374 
Scm_CharLowercaseP(ScmChar ch)1375 int Scm_CharLowercaseP(ScmChar ch)
1376 {
1377     return ((SCM_CHAR_ALPHA_MASK & Scm__LookupCharCategory(ch))
1378             == SCM_CHAR_LOWERCASE_BITS);
1379 }
1380 
Scm_CharTitlecaseP(ScmChar ch)1381 int Scm_CharTitlecaseP(ScmChar ch)
1382 {
1383     return (Scm_CharGeneralCategory(ch) == SCM_CHAR_CATEGORY_Lt);
1384 }
1385 
Scm_CharNumericP(ScmChar ch)1386 int Scm_CharNumericP(ScmChar ch)
1387 {
1388     return (Scm_CharGeneralCategory(ch) == SCM_CHAR_CATEGORY_Nd);
1389 }
1390 
1391 /* An internal entry to extract case mapping info.
1392  * Internal table is compressed, so the caller must provide
1393  * the buffer for ScmCharCaseMap.
1394  * The function returns either the pointer to the given buffer
1395  * with information filled, or a pointer to a static read-only
1396  * data structure in the internal table.
1397  */
1398 static const ScmCharCaseMap casemap_identity = {
1399     0, 0, 0, {-1}, {-1}, {-1}
1400 };
1401 
Scm__CharCaseMap(ScmChar ch,ScmCharCaseMap * buf,int full)1402 const ScmCharCaseMap *Scm__CharCaseMap(ScmChar ch,
1403                                        ScmCharCaseMap *buf,
1404                                        int full)
1405 {
1406     if (ch < 0x10000) {
1407         int subtable = casemap_000[(ch >> 8) & 0xff];
1408         if (subtable == 255) return &casemap_identity;
1409 
1410         unsigned short cmap =
1411                 casemap_subtable[subtable][(unsigned char)(ch & 0xff)];
1412         if (cmap == SCM_CHAR_NO_CASE_MAPPING) return &casemap_identity;
1413         if (cmap & 0x8000) {
1414             /* mapping is extended. */
1415             return &(extended_casemaps[cmap & 0x7fff]);
1416         } else {
1417             /* mapping is simple */
1418             int off = (cmap & 0x2000)? (signed int)(cmap|~0x1fff) : cmap&0x1fff;
1419             if (cmap & 0x4000) {
1420                 buf->to_upper_simple = off;
1421                 buf->to_lower_simple = 0;
1422                 buf->to_title_simple = off;
1423             } else {
1424                 buf->to_upper_simple = 0;
1425                 buf->to_lower_simple = off;
1426                 buf->to_title_simple = 0;
1427             }
1428             if (full) {
1429                 /* indicate no special mappings */
1430                 buf->to_upper_full[0] = -1;
1431                 buf->to_lower_full[0] = -1;
1432                 buf->to_title_full[0] = -1;
1433             }
1434             return buf;
1435         }
1436     } else {
1437         /* TODO: 104xx*/
1438         return &casemap_identity;
1439     }
1440 }
1441 
1442 /*
1443  * Case conversion API.  For the time being, CharCaseMap works on Unicode
1444  * codepoints, so we have to convert from/to ScmChar if the internal encoding
1445  * is either EUC-JP or SJIS.
1446  */
1447 #define SIMPLE_CASE(code, buf, field) \
1448     (ScmChar)((code) + Scm__CharCaseMap((code), (buf), FALSE)->SCM_CPP_CAT3(to_, field, _simple))
1449 
1450 #define SIMPLE_CASE_CV(code, buf, field)    \
1451     ((code) = (ScmChar)Scm_CharToUcs((int)(code)), \
1452      (code) = SIMPLE_CASE(code, buf, field),       \
1453      Scm_UcsToChar((int)(code)))
1454 
Scm_CharUpcase(ScmChar ch)1455 ScmChar Scm_CharUpcase(ScmChar ch)
1456 {
1457     ScmCharCaseMap cm;
1458 #if defined(GAUCHE_CHAR_ENCODING_EUC_JP) || defined(GAUCHE_CHAR_ENCODING_SJIS)
1459     if (ch < 0x80) return SIMPLE_CASE(ch, &cm, upper);
1460     else if (Scm__CharInUnicodeP(ch)) return SIMPLE_CASE_CV(ch, &cm, upper);
1461     else           return ch;
1462 #elif defined(GAUCHE_CHAR_ENCODING_UTF_8)
1463     return SIMPLE_CASE(ch, &cm, upper);
1464 #else
1465     /* Latin-1 mapping and Unicode mapping differ in U+00B5 (MICRO SIGN)
1466        and U+00FF (LATIN SMALL LETTER Y WITH DIAERESIS).  In Unicode
1467        they map to U+039C and U+0178, respectively.  In Latin-1 we don't
1468        have those characters, so we leave them alone. */
1469     if (ch == 0xb5 || ch == 0xff) return ch;
1470     else return SIMPLE_CASE(ch, &cm, upper);
1471 #endif
1472 }
1473 
Scm_CharDowncase(ScmChar ch)1474 ScmChar Scm_CharDowncase(ScmChar ch)
1475 {
1476     ScmCharCaseMap cm;
1477 #if defined(GAUCHE_CHAR_ENCODING_EUC_JP) || defined(GAUCHE_CHAR_ENCODING_SJIS)
1478     if (ch < 0x80) return SIMPLE_CASE(ch, &cm, lower);
1479     else if (Scm__CharInUnicodeP(ch)) return SIMPLE_CASE_CV(ch, &cm, lower);
1480     else           return ch;
1481 #else
1482     return SIMPLE_CASE(ch, &cm, lower);
1483 #endif
1484 }
1485 
Scm_CharTitlecase(ScmChar ch)1486 ScmChar Scm_CharTitlecase(ScmChar ch)
1487 {
1488     ScmCharCaseMap cm;
1489 #if defined(GAUCHE_CHAR_ENCODING_EUC_JP) || defined(GAUCHE_CHAR_ENCODING_SJIS)
1490     if (ch < 0x80) return SIMPLE_CASE(ch, &cm, title);
1491     else if (Scm__CharInUnicodeP(ch)) return SIMPLE_CASE_CV(ch, &cm, title);
1492     else           return ch;
1493 #elif defined(GAUCHE_CHAR_ENCODING_UTF_8)
1494     return SIMPLE_CASE(ch, &cm, title);
1495 #else
1496     /* In Latin-1, titlecase is the same as upcase. */
1497     return Scm_CharUpcase(ch);
1498 #endif
1499 }
1500 
Scm_CharFoldcase(ScmChar ch)1501 ScmChar Scm_CharFoldcase(ScmChar ch)
1502 {
1503     ScmCharCaseMap cm;
1504 #if defined(GAUCHE_CHAR_ENCODING_EUC_JP) || defined(GAUCHE_CHAR_ENCODING_SJIS)
1505     if (Scm__CharInUnicodeP(ch)) {
1506         ScmChar ucs = (ScmChar)Scm_CharToUcs(ch);
1507         const ScmCharCaseMap *pcm = Scm__CharCaseMap(ucs, &cm, FALSE);
1508         if (pcm->to_lower_simple == 0 && pcm->to_upper_simple == 0) {
1509             /* we don't have case folding */
1510             return ch;
1511         }
1512         /* Otherwise, we do (char-downcase (char-upcase ch)) */
1513         if (pcm->to_upper_simple != 0) {
1514             ucs += pcm->to_upper_simple;
1515             pcm = Scm__CharCaseMap(ucs, &cm, FALSE);
1516         }
1517         return Scm_UcsToChar((int)(ucs + pcm->to_lower_simple));
1518     } else {
1519         return ch;
1520     }
1521 #elif defined(GAUCHE_CHAR_ENCODING_UTF_8)
1522     if (ch == 0x130 || ch == 0x131) {
1523         /* char-foldcase is identity for
1524            U+0130 Turkish I (LATIN CAPITAL LETTER I WITH DOT ABOVE) and
1525            U+0131 Turkish i (LATIN SMALL LETTER DOTLESS I) */
1526         return ch;
1527     }
1528     const ScmCharCaseMap *pcm = Scm__CharCaseMap(ch, &cm, FALSE);
1529     if (pcm->to_lower_simple == 0 && pcm->to_upper_simple == 0) {
1530         /* we don't have case folding */
1531         return ch;
1532     }
1533     /* Otherwise, we do (char-downcase (char-upcase ch)) */
1534     if (pcm->to_upper_simple != 0) {
1535         ch += pcm->to_upper_simple;
1536         pcm = Scm__CharCaseMap(ch, &cm, FALSE);
1537     }
1538     return ch + pcm->to_lower_simple;
1539 #else
1540     /* In Latin-1 range, foldcase is the same as donwcase. */
1541     return SIMPLE_CASE(ch, &cm, lower);
1542 #endif
1543 }
1544 
1545 /*-----------------------------------------------------------------
1546  * Pre-defined charset
1547  */
1548 
1549 /* Most predefined charset are pre-generated as static immutable data.
1550    See gen-unicode.scm for the generation code. */
1551 
Scm_GetStandardCharSet(int id)1552 ScmObj Scm_GetStandardCharSet(int id)
1553 {
1554     if (id == 0
1555         || id >= SCM_CHAR_SET_NUM_PREDEFINED_SETS
1556         || id <= -SCM_CHAR_SET_NUM_PREDEFINED_SETS) {
1557         Scm_Error("bad id for predefined charset index: %d", id);
1558     }
1559 
1560     if (id > 0) {
1561         return predef_sets[id];
1562     } else {
1563         if (!SCM_CHAR_SET_P(predef_sets_complement[-id])) {
1564             ScmObj cs = Scm_CharSetCopy(SCM_CHAR_SET(predef_sets[-id]));
1565             cs = Scm_CharSetComplement(SCM_CHAR_SET(cs));
1566             Scm_CharSetFreezeX(SCM_CHAR_SET(cs));
1567             predef_sets_complement[-id] = cs;
1568         }
1569         return predef_sets_complement[-id];
1570     }
1571 }
1572 
Scm__InitChar(void)1573 void Scm__InitChar(void)
1574 {
1575     ScmModule *mod = Scm_GaucheModule();
1576 
1577     init_predefined_charsets();
1578     predef_sets[SCM_CHAR_SET_FULL] = Scm_CharSetComplement(make_charset());
1579 
1580 #define DEFCS(name, id) \
1581     Scm_Define(mod, SCM_SYMBOL(SCM_INTERN("char-set:" name)), predef_sets[SCM_CPP_CAT(SCM_CHAR_SET_, id)])
1582 
1583     DEFCS("L",  L);
1584     DEFCS("LC", LC);
1585     DEFCS("Lu", Lu);
1586     DEFCS("Ll", Ll);
1587     DEFCS("Lt", Lt);
1588     DEFCS("Lm", Lm);
1589     DEFCS("Lo", Lo);
1590     DEFCS("M",  M);
1591     DEFCS("Mn", Mn);
1592     DEFCS("Mc", Mc);
1593     DEFCS("Me", Me);
1594     DEFCS("N",  N);
1595     DEFCS("Nd", Nd);
1596     DEFCS("Nl", Nl);
1597     DEFCS("No", No);
1598     DEFCS("P",  P);
1599     DEFCS("Pc", Pc);
1600     DEFCS("Pd", Pd);
1601     DEFCS("Ps", Ps);
1602     DEFCS("Pe", Pe);
1603     DEFCS("Pi", Pi);
1604     DEFCS("Pf", Pf);
1605     DEFCS("Po", Po);
1606     DEFCS("S",  S);
1607     DEFCS("Sm", Sm);
1608     DEFCS("Sc", Sc);
1609     DEFCS("Sk", Sk);
1610     DEFCS("So", So);
1611     DEFCS("Z",  Z);
1612     DEFCS("Zs", Zs);
1613     DEFCS("Zl", Zl);
1614     DEFCS("Zp", Zp);
1615     DEFCS("C",  C);
1616     DEFCS("Cc", Cc);
1617     DEFCS("Cf", Cf);
1618     DEFCS("Cs", Cs);
1619     DEFCS("Co", Co);
1620     DEFCS("Cn", Cn);
1621 
1622     DEFCS("lower-case", LOWER);
1623     DEFCS("ascii-lower-case", ASCII_LOWER);
1624     DEFCS("upper-case", UPPER);
1625     DEFCS("ascii-upper-case", ASCII_UPPER);
1626     DEFCS("title-case", TITLE);
1627     DEFCS("letter", LETTER);
1628     DEFCS("ascii-letter", ASCII_LETTER);
1629     DEFCS("digit", DIGIT);
1630     DEFCS("ascii-digit", ASCII_DIGIT);
1631     DEFCS("letter+digit", LETTER_DIGIT);
1632     DEFCS("ascii-letter+digit", ASCII_LETTER_DIGIT);
1633     DEFCS("graphic", GRAPHIC);
1634     DEFCS("ascii-graphic", ASCII_GRAPHIC);
1635     DEFCS("printing", PRINTING);
1636     DEFCS("ascii-printing", ASCII_PRINTING);
1637     DEFCS("whitespace", WHITESPACE);
1638     DEFCS("ascii-whitespace", ASCII_WHITESPACE);
1639     DEFCS("iso-control", ISO_CONTROL);
1640     DEFCS("ascii-control", ASCII_CONTROL);
1641     DEFCS("punctuation", PUNCTUATION);
1642     DEFCS("ascii-punctuation", ASCII_PUNCTUATION);
1643     DEFCS("symbol", SYMBOL);
1644     DEFCS("ascii-symbol", ASCII_SYMBOL);
1645     DEFCS("hex-digit", HEX_DIGIT);
1646     DEFCS("blank", BLANK);
1647     DEFCS("ascii-blank", ASCII_BLANK);
1648     DEFCS("ascii", ASCII);
1649     DEFCS("word", WORD);
1650     DEFCS("ascii-word", ASCII_WORD);
1651     DEFCS("empty", EMPTY);
1652     DEFCS("full", FULL);
1653 
1654     /* We initialize complement charset on demand, except EMPTY and FULL. */
1655     for (int i=0; i<SCM_CHAR_SET_NUM_PREDEFINED_SETS; i++) {
1656         predef_sets_complement[i] = SCM_FALSE;
1657     }
1658     predef_sets_complement[SCM_CHAR_SET_EMPTY]
1659         = predef_sets_complement[SCM_CHAR_SET_FULL];
1660     predef_sets_complement[SCM_CHAR_SET_FULL]
1661         = predef_sets_complement[SCM_CHAR_SET_EMPTY];
1662 
1663     /* Expose internal charset */
1664 #if defined(GAUCHE_CHAR_ENCODING_EUC_JP)
1665     Scm_AddFeature("gauche.ces.eucjp", NULL);
1666 #elif defined(GAUCHE_CHAR_ENCODING_SJIS)
1667     Scm_AddFeature("gauche.ces.sjis", NULL);
1668 #elif defined(GAUCHE_CHAR_ENCODING_UTF_8)
1669     Scm_AddFeature("gauche.ces.utf8", NULL);
1670 #else
1671     Scm_AddFeature("gauche.ces.none", NULL);
1672 #endif
1673 }
1674