1 /*
2  * char_attr.h - Various character attributes
3  *
4  *   Copyright (c) 2011-2020  Shiro Kawai  <shiro@acm.org>
5  *
6  *   Redistribution and use in source and binary forms, with or without
7  *   modification, are permitted provided that the following conditions
8  *   are met:
9  *
10  *   1. Redistributions of source code must retain the above copyright
11  *      notice, this list of conditions and the following disclaimer.
12  *
13  *   2. Redistributions in binary form must reproduce the above copyright
14  *      notice, this list of conditions and the following disclaimer in the
15  *      documentation and/or other materials provided with the distribution.
16  *
17  *   3. Neither the name of the authors nor the names of its contributors
18  *      may be used to endorse or promote products derived from this
19  *      software without specific prior written permission.
20  *
21  *   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
22  *   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
23  *   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
24  *   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
25  *   OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
26  *   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED
27  *   TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
28  *   PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
29  *   LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
30  *   NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
31  *   SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
32  */
33 
34 /* This file is not a part of public Gauche C API.  Should be included
35    only for Gauche internal functions. */
36 #ifndef GAUCHE_CHAR_ATTR_H
37 #define GAUCHE_CHAR_ATTR_H
38 
39 /* Unicode general categories */
40 enum {
41     SCM_CHAR_CATEGORY_Lu,       /* Uppercase_Letter */
42     SCM_CHAR_CATEGORY_Ll,       /* Lowercase_Letter */
43     SCM_CHAR_CATEGORY_Lt,       /* Titlecase_Letter */
44     SCM_CHAR_CATEGORY_Lm,       /* Modifier_Letter */
45     SCM_CHAR_CATEGORY_Lo,       /* Other_Letter */
46     SCM_CHAR_CATEGORY_Mn,       /* Nonspacing_Mark */
47     SCM_CHAR_CATEGORY_Mc,       /* Spacing_Mark */
48     SCM_CHAR_CATEGORY_Me,       /* Enclosing_Mark */
49     SCM_CHAR_CATEGORY_Nd,       /* Decimal_Number */
50     SCM_CHAR_CATEGORY_Nl,       /* Letter_Number */
51     SCM_CHAR_CATEGORY_No,       /* Other_Number */
52     SCM_CHAR_CATEGORY_Pc,       /* Connector_Punctuation */
53     SCM_CHAR_CATEGORY_Pd,       /* Dash_Punctuation */
54     SCM_CHAR_CATEGORY_Ps,       /* Open_Punctuation */
55     SCM_CHAR_CATEGORY_Pe,       /* Close_Punctuation */
56     SCM_CHAR_CATEGORY_Pi,       /* Initial_Punctuation */
57     SCM_CHAR_CATEGORY_Pf,       /* Final_Punctuation */
58     SCM_CHAR_CATEGORY_Po,       /* Other_Punctuation */
59     SCM_CHAR_CATEGORY_Sm,       /* Math_Symbol */
60     SCM_CHAR_CATEGORY_Sc,       /* Currency_Symbol */
61     SCM_CHAR_CATEGORY_Sk,       /* Modifier_Symbol */
62     SCM_CHAR_CATEGORY_So,       /* Other_Symbol */
63     SCM_CHAR_CATEGORY_Zs,       /* Space_Separator */
64     SCM_CHAR_CATEGORY_Zl,       /* Line_Separator */
65     SCM_CHAR_CATEGORY_Zp,       /* Paragraph_Separator */
66     SCM_CHAR_CATEGORY_Cc,       /* Control */
67     SCM_CHAR_CATEGORY_Cf,       /* Format */
68     SCM_CHAR_CATEGORY_Cs,       /* Surrogate */
69     SCM_CHAR_CATEGORY_Co,       /* Private_Use */
70     SCM_CHAR_CATEGORY_Cn        /* Unassigned */
71 };
72 
73 #define SCM_CHAR_CATEGORY_MASK  (0x1f)
74 
75 /* Higher two bits of a category byte are used for these flags
76    00xxxxxx - non-alphabetic char
77    01xxxxxx - lowercase alphabetic char
78    10xxxxxx - uppercase alphabetic char
79    11xxxxxx - caseless or titlecase alphabetic char
80  */
81 #define SCM_CHAR_ALPHA_MASK      (0xc0u)
82 #define SCM_CHAR_ALPHABETIC_BITS (0xc0u)
83 #define SCM_CHAR_UPPERCASE_BITS  (0x80u)
84 #define SCM_CHAR_LOWERCASE_BITS  (0x40u)
85 
86 /* Case mappings */
87 
88 /* In Unicode 6.0, the max length of full case mapping is 3, but we reserve
89    one more just in case for future ABI. */
90 #define SCM_CHAR_FULL_CASE_MAPPING_SIZE 4
91 
92 typedef struct {
93     int to_upper_simple;  /* offset to add to produce uppercase */
94     int to_lower_simple;  /* offset to add to produce lowercase */
95     int to_title_simple;  /* offset to add to produce titlecase */
96     ScmChar to_upper_full[SCM_CHAR_FULL_CASE_MAPPING_SIZE];
97     ScmChar to_lower_full[SCM_CHAR_FULL_CASE_MAPPING_SIZE];
98     ScmChar to_title_full[SCM_CHAR_FULL_CASE_MAPPING_SIZE];
99 } ScmCharCaseMap;
100 
101 /* Internal function to access case map info */
102 SCM_EXTERN const ScmCharCaseMap *Scm__CharCaseMap(ScmChar ch,
103                                                   ScmCharCaseMap *buf,
104                                                   int full);
105 
106 /* Casemap entry value */
107 #define SCM_CHAR_NO_CASE_MAPPING 0xffff
108 #define SCM_CHAR_CASEMAP_TOLOWER(off)  (((unsigned int)(off))&0x3fff)
109 #define SCM_CHAR_CASEMAP_TOUPPER(off)  ((((unsigned int)(off))&0x3fff)|0x4000)
110 #define SCM_CHAR_CASEMAP_EXTENDED(off) ((((unsigned int)(off))&0x3fff)|0x8000)
111 
112 
113 #endif /*GAUCHE_CHAR_ATTR_H*/
114