1 /************************************************************************/
2 /*									*/
3 /*  UCD General Categories.						*/
4 /*									*/
5 /************************************************************************/
6 
7 /**
8  * To build the masks. Do not directly refer to these values.
9  */
10 typedef enum UCDGeneralCategoryGroup
11     {
12     UCDgc_L= 1,		/*  Letter	*/
13     UCDgc_M,		/*  Mark	*/
14     UCDgc_N,		/*  Number	*/
15     UCDgc_P,		/*  Punctuation	*/
16     UCDgc_S,		/*  Symbol	*/
17     UCDgc_Z,		/*  Separator	*/
18     UCDgc_C,		/*  Other	*/
19 
20     UCDgc__COUNT
21     } UCDGeneralCategoryGroup;
22 
23 # define	UCDgc_X_SHIFT	5
24 # define	UCDgc_X_MASK	0xe0
25 # define	UCDgc_XX_MASK	0xf0
26 
27 typedef enum UCDGeneralCategory
28     {
29 	    /**********
30 	     * Offset for the letters. Do not directly refer to this value.
31 	     * Note that the order is different from that in the Unicode
32 	     * standard to reserve a bit for the LC mask.
33 	     */
34     UCDgc_L_MASK= ( UCDgc_L << UCDgc_X_SHIFT ),
35 	    /**
36 	     * Modifier_Letter: a modifier letter
37 	     */
38     UCDgc_Lm,
39 	    /**
40 	     * Other_Letter: other letters, including syllables and ideographs
41 	     */
42     UCDgc_Lo,
43 	    /**********
44 	     * Offset for the cased letters. Do not directly refer to
45 	     * this value.
46 	     */
47     UCDgc_LC_MASK= ( ( UCDgc_L << UCDgc_X_SHIFT ) | 0x10 ),
48 	    /**
49 	     * Uppercase_Letter: an uppercase letter
50 	     */
51     UCDgc_Lu,
52 	    /**
53 	     * Lowercase_Letter: a lowercase letter
54 	     */
55     UCDgc_Ll,
56 	    /**
57 	     * Titlecase_Letter:
58 	     * a digraphic character, with first part uppercase
59 	     */
60     UCDgc_Lt,
61 
62 	    /**********
63 	     * Offset for the marks. Do not directly refer to this value.
64 	     */
65     UCDgc_M_MASK= ( UCDgc_M << UCDgc_X_SHIFT ),
66 	    /**
67 	     * Nonspacing_Mark: a nonspacing combining mark (zero advance width)
68 	     */
69     UCDgc_Mn,
70 	    /**
71 	     * Spacing_Mark: a spacing combining mark (positive advance width)
72 	     */
73     UCDgc_Mc,
74 	    /**
75 	     * Enclosing_Mark: an enclosing combining mark
76 	     */
77     UCDgc_Me,
78 
79 	    /**********
80 	     * Offset for the numbers. Do not directly refer to this value.
81 	     */
82     UCDgc_N_MASK= ( UCDgc_N << UCDgc_X_SHIFT ),
83 	    /**
84 	     * Decimal_Number: a decimal digit
85 	     */
86     UCDgc_Nd,
87 	    /**
88 	     * Letter_Number: a letterlike numeric character
89 	     */
90     UCDgc_Nl,
91 	    /**
92 	     * Other_Number: a numeric character of other type
93 	     */
94     UCDgc_No,
95 
96 	    /**********
97 	     * Offset for the punctuation. Do not directly refer to this value.
98 	     */
99     UCDgc_P_MASK= ( UCDgc_P << UCDgc_X_SHIFT ),
100 	    /**
101 	     * Connector_Punctuation: a connecting punctuation mark, like a tie
102 	     */
103     UCDgc_Pc,
104 	    /**
105 	     * Dash_Punctuation: a dash or hyphen punctuation mark
106 	     */
107     UCDgc_Pd,
108 	    /**
109 	     * Open_Punctuation: an opening punctuation mark (of a pair)
110 	     */
111     UCDgc_Ps,
112 	    /**
113 	     * Close_Punctuation: a closing punctuation mark (of a pair)
114 	     */
115     UCDgc_Pe,
116 	    /**
117 	     * Initial_Punctuation: an initial quotation mark
118 	     */
119     UCDgc_Pi,
120 	    /**
121 	     * Final_Punctuation: a final quotation mark
122 	     */
123     UCDgc_Pf,
124 	    /**
125 	     * Other_Punctuation: a punctuation mark of other type
126 	     */
127     UCDgc_Po,
128 
129 	    /**********
130 	     * Offset for the signs. Do not directly refer to this value.
131 	     */
132     UCDgc_S_MASK= ( UCDgc_S << UCDgc_X_SHIFT ),
133 	    /**
134 	     * Math_Symbol: a symbol of mathematical use
135 	     */
136     UCDgc_Sm,
137 	    /**
138 	     * Currency_Symbol: a currency sign
139 	     */
140     UCDgc_Sc,
141 	    /**
142 	     * Modifier_Symbol: a non-letterlike modifier symbol
143 	     */
144     UCDgc_Sk,
145 	    /**
146 	     * Other_Symbol: a symbol of other type
147 	     */
148     UCDgc_So,
149 
150 	    /**********
151 	     * Offset for the spaces. Do not directly refer to this value.
152 	     */
153     UCDgc_Z_MASK= ( UCDgc_Z << UCDgc_X_SHIFT ),
154 	    /**
155 	     * Space_Separator: a space character (of various non-zero widths)
156 	     */
157     UCDgc_Zs,
158 	    /**
159 	     * Line_Separator: U+2028 LINE SEPARATOR only
160 	     */
161     UCDgc_Zl,
162 	    /**
163 	     * Paragraph_Separator: U+2029 PARAGRAPH SEPARATOR only
164 	     */
165     UCDgc_Zp,
166 
167 	    /**********
168 	     * Offset for the other categories. Do not directly refer to
169 	     * this value.
170 	     */
171     UCDgc_C_MASK= ( UCDgc_C << UCDgc_X_SHIFT ),
172 	    /**
173 	     * Control: a C0 or C1 control code
174 	     */
175     UCDgc_Cc,
176 	    /**
177 	     * Format: a format control character
178 	     */
179     UCDgc_Cf,
180 	    /**
181 	     * Surrogate: a surrogate code point
182 	     */
183     UCDgc_Cs,
184 	    /**
185 	     * Private_Use: a private-use character
186 	     */
187     UCDgc_Co,
188 	    /**
189 	     * Unassigned: a reserved unassigned code point or a noncharacter
190 	     */
191     UCDgc_Cn
192     } UCDGeneralCategory;
193 
194 /************************************************************************/
195 /*									*/
196 /*  Routine declarations						*/
197 /*									*/
198 /************************************************************************/
199 
200 extern int ucdGeneralCategory( int sym );
201 extern const char * ucdGeneralCategoryStr( int sym );
202 
203 /************************************************************************/
204 /*									*/
205 /*  Probing macros							*/
206 /*									*/
207 /************************************************************************/
208 
209 # define ucdIsL(c)	( ( ucdGeneralCategory((c)) & UCDgc_X_MASK ) == UCDgc_L_MASK )
210 # define ucdIsLm(c)	( ucdGeneralCategory((c)) == UCDgc_Lm )
211 # define ucdIsLo(c)	( ucdGeneralCategory((c)) == UCDgc_Lo )
212 # define ucdIsLC(c)	( ( ucdGeneralCategory((c)) & UCDgc_XX_MASK ) == UCDgc_LC_MASK )
213 # define ucdIsLu(c)	( ucdGeneralCategory((c)) == UCDgc_Lu )
214 # define ucdIsLl(c)	( ucdGeneralCategory((c)) == UCDgc_Ll )
215 # define ucdIsLt(c)	( ucdGeneralCategory((c)) == UCDgc_Lt )
216 
217 # define ucdIsM(c)	( ( ucdGeneralCategory((c)) & UCDgc_X_MASK ) == UCDgc_M_MASK )
218 # define ucdIsMn(c)	( ucdGeneralCategory((c)) == UCDgc_Mn )
219 # define ucdIsMc(c)	( ucdGeneralCategory((c)) == UCDgc_Mc )
220 # define ucdIsMe(c)	( ucdGeneralCategory((c)) == UCDgc_Me )
221 
222 # define ucdIsN(c)	( ( ucdGeneralCategory((c)) & UCDgc_X_MASK ) == UCDgc_N_MASK )
223 # define ucdIsNd(c)	( ucdGeneralCategory((c)) == UCDgc_Nd )
224 # define ucdIsNl(c)	( ucdGeneralCategory((c)) == UCDgc_Nl )
225 # define ucdIsNo(c)	( ucdGeneralCategory((c)) == UCDgc_No )
226 
227 # define ucdIsP(c)	( ( ucdGeneralCategory((c)) & UCDgc_X_MASK ) == UCDgc_P_MASK )
228 # define ucdIsPc(c)	( ucdGeneralCategory((c)) == UCDgc_Pc )
229 # define ucdIsPd(c)	( ucdGeneralCategory((c)) == UCDgc_Pd )
230 # define ucdIsPs(c)	( ucdGeneralCategory((c)) == UCDgc_Ps )
231 # define ucdIsPe(c)	( ucdGeneralCategory((c)) == UCDgc_Pe )
232 # define ucdIsPi(c)	( ucdGeneralCategory((c)) == UCDgc_Pi )
233 # define ucdIsPf(c)	( ucdGeneralCategory((c)) == UCDgc_Pf )
234 # define ucdIsPo(c)	( ucdGeneralCategory((c)) == UCDgc_Po )
235 
236 # define ucdIsS(c)	( ( ucdGeneralCategory((c)) & UCDgc_X_MASK ) == UCDgc_S_MASK )
237 # define ucdIsSm(c)	( ucdGeneralCategory((c)) == UCDgc_Sm )
238 # define ucdIsSc(c)	( ucdGeneralCategory((c)) == UCDgc_Sc )
239 # define ucdIsSk(c)	( ucdGeneralCategory((c)) == UCDgc_Sk )
240 # define ucdIsSo(c)	( ucdGeneralCategory((c)) == UCDgc_So )
241 
242 # define ucdIsZ(c)	( ( ucdGeneralCategory((c)) & UCDgc_X_MASK ) == UCDgc_Z_MASK )
243 # define ucdIsZs(c)	( ucdGeneralCategory((c)) == UCDgc_Zs )
244 # define ucdIsZl(c)	( ucdGeneralCategory((c)) == UCDgc_Zl )
245 # define ucdIsZp(c)	( ucdGeneralCategory((c)) == UCDgc_Zp )
246 
247 # define ucdIsC(c)	( ( ucdGeneralCategory((c)) & UCDgc_X_MASK ) == UCDgc_C_MASK )
248 # define ucdIsCc(c)	( ucdGeneralCategory((c)) == UCDgc_Cc )
249 # define ucdIsCf(c)	( ucdGeneralCategory((c)) == UCDgc_Cf )
250 # define ucdIsCs(c)	( ucdGeneralCategory((c)) == UCDgc_Cs )
251 # define ucdIsCo(c)	( ucdGeneralCategory((c)) == UCDgc_Co )
252 # define ucdIsCn(c)	( ucdGeneralCategory((c)) == UCDgc_Cn )
253 
254