1 #ifndef _melder_kar_h_
2 #define _melder_kar_h_
3 /* melder_kar.h
4  *
5  * Copyright (C) 1992-2020 Paul Boersma
6  *
7  * This code is free software; you can redistribute it and/or modify
8  * it under the terms of the GNU General Public License as published by
9  * the Free Software Foundation; either version 2 of the License, or (at
10  * your option) any later version.
11  *
12  * This code is distributed in the hope that it will be useful, but
13  * WITHOUT ANY WARRANTY; without even the implied warranty of
14  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
15  * See the GNU General Public License for more details.
16  *
17  * You should have received a copy of the GNU General Public License
18  * along with this work. If not, see <http://www.gnu.org/licenses/>.
print(raw_ostream & OS) const19  */
20 
21 #define kUCD_TOP_OF_ASCII  127
22 #define kUCD_TOP_OF_LIST  0x2FA1D
23 #define kUCD_UNASSIGNED  0
24 
25 enum {
26 	mUCD_UPPERCASE_LETTER = (1 << 0),
27 	mUCD_LOWERCASE_LETTER = (1 << 1),
28 	mUCD_TITLECASE_LETTER = (1 << 2),
29 	mUCD_CASED_LETTER = (mUCD_UPPERCASE_LETTER | mUCD_LOWERCASE_LETTER | mUCD_TITLECASE_LETTER),
30 	mUCD_MODIFIER_LETTER = (1 << 3),
31 	mUCD_OTHER_LETTER = (1 << 4),
32 	mUCD_LETTER = (mUCD_CASED_LETTER | mUCD_MODIFIER_LETTER | mUCD_OTHER_LETTER),
33 
34 	mUCD_NONSPACING_MARK = (1 << 5),
35 	mUCD_SPACING_MARK = (1 << 6),
36 	mUCD_ENCLOSING_MARK = (1 << 7),
37 	mUCD_MARK = (mUCD_NONSPACING_MARK | mUCD_SPACING_MARK | mUCD_ENCLOSING_MARK),
38 
39 	mUCD_DECIMAL_NUMBER = (1 << 8),
40 	mUCD_LETTER_NUMBER = (1 << 9),
41 	mUCD_OTHER_NUMBER = (1 << 10),
42 	mUCD_NUMBER = (mUCD_DECIMAL_NUMBER | mUCD_LETTER_NUMBER | mUCD_OTHER_NUMBER),
43 
44 	mUCD_CONNECTOR_PUNCTUATION = (1 << 11),
45 	mUCD_DASH_PUNCTUATION = (1 << 12),
46 	mUCD_OPEN_PUNCTUATION = (1 << 13),
47 	mUCD_CLOSE_PUNCTUATION = (1 << 14),
48 	mUCD_INITIAL_PUNCTUATION = (1 << 15),
49 	mUCD_FINAL_PUNCTUATION = (1 << 16),
50 	mUCD_OTHER_PUNCTUATION = (1 << 17),
51 	mUCD_PUNCTUATION = (mUCD_CONNECTOR_PUNCTUATION | mUCD_DASH_PUNCTUATION | mUCD_OPEN_PUNCTUATION | mUCD_CLOSE_PUNCTUATION | mUCD_INITIAL_PUNCTUATION | mUCD_FINAL_PUNCTUATION | mUCD_OTHER_PUNCTUATION),
52 
53 	mUCD_MATH_SYMBOL = (1 << 18),
54 	mUCD_CURRENCY_SYMBOL = (1 << 19),
55 	mUCD_MODIFIER_SYMBOL = (1 << 20),
56 	mUCD_OTHER_SYMBOL = (1 << 21),
57 	mUCD_SYMBOL = (mUCD_MATH_SYMBOL | mUCD_CURRENCY_SYMBOL | mUCD_MODIFIER_SYMBOL | mUCD_OTHER_SYMBOL),
58 
59 	mUCD_BREAKING_SPACE = (1 << 22),
60 	mUCD_NON_BREAKING_SPACE = (1 << 23),   // note: this keeps *lines* together; it still separates *words*, despite interpretations elsewhere
61 	mUCD_SPACE_SEPARATOR = (mUCD_BREAKING_SPACE | mUCD_NON_BREAKING_SPACE),
62 	mUCD_LINE_SEPARATOR = (1 << 24),
63 	mUCD_PARAGRAPH_SEPARATOR = (1 << 25),
64 	mUCD_NEWLINE = (mUCD_LINE_SEPARATOR | mUCD_PARAGRAPH_SEPARATOR),
65 	mUCD_SEPARATOR = (mUCD_SPACE_SEPARATOR | mUCD_NEWLINE),
66 
67 	mUCD_CONTROL = (1 << 26),
68 	mUCD_FORMAT = (1 << 27),
69 	mUCD_PRIVATE_USE = (1 << 28),
70 
71 	mUCD_WORD_CHARACTER = (1 << 29),
72 	mUCD_NULL = (1 << 30),
73 
74 	mUCD_ALPHANUMERIC = (mUCD_LETTER | mUCD_NUMBER),
75 	mUCD_END_OF_INK = (mUCD_SEPARATOR | mUCD_NULL),
76 	mUCD_END_OF_LINE = (mUCD_NEWLINE | mUCD_NULL),
77 };
78 
79 struct UCD_CodePointInfo {
80 	uint32 features;
81 	char32 upperCase, lowerCase, titleCase;
82 	conststring32 decomposed;
83 	char first, second;
84 };
85 extern UCD_CodePointInfo theUnicodeDatabase [1+kUCD_TOP_OF_LIST];
86 
87 /*
88 	Praat is an internationalized program, which means it has to work in the same way
89 	wherever on earth it is used. This means that Praat has to be blind to localized settings,
90 	such as what counts as a space and what combinations of characters
91 	count as pairs of lower case and upper case.
92 
93 	To be able to use Praat all over the world, we therefore define one single
94 	"international locale", which is simply based on the Unicode features of each code point.
95 */
96 
97 /*
98 	Internationalize std::isblank ():
99 */
100 inline bool Melder_isHorizontalSpace (char32 kar) {
101 	return kar <= kUCD_TOP_OF_LIST && (theUnicodeDatabase [kar]. features & mUCD_SPACE_SEPARATOR) != 0;
102 }
103 inline void Melder_skipHorizontalSpace (char32 **p_text) {
104 	while (Melder_isHorizontalSpace (**p_text)) (*p_text) ++;
105 }
106 inline char32 * Melder_findEndOfHorizontalSpace (char32 *p) {
107 	while (Melder_isHorizontalSpace (*p)) p ++;
108 	return p;
109 }
110 inline const char32 * Melder_findEndOfHorizontalSpace (const char32 *p) {
111 	while (Melder_isHorizontalSpace (*p)) p ++;
112 	return p;
113 }
114 
115 inline bool Melder_isAsciiHorizontalSpace (char32 kar) {
116 	return kar == U'\t' || kar == U' ';
117 }
118 
119 inline bool Melder_isVerticalSpace (char32 kar) {
120 	return kar <= kUCD_TOP_OF_LIST && (theUnicodeDatabase [kar]. features & mUCD_NEWLINE) != 0;
121 }
122 inline bool Melder_isAsciiVerticalSpace (char32 kar) {
123 	return kar >= 10 && kar <= 13;   // \n, \v, \f, \r
124 }
125 
126 /*
127 	Internationalize std::isspace ():
128 */
129 inline bool Melder_isHorizontalOrVerticalSpace (char32 kar) {
130 	return kar <= kUCD_TOP_OF_LIST && (theUnicodeDatabase [kar]. features & mUCD_SEPARATOR) != 0;
131 }
132 inline bool Melder_isAsciiHorizontalOrVerticalSpace (char32 kar) {
133 	return kar <= kUCD_TOP_OF_ASCII && (theUnicodeDatabase [kar]. features & mUCD_SEPARATOR) != 0;
134 }
135 inline void Melder_skipHorizontalOrVerticalSpace (char32 **p_text) {
136 	while (Melder_isHorizontalOrVerticalSpace (**p_text)) (*p_text) ++;
137 }
138 inline void Melder_skipHorizontalOrVerticalSpace (const char32 **p_text) {
139 	while (Melder_isHorizontalOrVerticalSpace (**p_text)) (*p_text) ++;
140 }
141 
142 inline bool Melder_isEndOfInk (char32 kar) {
143 	return kar <= kUCD_TOP_OF_LIST && (theUnicodeDatabase [kar]. features & mUCD_END_OF_INK) != 0;
144 }
145 inline bool Melder_isEndOfLine (char32 kar) {
146 	return kar <= kUCD_TOP_OF_LIST && (theUnicodeDatabase [kar]. features & mUCD_END_OF_LINE) != 0;
147 }
148 inline bool Melder_isEndOfText (char32 kar) {
149 	return kar == U'\0';
150 }
151 inline bool Melder_staysWithinInk (char32 kar) {
152 	return kar <= kUCD_TOP_OF_LIST && (theUnicodeDatabase [kar]. features & mUCD_END_OF_INK) == 0;
153 }
154 inline bool Melder_staysWithinLine (char32 kar) {
155 	return kar <= kUCD_TOP_OF_LIST && (theUnicodeDatabase [kar]. features & mUCD_END_OF_LINE) == 0;
156 }
157 inline void Melder_skipToEndOfLine (char32 **p_text) {
158 	while (Melder_staysWithinLine (**p_text)) (*p_text) ++;
159 }
160 inline char32 * Melder_findEndOfInk (char32 *p) {
161 	while (Melder_staysWithinInk (*p)) p ++;
162 	return p;
163 }
164 inline const char32 * Melder_findEndOfInk (const char32 *p) {
165 	while (Melder_staysWithinInk (*p)) p ++;
166 	return p;
167 }
168 inline char32 * Melder_findEndOfLine (char32 *p) {
169 	while (Melder_staysWithinLine (*p)) p ++;
170 	return p;
171 }
172 inline const char32 * Melder_findEndOfLine (const char32 *p) {
173 	while (Melder_staysWithinLine (*p)) p ++;
174 	return p;
175 }
176 
177 /*
178 	Internationalize std::isalpha ():
179 */
180 inline bool Melder_isLetter (char32 kar) {
181 	return kar <= kUCD_TOP_OF_LIST && (theUnicodeDatabase [kar]. features & mUCD_LETTER) != 0;
182 }
183 inline bool Melder_isAsciiLetter (char32 kar) {
184 	return kar <= kUCD_TOP_OF_ASCII && (theUnicodeDatabase [kar]. features & mUCD_LETTER) != 0;
185 }
186 
187 /*
188 	Internationalize std::isupper ():
189 */
190 inline bool Melder_isUpperCaseLetter (char32 kar) {
191 	return kar <= kUCD_TOP_OF_LIST && (theUnicodeDatabase [kar]. features & mUCD_UPPERCASE_LETTER) != 0;
192 }
193 inline bool Melder_isAsciiUpperCaseLetter (char32 kar) {
194 	return kar >= U'A' && kar <= U'Z';
195 }
196 
197 /*
198 	Internationalize std::islower ():
199 */
200 inline bool Melder_isLowerCaseLetter (char32 kar) {
201 	return kar <= kUCD_TOP_OF_LIST && (theUnicodeDatabase [kar]. features & mUCD_LOWERCASE_LETTER) != 0;
202 }
203 inline bool Melder_isAsciiLowerCaseLetter (char32 kar) {
204 	return kar >= U'a' && kar <= U'z';
205 }
206 
207 inline bool Melder_isTitleCaseLetter (char32 kar) {
208 	return kar <= kUCD_TOP_OF_LIST && (theUnicodeDatabase [kar]. features & mUCD_TITLECASE_LETTER) != 0;
209 }
210 inline bool Melder_isAsciiTitleCaseLetter (char32 kar) {
211 	return kar >= U'A' && kar <= U'Z';
212 }
213 
214 /*
215 	Internationalize std::isdigit ():
216 */
217 inline bool Melder_isDecimalNumber (char32 kar) {
218 	return kar <= kUCD_TOP_OF_LIST && (theUnicodeDatabase [kar]. features & mUCD_DECIMAL_NUMBER) != 0;
219 }
220 inline bool Melder_isAsciiDecimalNumber (char32 kar) {
221 	return kar >= U'0' && kar <= U'9';
222 }
223 
224 /*
225 	We cannot really internationalize std::isxdigit ():
226 */
227 inline bool Melder_isHexadecimalDigit (char32 kar) {
228 	return kar >= U'0' && kar <= U'9' || kar >= U'A' && kar <= U'Z' || kar >= U'a' && kar <= U'z';
229 }
230 
231 /*
232 	Internationalize std::isalnum ():
233 */
234 inline bool Melder_isAlphanumeric (char32 kar) {
235 	return kar <= kUCD_TOP_OF_LIST && (theUnicodeDatabase [kar]. features & mUCD_ALPHANUMERIC) != 0;
236 }
237 inline bool Melder_isAsciiAlphanumeric (char32 kar) {
238 	return kar <= kUCD_TOP_OF_ASCII && (theUnicodeDatabase [kar]. features & mUCD_ALPHANUMERIC) != 0;
239 }
240 
241 inline bool Melder_isWordCharacter (char32 kar) {
242 	return kar <= kUCD_TOP_OF_LIST && (theUnicodeDatabase [kar]. features & mUCD_WORD_CHARACTER) != 0;
243 }
244 inline bool Melder_isAsciiWordCharacter (char32 kar) {
245 	return kar <= kUCD_TOP_OF_ASCII && (theUnicodeDatabase [kar]. features & mUCD_WORD_CHARACTER) != 0;
246 }
247 
248 /*
249 	The standard library further contains std::ispunct (), std::iscntrl (), std::isprint (), std::isgraph ().
250 	These have very little use nowadays, so only for completeness do we include versions of them here,
251 	which are correct at least for ASCII arguments.
252 	Of these four functions, Melder_hasInk () is not yet correct for all Unicode points,
253 	as approximately one half of the mUCD_FORMAT points are inkless as well.
254 */
255 inline bool Melder_isPunctuationOrSymbol (char32 kar) {
256 	return kar <= kUCD_TOP_OF_LIST && (theUnicodeDatabase [kar]. features & (mUCD_PUNCTUATION | mUCD_SYMBOL)) != 0;
257 }
258 inline bool Melder_isAsciiPunctuationOrSymbol (char32 kar) {   // same as std::ispunct() with default C locale
259 	return kar <= kUCD_TOP_OF_ASCII && (theUnicodeDatabase [kar]. features & (mUCD_PUNCTUATION | mUCD_SYMBOL)) != 0;
260 }
261 inline bool Melder_isControl (char32 kar) {
262 	return kar <= kUCD_TOP_OF_LIST && (theUnicodeDatabase [kar]. features & mUCD_CONTROL) != 0;
263 }
264 inline bool Melder_isAsciiControl (char32 kar) {   // same as std::iscntrl() with default C locale
265 	return kar <= kUCD_TOP_OF_ASCII && (theUnicodeDatabase [kar]. features & mUCD_CONTROL) != 0;
266 }
267 inline bool Melder_isPrintable (char32 kar) {
268 	return kar <= kUCD_TOP_OF_LIST && (theUnicodeDatabase [kar]. features & mUCD_CONTROL) == 0;
269 }
270 inline bool Melder_isAsciiPrintable (char32 kar) {   // same as std::isprint() with default C locale
271 	return kar <= kUCD_TOP_OF_ASCII && (theUnicodeDatabase [kar]. features & mUCD_CONTROL) == 0;
272 }
273 inline bool Melder_hasInk (char32 kar) {
274 	return kar <= kUCD_TOP_OF_LIST && (theUnicodeDatabase [kar]. features & (mUCD_CONTROL | mUCD_SEPARATOR)) == 0;
275 }
276 inline bool Melder_hasAsciiInk (char32 kar) {   // same as std::isgraph() with default C locale
277 	return kar <= kUCD_TOP_OF_ASCII && (theUnicodeDatabase [kar]. features & (mUCD_CONTROL | mUCD_SEPARATOR)) == 0;
278 }
279 
280 /*
281 	Internationalize std::toupper () and std::tolower ():
282 */
283 inline char32 Melder_toUpperCase (char32 kar) {
284 	return kar <= kUCD_TOP_OF_LIST ? theUnicodeDatabase [kar]. upperCase : kar;
285 }
286 inline char32 Melder_toLowerCase (char32 kar) {
287 	return kar <= kUCD_TOP_OF_LIST ? theUnicodeDatabase [kar]. lowerCase : kar;
288 }
289 inline char32 Melder_toTitleCase (char32 kar) {
290 	return kar <= kUCD_TOP_OF_LIST ? theUnicodeDatabase [kar]. titleCase : kar;
291 }
292 
293 /*
294 	Search functions instantiating strspn() but much faster (CHECK).
295 */
296 inline const char32 * Melder_findInk (conststring32 str) noexcept {
297 	if (! str)
298 		return nullptr;
299 	const char32 *p = & str [0];
300 	for (; ! Melder_hasInk (*p); p ++) {
301 		if (*p == U'\0')
302 			return nullptr;   // not found
303 	}
304 	return p;
305 }
306 inline const char32 * Melder_findHorizontalOrVerticalSpace (conststring32 str) noexcept {
307 	if (! str)
308 		return nullptr;
309 	const char32 *p = & str [0];
310 	for (; ! Melder_isHorizontalOrVerticalSpace (*p); p ++)
311 		if (*p == U'\0')
312 			return nullptr;   // not found
313 	return p;
314 }
315 
316 /* End of file melder_kar.h */
317 #endif
318