1 #ifndef _melder_kar_h_ 2 #define _melder_kar_h_ 3 /* melder_kar.h 4 * 5 * Copyright (C) 1992-2020 Paul Boersma 6 * 7 * This code is free software; you can redistribute it and/or modify 8 * it under the terms of the GNU General Public License as published by 9 * the Free Software Foundation; either version 2 of the License, or (at 10 * your option) any later version. 11 * 12 * This code is distributed in the hope that it will be useful, but 13 * WITHOUT ANY WARRANTY; without even the implied warranty of 14 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. 15 * See the GNU General Public License for more details. 16 * 17 * You should have received a copy of the GNU General Public License 18 * along with this work. If not, see <http://www.gnu.org/licenses/>. print(raw_ostream & OS) const19 */ 20 21 #define kUCD_TOP_OF_ASCII 127 22 #define kUCD_TOP_OF_LIST 0x2FA1D 23 #define kUCD_UNASSIGNED 0 24 25 enum { 26 mUCD_UPPERCASE_LETTER = (1 << 0), 27 mUCD_LOWERCASE_LETTER = (1 << 1), 28 mUCD_TITLECASE_LETTER = (1 << 2), 29 mUCD_CASED_LETTER = (mUCD_UPPERCASE_LETTER | mUCD_LOWERCASE_LETTER | mUCD_TITLECASE_LETTER), 30 mUCD_MODIFIER_LETTER = (1 << 3), 31 mUCD_OTHER_LETTER = (1 << 4), 32 mUCD_LETTER = (mUCD_CASED_LETTER | mUCD_MODIFIER_LETTER | mUCD_OTHER_LETTER), 33 34 mUCD_NONSPACING_MARK = (1 << 5), 35 mUCD_SPACING_MARK = (1 << 6), 36 mUCD_ENCLOSING_MARK = (1 << 7), 37 mUCD_MARK = (mUCD_NONSPACING_MARK | mUCD_SPACING_MARK | mUCD_ENCLOSING_MARK), 38 39 mUCD_DECIMAL_NUMBER = (1 << 8), 40 mUCD_LETTER_NUMBER = (1 << 9), 41 mUCD_OTHER_NUMBER = (1 << 10), 42 mUCD_NUMBER = (mUCD_DECIMAL_NUMBER | mUCD_LETTER_NUMBER | mUCD_OTHER_NUMBER), 43 44 mUCD_CONNECTOR_PUNCTUATION = (1 << 11), 45 mUCD_DASH_PUNCTUATION = (1 << 12), 46 mUCD_OPEN_PUNCTUATION = (1 << 13), 47 mUCD_CLOSE_PUNCTUATION = (1 << 14), 48 mUCD_INITIAL_PUNCTUATION = (1 << 15), 49 mUCD_FINAL_PUNCTUATION = (1 << 16), 50 mUCD_OTHER_PUNCTUATION = (1 << 17), 51 mUCD_PUNCTUATION = (mUCD_CONNECTOR_PUNCTUATION | mUCD_DASH_PUNCTUATION | mUCD_OPEN_PUNCTUATION | mUCD_CLOSE_PUNCTUATION | mUCD_INITIAL_PUNCTUATION | mUCD_FINAL_PUNCTUATION | mUCD_OTHER_PUNCTUATION), 52 53 mUCD_MATH_SYMBOL = (1 << 18), 54 mUCD_CURRENCY_SYMBOL = (1 << 19), 55 mUCD_MODIFIER_SYMBOL = (1 << 20), 56 mUCD_OTHER_SYMBOL = (1 << 21), 57 mUCD_SYMBOL = (mUCD_MATH_SYMBOL | mUCD_CURRENCY_SYMBOL | mUCD_MODIFIER_SYMBOL | mUCD_OTHER_SYMBOL), 58 59 mUCD_BREAKING_SPACE = (1 << 22), 60 mUCD_NON_BREAKING_SPACE = (1 << 23), // note: this keeps *lines* together; it still separates *words*, despite interpretations elsewhere 61 mUCD_SPACE_SEPARATOR = (mUCD_BREAKING_SPACE | mUCD_NON_BREAKING_SPACE), 62 mUCD_LINE_SEPARATOR = (1 << 24), 63 mUCD_PARAGRAPH_SEPARATOR = (1 << 25), 64 mUCD_NEWLINE = (mUCD_LINE_SEPARATOR | mUCD_PARAGRAPH_SEPARATOR), 65 mUCD_SEPARATOR = (mUCD_SPACE_SEPARATOR | mUCD_NEWLINE), 66 67 mUCD_CONTROL = (1 << 26), 68 mUCD_FORMAT = (1 << 27), 69 mUCD_PRIVATE_USE = (1 << 28), 70 71 mUCD_WORD_CHARACTER = (1 << 29), 72 mUCD_NULL = (1 << 30), 73 74 mUCD_ALPHANUMERIC = (mUCD_LETTER | mUCD_NUMBER), 75 mUCD_END_OF_INK = (mUCD_SEPARATOR | mUCD_NULL), 76 mUCD_END_OF_LINE = (mUCD_NEWLINE | mUCD_NULL), 77 }; 78 79 struct UCD_CodePointInfo { 80 uint32 features; 81 char32 upperCase, lowerCase, titleCase; 82 conststring32 decomposed; 83 char first, second; 84 }; 85 extern UCD_CodePointInfo theUnicodeDatabase [1+kUCD_TOP_OF_LIST]; 86 87 /* 88 Praat is an internationalized program, which means it has to work in the same way 89 wherever on earth it is used. This means that Praat has to be blind to localized settings, 90 such as what counts as a space and what combinations of characters 91 count as pairs of lower case and upper case. 92 93 To be able to use Praat all over the world, we therefore define one single 94 "international locale", which is simply based on the Unicode features of each code point. 95 */ 96 97 /* 98 Internationalize std::isblank (): 99 */ 100 inline bool Melder_isHorizontalSpace (char32 kar) { 101 return kar <= kUCD_TOP_OF_LIST && (theUnicodeDatabase [kar]. features & mUCD_SPACE_SEPARATOR) != 0; 102 } 103 inline void Melder_skipHorizontalSpace (char32 **p_text) { 104 while (Melder_isHorizontalSpace (**p_text)) (*p_text) ++; 105 } 106 inline char32 * Melder_findEndOfHorizontalSpace (char32 *p) { 107 while (Melder_isHorizontalSpace (*p)) p ++; 108 return p; 109 } 110 inline const char32 * Melder_findEndOfHorizontalSpace (const char32 *p) { 111 while (Melder_isHorizontalSpace (*p)) p ++; 112 return p; 113 } 114 115 inline bool Melder_isAsciiHorizontalSpace (char32 kar) { 116 return kar == U'\t' || kar == U' '; 117 } 118 119 inline bool Melder_isVerticalSpace (char32 kar) { 120 return kar <= kUCD_TOP_OF_LIST && (theUnicodeDatabase [kar]. features & mUCD_NEWLINE) != 0; 121 } 122 inline bool Melder_isAsciiVerticalSpace (char32 kar) { 123 return kar >= 10 && kar <= 13; // \n, \v, \f, \r 124 } 125 126 /* 127 Internationalize std::isspace (): 128 */ 129 inline bool Melder_isHorizontalOrVerticalSpace (char32 kar) { 130 return kar <= kUCD_TOP_OF_LIST && (theUnicodeDatabase [kar]. features & mUCD_SEPARATOR) != 0; 131 } 132 inline bool Melder_isAsciiHorizontalOrVerticalSpace (char32 kar) { 133 return kar <= kUCD_TOP_OF_ASCII && (theUnicodeDatabase [kar]. features & mUCD_SEPARATOR) != 0; 134 } 135 inline void Melder_skipHorizontalOrVerticalSpace (char32 **p_text) { 136 while (Melder_isHorizontalOrVerticalSpace (**p_text)) (*p_text) ++; 137 } 138 inline void Melder_skipHorizontalOrVerticalSpace (const char32 **p_text) { 139 while (Melder_isHorizontalOrVerticalSpace (**p_text)) (*p_text) ++; 140 } 141 142 inline bool Melder_isEndOfInk (char32 kar) { 143 return kar <= kUCD_TOP_OF_LIST && (theUnicodeDatabase [kar]. features & mUCD_END_OF_INK) != 0; 144 } 145 inline bool Melder_isEndOfLine (char32 kar) { 146 return kar <= kUCD_TOP_OF_LIST && (theUnicodeDatabase [kar]. features & mUCD_END_OF_LINE) != 0; 147 } 148 inline bool Melder_isEndOfText (char32 kar) { 149 return kar == U'\0'; 150 } 151 inline bool Melder_staysWithinInk (char32 kar) { 152 return kar <= kUCD_TOP_OF_LIST && (theUnicodeDatabase [kar]. features & mUCD_END_OF_INK) == 0; 153 } 154 inline bool Melder_staysWithinLine (char32 kar) { 155 return kar <= kUCD_TOP_OF_LIST && (theUnicodeDatabase [kar]. features & mUCD_END_OF_LINE) == 0; 156 } 157 inline void Melder_skipToEndOfLine (char32 **p_text) { 158 while (Melder_staysWithinLine (**p_text)) (*p_text) ++; 159 } 160 inline char32 * Melder_findEndOfInk (char32 *p) { 161 while (Melder_staysWithinInk (*p)) p ++; 162 return p; 163 } 164 inline const char32 * Melder_findEndOfInk (const char32 *p) { 165 while (Melder_staysWithinInk (*p)) p ++; 166 return p; 167 } 168 inline char32 * Melder_findEndOfLine (char32 *p) { 169 while (Melder_staysWithinLine (*p)) p ++; 170 return p; 171 } 172 inline const char32 * Melder_findEndOfLine (const char32 *p) { 173 while (Melder_staysWithinLine (*p)) p ++; 174 return p; 175 } 176 177 /* 178 Internationalize std::isalpha (): 179 */ 180 inline bool Melder_isLetter (char32 kar) { 181 return kar <= kUCD_TOP_OF_LIST && (theUnicodeDatabase [kar]. features & mUCD_LETTER) != 0; 182 } 183 inline bool Melder_isAsciiLetter (char32 kar) { 184 return kar <= kUCD_TOP_OF_ASCII && (theUnicodeDatabase [kar]. features & mUCD_LETTER) != 0; 185 } 186 187 /* 188 Internationalize std::isupper (): 189 */ 190 inline bool Melder_isUpperCaseLetter (char32 kar) { 191 return kar <= kUCD_TOP_OF_LIST && (theUnicodeDatabase [kar]. features & mUCD_UPPERCASE_LETTER) != 0; 192 } 193 inline bool Melder_isAsciiUpperCaseLetter (char32 kar) { 194 return kar >= U'A' && kar <= U'Z'; 195 } 196 197 /* 198 Internationalize std::islower (): 199 */ 200 inline bool Melder_isLowerCaseLetter (char32 kar) { 201 return kar <= kUCD_TOP_OF_LIST && (theUnicodeDatabase [kar]. features & mUCD_LOWERCASE_LETTER) != 0; 202 } 203 inline bool Melder_isAsciiLowerCaseLetter (char32 kar) { 204 return kar >= U'a' && kar <= U'z'; 205 } 206 207 inline bool Melder_isTitleCaseLetter (char32 kar) { 208 return kar <= kUCD_TOP_OF_LIST && (theUnicodeDatabase [kar]. features & mUCD_TITLECASE_LETTER) != 0; 209 } 210 inline bool Melder_isAsciiTitleCaseLetter (char32 kar) { 211 return kar >= U'A' && kar <= U'Z'; 212 } 213 214 /* 215 Internationalize std::isdigit (): 216 */ 217 inline bool Melder_isDecimalNumber (char32 kar) { 218 return kar <= kUCD_TOP_OF_LIST && (theUnicodeDatabase [kar]. features & mUCD_DECIMAL_NUMBER) != 0; 219 } 220 inline bool Melder_isAsciiDecimalNumber (char32 kar) { 221 return kar >= U'0' && kar <= U'9'; 222 } 223 224 /* 225 We cannot really internationalize std::isxdigit (): 226 */ 227 inline bool Melder_isHexadecimalDigit (char32 kar) { 228 return kar >= U'0' && kar <= U'9' || kar >= U'A' && kar <= U'Z' || kar >= U'a' && kar <= U'z'; 229 } 230 231 /* 232 Internationalize std::isalnum (): 233 */ 234 inline bool Melder_isAlphanumeric (char32 kar) { 235 return kar <= kUCD_TOP_OF_LIST && (theUnicodeDatabase [kar]. features & mUCD_ALPHANUMERIC) != 0; 236 } 237 inline bool Melder_isAsciiAlphanumeric (char32 kar) { 238 return kar <= kUCD_TOP_OF_ASCII && (theUnicodeDatabase [kar]. features & mUCD_ALPHANUMERIC) != 0; 239 } 240 241 inline bool Melder_isWordCharacter (char32 kar) { 242 return kar <= kUCD_TOP_OF_LIST && (theUnicodeDatabase [kar]. features & mUCD_WORD_CHARACTER) != 0; 243 } 244 inline bool Melder_isAsciiWordCharacter (char32 kar) { 245 return kar <= kUCD_TOP_OF_ASCII && (theUnicodeDatabase [kar]. features & mUCD_WORD_CHARACTER) != 0; 246 } 247 248 /* 249 The standard library further contains std::ispunct (), std::iscntrl (), std::isprint (), std::isgraph (). 250 These have very little use nowadays, so only for completeness do we include versions of them here, 251 which are correct at least for ASCII arguments. 252 Of these four functions, Melder_hasInk () is not yet correct for all Unicode points, 253 as approximately one half of the mUCD_FORMAT points are inkless as well. 254 */ 255 inline bool Melder_isPunctuationOrSymbol (char32 kar) { 256 return kar <= kUCD_TOP_OF_LIST && (theUnicodeDatabase [kar]. features & (mUCD_PUNCTUATION | mUCD_SYMBOL)) != 0; 257 } 258 inline bool Melder_isAsciiPunctuationOrSymbol (char32 kar) { // same as std::ispunct() with default C locale 259 return kar <= kUCD_TOP_OF_ASCII && (theUnicodeDatabase [kar]. features & (mUCD_PUNCTUATION | mUCD_SYMBOL)) != 0; 260 } 261 inline bool Melder_isControl (char32 kar) { 262 return kar <= kUCD_TOP_OF_LIST && (theUnicodeDatabase [kar]. features & mUCD_CONTROL) != 0; 263 } 264 inline bool Melder_isAsciiControl (char32 kar) { // same as std::iscntrl() with default C locale 265 return kar <= kUCD_TOP_OF_ASCII && (theUnicodeDatabase [kar]. features & mUCD_CONTROL) != 0; 266 } 267 inline bool Melder_isPrintable (char32 kar) { 268 return kar <= kUCD_TOP_OF_LIST && (theUnicodeDatabase [kar]. features & mUCD_CONTROL) == 0; 269 } 270 inline bool Melder_isAsciiPrintable (char32 kar) { // same as std::isprint() with default C locale 271 return kar <= kUCD_TOP_OF_ASCII && (theUnicodeDatabase [kar]. features & mUCD_CONTROL) == 0; 272 } 273 inline bool Melder_hasInk (char32 kar) { 274 return kar <= kUCD_TOP_OF_LIST && (theUnicodeDatabase [kar]. features & (mUCD_CONTROL | mUCD_SEPARATOR)) == 0; 275 } 276 inline bool Melder_hasAsciiInk (char32 kar) { // same as std::isgraph() with default C locale 277 return kar <= kUCD_TOP_OF_ASCII && (theUnicodeDatabase [kar]. features & (mUCD_CONTROL | mUCD_SEPARATOR)) == 0; 278 } 279 280 /* 281 Internationalize std::toupper () and std::tolower (): 282 */ 283 inline char32 Melder_toUpperCase (char32 kar) { 284 return kar <= kUCD_TOP_OF_LIST ? theUnicodeDatabase [kar]. upperCase : kar; 285 } 286 inline char32 Melder_toLowerCase (char32 kar) { 287 return kar <= kUCD_TOP_OF_LIST ? theUnicodeDatabase [kar]. lowerCase : kar; 288 } 289 inline char32 Melder_toTitleCase (char32 kar) { 290 return kar <= kUCD_TOP_OF_LIST ? theUnicodeDatabase [kar]. titleCase : kar; 291 } 292 293 /* 294 Search functions instantiating strspn() but much faster (CHECK). 295 */ 296 inline const char32 * Melder_findInk (conststring32 str) noexcept { 297 if (! str) 298 return nullptr; 299 const char32 *p = & str [0]; 300 for (; ! Melder_hasInk (*p); p ++) { 301 if (*p == U'\0') 302 return nullptr; // not found 303 } 304 return p; 305 } 306 inline const char32 * Melder_findHorizontalOrVerticalSpace (conststring32 str) noexcept { 307 if (! str) 308 return nullptr; 309 const char32 *p = & str [0]; 310 for (; ! Melder_isHorizontalOrVerticalSpace (*p); p ++) 311 if (*p == U'\0') 312 return nullptr; // not found 313 return p; 314 } 315 316 /* End of file melder_kar.h */ 317 #endif 318