1 /*============================================================================= 2 Copyright (c) 2001-2011 Joel de Guzman 3 4 Distributed under the Boost Software License, Version 1.0. (See accompanying 5 file LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt) 6 7 Autogenerated by MultiStageTable.py (Unicode multi-stage 8 table builder) (c) Peter Kankowski, 2008 9 ==============================================================================*/ 10 #if !defined(BOOST_SPIRIT_UNICODE_QUERY_FEBRUARY_2_2010) 11 #define BOOST_SPIRIT_UNICODE_QUERY_FEBRUARY_2_2010 12 13 #include <boost/cstdint.hpp> 14 15 # include "category_table.hpp" 16 # include "script_table.hpp" 17 # include "lowercase_table.hpp" 18 # include "uppercase_table.hpp" 19 20 namespace boost { namespace spirit { namespace ucd 21 { 22 // This header provides Basic (Level 1) Unicode Support 23 // See http://unicode.org/reports/tr18/ for details 24 25 struct properties 26 { 27 // bit pattern: xxMMMCCC 28 // MMM: major_category 29 // CCC: category 30 31 enum major_category 32 { 33 letter, 34 mark, 35 number, 36 separator, 37 other, 38 punctuation, 39 symbol 40 }; 41 42 enum category 43 { 44 uppercase_letter = 0, // [Lu] an uppercase letter 45 lowercase_letter, // [Ll] a lowercase letter 46 titlecase_letter, // [Lt] a digraphic character, with first part uppercase 47 modifier_letter, // [Lm] a modifier letter 48 other_letter, // [Lo] other letters, including syllables and ideographs 49 50 nonspacing_mark = 8, // [Mn] a nonspacing combining mark (zero advance width) 51 enclosing_mark, // [Me] an enclosing combining mark 52 spacing_mark, // [Mc] a spacing combining mark (positive advance width) 53 54 decimal_number = 16, // [Nd] a decimal digit 55 letter_number, // [Nl] a letterlike numeric character 56 other_number, // [No] a numeric character of other type 57 58 space_separator = 24, // [Zs] a space character (of various non-zero widths) 59 line_separator, // [Zl] U+2028 LINE SEPARATOR only 60 paragraph_separator, // [Zp] U+2029 PARAGRAPH SEPARATOR only 61 62 control = 32, // [Cc] a C0 or C1 control code 63 format, // [Cf] a format control character 64 private_use, // [Co] a private-use character 65 surrogate, // [Cs] a surrogate code point 66 unassigned, // [Cn] a reserved unassigned code point or a noncharacter 67 68 dash_punctuation = 40, // [Pd] a dash or hyphen punctuation mark 69 open_punctuation, // [Ps] an opening punctuation mark (of a pair) 70 close_punctuation, // [Pe] a closing punctuation mark (of a pair) 71 connector_punctuation, // [Pc] a connecting punctuation mark, like a tie 72 other_punctuation, // [Po] a punctuation mark of other type 73 initial_punctuation, // [Pi] an initial quotation mark 74 final_punctuation, // [Pf] a final quotation mark 75 76 math_symbol = 48, // [Sm] a symbol of primarily mathematical use 77 currency_symbol, // [Sc] a currency sign 78 modifier_symbol, // [Sk] a non-letterlike modifier symbol 79 other_symbol // [So] a symbol of other type 80 }; 81 82 enum derived_properties 83 { 84 alphabetic = 64, 85 uppercase = 128, 86 lowercase = 256, 87 white_space = 512, 88 hex_digit = 1024, 89 noncharacter_code_point = 2048, 90 default_ignorable_code_point = 4096 91 }; 92 93 enum script 94 { 95 arabic = 0, 96 imperial_aramaic = 1, 97 armenian = 2, 98 avestan = 3, 99 balinese = 4, 100 bamum = 5, 101 bengali = 6, 102 bopomofo = 7, 103 braille = 8, 104 buginese = 9, 105 buhid = 10, 106 canadian_aboriginal = 11, 107 carian = 12, 108 cham = 13, 109 cherokee = 14, 110 coptic = 15, 111 cypriot = 16, 112 cyrillic = 17, 113 devanagari = 18, 114 deseret = 19, 115 egyptian_hieroglyphs = 20, 116 ethiopic = 21, 117 georgian = 22, 118 glagolitic = 23, 119 gothic = 24, 120 greek = 25, 121 gujarati = 26, 122 gurmukhi = 27, 123 hangul = 28, 124 han = 29, 125 hanunoo = 30, 126 hebrew = 31, 127 hiragana = 32, 128 katakana_or_hiragana = 33, 129 old_italic = 34, 130 javanese = 35, 131 kayah_li = 36, 132 katakana = 37, 133 kharoshthi = 38, 134 khmer = 39, 135 kannada = 40, 136 kaithi = 41, 137 tai_tham = 42, 138 lao = 43, 139 latin = 44, 140 lepcha = 45, 141 limbu = 46, 142 linear_b = 47, 143 lisu = 48, 144 lycian = 49, 145 lydian = 50, 146 malayalam = 51, 147 mongolian = 52, 148 meetei_mayek = 53, 149 myanmar = 54, 150 nko = 55, 151 ogham = 56, 152 ol_chiki = 57, 153 old_turkic = 58, 154 oriya = 59, 155 osmanya = 60, 156 phags_pa = 61, 157 inscriptional_pahlavi = 62, 158 phoenician = 63, 159 inscriptional_parthian = 64, 160 rejang = 65, 161 runic = 66, 162 samaritan = 67, 163 old_south_arabian = 68, 164 saurashtra = 69, 165 shavian = 70, 166 sinhala = 71, 167 sundanese = 72, 168 syloti_nagri = 73, 169 syriac = 74, 170 tagbanwa = 75, 171 tai_le = 76, 172 new_tai_lue = 77, 173 tamil = 78, 174 tai_viet = 79, 175 telugu = 80, 176 tifinagh = 81, 177 tagalog = 82, 178 thaana = 83, 179 thai = 84, 180 tibetan = 85, 181 ugaritic = 86, 182 vai = 87, 183 old_persian = 88, 184 cuneiform = 89, 185 yi = 90, 186 inherited = 91, 187 common = 92, 188 unknown = 93 189 }; 190 }; 191 get_category(::boost::uint32_t ch)192 inline properties::category get_category(::boost::uint32_t ch) 193 { 194 return static_cast<properties::category>(detail::category_lookup(ch) & 0x3F); 195 } 196 get_major_category(::boost::uint32_t ch)197 inline properties::major_category get_major_category(::boost::uint32_t ch) 198 { 199 return static_cast<properties::major_category>(get_category(ch) >> 3); 200 } 201 is_punctuation(::boost::uint32_t ch)202 inline bool is_punctuation(::boost::uint32_t ch) 203 { 204 return get_major_category(ch) == properties::punctuation; 205 } 206 is_decimal_number(::boost::uint32_t ch)207 inline bool is_decimal_number(::boost::uint32_t ch) 208 { 209 return get_category(ch) == properties::decimal_number; 210 } 211 is_hex_digit(::boost::uint32_t ch)212 inline bool is_hex_digit(::boost::uint32_t ch) 213 { 214 return (detail::category_lookup(ch) & properties::hex_digit) != 0; 215 } 216 is_control(::boost::uint32_t ch)217 inline bool is_control(::boost::uint32_t ch) 218 { 219 return get_category(ch) == properties::control; 220 } 221 is_alphabetic(::boost::uint32_t ch)222 inline bool is_alphabetic(::boost::uint32_t ch) 223 { 224 return (detail::category_lookup(ch) & properties::alphabetic) != 0; 225 } 226 is_alphanumeric(::boost::uint32_t ch)227 inline bool is_alphanumeric(::boost::uint32_t ch) 228 { 229 return is_decimal_number(ch) || is_alphabetic(ch); 230 } 231 is_uppercase(::boost::uint32_t ch)232 inline bool is_uppercase(::boost::uint32_t ch) 233 { 234 return (detail::category_lookup(ch) & properties::uppercase) != 0; 235 } 236 is_lowercase(::boost::uint32_t ch)237 inline bool is_lowercase(::boost::uint32_t ch) 238 { 239 return (detail::category_lookup(ch) & properties::lowercase) != 0; 240 } 241 is_white_space(::boost::uint32_t ch)242 inline bool is_white_space(::boost::uint32_t ch) 243 { 244 return (detail::category_lookup(ch) & properties::white_space) != 0; 245 } 246 is_blank(::boost::uint32_t ch)247 inline bool is_blank(::boost::uint32_t ch) 248 { 249 switch (ch) 250 { 251 case '\n': case '\v': case '\f': case '\r': 252 return false; 253 default: 254 return is_white_space(ch) 255 && !( get_category(ch) == properties::line_separator 256 || get_category(ch) == properties::paragraph_separator 257 ); 258 } 259 } 260 is_graph(::boost::uint32_t ch)261 inline bool is_graph(::boost::uint32_t ch) 262 { 263 return !( is_white_space(ch) 264 || get_category(ch) == properties::control 265 || get_category(ch) == properties::surrogate 266 || get_category(ch) == properties::unassigned 267 ); 268 } 269 is_print(::boost::uint32_t ch)270 inline bool is_print(::boost::uint32_t ch) 271 { 272 return (is_graph(ch) || is_blank(ch)) && !is_control(ch); 273 } 274 is_noncharacter_code_point(::boost::uint32_t ch)275 inline bool is_noncharacter_code_point(::boost::uint32_t ch) 276 { 277 return (detail::category_lookup(ch) & properties::noncharacter_code_point) != 0; 278 } 279 is_default_ignorable_code_point(::boost::uint32_t ch)280 inline bool is_default_ignorable_code_point(::boost::uint32_t ch) 281 { 282 return (detail::category_lookup(ch) & properties::default_ignorable_code_point) != 0; 283 } 284 get_script(::boost::uint32_t ch)285 inline properties::script get_script(::boost::uint32_t ch) 286 { 287 return static_cast<properties::script>(detail::script_lookup(ch) & 0x7F); 288 } 289 to_lowercase(::boost::uint32_t ch)290 inline ::boost::uint32_t to_lowercase(::boost::uint32_t ch) 291 { 292 // The table returns 0 to signal that this code maps to itself 293 ::boost::uint32_t r = detail::lowercase_lookup(ch); 294 return (r == 0)? ch : r; 295 } 296 to_uppercase(::boost::uint32_t ch)297 inline ::boost::uint32_t to_uppercase(::boost::uint32_t ch) 298 { 299 // The table returns 0 to signal that this code maps to itself 300 ::boost::uint32_t r = detail::uppercase_lookup(ch); 301 return (r == 0)? ch : r; 302 } 303 }}} 304 305 #endif 306