1 // Copyright 2017 The UNIC Project Developers. 2 // 3 // See the COPYRIGHT file at the top-level directory of this distribution. 4 // 5 // Licensed under the Apache License, Version 2.0 <LICENSE-APACHE or 6 // http://www.apache.org/licenses/LICENSE-2.0> or the MIT license 7 // <LICENSE-MIT or http://opensource.org/licenses/MIT>, at your 8 // option. This file may not be copied, modified, or distributed 9 // except according to those terms. 10 11 use unic_char_property::TotalCharProperty; 12 13 char_property! { 14 /// Represents the Unicode Character 15 /// [`General_Category`](http://unicode.org/reports/tr44/#General_Category) property. 16 /// 17 /// This is a useful breakdown into various character types which can be used as a default 18 /// categorization in implementations. For the property values, see 19 /// [`General_Category Values`](http://unicode.org/reports/tr44/#General_Category_Values). 20 pub enum GeneralCategory { 21 abbr => "gc"; 22 long => "General_Category"; 23 human => "General Category"; 24 25 /// An uppercase letter 26 UppercaseLetter { 27 abbr => Lu, 28 long => Uppercase_Letter, 29 human => "Uppercase Letter", 30 } 31 32 /// A lowercase letter 33 LowercaseLetter { 34 abbr => Ll, 35 long => Lowercase_Letter, 36 human => "Lowercase Letter", 37 } 38 39 /// A digraphic character, with first part uppercase 40 TitlecaseLetter { 41 abbr => Lt, 42 long => Titlecase_Letter, 43 human => "Titlecase Letter", 44 } 45 46 /// A modifier letter 47 ModifierLetter { 48 abbr => Lm, 49 long => Modifier_Letter, 50 human => "Modifier Letter", 51 } 52 53 /// Other letters, including syllables and ideographs 54 OtherLetter { 55 abbr => Lo, 56 long => Other_Letter, 57 human => "Other Letter", 58 } 59 60 /// A nonspacing combining mark (zero advance width) 61 NonspacingMark { 62 abbr => Mn, 63 long => Nonspacing_Mark, 64 human => "Nonspacing Mark", 65 } 66 67 /// A spacing combining mark (positive advance width) 68 SpacingMark { 69 abbr => Mc, 70 long => Spacing_Mark, 71 human => "Spacing Mark", 72 } 73 74 /// An enclosing combining mark 75 EnclosingMark { 76 abbr => Me, 77 long => Enclosing_Mark, 78 human => "Enclosing Mark", 79 } 80 81 /// A decimal digit 82 DecimalNumber { 83 abbr => Nd, 84 long => Decimal_Number, 85 human => "Decimal Digit", 86 } 87 88 /// A letterlike numeric character 89 LetterNumber { 90 abbr => Nl, 91 long => Letter_Number, 92 human => "Letterlike Number", 93 } 94 95 /// A numeric character of other type 96 OtherNumber { 97 abbr => No, 98 long => Other_Number, 99 human => "Other Numeric", 100 } 101 102 /// A connecting punctuation mark, like a tie 103 ConnectorPunctuation { 104 abbr => Pc, 105 long => Connector_Punctuation, 106 human => "Connecting Punctuation", 107 } 108 109 /// A dash or hyphen punctuation mark 110 DashPunctuation { 111 abbr => Pd, 112 long => Dash_Punctuation, 113 human => "Dash Punctuation", 114 } 115 116 /// An opening punctuation mark (of a pair) 117 OpenPunctuation { 118 abbr => Ps, 119 long => Open_Punctuation, 120 human => "Opening Punctuation", 121 } 122 123 /// A closing punctuation mark (of a pair) 124 ClosePunctuation { 125 abbr => Pe, 126 long => Close_Punctuation, 127 human => "Closing Punctuation", 128 } 129 130 /// An initial quotation mark 131 InitialPunctuation { 132 abbr => Pi, 133 long => Initial_Punctuation, 134 human => "Initial Quotation", 135 } 136 137 /// A final quotation mark 138 FinalPunctuation { 139 abbr => Pf, 140 long => Final_Punctuation, 141 human => "Final Quotation", 142 } 143 144 /// A punctuation mark of other type 145 OtherPunctuation { 146 abbr => Po, 147 long => Other_Punctuation, 148 human => "Other Punctuation", 149 } 150 151 /// A symbol of mathematical use 152 MathSymbol { 153 abbr => Sm, 154 long => Math_Symbol, 155 human => "Math Symbol", 156 } 157 158 /// A currency sign 159 CurrencySymbol { 160 abbr => Sc, 161 long => Currency_Symbol, 162 human => "Currency Symbol", 163 } 164 165 /// A non-letterlike modifier symbol 166 ModifierSymbol { 167 abbr => Sk, 168 long => Modifier_Symbol, 169 human => "Modifier Symbol", 170 } 171 172 /// A symbol of other type 173 OtherSymbol { 174 abbr => So, 175 long => Other_Symbol, 176 human => "Other Symbol", 177 } 178 179 /// A space character (of various non-zero widths) 180 SpaceSeparator { 181 abbr => Zs, 182 long => Space_Separator, 183 human => "Space", 184 } 185 186 /// U+2028 LINE SEPARATOR only 187 LineSeparator { 188 abbr => Zl, 189 long => Line_Separator, 190 human => "Line Separator", 191 } 192 193 /// U+2029 PARAGRAPH SEPARATOR only 194 ParagraphSeparator { 195 abbr => Zp, 196 long => Paragraph_Separator, 197 human => "Paragraph Separator", 198 } 199 200 /// A C0 or C1 control code 201 Control { 202 abbr => Cc, 203 long => Control, 204 human => "Control", 205 } 206 207 /// A format control character 208 Format { 209 abbr => Cf, 210 long => Format, 211 human => "Formatting", 212 } 213 214 /// A surrogate code point 215 Surrogate { 216 abbr => Cs, 217 long => Surrogate, 218 human => "Surrogate", 219 } 220 221 /// A private-use character 222 PrivateUse { 223 abbr => Co, 224 long => Private_Use, 225 human => "Private-Use", 226 } 227 228 /// Unassigned 229 Unassigned { 230 abbr => Cn, 231 long => Unassigned, 232 human => "Unassigned", 233 } 234 } 235 236 pub mod abbr_names for abbr; 237 pub mod long_names for long; 238 } 239 240 impl TotalCharProperty for GeneralCategory { of(ch: char) -> Self241 fn of(ch: char) -> Self { 242 Self::of(ch) 243 } 244 } 245 246 impl Default for GeneralCategory { default() -> Self247 fn default() -> Self { 248 GeneralCategory::Unassigned 249 } 250 } 251 252 mod data { 253 use super::abbr_names::*; 254 use unic_char_property::tables::CharDataTable; 255 pub const GENERAL_CATEGORY_TABLE: CharDataTable<super::GeneralCategory> = 256 include!("../tables/general_category.rsv"); 257 } 258 259 impl GeneralCategory { 260 /// Find the `GeneralCategory` of a single char. of(ch: char) -> GeneralCategory261 pub fn of(ch: char) -> GeneralCategory { 262 data::GENERAL_CATEGORY_TABLE.find_or_default(ch) 263 } 264 } 265 266 impl GeneralCategory { 267 /// `Lu` | `Ll` | `Lt` (Short form: `LC`) is_cased_letter(&self) -> bool268 pub fn is_cased_letter(&self) -> bool { 269 use self::abbr_names::*; 270 matches!(*self, Lu | Ll | Lt) 271 } 272 273 /// `Lu` | `Ll` | `Lt` | `Lm` | `Lo` (Short form: `L`) is_letter(&self) -> bool274 pub fn is_letter(&self) -> bool { 275 use self::abbr_names::*; 276 matches!(*self, Lu | Ll | Lt | Lm | Lo) 277 } 278 279 /// `Mn` | `Mc` | `Me` (Short form: `M`) is_mark(&self) -> bool280 pub fn is_mark(&self) -> bool { 281 use self::abbr_names::*; 282 matches!(*self, Mn | Mc | Me) 283 } 284 285 /// `Nd` | `Nl` | `No` (Short form: `N`) is_number(&self) -> bool286 pub fn is_number(&self) -> bool { 287 use self::abbr_names::*; 288 matches!(*self, Nd | Nl | No) 289 } 290 291 /// `Pc` | `Pd` | `Ps` | `Pe` | `Pi` | `Pf` | `Po` (Short form: `P`) is_punctuation(&self) -> bool292 pub fn is_punctuation(&self) -> bool { 293 use self::abbr_names::*; 294 matches!(*self, Pc | Pd | Ps | Pe | Pi | Pf | Po) 295 } 296 297 /// `Sm` | `Sc` | `Sk` | `So` (Short form: `S`) is_symbol(&self) -> bool298 pub fn is_symbol(&self) -> bool { 299 use self::abbr_names::*; 300 matches!(*self, Sm | Sc | Sk | So) 301 } 302 303 /// `Zs` | `Zl` | `Zp` (Short form: `Z`) is_separator(&self) -> bool304 pub fn is_separator(&self) -> bool { 305 use self::abbr_names::*; 306 matches!(*self, Zs | Zl | Zp) 307 } 308 309 /// `Cc` | `Cf` | `Cs` | `Co` | `Cn` (Short form: `C`) is_other(&self) -> bool310 pub fn is_other(&self) -> bool { 311 use self::abbr_names::*; 312 matches!(*self, Cc | Cf | Cs | Co | Cn) 313 } 314 } 315 316 #[cfg(test)] 317 mod tests { 318 use super::GeneralCategory as GC; 319 use core::char; 320 use unic_char_property::EnumeratedCharProperty; 321 322 #[test] test_ascii()323 fn test_ascii() { 324 for c in 0x00..(0x1F + 1) { 325 let c = char::from_u32(c).unwrap(); 326 assert_eq!(GC::of(c), GC::Control); 327 } 328 329 assert_eq!(GC::of(' '), GC::SpaceSeparator); 330 assert_eq!(GC::of('!'), GC::OtherPunctuation); 331 assert_eq!(GC::of('"'), GC::OtherPunctuation); 332 assert_eq!(GC::of('#'), GC::OtherPunctuation); 333 assert_eq!(GC::of('$'), GC::CurrencySymbol); 334 assert_eq!(GC::of('%'), GC::OtherPunctuation); 335 assert_eq!(GC::of('&'), GC::OtherPunctuation); 336 assert_eq!(GC::of('\''), GC::OtherPunctuation); 337 assert_eq!(GC::of('('), GC::OpenPunctuation); 338 assert_eq!(GC::of(')'), GC::ClosePunctuation); 339 assert_eq!(GC::of('*'), GC::OtherPunctuation); 340 assert_eq!(GC::of('+'), GC::MathSymbol); 341 assert_eq!(GC::of(','), GC::OtherPunctuation); 342 assert_eq!(GC::of('-'), GC::DashPunctuation); 343 assert_eq!(GC::of('.'), GC::OtherPunctuation); 344 assert_eq!(GC::of('/'), GC::OtherPunctuation); 345 346 for c in ('0' as u32)..('9' as u32 + 1) { 347 let c = char::from_u32(c).unwrap(); 348 assert_eq!(GC::of(c), GC::DecimalNumber); 349 } 350 351 assert_eq!(GC::of(':'), GC::OtherPunctuation); 352 assert_eq!(GC::of(';'), GC::OtherPunctuation); 353 assert_eq!(GC::of('<'), GC::MathSymbol); 354 assert_eq!(GC::of('='), GC::MathSymbol); 355 assert_eq!(GC::of('>'), GC::MathSymbol); 356 assert_eq!(GC::of('?'), GC::OtherPunctuation); 357 assert_eq!(GC::of('@'), GC::OtherPunctuation); 358 359 for c in ('A' as u32)..('Z' as u32 + 1) { 360 let c = char::from_u32(c).unwrap(); 361 assert_eq!(GC::of(c), GC::UppercaseLetter); 362 } 363 364 assert_eq!(GC::of('['), GC::OpenPunctuation); 365 assert_eq!(GC::of('\\'), GC::OtherPunctuation); 366 assert_eq!(GC::of(']'), GC::ClosePunctuation); 367 assert_eq!(GC::of('^'), GC::ModifierSymbol); 368 assert_eq!(GC::of('_'), GC::ConnectorPunctuation); 369 assert_eq!(GC::of('`'), GC::ModifierSymbol); 370 371 for c in ('a' as u32)..('z' as u32 + 1) { 372 let c = char::from_u32(c).unwrap(); 373 assert_eq!(GC::of(c), GC::LowercaseLetter); 374 } 375 376 assert_eq!(GC::of('{'), GC::OpenPunctuation); 377 assert_eq!(GC::of('|'), GC::MathSymbol); 378 assert_eq!(GC::of('}'), GC::ClosePunctuation); 379 assert_eq!(GC::of('~'), GC::MathSymbol); 380 } 381 382 #[test] test_bmp_edge()383 fn test_bmp_edge() { 384 // 0xFEFF ZERO WIDTH NO-BREAK SPACE (or) BYTE ORDER MARK 385 let bom = '\u{FEFF}'; 386 assert_eq!(GC::of(bom), GC::Format); 387 // 0xFFFC OBJECT REPLACEMENT CHARACTER 388 assert_eq!(GC::of(''), GC::OtherSymbol); 389 // 0xFFFD REPLACEMENT CHARACTER 390 assert_eq!(GC::of('�'), GC::OtherSymbol); 391 392 for &c in [0xFFEF, 0xFFFE, 0xFFFF].iter() { 393 let c = char::from_u32(c).unwrap(); 394 assert_eq!(GC::of(c), GC::Unassigned); 395 } 396 } 397 398 #[test] test_private_use()399 fn test_private_use() { 400 for c in 0xF_0000..(0xF_FFFD + 1) { 401 let c = char::from_u32(c).unwrap(); 402 assert_eq!(GC::of(c), GC::PrivateUse); 403 } 404 405 for c in 0x10_0000..(0x10_FFFD + 1) { 406 let c = char::from_u32(c).unwrap(); 407 assert_eq!(GC::of(c), GC::PrivateUse); 408 } 409 410 for &c in [0xF_FFFE, 0xF_FFFF, 0x10_FFFE, 0x10_FFFF].iter() { 411 let c = char::from_u32(c).unwrap(); 412 assert_eq!(GC::of(c), GC::Unassigned); 413 } 414 } 415 416 #[test] test_abbr_name()417 fn test_abbr_name() { 418 assert_eq!(GC::UppercaseLetter.abbr_name(), "Lu"); 419 assert_eq!(GC::Unassigned.abbr_name(), "Cn"); 420 } 421 422 #[test] test_long_name()423 fn test_long_name() { 424 assert_eq!(GC::UppercaseLetter.long_name(), "Uppercase_Letter"); 425 assert_eq!(GC::Unassigned.long_name(), "Unassigned"); 426 } 427 428 #[test] test_human_name()429 fn test_human_name() { 430 assert_eq!(GC::UppercaseLetter.human_name(), "Uppercase Letter"); 431 assert_eq!(GC::Unassigned.human_name(), "Unassigned"); 432 } 433 } 434