1 // Copyright 2017 The UNIC Project Developers. 2 // 3 // See the COPYRIGHT file at the top-level directory of this distribution. 4 // 5 // Licensed under the Apache License, Version 2.0 <LICENSE-APACHE or 6 // http://www.apache.org/licenses/LICENSE-2.0> or the MIT license 7 // <LICENSE-MIT or http://opensource.org/licenses/MIT>, at your 8 // option. This file may not be copied, modified, or distributed 9 // except according to those terms. 10 11 //! Unicode `Word_Break` Character Property. 12 //! 13 //! ## References 14 //! 15 //! * <https://www.unicode.org/reports/tr44/#Word_Break> 16 //! * <https://www.unicode.org/reports/tr29/#Word_Boundaries> 17 //! * <https://www.unicode.org/reports/tr29/#Table_Word_Break_Property_Values> 18 19 use unic_char_property::TotalCharProperty; 20 21 char_property! { 22 /// Represents the Unicode character 23 /// [`Word_Break`](https://www.unicode.org/reports/tr44/#Word_Break) 24 /// property. 25 /// 26 /// ## References 27 /// 28 /// * <https://www.unicode.org/reports/tr44/#Word_Break> 29 /// * <https://www.unicode.org/reports/tr29/#Word_Boundaries> 30 /// * <https://www.unicode.org/reports/tr29/#Table_Word_Break_Property_Values> 31 pub enum WordBreak { 32 abbr => "WB"; 33 long => "Word_Break"; 34 human => "Word Break"; 35 36 /// ```text 37 /// U+000D CARRIAGE RETURN (CR) 38 /// ``` 39 CR { 40 abbr => CR, 41 long => CR, 42 human => "Carriage Return", 43 } 44 45 /// ```text 46 /// U+000A LINE FEED (LF) 47 /// ``` 48 LF { 49 abbr => LF, 50 long => LF, 51 human => "Line Feed", 52 } 53 54 /// ```text 55 /// U+000B LINE TABULATION 56 /// U+000C FORM FEED (FF) 57 /// U+0085 NEXT LINE (NEL) 58 /// U+2028 LINE SEPARATOR 59 /// U+2029 PARAGRAPH SEPARATOR 60 /// ``` 61 Newline { 62 abbr => NL, 63 long => Newline, 64 human => "Newline", 65 } 66 67 /// ```text 68 /// Grapheme_Extend = Yes, or 69 /// General_Category = Spacing_Mark 70 /// and not U+200D ZERO WIDTH JOINER (ZWJ) 71 /// ``` 72 Extend { 73 abbr => Extend, 74 long => Extend, 75 human => "Extend", 76 } 77 78 /// ```text 79 /// U+200D ZERO WIDTH JOINER 80 /// ``` 81 ZWJ { 82 abbr => ZWJ, 83 long => ZWJ, 84 human => "Zero Width Joiner (ZWJ)", 85 } 86 87 /// ```text 88 /// Regional_Indicator = Yes 89 /// ``` 90 /// 91 /// This consists of the range: 92 /// 93 /// ```text 94 /// U+1F1E6 REGIONAL INDICATOR SYMBOL LETTER A 95 /// ..U+1F1FF REGIONAL INDICATOR SYMBOL LETTER Z 96 /// ``` 97 RegionalIndicator { 98 abbr => RI, 99 long => Regional_Indicator, 100 human => "Regional Indicator", 101 } 102 103 /// ```text 104 /// General_Category = Format 105 /// and not U+200B ZERO WIDTH SPACE (ZWSP) 106 /// and not U+200C ZERO WIDTH NON-JOINER (ZWNJ) 107 /// and not U+200D ZERO WIDTH JOINER (ZWJ) 108 /// ``` 109 Format { 110 abbr => FO, 111 long => Format, 112 human => "Format", 113 } 114 115 /// ```text 116 /// Script = KATAKANA, or 117 /// any of the following: 118 /// U+3031 ( 〱 ) VERTICAL KANA REPEAT MARK 119 /// U+3032 ( 〲 ) VERTICAL KANA REPEAT WITH VOICED SOUND MARK 120 /// U+3033 ( 〳 ) VERTICAL KANA REPEAT MARK UPPER HALF 121 /// U+3034 ( 〴 ) VERTICAL KANA REPEAT WITH VOICED SOUND MARK UPPER HALF 122 /// U+3035 ( 〵 ) VERTICAL KANA REPEAT MARK LOWER HALF 123 /// U+309B ( ゛ ) KATAKANA-HIRAGANA VOICED SOUND MARK 124 /// U+309C ( ゜ ) KATAKANA-HIRAGANA SEMI-VOICED SOUND MARK 125 /// U+30A0 ( ゠ ) KATAKANA-HIRAGANA DOUBLE HYPHEN 126 /// U+30FC ( ー ) KATAKANA-HIRAGANA PROLONGED SOUND MARK 127 /// U+FF70 ( ー ) HALFWIDTH KATAKANA-HIRAGANA PROLONGED SOUND MARK 128 /// ``` 129 Katakana { 130 abbr => KA, 131 long => Katakana, 132 human => "Katakana", 133 } 134 135 /// ```text 136 /// Script = Hebrew 137 /// and General_Category = Other_Letter 138 /// ``` 139 HebrewLetter { 140 abbr => HL, 141 long => Hebrew_Letter, 142 human => "Hebrew Letter", 143 } 144 145 /// ```text 146 /// Alphabetic = Yes, or 147 /// any of the following 36 characters: 148 /// U+02C2 ( ˂ ) MODIFIER LETTER LEFT ARROWHEAD 149 /// ..U+02C5 ( ˅ ) MODIFIER LETTER DOWN ARROWHEAD 150 /// U+02D2 ( ˒ ) MODIFIER LETTER CENTRED RIGHT HALF RING 151 /// ..U+02D7 ( ˗ ) MODIFIER LETTER MINUS SIGN 152 /// U+02DE ( ˞ ) MODIFIER LETTER RHOTIC HOOK 153 /// U+02DF ( ˟ ) MODIFIER LETTER CROSS ACCENT 154 /// U+02ED ( ˭ ) MODIFIER LETTER UNASPIRATED 155 /// U+02EF ( ˯ ) MODIFIER LETTER LOW DOWN ARROWHEAD 156 /// ..U+02FF ( ˿ ) MODIFIER LETTER LOW LEFT ARROW 157 /// U+05F3 ( ׳ ) HEBREW PUNCTUATION GERESH 158 /// U+A720 ( ꜠ ) MODIFIER LETTER STRESS AND HIGH TONE 159 /// U+A721 ( ꜡ ) MODIFIER LETTER STRESS AND LOW TONE 160 /// U+A789 ( ꞉ ) MODIFIER LETTER COLON 161 /// U+A78A ( ꞊ ) MODIFIER LETTER SHORT EQUALS SIGN 162 /// U+AB5B ( ꭛ ) MODIFIER BREVE WITH INVERTED BREVE 163 /// and Ideographic = No 164 /// and Word_Break ≠ Katakana 165 /// and Line_Break ≠ Complex_Context (SA) 166 /// and Script ≠ Hiragana 167 /// and Word_Break ≠ Extend 168 /// and Word_Break ≠ Hebrew_Letter 169 /// ``` 170 ALetter { 171 abbr => LE, 172 long => ALetter, 173 human => "Alphabetic Letter", 174 } 175 176 /// ```text 177 /// U+0027 ( ' ) APOSTROPHE 178 /// ``` 179 SingleQuote { 180 abbr => SQ, 181 long => Single_Quote, 182 human => "Single Quote", 183 } 184 185 /// ```text 186 /// U+0022 ( " ) QUOTATION MARK 187 /// ``` 188 DoubleQuote { 189 abbr => DQ, 190 long => Double_Quote, 191 human => "Double Quote", 192 } 193 194 /// ```text 195 /// U+002E ( . ) FULL STOP 196 /// U+2018 ( ‘ ) LEFT SINGLE QUOTATION MARK 197 /// U+2019 ( ’ ) RIGHT SINGLE QUOTATION MARK 198 /// U+2024 ( ․ ) ONE DOT LEADER 199 /// U+FE52 ( ﹒ ) SMALL FULL STOP 200 /// U+FF07 ( ' ) FULLWIDTH APOSTROPHE 201 /// U+FF0E ( . ) FULLWIDTH FULL STOP 202 /// ``` 203 MidNumLet { 204 abbr => MB, 205 long => MidNumLet, 206 human => "Middle of Numeric/Letter", 207 } 208 209 /// ```text 210 /// U+00B7 ( · ) MIDDLE DOT 211 /// U+0387 ( · ) GREEK ANO TELEIA 212 /// U+05F4 ( ״ ) HEBREW PUNCTUATION GERSHAYIM 213 /// U+2027 ( ‧ ) HYPHENATION POINT 214 /// U+003A ( : ) COLON (used in Swedish) 215 /// U+FE13 ( ︓ ) PRESENTATION FORM FOR VERTICAL COLON 216 /// U+FE55 ( ﹕ ) SMALL COLON 217 /// U+FF1A ( : ) FULLWIDTH COLON 218 /// ``` 219 MidLetter { 220 abbr => ML, 221 long => MidLetter, 222 human => "Middle of Letter", 223 } 224 225 /// ```text 226 /// Line_Break = Infix_Numeric, or 227 /// any of the following: 228 /// U+066C ( ٬ ) ARABIC THOUSANDS SEPARATOR 229 /// U+FE50 ( ﹐ ) SMALL COMMA 230 /// U+FE54 ( ﹔ ) SMALL SEMICOLON 231 /// U+FF0C ( , ) FULLWIDTH COMMA 232 /// U+FF1B ( ; ) FULLWIDTH SEMICOLON 233 /// and not U+003A ( : ) COLON 234 /// and not U+FE13 ( ︓ ) PRESENTATION FORM FOR VERTICAL COLON 235 /// and not U+002E ( . ) FULL STOP 236 /// ``` 237 MidNum { 238 abbr => MN, 239 long => MidNum, 240 human => "Middle of Numeric", 241 } 242 243 /// ```text 244 /// Line_Break = Numeric 245 /// and not U+066C ( ٬ ) ARABIC THOUSANDS SEPARATOR 246 /// ``` 247 Numeric { 248 abbr => NU, 249 long => Numeric, 250 human => "Numeric", 251 } 252 253 /// ```text 254 /// General_Category = Connector_Punctuation, or 255 /// U+202F NARROW NO-BREAK SPACE (NNBSP) 256 /// ``` 257 ExtendNumLet { 258 abbr => EX, 259 long => ExtendNumLet, 260 human => "Extend Numeric/Letter", 261 } 262 263 // Emoji 264 265 /// Emoji characters listed as `Emoji_Modifier_Base=Yes` in `emoji-data.txt`, which do not 266 /// occur after ZWJ in `emoji-zwj-sequences.txt`. 267 /// 268 /// See <https://www.unicode.org/reports/tr51/>. 269 EBase { 270 abbr => EB, 271 long => E_Base, 272 human => "Emoji Base", 273 } 274 275 /// Emoji characters listed as `Emoji_Modifer=Yes` in `emoji-data.txt`. 276 /// 277 /// See <https://www.unicode.org/reports/tr51/>. 278 EModifier { 279 abbr => EM, 280 long => E_Modifier, 281 human => "Emoji Modifier", 282 } 283 284 /// Emoji characters that do not break from a previous ZWJ in a defined emoji ZWJ sequence, 285 /// and are not listed as `Emoji_Modifier_Base=Yes` in `emoji-data.txt`. 286 /// 287 /// See <https://www.unicode.org/reports/tr51/>. 288 GlueAfterZwj { 289 abbr => GAZ, 290 long => Glue_After_Zwj, 291 human => "Glue After ZWJ", 292 } 293 294 /// Emoji characters listed as `Emoji_Modifer_Base=Yes` in `emoji_data.txt`, and also occur 295 /// after ZWJ in `emoji-zwj-sequences.txt`. 296 /// 297 /// See <https://www.unicode.org/reports/tr51/>. 298 EBaseGAZ { 299 abbr => EBG, 300 long => E_Base_GAZ, 301 human => "Emoji Base and Glue After ZWJ", 302 } 303 304 /// All other characters 305 Other { 306 abbr => XX, 307 long => Other, 308 human => "Other", 309 } 310 } 311 312 /// Abbreviated name aliases for the 313 /// [`Word_Break`](https://www.unicode.org/reports/tr44/#Word_Break) 314 /// property. 315 /// 316 /// ## See Also 317 /// 318 /// * <https://www.unicode.org/reports/tr29/#Word_Boundaries> 319 pub mod abbr_names for abbr; 320 321 /// Long name aliases for the 322 /// [`Word_Break`](https://www.unicode.org/reports/tr44/#Word_Break) 323 /// property. 324 /// 325 /// ## See Also 326 /// 327 /// * <https://www.unicode.org/reports/tr29/#Word_Boundaries> 328 pub mod long_names for long; 329 } 330 331 impl TotalCharProperty for WordBreak { of(ch: char) -> Self332 fn of(ch: char) -> Self { 333 Self::of(ch) 334 } 335 } 336 337 impl Default for WordBreak { default() -> Self338 fn default() -> Self { 339 WordBreak::Other 340 } 341 } 342 343 mod data { 344 use super::long_names as WB; 345 use unic_char_property::tables::CharDataTable; 346 pub const WORD_BREAK_TABLE: CharDataTable<super::WordBreak> = 347 include!("../tables/word_break.rsv"); 348 } 349 350 impl WordBreak { 351 /// Find the character `Word_Break` property value. of(ch: char) -> WordBreak352 pub fn of(ch: char) -> WordBreak { 353 data::WORD_BREAK_TABLE.find_or_default(ch) 354 } 355 } 356 357 #[cfg(test)] 358 mod tests { 359 use super::WordBreak as WB; 360 use unic_char_property::EnumeratedCharProperty; 361 362 #[test] test_ascii()363 fn test_ascii() { 364 assert_eq!(WB::of('\u{0000}'), WB::Other); 365 assert_eq!(WB::of('\u{0040}'), WB::Other); 366 assert_eq!(WB::of('\u{0041}'), WB::ALetter); 367 assert_eq!(WB::of('\u{0062}'), WB::ALetter); 368 assert_eq!(WB::of('\u{007F}'), WB::Other); 369 } 370 371 #[test] test_bmp()372 fn test_bmp() { 373 // Hebrew 374 assert_eq!(WB::of('\u{0590}'), WB::Other); 375 assert_eq!(WB::of('\u{05D0}'), WB::HebrewLetter); 376 assert_eq!(WB::of('\u{05D1}'), WB::HebrewLetter); 377 assert_eq!(WB::of('\u{05FF}'), WB::Other); 378 379 // Arabic 380 assert_eq!(WB::of('\u{0600}'), WB::Format); 381 assert_eq!(WB::of('\u{0627}'), WB::ALetter); 382 assert_eq!(WB::of('\u{07BF}'), WB::Other); 383 384 // Default R + Arabic Extras 385 assert_eq!(WB::of('\u{07C0}'), WB::Numeric); 386 assert_eq!(WB::of('\u{085F}'), WB::Other); 387 assert_eq!(WB::of('\u{0860}'), WB::ALetter); 388 assert_eq!(WB::of('\u{0870}'), WB::Other); 389 assert_eq!(WB::of('\u{089F}'), WB::Other); 390 assert_eq!(WB::of('\u{08A0}'), WB::ALetter); 391 assert_eq!(WB::of('\u{089F}'), WB::Other); 392 assert_eq!(WB::of('\u{08FF}'), WB::Extend); 393 394 // Default ET 395 assert_eq!(WB::of('\u{20A0}'), WB::Other); 396 assert_eq!(WB::of('\u{20CF}'), WB::Other); 397 398 // Arabic Presentation Forms 399 assert_eq!(WB::of('\u{FB1D}'), WB::HebrewLetter); 400 assert_eq!(WB::of('\u{FB4F}'), WB::HebrewLetter); 401 assert_eq!(WB::of('\u{FB50}'), WB::ALetter); 402 assert_eq!(WB::of('\u{FDCF}'), WB::Other); 403 assert_eq!(WB::of('\u{FDF0}'), WB::ALetter); 404 assert_eq!(WB::of('\u{FDFF}'), WB::Other); 405 assert_eq!(WB::of('\u{FE70}'), WB::ALetter); 406 assert_eq!(WB::of('\u{FEFE}'), WB::Other); 407 assert_eq!(WB::of('\u{FEFF}'), WB::Format); 408 409 // noncharacters 410 assert_eq!(WB::of('\u{FDD0}'), WB::Other); 411 assert_eq!(WB::of('\u{FDD1}'), WB::Other); 412 assert_eq!(WB::of('\u{FDEE}'), WB::Other); 413 assert_eq!(WB::of('\u{FDEF}'), WB::Other); 414 assert_eq!(WB::of('\u{FFFE}'), WB::Other); 415 assert_eq!(WB::of('\u{FFFF}'), WB::Other); 416 } 417 418 #[test] test_smp()419 fn test_smp() { 420 // Default AL + R 421 assert_eq!(WB::of('\u{10800}'), WB::ALetter); 422 assert_eq!(WB::of('\u{10FFF}'), WB::Other); 423 assert_eq!(WB::of('\u{1E800}'), WB::ALetter); 424 assert_eq!(WB::of('\u{1EDFF}'), WB::Other); 425 assert_eq!(WB::of('\u{1EE00}'), WB::ALetter); 426 assert_eq!(WB::of('\u{1EEFF}'), WB::Other); 427 assert_eq!(WB::of('\u{1EF00}'), WB::Other); 428 assert_eq!(WB::of('\u{1EFFF}'), WB::Other); 429 } 430 431 #[test] test_unassigned_planes()432 fn test_unassigned_planes() { 433 assert_eq!(WB::of('\u{30000}'), WB::Other); 434 assert_eq!(WB::of('\u{40000}'), WB::Other); 435 assert_eq!(WB::of('\u{50000}'), WB::Other); 436 assert_eq!(WB::of('\u{60000}'), WB::Other); 437 assert_eq!(WB::of('\u{70000}'), WB::Other); 438 assert_eq!(WB::of('\u{80000}'), WB::Other); 439 assert_eq!(WB::of('\u{90000}'), WB::Other); 440 assert_eq!(WB::of('\u{a0000}'), WB::Other); 441 } 442 443 #[test] test_abbr_name()444 fn test_abbr_name() { 445 assert_eq!(WB::CR.abbr_name(), "CR"); 446 } 447 448 #[test] test_long_name()449 fn test_long_name() { 450 assert_eq!(WB::CR.long_name(), "CR"); 451 } 452 453 #[test] test_human_name()454 fn test_human_name() { 455 assert_eq!(WB::CR.human_name(), "Carriage Return"); 456 } 457 } 458