1 // Copyright 2017 The UNIC Project Developers. 2 // 3 // See the COPYRIGHT file at the top-level directory of this distribution. 4 // 5 // Licensed under the Apache License, Version 2.0 <LICENSE-APACHE or 6 // http://www.apache.org/licenses/LICENSE-2.0> or the MIT license 7 // <LICENSE-MIT or http://opensource.org/licenses/MIT>, at your 8 // option. This file may not be copied, modified, or distributed 9 // except according to those terms. 10 11 //! Unicode `Sentence_Break` Character Property. 12 //! 13 //! ## References 14 //! 15 //! * <https://www.unicode.org/reports/tr44/#Sentence_Break> 16 //! * <https://www.unicode.org/reports/tr29/#Sentence_Boundaries> 17 //! * <https://www.unicode.org/reports/tr29/#Table_Sentence_Break_Property_Values> 18 19 use unic_char_property::TotalCharProperty; 20 21 char_property! { 22 /// Represents the Unicode character 23 /// [`Sentence_Break`](https://www.unicode.org/reports/tr44/#Sentence_Break) 24 /// property. 25 /// 26 /// ## References 27 /// 28 /// * <https://www.unicode.org/reports/tr44/#Sentence_Break> 29 /// * <https://www.unicode.org/reports/tr29/#Sentence_Boundaries> 30 /// * <https://www.unicode.org/reports/tr29/#Table_Sentence_Break_Property_Values> 31 pub enum SentenceBreak { 32 abbr => "SB"; 33 long => "Sentence_Break"; 34 human => "Sentence Break"; 35 36 /// ```text 37 /// U+000D CARRIAGE RETURN (CR) 38 /// ``` 39 CR { 40 abbr => CR, 41 long => CR, 42 human => "Carriage Return", 43 } 44 45 /// ```text 46 /// U+000A LINE FEED (LF) 47 /// ``` 48 LF { 49 abbr => LF, 50 long => LF, 51 human => "Line Feed", 52 } 53 54 /// ```text 55 /// Grapheme_Extend = Yes, or 56 /// U+200D ZERO WIDTH JOINER (ZWJ), or 57 /// General_Category = Spacing_Mark 58 /// ``` 59 Extend { 60 abbr => Extend, 61 long => Extend, 62 human => "Extend", 63 } 64 65 /// ```text 66 /// U+0085 NEXT LINE (NEL) 67 /// U+2028 LINE SEPARATOR 68 /// U+2029 PARAGRAPH SEPARATOR 69 /// ``` 70 Sep { 71 abbr => SE, 72 long => Sep, 73 human => "Separator", 74 } 75 76 /// ```text 77 /// General_Category = Format 78 /// and not U+200C ZERO WIDTH NON-JOINER (ZWNJ) 79 /// and not U+200D ZERO WIDTH JOINER (ZWJ) 80 /// ``` 81 Format { 82 abbr => FO, 83 long => Format, 84 human => "Format", 85 } 86 87 /// ```text 88 /// White_Space = Yes 89 /// and Sentence_Break ≠ Sep 90 /// and Sentence_Break ≠ CR 91 /// and Sentence_Break ≠ LF 92 /// ``` 93 Sp { 94 abbr => SP, 95 long => Sp, 96 human => "Space", 97 } 98 99 /// ```text 100 /// Lowercase = Yes 101 /// and Grapheme_Extend = No 102 /// ``` 103 Lower { 104 abbr => LO, 105 long => Lower, 106 human => "Lowercase", 107 } 108 109 /// ```text 110 /// General_Category = Titlecase_Letter, or 111 /// Uppercase = Yes 112 /// ``` 113 Upper { 114 abbr => UP, 115 long => Upper, 116 human => "Uppercase", 117 } 118 119 /// ```text 120 /// Alphabetic = Yes, or 121 /// U+00A0 NO-BREAK SPACE (NBSP), or 122 /// U+05F3 ( ׳ ) HEBREW PUNCTUATION GERESH 123 /// and Lower = No 124 /// and Upper = No 125 /// and Sentence_Break ≠ Extend 126 /// ``` 127 OLetter { 128 abbr => LE, 129 long => OLetter, 130 human => "Other Letter", 131 } 132 133 /// ```text 134 /// Line_Break = Numeric 135 /// ``` 136 Numeric { 137 abbr => NU, 138 long => Numeric, 139 human => "Numeric", 140 } 141 142 /// ```text 143 /// U+002E ( . ) FULL STOP 144 /// U+2024 ( ․ ) ONE DOT LEADER 145 /// U+FE52 ( ﹒ ) SMALL FULL STOP 146 /// U+FF0E ( . ) FULLWIDTH FULL STOP 147 /// ``` 148 ATerm { 149 abbr => AT, 150 long => ATerm, 151 human => "ATerm", 152 } 153 154 /// ```text 155 /// U+002C ( , ) COMMA 156 /// U+002D ( - ) HYPHEN-MINUS 157 /// U+003A ( : ) COLON 158 /// U+055D ( ՝ ) ARMENIAN COMMA 159 /// U+060C ( ، ) ARABIC COMMA 160 /// U+060D ( ؍ ) ARABIC DATE SEPARATOR 161 /// U+07F8 ( ߸ ) NKO COMMA 162 /// U+1802 ( ᠂ ) MONGOLIAN COMMA 163 /// U+1808 ( ᠈ ) MONGOLIAN MANCHU COMMA 164 /// U+2013 ( – ) EN DASH 165 /// U+2014 ( — ) EM DASH 166 /// U+3001 ( 、 ) IDEOGRAPHIC COMMA 167 /// U+FE10 ( ︐ ) PRESENTATION FORM FOR VERTICAL COMMA 168 /// U+FE11 ( ︑ ) PRESENTATION FORM FOR VERTICAL IDEOGRAPHIC COMMA 169 /// U+FE13 ( ︓ ) PRESENTATION FORM FOR VERTICAL COLON 170 /// U+FE31 ( ︱ ) PRESENTATION FORM FOR VERTICAL EM DASH 171 /// U+FE32 ( ︲ ) PRESENTATION FORM FOR VERTICAL EN DASH 172 /// U+FE50 ( ﹐ ) SMALL COMMA 173 /// U+FE51 ( ﹑ ) SMALL IDEOGRAPHIC COMMA 174 /// U+FE55 ( ﹕ ) SMALL COLON 175 /// U+FE58 ( ﹘ ) SMALL EM DASH 176 /// U+FE63 ( ﹣ ) SMALL HYPHEN-MINUS 177 /// U+FF0C ( , ) FULLWIDTH COMMA 178 /// U+FF0D ( - ) FULLWIDTH HYPHEN-MINUS 179 /// U+FF1A ( : ) FULLWIDTH COLON 180 /// U+FF64 ( 、 ) HALFWIDTH IDEOGRAPHIC COMMA 181 /// ``` 182 SContinue { 183 abbr => SC, 184 long => SContinue, 185 human => "Sentence Continue", 186 } 187 188 /// ```text 189 /// Sentence_Terminal = Yes 190 /// ``` 191 STerm { 192 abbr => ST, 193 long => STerm, 194 human => "Sentence Terminal", 195 } 196 197 /// ```text 198 /// General_Category = Open_Punctuation, or 199 /// General_Category = Close_Punctuation, or 200 /// Line_Break = Quotation 201 /// and not U+05F3 ( ׳ ) HEBREW PUNCTUATION GERESH 202 /// and ATerm = No 203 /// and STerm = No 204 /// ``` 205 Close { 206 abbr => CL, 207 long => Close, 208 human => "Close", 209 } 210 211 /// All other characters 212 Other { 213 abbr => XX, 214 long => Other, 215 human => "Other", 216 } 217 } 218 219 /// Abbreviated name aliases for the 220 /// [`Sentence_Break`](https://www.unicode.org/reports/tr44/#Sentence_Break) 221 /// property. 222 /// 223 /// ## See Also 224 /// 225 /// * <https://www.unicode.org/reports/tr29/#Sentence_Boundaries> 226 pub mod abbr_names for abbr; 227 228 /// Long name aliases for the 229 /// [`Sentence_Break`](https://www.unicode.org/reports/tr44/#Sentence_Break) 230 /// property. 231 /// 232 /// ## See Also 233 /// 234 /// * <https://www.unicode.org/reports/tr29/#Sentence_Boundaries> 235 pub mod long_names for long; 236 } 237 238 impl TotalCharProperty for SentenceBreak { of(ch: char) -> Self239 fn of(ch: char) -> Self { 240 Self::of(ch) 241 } 242 } 243 244 impl Default for SentenceBreak { default() -> Self245 fn default() -> Self { 246 SentenceBreak::Other 247 } 248 } 249 250 mod data { 251 use super::long_names as SB; 252 use unic_char_property::tables::CharDataTable; 253 pub const SENTENCE_BREAK_TABLE: CharDataTable<super::SentenceBreak> = 254 include!("../tables/sentence_break.rsv"); 255 } 256 257 impl SentenceBreak { 258 /// Find the character `Sentence_Break` property value. of(ch: char) -> SentenceBreak259 pub fn of(ch: char) -> SentenceBreak { 260 data::SENTENCE_BREAK_TABLE.find_or_default(ch) 261 } 262 } 263 264 #[cfg(test)] 265 mod tests { 266 use super::SentenceBreak as SB; 267 use unic_char_property::EnumeratedCharProperty; 268 269 #[test] test_ascii()270 fn test_ascii() { 271 assert_eq!(SB::of('\u{0000}'), SB::Other); 272 assert_eq!(SB::of('\u{0040}'), SB::Other); 273 assert_eq!(SB::of('\u{0041}'), SB::Upper); 274 assert_eq!(SB::of('\u{0062}'), SB::Lower); 275 assert_eq!(SB::of('\u{007F}'), SB::Other); 276 } 277 278 #[test] test_bmp()279 fn test_bmp() { 280 // Hebrew 281 assert_eq!(SB::of('\u{0590}'), SB::Other); 282 assert_eq!(SB::of('\u{05D0}'), SB::OLetter); 283 assert_eq!(SB::of('\u{05D1}'), SB::OLetter); 284 assert_eq!(SB::of('\u{05FF}'), SB::Other); 285 286 // Arabic 287 assert_eq!(SB::of('\u{0600}'), SB::Format); 288 assert_eq!(SB::of('\u{0627}'), SB::OLetter); 289 assert_eq!(SB::of('\u{07BF}'), SB::Other); 290 291 // Default R + Arabic Extras 292 assert_eq!(SB::of('\u{07C0}'), SB::Numeric); 293 assert_eq!(SB::of('\u{085F}'), SB::Other); 294 assert_eq!(SB::of('\u{0860}'), SB::OLetter); 295 assert_eq!(SB::of('\u{0870}'), SB::Other); 296 assert_eq!(SB::of('\u{089F}'), SB::Other); 297 assert_eq!(SB::of('\u{08A0}'), SB::OLetter); 298 assert_eq!(SB::of('\u{089F}'), SB::Other); 299 assert_eq!(SB::of('\u{08FF}'), SB::Extend); 300 301 // Default ET 302 assert_eq!(SB::of('\u{20A0}'), SB::Other); 303 assert_eq!(SB::of('\u{20CF}'), SB::Other); 304 305 // Arabic Presentation Forms 306 assert_eq!(SB::of('\u{FB1D}'), SB::OLetter); 307 assert_eq!(SB::of('\u{FB4F}'), SB::OLetter); 308 assert_eq!(SB::of('\u{FB50}'), SB::OLetter); 309 assert_eq!(SB::of('\u{FDCF}'), SB::Other); 310 assert_eq!(SB::of('\u{FDF0}'), SB::OLetter); 311 assert_eq!(SB::of('\u{FDFF}'), SB::Other); 312 assert_eq!(SB::of('\u{FE70}'), SB::OLetter); 313 assert_eq!(SB::of('\u{FEFE}'), SB::Other); 314 assert_eq!(SB::of('\u{FEFF}'), SB::Format); 315 316 // noncharacters 317 assert_eq!(SB::of('\u{FDD0}'), SB::Other); 318 assert_eq!(SB::of('\u{FDD1}'), SB::Other); 319 assert_eq!(SB::of('\u{FDEE}'), SB::Other); 320 assert_eq!(SB::of('\u{FDEF}'), SB::Other); 321 assert_eq!(SB::of('\u{FFFE}'), SB::Other); 322 assert_eq!(SB::of('\u{FFFF}'), SB::Other); 323 } 324 325 #[test] test_smp()326 fn test_smp() { 327 // Default AL + R 328 assert_eq!(SB::of('\u{10800}'), SB::OLetter); 329 assert_eq!(SB::of('\u{10FFF}'), SB::Other); 330 assert_eq!(SB::of('\u{1E800}'), SB::OLetter); 331 assert_eq!(SB::of('\u{1EDFF}'), SB::Other); 332 assert_eq!(SB::of('\u{1EE00}'), SB::OLetter); 333 assert_eq!(SB::of('\u{1EEFF}'), SB::Other); 334 assert_eq!(SB::of('\u{1EF00}'), SB::Other); 335 assert_eq!(SB::of('\u{1EFFF}'), SB::Other); 336 } 337 338 #[test] test_unassigned_planes()339 fn test_unassigned_planes() { 340 assert_eq!(SB::of('\u{30000}'), SB::Other); 341 assert_eq!(SB::of('\u{40000}'), SB::Other); 342 assert_eq!(SB::of('\u{50000}'), SB::Other); 343 assert_eq!(SB::of('\u{60000}'), SB::Other); 344 assert_eq!(SB::of('\u{70000}'), SB::Other); 345 assert_eq!(SB::of('\u{80000}'), SB::Other); 346 assert_eq!(SB::of('\u{90000}'), SB::Other); 347 assert_eq!(SB::of('\u{a0000}'), SB::Other); 348 } 349 350 #[test] test_abbr_name()351 fn test_abbr_name() { 352 assert_eq!(SB::CR.abbr_name(), "CR"); 353 } 354 355 #[test] test_long_name()356 fn test_long_name() { 357 assert_eq!(SB::CR.long_name(), "CR"); 358 } 359 360 #[test] test_human_name()361 fn test_human_name() { 362 assert_eq!(SB::CR.human_name(), "Carriage Return"); 363 } 364 } 365