1 // Copyright 2017 The UNIC Project Developers.
2 //
3 // See the COPYRIGHT file at the top-level directory of this distribution.
4 //
5 // Licensed under the Apache License, Version 2.0 <LICENSE-APACHE or
6 // http://www.apache.org/licenses/LICENSE-2.0> or the MIT license
7 // <LICENSE-MIT or http://opensource.org/licenses/MIT>, at your
8 // option. This file may not be copied, modified, or distributed
9 // except according to those terms.
10 
11 //! Unicode `Sentence_Break` Character Property.
12 //!
13 //! ## References
14 //!
15 //! * <https://www.unicode.org/reports/tr44/#Sentence_Break>
16 //! * <https://www.unicode.org/reports/tr29/#Sentence_Boundaries>
17 //! * <https://www.unicode.org/reports/tr29/#Table_Sentence_Break_Property_Values>
18 
19 use unic_char_property::TotalCharProperty;
20 
21 char_property! {
22     /// Represents the Unicode character
23     /// [`Sentence_Break`](https://www.unicode.org/reports/tr44/#Sentence_Break)
24     /// property.
25     ///
26     /// ## References
27     ///
28     /// * <https://www.unicode.org/reports/tr44/#Sentence_Break>
29     /// * <https://www.unicode.org/reports/tr29/#Sentence_Boundaries>
30     /// * <https://www.unicode.org/reports/tr29/#Table_Sentence_Break_Property_Values>
31     pub enum SentenceBreak {
32         abbr => "SB";
33         long => "Sentence_Break";
34         human => "Sentence Break";
35 
36         /// ```text
37         /// U+000D CARRIAGE RETURN (CR)
38         /// ```
39         CR {
40             abbr => CR,
41             long => CR,
42             human => "Carriage Return",
43         }
44 
45         /// ```text
46         /// U+000A LINE FEED (LF)
47         /// ```
48         LF {
49             abbr => LF,
50             long => LF,
51             human => "Line Feed",
52         }
53 
54         /// ```text
55         /// Grapheme_Extend = Yes, or
56         /// U+200D ZERO WIDTH JOINER (ZWJ), or
57         /// General_Category = Spacing_Mark
58         /// ```
59         Extend {
60             abbr => Extend,
61             long => Extend,
62             human => "Extend",
63         }
64 
65         /// ```text
66         /// U+0085 NEXT LINE (NEL)
67         /// U+2028 LINE SEPARATOR
68         /// U+2029 PARAGRAPH SEPARATOR
69         /// ```
70         Sep {
71             abbr => SE,
72             long => Sep,
73             human => "Separator",
74         }
75 
76         /// ```text
77         /// General_Category = Format
78         /// and not U+200C ZERO WIDTH NON-JOINER (ZWNJ)
79         /// and not U+200D ZERO WIDTH JOINER (ZWJ)
80         /// ```
81         Format {
82             abbr => FO,
83             long => Format,
84             human => "Format",
85         }
86 
87         /// ```text
88         /// White_Space = Yes
89         /// and Sentence_Break ≠ Sep
90         /// and Sentence_Break ≠ CR
91         /// and Sentence_Break ≠ LF
92         /// ```
93         Sp {
94             abbr => SP,
95             long => Sp,
96             human => "Space",
97         }
98 
99         /// ```text
100         /// Lowercase = Yes
101         /// and Grapheme_Extend = No
102         /// ```
103         Lower {
104             abbr => LO,
105             long => Lower,
106             human => "Lowercase",
107         }
108 
109         /// ```text
110         /// General_Category = Titlecase_Letter, or
111         /// Uppercase = Yes
112         /// ```
113         Upper {
114             abbr => UP,
115             long => Upper,
116             human => "Uppercase",
117         }
118 
119         /// ```text
120         /// Alphabetic = Yes, or
121         /// U+00A0 NO-BREAK SPACE (NBSP), or
122         /// U+05F3 ( ׳ ) HEBREW PUNCTUATION GERESH
123         /// and Lower = No
124         /// and Upper = No
125         /// and Sentence_Break ≠ Extend
126         /// ```
127         OLetter {
128             abbr => LE,
129             long => OLetter,
130             human => "Other Letter",
131         }
132 
133         /// ```text
134         /// Line_Break = Numeric
135         /// ```
136         Numeric {
137             abbr => NU,
138             long => Numeric,
139             human => "Numeric",
140         }
141 
142         /// ```text
143         /// U+002E ( . ) FULL STOP
144         /// U+2024 ( ․ ) ONE DOT LEADER
145         /// U+FE52 ( ﹒ ) SMALL FULL STOP
146         /// U+FF0E ( . ) FULLWIDTH FULL STOP
147         /// ```
148         ATerm {
149             abbr => AT,
150             long => ATerm,
151             human => "ATerm",
152         }
153 
154         /// ```text
155         /// U+002C ( , ) COMMA
156         /// U+002D ( - ) HYPHEN-MINUS
157         /// U+003A ( : ) COLON
158         /// U+055D ( ՝ ) ARMENIAN COMMA
159         /// U+060C ( ، ) ARABIC COMMA
160         /// U+060D ( ‎؍‎ ) ARABIC DATE SEPARATOR
161         /// U+07F8 ( ߸ ) NKO COMMA
162         /// U+1802 ( ᠂ ) MONGOLIAN COMMA
163         /// U+1808 ( ᠈ ) MONGOLIAN MANCHU COMMA
164         /// U+2013 ( – ) EN DASH
165         /// U+2014 ( — ) EM DASH
166         /// U+3001 ( 、 ) IDEOGRAPHIC COMMA
167         /// U+FE10 ( ︐ ) PRESENTATION FORM FOR VERTICAL COMMA
168         /// U+FE11 ( ︑ ) PRESENTATION FORM FOR VERTICAL IDEOGRAPHIC COMMA
169         /// U+FE13 ( ︓ ) PRESENTATION FORM FOR VERTICAL COLON
170         /// U+FE31 ( ︱ ) PRESENTATION FORM FOR VERTICAL EM DASH
171         /// U+FE32 ( ︲ ) PRESENTATION FORM FOR VERTICAL EN DASH
172         /// U+FE50 ( ﹐ ) SMALL COMMA
173         /// U+FE51 ( ﹑ ) SMALL IDEOGRAPHIC COMMA
174         /// U+FE55 ( ﹕ ) SMALL COLON
175         /// U+FE58 ( ﹘ ) SMALL EM DASH
176         /// U+FE63 ( ﹣ ) SMALL HYPHEN-MINUS
177         /// U+FF0C ( , ) FULLWIDTH COMMA
178         /// U+FF0D ( - ) FULLWIDTH HYPHEN-MINUS
179         /// U+FF1A ( : ) FULLWIDTH COLON
180         /// U+FF64 ( 、 ) HALFWIDTH IDEOGRAPHIC COMMA
181         /// ```
182         SContinue {
183             abbr => SC,
184             long => SContinue,
185             human => "Sentence Continue",
186         }
187 
188         /// ```text
189         /// Sentence_Terminal = Yes
190         /// ```
191         STerm {
192             abbr => ST,
193             long => STerm,
194             human => "Sentence Terminal",
195         }
196 
197         /// ```text
198         /// General_Category = Open_Punctuation, or
199         /// General_Category = Close_Punctuation, or
200         /// Line_Break = Quotation
201         /// and not U+05F3 ( ׳ ) HEBREW PUNCTUATION GERESH
202         /// and ATerm = No
203         /// and STerm = No
204         /// ```
205         Close {
206             abbr => CL,
207             long => Close,
208             human => "Close",
209         }
210 
211         /// All other characters
212         Other {
213             abbr => XX,
214             long => Other,
215             human => "Other",
216         }
217     }
218 
219     /// Abbreviated name aliases for the
220     /// [`Sentence_Break`](https://www.unicode.org/reports/tr44/#Sentence_Break)
221     /// property.
222     ///
223     /// ## See Also
224     ///
225     /// * <https://www.unicode.org/reports/tr29/#Sentence_Boundaries>
226     pub mod abbr_names for abbr;
227 
228     /// Long name aliases for the
229     /// [`Sentence_Break`](https://www.unicode.org/reports/tr44/#Sentence_Break)
230     /// property.
231     ///
232     /// ## See Also
233     ///
234     /// * <https://www.unicode.org/reports/tr29/#Sentence_Boundaries>
235     pub mod long_names for long;
236 }
237 
238 impl TotalCharProperty for SentenceBreak {
of(ch: char) -> Self239     fn of(ch: char) -> Self {
240         Self::of(ch)
241     }
242 }
243 
244 impl Default for SentenceBreak {
default() -> Self245     fn default() -> Self {
246         SentenceBreak::Other
247     }
248 }
249 
250 mod data {
251     use super::long_names as SB;
252     use unic_char_property::tables::CharDataTable;
253     pub const SENTENCE_BREAK_TABLE: CharDataTable<super::SentenceBreak> =
254         include!("../tables/sentence_break.rsv");
255 }
256 
257 impl SentenceBreak {
258     /// Find the character `Sentence_Break` property value.
of(ch: char) -> SentenceBreak259     pub fn of(ch: char) -> SentenceBreak {
260         data::SENTENCE_BREAK_TABLE.find_or_default(ch)
261     }
262 }
263 
264 #[cfg(test)]
265 mod tests {
266     use super::SentenceBreak as SB;
267     use unic_char_property::EnumeratedCharProperty;
268 
269     #[test]
test_ascii()270     fn test_ascii() {
271         assert_eq!(SB::of('\u{0000}'), SB::Other);
272         assert_eq!(SB::of('\u{0040}'), SB::Other);
273         assert_eq!(SB::of('\u{0041}'), SB::Upper);
274         assert_eq!(SB::of('\u{0062}'), SB::Lower);
275         assert_eq!(SB::of('\u{007F}'), SB::Other);
276     }
277 
278     #[test]
test_bmp()279     fn test_bmp() {
280         // Hebrew
281         assert_eq!(SB::of('\u{0590}'), SB::Other);
282         assert_eq!(SB::of('\u{05D0}'), SB::OLetter);
283         assert_eq!(SB::of('\u{05D1}'), SB::OLetter);
284         assert_eq!(SB::of('\u{05FF}'), SB::Other);
285 
286         // Arabic
287         assert_eq!(SB::of('\u{0600}'), SB::Format);
288         assert_eq!(SB::of('\u{0627}'), SB::OLetter);
289         assert_eq!(SB::of('\u{07BF}'), SB::Other);
290 
291         // Default R + Arabic Extras
292         assert_eq!(SB::of('\u{07C0}'), SB::Numeric);
293         assert_eq!(SB::of('\u{085F}'), SB::Other);
294         assert_eq!(SB::of('\u{0860}'), SB::OLetter);
295         assert_eq!(SB::of('\u{0870}'), SB::Other);
296         assert_eq!(SB::of('\u{089F}'), SB::Other);
297         assert_eq!(SB::of('\u{08A0}'), SB::OLetter);
298         assert_eq!(SB::of('\u{089F}'), SB::Other);
299         assert_eq!(SB::of('\u{08FF}'), SB::Extend);
300 
301         // Default ET
302         assert_eq!(SB::of('\u{20A0}'), SB::Other);
303         assert_eq!(SB::of('\u{20CF}'), SB::Other);
304 
305         // Arabic Presentation Forms
306         assert_eq!(SB::of('\u{FB1D}'), SB::OLetter);
307         assert_eq!(SB::of('\u{FB4F}'), SB::OLetter);
308         assert_eq!(SB::of('\u{FB50}'), SB::OLetter);
309         assert_eq!(SB::of('\u{FDCF}'), SB::Other);
310         assert_eq!(SB::of('\u{FDF0}'), SB::OLetter);
311         assert_eq!(SB::of('\u{FDFF}'), SB::Other);
312         assert_eq!(SB::of('\u{FE70}'), SB::OLetter);
313         assert_eq!(SB::of('\u{FEFE}'), SB::Other);
314         assert_eq!(SB::of('\u{FEFF}'), SB::Format);
315 
316         // noncharacters
317         assert_eq!(SB::of('\u{FDD0}'), SB::Other);
318         assert_eq!(SB::of('\u{FDD1}'), SB::Other);
319         assert_eq!(SB::of('\u{FDEE}'), SB::Other);
320         assert_eq!(SB::of('\u{FDEF}'), SB::Other);
321         assert_eq!(SB::of('\u{FFFE}'), SB::Other);
322         assert_eq!(SB::of('\u{FFFF}'), SB::Other);
323     }
324 
325     #[test]
test_smp()326     fn test_smp() {
327         // Default AL + R
328         assert_eq!(SB::of('\u{10800}'), SB::OLetter);
329         assert_eq!(SB::of('\u{10FFF}'), SB::Other);
330         assert_eq!(SB::of('\u{1E800}'), SB::OLetter);
331         assert_eq!(SB::of('\u{1EDFF}'), SB::Other);
332         assert_eq!(SB::of('\u{1EE00}'), SB::OLetter);
333         assert_eq!(SB::of('\u{1EEFF}'), SB::Other);
334         assert_eq!(SB::of('\u{1EF00}'), SB::Other);
335         assert_eq!(SB::of('\u{1EFFF}'), SB::Other);
336     }
337 
338     #[test]
test_unassigned_planes()339     fn test_unassigned_planes() {
340         assert_eq!(SB::of('\u{30000}'), SB::Other);
341         assert_eq!(SB::of('\u{40000}'), SB::Other);
342         assert_eq!(SB::of('\u{50000}'), SB::Other);
343         assert_eq!(SB::of('\u{60000}'), SB::Other);
344         assert_eq!(SB::of('\u{70000}'), SB::Other);
345         assert_eq!(SB::of('\u{80000}'), SB::Other);
346         assert_eq!(SB::of('\u{90000}'), SB::Other);
347         assert_eq!(SB::of('\u{a0000}'), SB::Other);
348     }
349 
350     #[test]
test_abbr_name()351     fn test_abbr_name() {
352         assert_eq!(SB::CR.abbr_name(), "CR");
353     }
354 
355     #[test]
test_long_name()356     fn test_long_name() {
357         assert_eq!(SB::CR.long_name(), "CR");
358     }
359 
360     #[test]
test_human_name()361     fn test_human_name() {
362         assert_eq!(SB::CR.human_name(), "Carriage Return");
363     }
364 }
365