1 // Copyright 2017 The UNIC Project Developers.
2 //
3 // See the COPYRIGHT file at the top-level directory of this distribution.
4 //
5 // Licensed under the Apache License, Version 2.0 <LICENSE-APACHE or
6 // http://www.apache.org/licenses/LICENSE-2.0> or the MIT license
7 // <LICENSE-MIT or http://opensource.org/licenses/MIT>, at your
8 // option. This file may not be copied, modified, or distributed
9 // except according to those terms.
10 
11 //! Unicode `Word_Break` Character Property.
12 //!
13 //! ## References
14 //!
15 //! * <https://www.unicode.org/reports/tr44/#Word_Break>
16 //! * <https://www.unicode.org/reports/tr29/#Word_Boundaries>
17 //! * <https://www.unicode.org/reports/tr29/#Table_Word_Break_Property_Values>
18 
19 use unic_char_property::TotalCharProperty;
20 
21 char_property! {
22     /// Represents the Unicode character
23     /// [`Word_Break`](https://www.unicode.org/reports/tr44/#Word_Break)
24     /// property.
25     ///
26     /// ## References
27     ///
28     /// * <https://www.unicode.org/reports/tr44/#Word_Break>
29     /// * <https://www.unicode.org/reports/tr29/#Word_Boundaries>
30     /// * <https://www.unicode.org/reports/tr29/#Table_Word_Break_Property_Values>
31     pub enum WordBreak {
32         abbr => "WB";
33         long => "Word_Break";
34         human => "Word Break";
35 
36         /// ```text
37         /// U+000D CARRIAGE RETURN (CR)
38         /// ```
39         CR {
40             abbr => CR,
41             long => CR,
42             human => "Carriage Return",
43         }
44 
45         /// ```text
46         /// U+000A LINE FEED (LF)
47         /// ```
48         LF {
49             abbr => LF,
50             long => LF,
51             human => "Line Feed",
52         }
53 
54         /// ```text
55         /// U+000B LINE TABULATION
56         /// U+000C FORM FEED (FF)
57         /// U+0085 NEXT LINE (NEL)
58         /// U+2028 LINE SEPARATOR
59         /// U+2029 PARAGRAPH SEPARATOR
60         /// ```
61         Newline {
62             abbr => NL,
63             long => Newline,
64             human => "Newline",
65         }
66 
67         /// ```text
68         /// Grapheme_Extend = Yes, or
69         /// General_Category = Spacing_Mark
70         /// and not U+200D ZERO WIDTH JOINER (ZWJ)
71         /// ```
72         Extend {
73             abbr => Extend,
74             long => Extend,
75             human => "Extend",
76         }
77 
78         /// ```text
79         /// U+200D ZERO WIDTH JOINER
80         /// ```
81         ZWJ {
82             abbr => ZWJ,
83             long => ZWJ,
84             human => "Zero Width Joiner (ZWJ)",
85         }
86 
87         /// ```text
88         /// Regional_Indicator = Yes
89         /// ```
90         ///
91         /// This consists of the range:
92         ///
93         /// ```text
94         /// U+1F1E6 REGIONAL INDICATOR SYMBOL LETTER A
95         /// ..U+1F1FF REGIONAL INDICATOR SYMBOL LETTER Z
96         /// ```
97         RegionalIndicator {
98             abbr => RI,
99             long => Regional_Indicator,
100             human => "Regional Indicator",
101         }
102 
103         /// ```text
104         /// General_Category = Format
105         /// and not U+200B ZERO WIDTH SPACE (ZWSP)
106         /// and not U+200C ZERO WIDTH NON-JOINER (ZWNJ)
107         /// and not U+200D ZERO WIDTH JOINER (ZWJ)
108         /// ```
109         Format {
110             abbr => FO,
111             long => Format,
112             human => "Format",
113         }
114 
115         /// ```text
116         /// Script = KATAKANA, or
117         /// any of the following:
118         /// U+3031 ( 〱 ) VERTICAL KANA REPEAT MARK
119         /// U+3032 ( 〲 ) VERTICAL KANA REPEAT WITH VOICED SOUND MARK
120         /// U+3033 ( 〳 ) VERTICAL KANA REPEAT MARK UPPER HALF
121         /// U+3034 ( 〴 ) VERTICAL KANA REPEAT WITH VOICED SOUND MARK UPPER HALF
122         /// U+3035 ( 〵 ) VERTICAL KANA REPEAT MARK LOWER HALF
123         /// U+309B ( ゛ ) KATAKANA-HIRAGANA VOICED SOUND MARK
124         /// U+309C ( ゜ ) KATAKANA-HIRAGANA SEMI-VOICED SOUND MARK
125         /// U+30A0 ( ゠ ) KATAKANA-HIRAGANA DOUBLE HYPHEN
126         /// U+30FC ( ー ) KATAKANA-HIRAGANA PROLONGED SOUND MARK
127         /// U+FF70 ( ー ) HALFWIDTH KATAKANA-HIRAGANA PROLONGED SOUND MARK
128         /// ```
129         Katakana {
130             abbr => KA,
131             long => Katakana,
132             human => "Katakana",
133         }
134 
135         /// ```text
136         /// Script = Hebrew
137         /// and General_Category = Other_Letter
138         /// ```
139         HebrewLetter {
140             abbr => HL,
141             long => Hebrew_Letter,
142             human => "Hebrew Letter",
143         }
144 
145         /// ```text
146         /// Alphabetic = Yes, or
147         /// any of the following 36 characters:
148         /// U+02C2 ( ˂ ) MODIFIER LETTER LEFT ARROWHEAD
149         /// ..U+02C5 ( ˅ ) MODIFIER LETTER DOWN ARROWHEAD
150         /// U+02D2 ( ˒ ) MODIFIER LETTER CENTRED RIGHT HALF RING
151         /// ..U+02D7 ( ˗ ) MODIFIER LETTER MINUS SIGN
152         /// U+02DE ( ˞ ) MODIFIER LETTER RHOTIC HOOK
153         /// U+02DF ( ˟ ) MODIFIER LETTER CROSS ACCENT
154         /// U+02ED ( ˭ ) MODIFIER LETTER UNASPIRATED
155         /// U+02EF ( ˯ ) MODIFIER LETTER LOW DOWN ARROWHEAD
156         /// ..U+02FF ( ˿ ) MODIFIER LETTER LOW LEFT ARROW
157         /// U+05F3 ( ׳ ) HEBREW PUNCTUATION GERESH
158         /// U+A720 ( ꜠ ) MODIFIER LETTER STRESS AND HIGH TONE
159         /// U+A721 ( ꜡ ) MODIFIER LETTER STRESS AND LOW TONE
160         /// U+A789 ( ꞉ ) MODIFIER LETTER COLON
161         /// U+A78A ( ꞊ ) MODIFIER LETTER SHORT EQUALS SIGN
162         /// U+AB5B ( ꭛ ) MODIFIER BREVE WITH INVERTED BREVE
163         /// and Ideographic = No
164         /// and Word_Break ≠ Katakana
165         /// and Line_Break ≠ Complex_Context (SA)
166         /// and Script ≠ Hiragana
167         /// and Word_Break ≠ Extend
168         /// and Word_Break ≠ Hebrew_Letter
169         /// ```
170         ALetter {
171             abbr => LE,
172             long => ALetter,
173             human => "Alphabetic Letter",
174         }
175 
176         /// ```text
177         /// U+0027 ( ' ) APOSTROPHE
178         /// ```
179         SingleQuote {
180             abbr => SQ,
181             long => Single_Quote,
182             human => "Single Quote",
183         }
184 
185         /// ```text
186         /// U+0022 ( " ) QUOTATION MARK
187         /// ```
188         DoubleQuote {
189             abbr => DQ,
190             long => Double_Quote,
191             human => "Double Quote",
192         }
193 
194         /// ```text
195         /// U+002E ( . ) FULL STOP
196         /// U+2018 ( ‘ ) LEFT SINGLE QUOTATION MARK
197         /// U+2019 ( ’ ) RIGHT SINGLE QUOTATION MARK
198         /// U+2024 ( ․ ) ONE DOT LEADER
199         /// U+FE52 ( ﹒ ) SMALL FULL STOP
200         /// U+FF07 ( ' ) FULLWIDTH APOSTROPHE
201         /// U+FF0E ( . ) FULLWIDTH FULL STOP
202         /// ```
203         MidNumLet {
204             abbr => MB,
205             long => MidNumLet,
206             human => "Middle of Numeric/Letter",
207         }
208 
209         /// ```text
210         /// U+00B7 ( · ) MIDDLE DOT
211         /// U+0387 ( · ) GREEK ANO TELEIA
212         /// U+05F4 ( ״ ) HEBREW PUNCTUATION GERSHAYIM
213         /// U+2027 ( ‧ ) HYPHENATION POINT
214         /// U+003A ( : ) COLON (used in Swedish)
215         /// U+FE13 ( ︓ ) PRESENTATION FORM FOR VERTICAL COLON
216         /// U+FE55 ( ﹕ ) SMALL COLON
217         /// U+FF1A ( : ) FULLWIDTH COLON
218         /// ```
219         MidLetter {
220             abbr => ML,
221             long => MidLetter,
222             human => "Middle of Letter",
223         }
224 
225         /// ```text
226         /// Line_Break = Infix_Numeric, or
227         /// any of the following:
228         /// U+066C ( ٬ ) ARABIC THOUSANDS SEPARATOR
229         /// U+FE50 ( ﹐ ) SMALL COMMA
230         /// U+FE54 ( ﹔ ) SMALL SEMICOLON
231         /// U+FF0C ( , ) FULLWIDTH COMMA
232         /// U+FF1B ( ; ) FULLWIDTH SEMICOLON
233         /// and not U+003A ( : ) COLON
234         /// and not U+FE13 ( ︓ ) PRESENTATION FORM FOR VERTICAL COLON
235         /// and not U+002E ( . ) FULL STOP
236         /// ```
237         MidNum {
238             abbr => MN,
239             long => MidNum,
240             human => "Middle of Numeric",
241         }
242 
243         /// ```text
244         /// Line_Break = Numeric
245         /// and not U+066C ( ٬ ) ARABIC THOUSANDS SEPARATOR
246         /// ```
247         Numeric {
248             abbr => NU,
249             long => Numeric,
250             human => "Numeric",
251         }
252 
253         /// ```text
254         /// General_Category = Connector_Punctuation, or
255         /// U+202F NARROW NO-BREAK SPACE (NNBSP)
256         /// ```
257         ExtendNumLet {
258             abbr => EX,
259             long => ExtendNumLet,
260             human => "Extend Numeric/Letter",
261         }
262 
263         // Emoji
264 
265         /// Emoji characters listed as `Emoji_Modifier_Base=Yes` in `emoji-data.txt`, which do not
266         /// occur after ZWJ in `emoji-zwj-sequences.txt`.
267         ///
268         /// See <https://www.unicode.org/reports/tr51/>.
269         EBase {
270             abbr => EB,
271             long => E_Base,
272             human => "Emoji Base",
273         }
274 
275         /// Emoji characters listed as `Emoji_Modifer=Yes` in `emoji-data.txt`.
276         ///
277         /// See <https://www.unicode.org/reports/tr51/>.
278         EModifier {
279             abbr => EM,
280             long => E_Modifier,
281             human => "Emoji Modifier",
282         }
283 
284         /// Emoji characters that do not break from a previous ZWJ in a defined emoji ZWJ sequence,
285         /// and are not listed as `Emoji_Modifier_Base=Yes` in `emoji-data.txt`.
286         ///
287         /// See <https://www.unicode.org/reports/tr51/>.
288         GlueAfterZwj {
289             abbr => GAZ,
290             long => Glue_After_Zwj,
291             human => "Glue After ZWJ",
292         }
293 
294         /// Emoji characters listed as `Emoji_Modifer_Base=Yes` in `emoji_data.txt`, and also occur
295         /// after ZWJ in `emoji-zwj-sequences.txt`.
296         ///
297         /// See <https://www.unicode.org/reports/tr51/>.
298         EBaseGAZ {
299             abbr => EBG,
300             long => E_Base_GAZ,
301             human => "Emoji Base and Glue After ZWJ",
302         }
303 
304         /// All other characters
305         Other {
306             abbr => XX,
307             long => Other,
308             human => "Other",
309         }
310     }
311 
312     /// Abbreviated name aliases for the
313     /// [`Word_Break`](https://www.unicode.org/reports/tr44/#Word_Break)
314     /// property.
315     ///
316     /// ## See Also
317     ///
318     /// * <https://www.unicode.org/reports/tr29/#Word_Boundaries>
319     pub mod abbr_names for abbr;
320 
321     /// Long name aliases for the
322     /// [`Word_Break`](https://www.unicode.org/reports/tr44/#Word_Break)
323     /// property.
324     ///
325     /// ## See Also
326     ///
327     /// * <https://www.unicode.org/reports/tr29/#Word_Boundaries>
328     pub mod long_names for long;
329 }
330 
331 impl TotalCharProperty for WordBreak {
of(ch: char) -> Self332     fn of(ch: char) -> Self {
333         Self::of(ch)
334     }
335 }
336 
337 impl Default for WordBreak {
default() -> Self338     fn default() -> Self {
339         WordBreak::Other
340     }
341 }
342 
343 mod data {
344     use super::long_names as WB;
345     use unic_char_property::tables::CharDataTable;
346     pub const WORD_BREAK_TABLE: CharDataTable<super::WordBreak> =
347         include!("../tables/word_break.rsv");
348 }
349 
350 impl WordBreak {
351     /// Find the character `Word_Break` property value.
of(ch: char) -> WordBreak352     pub fn of(ch: char) -> WordBreak {
353         data::WORD_BREAK_TABLE.find_or_default(ch)
354     }
355 }
356 
357 #[cfg(test)]
358 mod tests {
359     use super::WordBreak as WB;
360     use unic_char_property::EnumeratedCharProperty;
361 
362     #[test]
test_ascii()363     fn test_ascii() {
364         assert_eq!(WB::of('\u{0000}'), WB::Other);
365         assert_eq!(WB::of('\u{0040}'), WB::Other);
366         assert_eq!(WB::of('\u{0041}'), WB::ALetter);
367         assert_eq!(WB::of('\u{0062}'), WB::ALetter);
368         assert_eq!(WB::of('\u{007F}'), WB::Other);
369     }
370 
371     #[test]
test_bmp()372     fn test_bmp() {
373         // Hebrew
374         assert_eq!(WB::of('\u{0590}'), WB::Other);
375         assert_eq!(WB::of('\u{05D0}'), WB::HebrewLetter);
376         assert_eq!(WB::of('\u{05D1}'), WB::HebrewLetter);
377         assert_eq!(WB::of('\u{05FF}'), WB::Other);
378 
379         // Arabic
380         assert_eq!(WB::of('\u{0600}'), WB::Format);
381         assert_eq!(WB::of('\u{0627}'), WB::ALetter);
382         assert_eq!(WB::of('\u{07BF}'), WB::Other);
383 
384         // Default R + Arabic Extras
385         assert_eq!(WB::of('\u{07C0}'), WB::Numeric);
386         assert_eq!(WB::of('\u{085F}'), WB::Other);
387         assert_eq!(WB::of('\u{0860}'), WB::ALetter);
388         assert_eq!(WB::of('\u{0870}'), WB::Other);
389         assert_eq!(WB::of('\u{089F}'), WB::Other);
390         assert_eq!(WB::of('\u{08A0}'), WB::ALetter);
391         assert_eq!(WB::of('\u{089F}'), WB::Other);
392         assert_eq!(WB::of('\u{08FF}'), WB::Extend);
393 
394         // Default ET
395         assert_eq!(WB::of('\u{20A0}'), WB::Other);
396         assert_eq!(WB::of('\u{20CF}'), WB::Other);
397 
398         // Arabic Presentation Forms
399         assert_eq!(WB::of('\u{FB1D}'), WB::HebrewLetter);
400         assert_eq!(WB::of('\u{FB4F}'), WB::HebrewLetter);
401         assert_eq!(WB::of('\u{FB50}'), WB::ALetter);
402         assert_eq!(WB::of('\u{FDCF}'), WB::Other);
403         assert_eq!(WB::of('\u{FDF0}'), WB::ALetter);
404         assert_eq!(WB::of('\u{FDFF}'), WB::Other);
405         assert_eq!(WB::of('\u{FE70}'), WB::ALetter);
406         assert_eq!(WB::of('\u{FEFE}'), WB::Other);
407         assert_eq!(WB::of('\u{FEFF}'), WB::Format);
408 
409         // noncharacters
410         assert_eq!(WB::of('\u{FDD0}'), WB::Other);
411         assert_eq!(WB::of('\u{FDD1}'), WB::Other);
412         assert_eq!(WB::of('\u{FDEE}'), WB::Other);
413         assert_eq!(WB::of('\u{FDEF}'), WB::Other);
414         assert_eq!(WB::of('\u{FFFE}'), WB::Other);
415         assert_eq!(WB::of('\u{FFFF}'), WB::Other);
416     }
417 
418     #[test]
test_smp()419     fn test_smp() {
420         // Default AL + R
421         assert_eq!(WB::of('\u{10800}'), WB::ALetter);
422         assert_eq!(WB::of('\u{10FFF}'), WB::Other);
423         assert_eq!(WB::of('\u{1E800}'), WB::ALetter);
424         assert_eq!(WB::of('\u{1EDFF}'), WB::Other);
425         assert_eq!(WB::of('\u{1EE00}'), WB::ALetter);
426         assert_eq!(WB::of('\u{1EEFF}'), WB::Other);
427         assert_eq!(WB::of('\u{1EF00}'), WB::Other);
428         assert_eq!(WB::of('\u{1EFFF}'), WB::Other);
429     }
430 
431     #[test]
test_unassigned_planes()432     fn test_unassigned_planes() {
433         assert_eq!(WB::of('\u{30000}'), WB::Other);
434         assert_eq!(WB::of('\u{40000}'), WB::Other);
435         assert_eq!(WB::of('\u{50000}'), WB::Other);
436         assert_eq!(WB::of('\u{60000}'), WB::Other);
437         assert_eq!(WB::of('\u{70000}'), WB::Other);
438         assert_eq!(WB::of('\u{80000}'), WB::Other);
439         assert_eq!(WB::of('\u{90000}'), WB::Other);
440         assert_eq!(WB::of('\u{a0000}'), WB::Other);
441     }
442 
443     #[test]
test_abbr_name()444     fn test_abbr_name() {
445         assert_eq!(WB::CR.abbr_name(), "CR");
446     }
447 
448     #[test]
test_long_name()449     fn test_long_name() {
450         assert_eq!(WB::CR.long_name(), "CR");
451     }
452 
453     #[test]
test_human_name()454     fn test_human_name() {
455         assert_eq!(WB::CR.human_name(), "Carriage Return");
456     }
457 }
458