1 // Copyright 2017 The UNIC Project Developers.
2 //
3 // See the COPYRIGHT file at the top-level directory of this distribution.
4 //
5 // Licensed under the Apache License, Version 2.0 <LICENSE-APACHE or
6 // http://www.apache.org/licenses/LICENSE-2.0> or the MIT license
7 // <LICENSE-MIT or http://opensource.org/licenses/MIT>, at your
8 // option. This file may not be copied, modified, or distributed
9 // except according to those terms.
10 
11 use unic_char_property::TotalCharProperty;
12 
13 char_property! {
14     /// Represents the Unicode Character
15     /// [`General_Category`](http://unicode.org/reports/tr44/#General_Category) property.
16     ///
17     /// This is a useful breakdown into various character types which can be used as a default
18     /// categorization in implementations. For the property values, see
19     /// [`General_Category Values`](http://unicode.org/reports/tr44/#General_Category_Values).
20     pub enum GeneralCategory {
21         abbr => "gc";
22         long => "General_Category";
23         human => "General Category";
24 
25         /// An uppercase letter
26         UppercaseLetter {
27             abbr => Lu,
28             long => Uppercase_Letter,
29             human => "Uppercase Letter",
30         }
31 
32         /// A lowercase letter
33         LowercaseLetter {
34             abbr => Ll,
35             long => Lowercase_Letter,
36             human => "Lowercase Letter",
37         }
38 
39         /// A digraphic character, with first part uppercase
40         TitlecaseLetter {
41             abbr => Lt,
42             long => Titlecase_Letter,
43             human => "Titlecase Letter",
44         }
45 
46         /// A modifier letter
47         ModifierLetter {
48             abbr => Lm,
49             long => Modifier_Letter,
50             human => "Modifier Letter",
51         }
52 
53         /// Other letters, including syllables and ideographs
54         OtherLetter {
55             abbr => Lo,
56             long => Other_Letter,
57             human => "Other Letter",
58         }
59 
60         /// A nonspacing combining mark (zero advance width)
61         NonspacingMark {
62             abbr => Mn,
63             long => Nonspacing_Mark,
64             human => "Nonspacing Mark",
65         }
66 
67         /// A spacing combining mark (positive advance width)
68         SpacingMark {
69             abbr => Mc,
70             long => Spacing_Mark,
71             human => "Spacing Mark",
72         }
73 
74         /// An enclosing combining mark
75         EnclosingMark {
76             abbr => Me,
77             long => Enclosing_Mark,
78             human => "Enclosing Mark",
79         }
80 
81         /// A decimal digit
82         DecimalNumber {
83             abbr => Nd,
84             long => Decimal_Number,
85             human => "Decimal Digit",
86         }
87 
88         /// A letterlike numeric character
89         LetterNumber {
90             abbr => Nl,
91             long => Letter_Number,
92             human => "Letterlike Number",
93         }
94 
95         /// A numeric character of other type
96         OtherNumber {
97             abbr => No,
98             long => Other_Number,
99             human => "Other Numeric",
100         }
101 
102         /// A connecting punctuation mark, like a tie
103         ConnectorPunctuation {
104             abbr => Pc,
105             long => Connector_Punctuation,
106             human => "Connecting Punctuation",
107         }
108 
109         /// A dash or hyphen punctuation mark
110         DashPunctuation {
111             abbr => Pd,
112             long => Dash_Punctuation,
113             human => "Dash Punctuation",
114         }
115 
116         /// An opening punctuation mark (of a pair)
117         OpenPunctuation {
118             abbr => Ps,
119             long => Open_Punctuation,
120             human => "Opening Punctuation",
121         }
122 
123         /// A closing punctuation mark (of a pair)
124         ClosePunctuation {
125             abbr => Pe,
126             long => Close_Punctuation,
127             human => "Closing Punctuation",
128         }
129 
130         /// An initial quotation mark
131         InitialPunctuation {
132             abbr => Pi,
133             long => Initial_Punctuation,
134             human => "Initial Quotation",
135         }
136 
137         /// A final quotation mark
138         FinalPunctuation {
139             abbr => Pf,
140             long => Final_Punctuation,
141             human => "Final Quotation",
142         }
143 
144         /// A punctuation mark of other type
145         OtherPunctuation {
146             abbr => Po,
147             long => Other_Punctuation,
148             human => "Other Punctuation",
149         }
150 
151         /// A symbol of mathematical use
152         MathSymbol {
153             abbr => Sm,
154             long => Math_Symbol,
155             human => "Math Symbol",
156         }
157 
158         /// A currency sign
159         CurrencySymbol {
160             abbr => Sc,
161             long => Currency_Symbol,
162             human => "Currency Symbol",
163         }
164 
165         /// A non-letterlike modifier symbol
166         ModifierSymbol {
167             abbr => Sk,
168             long => Modifier_Symbol,
169             human => "Modifier Symbol",
170         }
171 
172         /// A symbol of other type
173         OtherSymbol {
174             abbr => So,
175             long => Other_Symbol,
176             human => "Other Symbol",
177         }
178 
179         /// A space character (of various non-zero widths)
180         SpaceSeparator {
181             abbr => Zs,
182             long => Space_Separator,
183             human => "Space",
184         }
185 
186         /// U+2028 LINE SEPARATOR only
187         LineSeparator {
188             abbr => Zl,
189             long => Line_Separator,
190             human => "Line Separator",
191         }
192 
193         /// U+2029 PARAGRAPH SEPARATOR only
194         ParagraphSeparator {
195             abbr => Zp,
196             long => Paragraph_Separator,
197             human => "Paragraph Separator",
198         }
199 
200         /// A C0 or C1 control code
201         Control {
202             abbr => Cc,
203             long => Control,
204             human => "Control",
205         }
206 
207         /// A format control character
208         Format {
209             abbr => Cf,
210             long => Format,
211             human => "Formatting",
212         }
213 
214         /// A surrogate code point
215         Surrogate {
216             abbr => Cs,
217             long => Surrogate,
218             human => "Surrogate",
219         }
220 
221         /// A private-use character
222         PrivateUse {
223             abbr => Co,
224             long => Private_Use,
225             human => "Private-Use",
226         }
227 
228         /// Unassigned
229         Unassigned {
230             abbr => Cn,
231             long => Unassigned,
232             human => "Unassigned",
233         }
234     }
235 
236     pub mod abbr_names for abbr;
237     pub mod long_names for long;
238 }
239 
240 impl TotalCharProperty for GeneralCategory {
of(ch: char) -> Self241     fn of(ch: char) -> Self {
242         Self::of(ch)
243     }
244 }
245 
246 impl Default for GeneralCategory {
default() -> Self247     fn default() -> Self {
248         GeneralCategory::Unassigned
249     }
250 }
251 
252 mod data {
253     use super::abbr_names::*;
254     use unic_char_property::tables::CharDataTable;
255     pub const GENERAL_CATEGORY_TABLE: CharDataTable<super::GeneralCategory> =
256         include!("../tables/general_category.rsv");
257 }
258 
259 impl GeneralCategory {
260     /// Find the `GeneralCategory` of a single char.
of(ch: char) -> GeneralCategory261     pub fn of(ch: char) -> GeneralCategory {
262         data::GENERAL_CATEGORY_TABLE.find_or_default(ch)
263     }
264 }
265 
266 impl GeneralCategory {
267     /// `Lu` | `Ll` | `Lt`  (Short form: `LC`)
is_cased_letter(&self) -> bool268     pub fn is_cased_letter(&self) -> bool {
269         use self::abbr_names::*;
270         matches!(*self, Lu | Ll | Lt)
271     }
272 
273     /// `Lu` | `Ll` | `Lt` | `Lm` | `Lo`  (Short form: `L`)
is_letter(&self) -> bool274     pub fn is_letter(&self) -> bool {
275         use self::abbr_names::*;
276         matches!(*self, Lu | Ll | Lt | Lm | Lo)
277     }
278 
279     /// `Mn` | `Mc` | `Me`  (Short form: `M`)
is_mark(&self) -> bool280     pub fn is_mark(&self) -> bool {
281         use self::abbr_names::*;
282         matches!(*self, Mn | Mc | Me)
283     }
284 
285     /// `Nd` | `Nl` | `No`  (Short form: `N`)
is_number(&self) -> bool286     pub fn is_number(&self) -> bool {
287         use self::abbr_names::*;
288         matches!(*self, Nd | Nl | No)
289     }
290 
291     /// `Pc` | `Pd` | `Ps` | `Pe` | `Pi` | `Pf` | `Po`  (Short form: `P`)
is_punctuation(&self) -> bool292     pub fn is_punctuation(&self) -> bool {
293         use self::abbr_names::*;
294         matches!(*self, Pc | Pd | Ps | Pe | Pi | Pf | Po)
295     }
296 
297     /// `Sm` | `Sc` | `Sk` | `So`  (Short form: `S`)
is_symbol(&self) -> bool298     pub fn is_symbol(&self) -> bool {
299         use self::abbr_names::*;
300         matches!(*self, Sm | Sc | Sk | So)
301     }
302 
303     /// `Zs` | `Zl` | `Zp`  (Short form: `Z`)
is_separator(&self) -> bool304     pub fn is_separator(&self) -> bool {
305         use self::abbr_names::*;
306         matches!(*self, Zs | Zl | Zp)
307     }
308 
309     /// `Cc` | `Cf` | `Cs` | `Co` | `Cn`  (Short form: `C`)
is_other(&self) -> bool310     pub fn is_other(&self) -> bool {
311         use self::abbr_names::*;
312         matches!(*self, Cc | Cf | Cs | Co | Cn)
313     }
314 }
315 
316 #[cfg(test)]
317 mod tests {
318     use super::GeneralCategory as GC;
319     use core::char;
320     use unic_char_property::EnumeratedCharProperty;
321 
322     #[test]
test_ascii()323     fn test_ascii() {
324         for c in 0x00..(0x1F + 1) {
325             let c = char::from_u32(c).unwrap();
326             assert_eq!(GC::of(c), GC::Control);
327         }
328 
329         assert_eq!(GC::of(' '), GC::SpaceSeparator);
330         assert_eq!(GC::of('!'), GC::OtherPunctuation);
331         assert_eq!(GC::of('"'), GC::OtherPunctuation);
332         assert_eq!(GC::of('#'), GC::OtherPunctuation);
333         assert_eq!(GC::of('$'), GC::CurrencySymbol);
334         assert_eq!(GC::of('%'), GC::OtherPunctuation);
335         assert_eq!(GC::of('&'), GC::OtherPunctuation);
336         assert_eq!(GC::of('\''), GC::OtherPunctuation);
337         assert_eq!(GC::of('('), GC::OpenPunctuation);
338         assert_eq!(GC::of(')'), GC::ClosePunctuation);
339         assert_eq!(GC::of('*'), GC::OtherPunctuation);
340         assert_eq!(GC::of('+'), GC::MathSymbol);
341         assert_eq!(GC::of(','), GC::OtherPunctuation);
342         assert_eq!(GC::of('-'), GC::DashPunctuation);
343         assert_eq!(GC::of('.'), GC::OtherPunctuation);
344         assert_eq!(GC::of('/'), GC::OtherPunctuation);
345 
346         for c in ('0' as u32)..('9' as u32 + 1) {
347             let c = char::from_u32(c).unwrap();
348             assert_eq!(GC::of(c), GC::DecimalNumber);
349         }
350 
351         assert_eq!(GC::of(':'), GC::OtherPunctuation);
352         assert_eq!(GC::of(';'), GC::OtherPunctuation);
353         assert_eq!(GC::of('<'), GC::MathSymbol);
354         assert_eq!(GC::of('='), GC::MathSymbol);
355         assert_eq!(GC::of('>'), GC::MathSymbol);
356         assert_eq!(GC::of('?'), GC::OtherPunctuation);
357         assert_eq!(GC::of('@'), GC::OtherPunctuation);
358 
359         for c in ('A' as u32)..('Z' as u32 + 1) {
360             let c = char::from_u32(c).unwrap();
361             assert_eq!(GC::of(c), GC::UppercaseLetter);
362         }
363 
364         assert_eq!(GC::of('['), GC::OpenPunctuation);
365         assert_eq!(GC::of('\\'), GC::OtherPunctuation);
366         assert_eq!(GC::of(']'), GC::ClosePunctuation);
367         assert_eq!(GC::of('^'), GC::ModifierSymbol);
368         assert_eq!(GC::of('_'), GC::ConnectorPunctuation);
369         assert_eq!(GC::of('`'), GC::ModifierSymbol);
370 
371         for c in ('a' as u32)..('z' as u32 + 1) {
372             let c = char::from_u32(c).unwrap();
373             assert_eq!(GC::of(c), GC::LowercaseLetter);
374         }
375 
376         assert_eq!(GC::of('{'), GC::OpenPunctuation);
377         assert_eq!(GC::of('|'), GC::MathSymbol);
378         assert_eq!(GC::of('}'), GC::ClosePunctuation);
379         assert_eq!(GC::of('~'), GC::MathSymbol);
380     }
381 
382     #[test]
test_bmp_edge()383     fn test_bmp_edge() {
384         // 0xFEFF ZERO WIDTH NO-BREAK SPACE (or) BYTE ORDER MARK
385         let bom = '\u{FEFF}';
386         assert_eq!(GC::of(bom), GC::Format);
387         // 0xFFFC OBJECT REPLACEMENT CHARACTER
388         assert_eq!(GC::of(''), GC::OtherSymbol);
389         // 0xFFFD REPLACEMENT CHARACTER
390         assert_eq!(GC::of('�'), GC::OtherSymbol);
391 
392         for &c in [0xFFEF, 0xFFFE, 0xFFFF].iter() {
393             let c = char::from_u32(c).unwrap();
394             assert_eq!(GC::of(c), GC::Unassigned);
395         }
396     }
397 
398     #[test]
test_private_use()399     fn test_private_use() {
400         for c in 0xF_0000..(0xF_FFFD + 1) {
401             let c = char::from_u32(c).unwrap();
402             assert_eq!(GC::of(c), GC::PrivateUse);
403         }
404 
405         for c in 0x10_0000..(0x10_FFFD + 1) {
406             let c = char::from_u32(c).unwrap();
407             assert_eq!(GC::of(c), GC::PrivateUse);
408         }
409 
410         for &c in [0xF_FFFE, 0xF_FFFF, 0x10_FFFE, 0x10_FFFF].iter() {
411             let c = char::from_u32(c).unwrap();
412             assert_eq!(GC::of(c), GC::Unassigned);
413         }
414     }
415 
416     #[test]
test_abbr_name()417     fn test_abbr_name() {
418         assert_eq!(GC::UppercaseLetter.abbr_name(), "Lu");
419         assert_eq!(GC::Unassigned.abbr_name(), "Cn");
420     }
421 
422     #[test]
test_long_name()423     fn test_long_name() {
424         assert_eq!(GC::UppercaseLetter.long_name(), "Uppercase_Letter");
425         assert_eq!(GC::Unassigned.long_name(), "Unassigned");
426     }
427 
428     #[test]
test_human_name()429     fn test_human_name() {
430         assert_eq!(GC::UppercaseLetter.human_name(), "Uppercase Letter");
431         assert_eq!(GC::Unassigned.human_name(), "Unassigned");
432     }
433 }
434