1 use std::fmt;
2 use std::iter;
3 use std::ops::Range;
4 use std::path::Path;
5 use std::str::FromStr;
6 
7 use lazy_static::lazy_static;
8 use regex::Regex;
9 
10 use crate::common::{Codepoint, CodepointIter, UcdFile, UcdFileByCodepoint};
11 use crate::error::Error;
12 
13 /// Represents a single row in the `UnicodeData.txt` file.
14 ///
15 /// These fields were taken from UAX44, Table 9, as part of the documentation
16 /// for the
17 /// [`UnicodeData.txt` file](http://www.unicode.org/reports/tr44/#UnicodeData.txt).
18 #[derive(Clone, Debug, Default, Eq, PartialEq)]
19 pub struct UnicodeData {
20     /// The codepoint corresponding to this row.
21     pub codepoint: Codepoint,
22     /// The name of this codepoint.
23     pub name: String,
24     /// The "general category" of this codepoint.
25     pub general_category: String,
26     /// The class of this codepoint used in the Canonical Ordering Algorithm.
27     ///
28     /// Note that some classes map to a particular symbol. See
29     /// [UAX44, Table 15](http://www.unicode.org/reports/tr44/#Canonical_Combining_Class_Values).
30     pub canonical_combining_class: u8,
31     /// The bidirectional class of this codepoint.
32     ///
33     /// Possible values are listed in
34     /// [UAX44, Table 13](http://www.unicode.org/reports/tr44/#Bidi_Class_Values).
35     pub bidi_class: String,
36     /// The decomposition mapping for this codepoint. This includes its
37     /// formatting tag (if present).
38     pub decomposition: UnicodeDataDecomposition,
39     /// A decimal numeric representation of this codepoint, if it has the
40     /// property `Numeric_Type=Decimal`.
41     pub numeric_type_decimal: Option<u8>,
42     /// A decimal numeric representation of this codepoint, if it has the
43     /// property `Numeric_Type=Digit`. Note that while this field is still
44     /// populated for existing codepoints, no new codepoints will have this
45     /// field populated.
46     pub numeric_type_digit: Option<u8>,
47     /// A decimal or rational numeric representation of this codepoint, if it
48     /// has the property `Numeric_Type=Numeric`.
49     pub numeric_type_numeric: Option<UnicodeDataNumeric>,
50     /// A boolean indicating whether this codepoint is "mirrored" in
51     /// bidirectional text.
52     pub bidi_mirrored: bool,
53     /// The "old" Unicode 1.0 or ISO 6429 name of this codepoint. Note that
54     /// this field is empty unless it is significantly different from
55     /// the `name` field.
56     pub unicode1_name: String,
57     /// The ISO 10464 comment field. This no longer contains any non-NULL
58     /// values.
59     pub iso_comment: String,
60     /// This codepoint's simple uppercase mapping, if it exists.
61     pub simple_uppercase_mapping: Option<Codepoint>,
62     /// This codepoint's simple lowercase mapping, if it exists.
63     pub simple_lowercase_mapping: Option<Codepoint>,
64     /// This codepoint's simple titlecase mapping, if it exists.
65     pub simple_titlecase_mapping: Option<Codepoint>,
66 }
67 
68 impl UcdFile for UnicodeData {
relative_file_path() -> &'static Path69     fn relative_file_path() -> &'static Path {
70         Path::new("UnicodeData.txt")
71     }
72 }
73 
74 impl UcdFileByCodepoint for UnicodeData {
codepoints(&self) -> CodepointIter75     fn codepoints(&self) -> CodepointIter {
76         self.codepoint.into_iter()
77     }
78 }
79 
80 impl UnicodeData {
81     /// Returns true if and only if this record corresponds to the start of a
82     /// range.
is_range_start(&self) -> bool83     pub fn is_range_start(&self) -> bool {
84         self.name.starts_with('<')
85             && self.name.ends_with('>')
86             && self.name.contains("First")
87     }
88 
89     /// Returns true if and only if this record corresponds to the end of a
90     /// range.
is_range_end(&self) -> bool91     pub fn is_range_end(&self) -> bool {
92         self.name.starts_with('<')
93             && self.name.ends_with('>')
94             && self.name.contains("Last")
95     }
96 }
97 
98 impl FromStr for UnicodeData {
99     type Err = Error;
100 
from_str(line: &str) -> Result<UnicodeData, Error>101     fn from_str(line: &str) -> Result<UnicodeData, Error> {
102         lazy_static! {
103             static ref PARTS: Regex = Regex::new(
104                 r"(?x)
105                 ^
106                 ([A-Z0-9]+);  #  1; codepoint
107                 ([^;]+);      #  2; name
108                 ([^;]+);      #  3; general category
109                 ([0-9]+);     #  4; canonical combining class
110                 ([^;]+);      #  5; bidi class
111                 ([^;]*);      #  6; decomposition
112                 ([0-9]*);     #  7; numeric type decimal
113                 ([0-9]*);     #  8; numeric type digit
114                 ([-0-9/]*);   #  9; numeric type numeric
115                 ([YN]);       # 10; bidi mirrored
116                 ([^;]*);      # 11; unicode1 name
117                 ([^;]*);      # 12; ISO comment
118                 ([^;]*);      # 13; simple uppercase mapping
119                 ([^;]*);      # 14; simple lowercase mapping
120                 ([^;]*)       # 15; simple titlecase mapping
121                 $
122                 "
123             )
124             .unwrap();
125         };
126         let caps = match PARTS.captures(line.trim()) {
127             Some(caps) => caps,
128             None => return err!("invalid UnicodeData line"),
129         };
130         let capget = |n| caps.get(n).unwrap().as_str();
131         let mut data = UnicodeData::default();
132 
133         data.codepoint = capget(1).parse()?;
134         data.name = capget(2).to_string();
135         data.general_category = capget(3).to_string();
136         data.canonical_combining_class = match capget(4).parse() {
137             Ok(n) => n,
138             Err(err) => {
139                 return err!(
140                     "failed to parse canonical combining class '{}': {}",
141                     capget(4),
142                     err
143                 )
144             }
145         };
146         data.bidi_class = capget(5).to_string();
147         if !caps[6].is_empty() {
148             data.decomposition = caps[6].parse()?;
149         } else {
150             data.decomposition.push(data.codepoint)?;
151         }
152         if !capget(7).is_empty() {
153             data.numeric_type_decimal = Some(match capget(7).parse() {
154                 Ok(n) => n,
155                 Err(err) => {
156                     return err!(
157                         "failed to parse numeric type decimal '{}': {}",
158                         capget(7),
159                         err
160                     )
161                 }
162             });
163         }
164         if !capget(8).is_empty() {
165             data.numeric_type_digit = Some(match capget(8).parse() {
166                 Ok(n) => n,
167                 Err(err) => {
168                     return err!(
169                         "failed to parse numeric type digit '{}': {}",
170                         capget(8),
171                         err
172                     )
173                 }
174             });
175         }
176         if !capget(9).is_empty() {
177             data.numeric_type_numeric = Some(capget(9).parse()?);
178         }
179         data.bidi_mirrored = capget(10) == "Y";
180         data.unicode1_name = capget(11).to_string();
181         data.iso_comment = capget(12).to_string();
182         if !capget(13).is_empty() {
183             data.simple_uppercase_mapping = Some(capget(13).parse()?);
184         }
185         if !capget(14).is_empty() {
186             data.simple_lowercase_mapping = Some(capget(14).parse()?);
187         }
188         if !capget(15).is_empty() {
189             data.simple_titlecase_mapping = Some(capget(15).parse()?);
190         }
191         Ok(data)
192     }
193 }
194 
195 impl fmt::Display for UnicodeData {
fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result196     fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
197         write!(f, "{};", self.codepoint)?;
198         write!(f, "{};", self.name)?;
199         write!(f, "{};", self.general_category)?;
200         write!(f, "{};", self.canonical_combining_class)?;
201         write!(f, "{};", self.bidi_class)?;
202         if self.decomposition.is_canonical()
203             && self.decomposition.mapping() == &[self.codepoint]
204         {
205             write!(f, ";")?;
206         } else {
207             write!(f, "{};", self.decomposition)?;
208         }
209         if let Some(n) = self.numeric_type_decimal {
210             write!(f, "{};", n)?;
211         } else {
212             write!(f, ";")?;
213         }
214         if let Some(n) = self.numeric_type_digit {
215             write!(f, "{};", n)?;
216         } else {
217             write!(f, ";")?;
218         }
219         if let Some(n) = self.numeric_type_numeric {
220             write!(f, "{};", n)?;
221         } else {
222             write!(f, ";")?;
223         }
224         write!(f, "{};", if self.bidi_mirrored { "Y" } else { "N" })?;
225         write!(f, "{};", self.unicode1_name)?;
226         write!(f, "{};", self.iso_comment)?;
227         if let Some(cp) = self.simple_uppercase_mapping {
228             write!(f, "{};", cp)?;
229         } else {
230             write!(f, ";")?;
231         }
232         if let Some(cp) = self.simple_lowercase_mapping {
233             write!(f, "{};", cp)?;
234         } else {
235             write!(f, ";")?;
236         }
237         if let Some(cp) = self.simple_titlecase_mapping {
238             write!(f, "{}", cp)?;
239         }
240         Ok(())
241     }
242 }
243 
244 /// Represents a decomposition mapping of a single row in the
245 /// `UnicodeData.txt` file.
246 #[derive(Clone, Debug, Default, Eq, PartialEq)]
247 pub struct UnicodeDataDecomposition {
248     /// The formatting tag associated with this mapping, if present.
249     pub tag: Option<UnicodeDataDecompositionTag>,
250     /// The number of codepoints in this mapping.
251     pub len: usize,
252     /// The codepoints in the mapping. Entries beyond `len` in the mapping
253     /// are always U+0000. If no mapping was present, then this always contains
254     /// a single codepoint corresponding to this row's character.
255     pub mapping: [Codepoint; 18],
256 }
257 
258 impl UnicodeDataDecomposition {
259     /// Create a new decomposition mapping with the given tag and codepoints.
260     ///
261     /// If there are too many codepoints, then an error is returned.
new( tag: Option<UnicodeDataDecompositionTag>, mapping: &[Codepoint], ) -> Result<UnicodeDataDecomposition, Error>262     pub fn new(
263         tag: Option<UnicodeDataDecompositionTag>,
264         mapping: &[Codepoint],
265     ) -> Result<UnicodeDataDecomposition, Error> {
266         let mut x = UnicodeDataDecomposition::default();
267         x.tag = tag;
268         for &cp in mapping {
269             x.push(cp)?;
270         }
271         Ok(x)
272     }
273 
274     /// Add a new codepoint to this decomposition's mapping.
275     ///
276     /// If the mapping is already full, then this returns an error.
push(&mut self, cp: Codepoint) -> Result<(), Error>277     pub fn push(&mut self, cp: Codepoint) -> Result<(), Error> {
278         if self.len >= self.mapping.len() {
279             return err!(
280                 "invalid decomposition mapping (too many codepoints)"
281             );
282         }
283         self.mapping[self.len] = cp;
284         self.len += 1;
285         Ok(())
286     }
287 
288     /// Return the mapping as a slice of codepoints. The slice returned
289     /// has length equivalent to the number of codepoints in this mapping.
mapping(&self) -> &[Codepoint]290     pub fn mapping(&self) -> &[Codepoint] {
291         &self.mapping[..self.len]
292     }
293 
294     /// Returns true if and only if this decomposition mapping is canonical.
is_canonical(&self) -> bool295     pub fn is_canonical(&self) -> bool {
296         self.tag.is_none()
297     }
298 }
299 
300 impl FromStr for UnicodeDataDecomposition {
301     type Err = Error;
302 
from_str(s: &str) -> Result<UnicodeDataDecomposition, Error>303     fn from_str(s: &str) -> Result<UnicodeDataDecomposition, Error> {
304         lazy_static! {
305             static ref WITH_TAG: Regex = Regex::new(
306                 r"^(?:<(?P<tag>[^>]+)>)?\s*(?P<chars>[\s0-9A-F]+)$"
307             )
308             .unwrap();
309             static ref CHARS: Regex = Regex::new(r"[0-9A-F]+").unwrap();
310         };
311         if s.is_empty() {
312             return err!(
313                 "expected non-empty string for \
314                  UnicodeDataDecomposition value"
315             );
316         }
317         let caps = match WITH_TAG.captures(s) {
318             Some(caps) => caps,
319             None => return err!("invalid decomposition value"),
320         };
321         let mut decomp = UnicodeDataDecomposition::default();
322         let mut codepoints = s;
323         if let Some(m) = caps.name("tag") {
324             decomp.tag = Some(m.as_str().parse()?);
325             codepoints = &caps["chars"];
326         }
327         for m in CHARS.find_iter(codepoints) {
328             let cp = m.as_str().parse()?;
329             decomp.push(cp)?;
330         }
331         Ok(decomp)
332     }
333 }
334 
335 impl fmt::Display for UnicodeDataDecomposition {
fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result336     fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
337         if let Some(ref tag) = self.tag {
338             write!(f, "<{}> ", tag)?;
339         }
340         let mut first = true;
341         for cp in self.mapping() {
342             if !first {
343                 write!(f, " ")?;
344             }
345             first = false;
346             write!(f, "{}", cp)?;
347         }
348         Ok(())
349     }
350 }
351 
352 /// The formatting tag on a decomposition mapping.
353 ///
354 /// This is taken from
355 /// [UAX44, Table 14](http://www.unicode.org/reports/tr44/#Character_Decomposition_Mappings).
356 #[derive(Clone, Debug, Eq, PartialEq)]
357 pub enum UnicodeDataDecompositionTag {
358     /// <font>
359     Font,
360     /// <noBreak>
361     NoBreak,
362     /// <initial>
363     Initial,
364     /// <medial>
365     Medial,
366     /// <final>
367     Final,
368     /// <isolated>
369     Isolated,
370     /// <circle>
371     Circle,
372     /// <super>
373     Super,
374     /// <sub>
375     Sub,
376     /// <vertical>
377     Vertical,
378     /// <wide>
379     Wide,
380     /// <narrow>
381     Narrow,
382     /// <small>
383     Small,
384     /// <square>
385     Square,
386     /// <fraction>
387     Fraction,
388     /// <compat>
389     Compat,
390 }
391 
392 impl FromStr for UnicodeDataDecompositionTag {
393     type Err = Error;
394 
from_str(s: &str) -> Result<UnicodeDataDecompositionTag, Error>395     fn from_str(s: &str) -> Result<UnicodeDataDecompositionTag, Error> {
396         use self::UnicodeDataDecompositionTag::*;
397         Ok(match s {
398             "font" => Font,
399             "noBreak" => NoBreak,
400             "initial" => Initial,
401             "medial" => Medial,
402             "final" => Final,
403             "isolated" => Isolated,
404             "circle" => Circle,
405             "super" => Super,
406             "sub" => Sub,
407             "vertical" => Vertical,
408             "wide" => Wide,
409             "narrow" => Narrow,
410             "small" => Small,
411             "square" => Square,
412             "fraction" => Fraction,
413             "compat" => Compat,
414             _ => return err!("invalid decomposition formatting tag: {}", s),
415         })
416     }
417 }
418 
419 impl fmt::Display for UnicodeDataDecompositionTag {
fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result420     fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
421         use self::UnicodeDataDecompositionTag::*;
422         let s = match *self {
423             Font => "font",
424             NoBreak => "noBreak",
425             Initial => "initial",
426             Medial => "medial",
427             Final => "final",
428             Isolated => "isolated",
429             Circle => "circle",
430             Super => "super",
431             Sub => "sub",
432             Vertical => "vertical",
433             Wide => "wide",
434             Narrow => "narrow",
435             Small => "small",
436             Square => "square",
437             Fraction => "fraction",
438             Compat => "compat",
439         };
440         write!(f, "{}", s)
441     }
442 }
443 
444 /// A numeric value corresponding to characters with `Numeric_Type=Numeric`.
445 ///
446 /// A numeric value can either be a signed integer or a rational number.
447 #[derive(Clone, Copy, Debug, Eq, PartialEq)]
448 pub enum UnicodeDataNumeric {
449     /// An integer.
450     Integer(i64),
451     /// A rational number. The first is the numerator and the latter is the
452     /// denominator.
453     Rational(i64, i64),
454 }
455 
456 impl FromStr for UnicodeDataNumeric {
457     type Err = Error;
458 
from_str(s: &str) -> Result<UnicodeDataNumeric, Error>459     fn from_str(s: &str) -> Result<UnicodeDataNumeric, Error> {
460         if s.is_empty() {
461             return err!(
462                 "expected non-empty string for UnicodeDataNumeric value"
463             );
464         }
465         if let Some(pos) = s.find('/') {
466             let (snum, sden) = (&s[..pos], &s[pos + 1..]);
467             let num = match snum.parse() {
468                 Ok(num) => num,
469                 Err(err) => {
470                     return err!(
471                         "invalid integer numerator '{}': {}",
472                         snum,
473                         err
474                     );
475                 }
476             };
477             let den = match sden.parse() {
478                 Ok(den) => den,
479                 Err(err) => {
480                     return err!(
481                         "invalid integer denominator '{}': {}",
482                         sden,
483                         err
484                     );
485                 }
486             };
487             Ok(UnicodeDataNumeric::Rational(num, den))
488         } else {
489             match s.parse() {
490                 Ok(den) => Ok(UnicodeDataNumeric::Integer(den)),
491                 Err(err) => {
492                     return err!(
493                         "invalid integer denominator '{}': {}",
494                         s,
495                         err
496                     );
497                 }
498             }
499         }
500     }
501 }
502 
503 impl fmt::Display for UnicodeDataNumeric {
fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result504     fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
505         match *self {
506             UnicodeDataNumeric::Integer(n) => write!(f, "{}", n),
507             UnicodeDataNumeric::Rational(n, d) => write!(f, "{}/{}", n, d),
508         }
509     }
510 }
511 
512 /// An iterator adapter that expands rows in `UnicodeData.txt`.
513 ///
514 /// Throughout `UnicodeData.txt`, some assigned codepoints are not explicitly
515 /// represented. Instead, they are represented by a pair of rows, indicating
516 /// a range of codepoints with the same properties. For example, the Hangul
517 /// syllable codepoints are represented by these two rows:
518 ///
519 /// ```ignore
520 /// AC00;<Hangul Syllable, First>;Lo;0;L;;;;;N;;;;;
521 /// D7A3;<Hangul Syllable, Last>;Lo;0;L;;;;;N;;;;;
522 /// ```
523 ///
524 /// This iterator will wrap any iterator of `UnicodeData` and, when a range of
525 /// Unicode codepoints is found, it will be expanded to the appropriate
526 /// sequence of `UnicodeData` values. Note that all such expanded records will
527 /// have an empty name.
528 pub struct UnicodeDataExpander<I: Iterator> {
529     /// The underlying iterator.
530     it: iter::Peekable<I>,
531     /// A range of codepoints to emit when we've found a pair. Otherwise,
532     /// `None`.
533     range: CodepointRange,
534 }
535 
536 struct CodepointRange {
537     /// The codepoint range.
538     range: Range<u32>,
539     /// The start record. All subsequent records in this range are generated
540     /// by cloning this and updating the codepoint/name.
541     start_record: UnicodeData,
542 }
543 
544 impl<I: Iterator<Item = UnicodeData>> UnicodeDataExpander<I> {
545     /// Create a new iterator that expands pairs of `UnicodeData` range
546     /// records. All other records are passed through as-is.
new<T>(it: T) -> UnicodeDataExpander<I> where T: IntoIterator<IntoIter = I, Item = I::Item>,547     pub fn new<T>(it: T) -> UnicodeDataExpander<I>
548     where
549         T: IntoIterator<IntoIter = I, Item = I::Item>,
550     {
551         UnicodeDataExpander {
552             it: it.into_iter().peekable(),
553             range: CodepointRange {
554                 range: 0..0,
555                 start_record: UnicodeData::default(),
556             },
557         }
558     }
559 }
560 
561 impl<I: Iterator<Item = UnicodeData>> Iterator for UnicodeDataExpander<I> {
562     type Item = UnicodeData;
563 
next(&mut self) -> Option<UnicodeData>564     fn next(&mut self) -> Option<UnicodeData> {
565         if let Some(udata) = self.range.next() {
566             return Some(udata);
567         }
568         let row1 = match self.it.next() {
569             None => return None,
570             Some(row1) => row1,
571         };
572         if !row1.is_range_start()
573             || !self.it.peek().map_or(false, |row2| row2.is_range_end())
574         {
575             return Some(row1);
576         }
577         let row2 = self.it.next().unwrap();
578         self.range = CodepointRange {
579             range: row1.codepoint.value()..(row2.codepoint.value() + 1),
580             start_record: row1,
581         };
582         self.next()
583     }
584 }
585 
586 impl Iterator for CodepointRange {
587     type Item = UnicodeData;
588 
next(&mut self) -> Option<UnicodeData>589     fn next(&mut self) -> Option<UnicodeData> {
590         let cp = match self.range.next() {
591             None => return None,
592             Some(cp) => cp,
593         };
594         Some(UnicodeData {
595             codepoint: Codepoint::from_u32(cp).unwrap(),
596             name: "".to_string(),
597             ..self.start_record.clone()
598         })
599     }
600 }
601 
602 #[cfg(test)]
603 mod tests {
604     use crate::common::Codepoint;
605 
606     use super::{
607         UnicodeData, UnicodeDataDecomposition, UnicodeDataDecompositionTag,
608         UnicodeDataNumeric,
609     };
610 
codepoint(n: u32) -> Codepoint611     fn codepoint(n: u32) -> Codepoint {
612         Codepoint::from_u32(n).unwrap()
613     }
614 
s(string: &str) -> String615     fn s(string: &str) -> String {
616         string.to_string()
617     }
618 
619     #[test]
parse1()620     fn parse1() {
621         let line = "249D;PARENTHESIZED LATIN SMALL LETTER B;So;0;L;<compat> 0028 0062 0029;;;;N;;;;;\n";
622         let data: UnicodeData = line.parse().unwrap();
623         assert_eq!(
624             data,
625             UnicodeData {
626                 codepoint: codepoint(0x249d),
627                 name: s("PARENTHESIZED LATIN SMALL LETTER B"),
628                 general_category: s("So"),
629                 canonical_combining_class: 0,
630                 bidi_class: s("L"),
631                 decomposition: UnicodeDataDecomposition::new(
632                     Some(UnicodeDataDecompositionTag::Compat),
633                     &[codepoint(0x28), codepoint(0x62), codepoint(0x29)],
634                 )
635                 .unwrap(),
636                 numeric_type_decimal: None,
637                 numeric_type_digit: None,
638                 numeric_type_numeric: None,
639                 bidi_mirrored: false,
640                 unicode1_name: s(""),
641                 iso_comment: s(""),
642                 simple_uppercase_mapping: None,
643                 simple_lowercase_mapping: None,
644                 simple_titlecase_mapping: None,
645             }
646         );
647     }
648 
649     #[test]
parse2()650     fn parse2() {
651         let line = "000D;<control>;Cc;0;B;;;;;N;CARRIAGE RETURN (CR);;;;\n";
652         let data: UnicodeData = line.parse().unwrap();
653         assert_eq!(
654             data,
655             UnicodeData {
656                 codepoint: codepoint(0x000D),
657                 name: s("<control>"),
658                 general_category: s("Cc"),
659                 canonical_combining_class: 0,
660                 bidi_class: s("B"),
661                 decomposition: UnicodeDataDecomposition::new(
662                     None,
663                     &[codepoint(0x000D)]
664                 )
665                 .unwrap(),
666                 numeric_type_decimal: None,
667                 numeric_type_digit: None,
668                 numeric_type_numeric: None,
669                 bidi_mirrored: false,
670                 unicode1_name: s("CARRIAGE RETURN (CR)"),
671                 iso_comment: s(""),
672                 simple_uppercase_mapping: None,
673                 simple_lowercase_mapping: None,
674                 simple_titlecase_mapping: None,
675             }
676         );
677     }
678 
679     #[test]
parse3()680     fn parse3() {
681         let line = "00BC;VULGAR FRACTION ONE QUARTER;No;0;ON;<fraction> 0031 2044 0034;;;1/4;N;FRACTION ONE QUARTER;;;;\n";
682         let data: UnicodeData = line.parse().unwrap();
683         assert_eq!(
684             data,
685             UnicodeData {
686                 codepoint: codepoint(0x00BC),
687                 name: s("VULGAR FRACTION ONE QUARTER"),
688                 general_category: s("No"),
689                 canonical_combining_class: 0,
690                 bidi_class: s("ON"),
691                 decomposition: UnicodeDataDecomposition::new(
692                     Some(UnicodeDataDecompositionTag::Fraction),
693                     &[codepoint(0x31), codepoint(0x2044), codepoint(0x34)],
694                 )
695                 .unwrap(),
696                 numeric_type_decimal: None,
697                 numeric_type_digit: None,
698                 numeric_type_numeric: Some(UnicodeDataNumeric::Rational(1, 4)),
699                 bidi_mirrored: false,
700                 unicode1_name: s("FRACTION ONE QUARTER"),
701                 iso_comment: s(""),
702                 simple_uppercase_mapping: None,
703                 simple_lowercase_mapping: None,
704                 simple_titlecase_mapping: None,
705             }
706         );
707     }
708 
709     #[test]
parse4()710     fn parse4() {
711         let line = "0041;LATIN CAPITAL LETTER A;Lu;0;L;;;;;N;;;;0061;\n";
712         let data: UnicodeData = line.parse().unwrap();
713         assert_eq!(
714             data,
715             UnicodeData {
716                 codepoint: codepoint(0x0041),
717                 name: s("LATIN CAPITAL LETTER A"),
718                 general_category: s("Lu"),
719                 canonical_combining_class: 0,
720                 bidi_class: s("L"),
721                 decomposition: UnicodeDataDecomposition::new(
722                     None,
723                     &[codepoint(0x0041)]
724                 )
725                 .unwrap(),
726                 numeric_type_decimal: None,
727                 numeric_type_digit: None,
728                 numeric_type_numeric: None,
729                 bidi_mirrored: false,
730                 unicode1_name: s(""),
731                 iso_comment: s(""),
732                 simple_uppercase_mapping: None,
733                 simple_lowercase_mapping: Some(codepoint(0x0061)),
734                 simple_titlecase_mapping: None,
735             }
736         );
737     }
738 
739     #[test]
parse5()740     fn parse5() {
741         let line = "0F33;TIBETAN DIGIT HALF ZERO;No;0;L;;;;-1/2;N;;;;;\n";
742         let data: UnicodeData = line.parse().unwrap();
743         assert_eq!(
744             data,
745             UnicodeData {
746                 codepoint: codepoint(0x0F33),
747                 name: s("TIBETAN DIGIT HALF ZERO"),
748                 general_category: s("No"),
749                 canonical_combining_class: 0,
750                 bidi_class: s("L"),
751                 decomposition: UnicodeDataDecomposition::new(
752                     None,
753                     &[codepoint(0x0F33)]
754                 )
755                 .unwrap(),
756                 numeric_type_decimal: None,
757                 numeric_type_digit: None,
758                 numeric_type_numeric: Some(UnicodeDataNumeric::Rational(
759                     -1, 2
760                 )),
761                 bidi_mirrored: false,
762                 unicode1_name: s(""),
763                 iso_comment: s(""),
764                 simple_uppercase_mapping: None,
765                 simple_lowercase_mapping: None,
766                 simple_titlecase_mapping: None,
767             }
768         );
769     }
770 
771     #[test]
expander()772     fn expander() {
773         use super::UnicodeDataExpander;
774         use crate::common::UcdLineParser;
775 
776         let data = "\
777 ABF9;MEETEI MAYEK DIGIT NINE;Nd;0;L;;9;9;9;N;;;;;
778 AC00;<Hangul Syllable, First>;Lo;0;L;;;;;N;;;;;
779 D7A3;<Hangul Syllable, Last>;Lo;0;L;;;;;N;;;;;
780 D7B0;HANGUL JUNGSEONG O-YEO;Lo;0;L;;;;;N;;;;;
781 ";
782         let records = UcdLineParser::new(None, data.as_bytes())
783             .collect::<Result<Vec<_>, _>>()
784             .unwrap();
785         assert_eq!(UnicodeDataExpander::new(records).count(), 11174);
786     }
787 }
788