1 use std::fmt; 2 use std::iter; 3 use std::ops::Range; 4 use std::path::Path; 5 use std::str::FromStr; 6 7 use lazy_static::lazy_static; 8 use regex::Regex; 9 10 use crate::common::{Codepoint, CodepointIter, UcdFile, UcdFileByCodepoint}; 11 use crate::error::Error; 12 13 /// Represents a single row in the `UnicodeData.txt` file. 14 /// 15 /// These fields were taken from UAX44, Table 9, as part of the documentation 16 /// for the 17 /// [`UnicodeData.txt` file](http://www.unicode.org/reports/tr44/#UnicodeData.txt). 18 #[derive(Clone, Debug, Default, Eq, PartialEq)] 19 pub struct UnicodeData { 20 /// The codepoint corresponding to this row. 21 pub codepoint: Codepoint, 22 /// The name of this codepoint. 23 pub name: String, 24 /// The "general category" of this codepoint. 25 pub general_category: String, 26 /// The class of this codepoint used in the Canonical Ordering Algorithm. 27 /// 28 /// Note that some classes map to a particular symbol. See 29 /// [UAX44, Table 15](http://www.unicode.org/reports/tr44/#Canonical_Combining_Class_Values). 30 pub canonical_combining_class: u8, 31 /// The bidirectional class of this codepoint. 32 /// 33 /// Possible values are listed in 34 /// [UAX44, Table 13](http://www.unicode.org/reports/tr44/#Bidi_Class_Values). 35 pub bidi_class: String, 36 /// The decomposition mapping for this codepoint. This includes its 37 /// formatting tag (if present). 38 pub decomposition: UnicodeDataDecomposition, 39 /// A decimal numeric representation of this codepoint, if it has the 40 /// property `Numeric_Type=Decimal`. 41 pub numeric_type_decimal: Option<u8>, 42 /// A decimal numeric representation of this codepoint, if it has the 43 /// property `Numeric_Type=Digit`. Note that while this field is still 44 /// populated for existing codepoints, no new codepoints will have this 45 /// field populated. 46 pub numeric_type_digit: Option<u8>, 47 /// A decimal or rational numeric representation of this codepoint, if it 48 /// has the property `Numeric_Type=Numeric`. 49 pub numeric_type_numeric: Option<UnicodeDataNumeric>, 50 /// A boolean indicating whether this codepoint is "mirrored" in 51 /// bidirectional text. 52 pub bidi_mirrored: bool, 53 /// The "old" Unicode 1.0 or ISO 6429 name of this codepoint. Note that 54 /// this field is empty unless it is significantly different from 55 /// the `name` field. 56 pub unicode1_name: String, 57 /// The ISO 10464 comment field. This no longer contains any non-NULL 58 /// values. 59 pub iso_comment: String, 60 /// This codepoint's simple uppercase mapping, if it exists. 61 pub simple_uppercase_mapping: Option<Codepoint>, 62 /// This codepoint's simple lowercase mapping, if it exists. 63 pub simple_lowercase_mapping: Option<Codepoint>, 64 /// This codepoint's simple titlecase mapping, if it exists. 65 pub simple_titlecase_mapping: Option<Codepoint>, 66 } 67 68 impl UcdFile for UnicodeData { relative_file_path() -> &'static Path69 fn relative_file_path() -> &'static Path { 70 Path::new("UnicodeData.txt") 71 } 72 } 73 74 impl UcdFileByCodepoint for UnicodeData { codepoints(&self) -> CodepointIter75 fn codepoints(&self) -> CodepointIter { 76 self.codepoint.into_iter() 77 } 78 } 79 80 impl UnicodeData { 81 /// Returns true if and only if this record corresponds to the start of a 82 /// range. is_range_start(&self) -> bool83 pub fn is_range_start(&self) -> bool { 84 self.name.starts_with('<') 85 && self.name.ends_with('>') 86 && self.name.contains("First") 87 } 88 89 /// Returns true if and only if this record corresponds to the end of a 90 /// range. is_range_end(&self) -> bool91 pub fn is_range_end(&self) -> bool { 92 self.name.starts_with('<') 93 && self.name.ends_with('>') 94 && self.name.contains("Last") 95 } 96 } 97 98 impl FromStr for UnicodeData { 99 type Err = Error; 100 from_str(line: &str) -> Result<UnicodeData, Error>101 fn from_str(line: &str) -> Result<UnicodeData, Error> { 102 lazy_static! { 103 static ref PARTS: Regex = Regex::new( 104 r"(?x) 105 ^ 106 ([A-Z0-9]+); # 1; codepoint 107 ([^;]+); # 2; name 108 ([^;]+); # 3; general category 109 ([0-9]+); # 4; canonical combining class 110 ([^;]+); # 5; bidi class 111 ([^;]*); # 6; decomposition 112 ([0-9]*); # 7; numeric type decimal 113 ([0-9]*); # 8; numeric type digit 114 ([-0-9/]*); # 9; numeric type numeric 115 ([YN]); # 10; bidi mirrored 116 ([^;]*); # 11; unicode1 name 117 ([^;]*); # 12; ISO comment 118 ([^;]*); # 13; simple uppercase mapping 119 ([^;]*); # 14; simple lowercase mapping 120 ([^;]*) # 15; simple titlecase mapping 121 $ 122 " 123 ) 124 .unwrap(); 125 }; 126 let caps = match PARTS.captures(line.trim()) { 127 Some(caps) => caps, 128 None => return err!("invalid UnicodeData line"), 129 }; 130 let capget = |n| caps.get(n).unwrap().as_str(); 131 let mut data = UnicodeData::default(); 132 133 data.codepoint = capget(1).parse()?; 134 data.name = capget(2).to_string(); 135 data.general_category = capget(3).to_string(); 136 data.canonical_combining_class = match capget(4).parse() { 137 Ok(n) => n, 138 Err(err) => { 139 return err!( 140 "failed to parse canonical combining class '{}': {}", 141 capget(4), 142 err 143 ) 144 } 145 }; 146 data.bidi_class = capget(5).to_string(); 147 if !caps[6].is_empty() { 148 data.decomposition = caps[6].parse()?; 149 } else { 150 data.decomposition.push(data.codepoint)?; 151 } 152 if !capget(7).is_empty() { 153 data.numeric_type_decimal = Some(match capget(7).parse() { 154 Ok(n) => n, 155 Err(err) => { 156 return err!( 157 "failed to parse numeric type decimal '{}': {}", 158 capget(7), 159 err 160 ) 161 } 162 }); 163 } 164 if !capget(8).is_empty() { 165 data.numeric_type_digit = Some(match capget(8).parse() { 166 Ok(n) => n, 167 Err(err) => { 168 return err!( 169 "failed to parse numeric type digit '{}': {}", 170 capget(8), 171 err 172 ) 173 } 174 }); 175 } 176 if !capget(9).is_empty() { 177 data.numeric_type_numeric = Some(capget(9).parse()?); 178 } 179 data.bidi_mirrored = capget(10) == "Y"; 180 data.unicode1_name = capget(11).to_string(); 181 data.iso_comment = capget(12).to_string(); 182 if !capget(13).is_empty() { 183 data.simple_uppercase_mapping = Some(capget(13).parse()?); 184 } 185 if !capget(14).is_empty() { 186 data.simple_lowercase_mapping = Some(capget(14).parse()?); 187 } 188 if !capget(15).is_empty() { 189 data.simple_titlecase_mapping = Some(capget(15).parse()?); 190 } 191 Ok(data) 192 } 193 } 194 195 impl fmt::Display for UnicodeData { fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result196 fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { 197 write!(f, "{};", self.codepoint)?; 198 write!(f, "{};", self.name)?; 199 write!(f, "{};", self.general_category)?; 200 write!(f, "{};", self.canonical_combining_class)?; 201 write!(f, "{};", self.bidi_class)?; 202 if self.decomposition.is_canonical() 203 && self.decomposition.mapping() == &[self.codepoint] 204 { 205 write!(f, ";")?; 206 } else { 207 write!(f, "{};", self.decomposition)?; 208 } 209 if let Some(n) = self.numeric_type_decimal { 210 write!(f, "{};", n)?; 211 } else { 212 write!(f, ";")?; 213 } 214 if let Some(n) = self.numeric_type_digit { 215 write!(f, "{};", n)?; 216 } else { 217 write!(f, ";")?; 218 } 219 if let Some(n) = self.numeric_type_numeric { 220 write!(f, "{};", n)?; 221 } else { 222 write!(f, ";")?; 223 } 224 write!(f, "{};", if self.bidi_mirrored { "Y" } else { "N" })?; 225 write!(f, "{};", self.unicode1_name)?; 226 write!(f, "{};", self.iso_comment)?; 227 if let Some(cp) = self.simple_uppercase_mapping { 228 write!(f, "{};", cp)?; 229 } else { 230 write!(f, ";")?; 231 } 232 if let Some(cp) = self.simple_lowercase_mapping { 233 write!(f, "{};", cp)?; 234 } else { 235 write!(f, ";")?; 236 } 237 if let Some(cp) = self.simple_titlecase_mapping { 238 write!(f, "{}", cp)?; 239 } 240 Ok(()) 241 } 242 } 243 244 /// Represents a decomposition mapping of a single row in the 245 /// `UnicodeData.txt` file. 246 #[derive(Clone, Debug, Default, Eq, PartialEq)] 247 pub struct UnicodeDataDecomposition { 248 /// The formatting tag associated with this mapping, if present. 249 pub tag: Option<UnicodeDataDecompositionTag>, 250 /// The number of codepoints in this mapping. 251 pub len: usize, 252 /// The codepoints in the mapping. Entries beyond `len` in the mapping 253 /// are always U+0000. If no mapping was present, then this always contains 254 /// a single codepoint corresponding to this row's character. 255 pub mapping: [Codepoint; 18], 256 } 257 258 impl UnicodeDataDecomposition { 259 /// Create a new decomposition mapping with the given tag and codepoints. 260 /// 261 /// If there are too many codepoints, then an error is returned. new( tag: Option<UnicodeDataDecompositionTag>, mapping: &[Codepoint], ) -> Result<UnicodeDataDecomposition, Error>262 pub fn new( 263 tag: Option<UnicodeDataDecompositionTag>, 264 mapping: &[Codepoint], 265 ) -> Result<UnicodeDataDecomposition, Error> { 266 let mut x = UnicodeDataDecomposition::default(); 267 x.tag = tag; 268 for &cp in mapping { 269 x.push(cp)?; 270 } 271 Ok(x) 272 } 273 274 /// Add a new codepoint to this decomposition's mapping. 275 /// 276 /// If the mapping is already full, then this returns an error. push(&mut self, cp: Codepoint) -> Result<(), Error>277 pub fn push(&mut self, cp: Codepoint) -> Result<(), Error> { 278 if self.len >= self.mapping.len() { 279 return err!( 280 "invalid decomposition mapping (too many codepoints)" 281 ); 282 } 283 self.mapping[self.len] = cp; 284 self.len += 1; 285 Ok(()) 286 } 287 288 /// Return the mapping as a slice of codepoints. The slice returned 289 /// has length equivalent to the number of codepoints in this mapping. mapping(&self) -> &[Codepoint]290 pub fn mapping(&self) -> &[Codepoint] { 291 &self.mapping[..self.len] 292 } 293 294 /// Returns true if and only if this decomposition mapping is canonical. is_canonical(&self) -> bool295 pub fn is_canonical(&self) -> bool { 296 self.tag.is_none() 297 } 298 } 299 300 impl FromStr for UnicodeDataDecomposition { 301 type Err = Error; 302 from_str(s: &str) -> Result<UnicodeDataDecomposition, Error>303 fn from_str(s: &str) -> Result<UnicodeDataDecomposition, Error> { 304 lazy_static! { 305 static ref WITH_TAG: Regex = Regex::new( 306 r"^(?:<(?P<tag>[^>]+)>)?\s*(?P<chars>[\s0-9A-F]+)$" 307 ) 308 .unwrap(); 309 static ref CHARS: Regex = Regex::new(r"[0-9A-F]+").unwrap(); 310 }; 311 if s.is_empty() { 312 return err!( 313 "expected non-empty string for \ 314 UnicodeDataDecomposition value" 315 ); 316 } 317 let caps = match WITH_TAG.captures(s) { 318 Some(caps) => caps, 319 None => return err!("invalid decomposition value"), 320 }; 321 let mut decomp = UnicodeDataDecomposition::default(); 322 let mut codepoints = s; 323 if let Some(m) = caps.name("tag") { 324 decomp.tag = Some(m.as_str().parse()?); 325 codepoints = &caps["chars"]; 326 } 327 for m in CHARS.find_iter(codepoints) { 328 let cp = m.as_str().parse()?; 329 decomp.push(cp)?; 330 } 331 Ok(decomp) 332 } 333 } 334 335 impl fmt::Display for UnicodeDataDecomposition { fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result336 fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { 337 if let Some(ref tag) = self.tag { 338 write!(f, "<{}> ", tag)?; 339 } 340 let mut first = true; 341 for cp in self.mapping() { 342 if !first { 343 write!(f, " ")?; 344 } 345 first = false; 346 write!(f, "{}", cp)?; 347 } 348 Ok(()) 349 } 350 } 351 352 /// The formatting tag on a decomposition mapping. 353 /// 354 /// This is taken from 355 /// [UAX44, Table 14](http://www.unicode.org/reports/tr44/#Character_Decomposition_Mappings). 356 #[derive(Clone, Debug, Eq, PartialEq)] 357 pub enum UnicodeDataDecompositionTag { 358 /// <font> 359 Font, 360 /// <noBreak> 361 NoBreak, 362 /// <initial> 363 Initial, 364 /// <medial> 365 Medial, 366 /// <final> 367 Final, 368 /// <isolated> 369 Isolated, 370 /// <circle> 371 Circle, 372 /// <super> 373 Super, 374 /// <sub> 375 Sub, 376 /// <vertical> 377 Vertical, 378 /// <wide> 379 Wide, 380 /// <narrow> 381 Narrow, 382 /// <small> 383 Small, 384 /// <square> 385 Square, 386 /// <fraction> 387 Fraction, 388 /// <compat> 389 Compat, 390 } 391 392 impl FromStr for UnicodeDataDecompositionTag { 393 type Err = Error; 394 from_str(s: &str) -> Result<UnicodeDataDecompositionTag, Error>395 fn from_str(s: &str) -> Result<UnicodeDataDecompositionTag, Error> { 396 use self::UnicodeDataDecompositionTag::*; 397 Ok(match s { 398 "font" => Font, 399 "noBreak" => NoBreak, 400 "initial" => Initial, 401 "medial" => Medial, 402 "final" => Final, 403 "isolated" => Isolated, 404 "circle" => Circle, 405 "super" => Super, 406 "sub" => Sub, 407 "vertical" => Vertical, 408 "wide" => Wide, 409 "narrow" => Narrow, 410 "small" => Small, 411 "square" => Square, 412 "fraction" => Fraction, 413 "compat" => Compat, 414 _ => return err!("invalid decomposition formatting tag: {}", s), 415 }) 416 } 417 } 418 419 impl fmt::Display for UnicodeDataDecompositionTag { fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result420 fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { 421 use self::UnicodeDataDecompositionTag::*; 422 let s = match *self { 423 Font => "font", 424 NoBreak => "noBreak", 425 Initial => "initial", 426 Medial => "medial", 427 Final => "final", 428 Isolated => "isolated", 429 Circle => "circle", 430 Super => "super", 431 Sub => "sub", 432 Vertical => "vertical", 433 Wide => "wide", 434 Narrow => "narrow", 435 Small => "small", 436 Square => "square", 437 Fraction => "fraction", 438 Compat => "compat", 439 }; 440 write!(f, "{}", s) 441 } 442 } 443 444 /// A numeric value corresponding to characters with `Numeric_Type=Numeric`. 445 /// 446 /// A numeric value can either be a signed integer or a rational number. 447 #[derive(Clone, Copy, Debug, Eq, PartialEq)] 448 pub enum UnicodeDataNumeric { 449 /// An integer. 450 Integer(i64), 451 /// A rational number. The first is the numerator and the latter is the 452 /// denominator. 453 Rational(i64, i64), 454 } 455 456 impl FromStr for UnicodeDataNumeric { 457 type Err = Error; 458 from_str(s: &str) -> Result<UnicodeDataNumeric, Error>459 fn from_str(s: &str) -> Result<UnicodeDataNumeric, Error> { 460 if s.is_empty() { 461 return err!( 462 "expected non-empty string for UnicodeDataNumeric value" 463 ); 464 } 465 if let Some(pos) = s.find('/') { 466 let (snum, sden) = (&s[..pos], &s[pos + 1..]); 467 let num = match snum.parse() { 468 Ok(num) => num, 469 Err(err) => { 470 return err!( 471 "invalid integer numerator '{}': {}", 472 snum, 473 err 474 ); 475 } 476 }; 477 let den = match sden.parse() { 478 Ok(den) => den, 479 Err(err) => { 480 return err!( 481 "invalid integer denominator '{}': {}", 482 sden, 483 err 484 ); 485 } 486 }; 487 Ok(UnicodeDataNumeric::Rational(num, den)) 488 } else { 489 match s.parse() { 490 Ok(den) => Ok(UnicodeDataNumeric::Integer(den)), 491 Err(err) => { 492 return err!( 493 "invalid integer denominator '{}': {}", 494 s, 495 err 496 ); 497 } 498 } 499 } 500 } 501 } 502 503 impl fmt::Display for UnicodeDataNumeric { fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result504 fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { 505 match *self { 506 UnicodeDataNumeric::Integer(n) => write!(f, "{}", n), 507 UnicodeDataNumeric::Rational(n, d) => write!(f, "{}/{}", n, d), 508 } 509 } 510 } 511 512 /// An iterator adapter that expands rows in `UnicodeData.txt`. 513 /// 514 /// Throughout `UnicodeData.txt`, some assigned codepoints are not explicitly 515 /// represented. Instead, they are represented by a pair of rows, indicating 516 /// a range of codepoints with the same properties. For example, the Hangul 517 /// syllable codepoints are represented by these two rows: 518 /// 519 /// ```ignore 520 /// AC00;<Hangul Syllable, First>;Lo;0;L;;;;;N;;;;; 521 /// D7A3;<Hangul Syllable, Last>;Lo;0;L;;;;;N;;;;; 522 /// ``` 523 /// 524 /// This iterator will wrap any iterator of `UnicodeData` and, when a range of 525 /// Unicode codepoints is found, it will be expanded to the appropriate 526 /// sequence of `UnicodeData` values. Note that all such expanded records will 527 /// have an empty name. 528 pub struct UnicodeDataExpander<I: Iterator> { 529 /// The underlying iterator. 530 it: iter::Peekable<I>, 531 /// A range of codepoints to emit when we've found a pair. Otherwise, 532 /// `None`. 533 range: CodepointRange, 534 } 535 536 struct CodepointRange { 537 /// The codepoint range. 538 range: Range<u32>, 539 /// The start record. All subsequent records in this range are generated 540 /// by cloning this and updating the codepoint/name. 541 start_record: UnicodeData, 542 } 543 544 impl<I: Iterator<Item = UnicodeData>> UnicodeDataExpander<I> { 545 /// Create a new iterator that expands pairs of `UnicodeData` range 546 /// records. All other records are passed through as-is. new<T>(it: T) -> UnicodeDataExpander<I> where T: IntoIterator<IntoIter = I, Item = I::Item>,547 pub fn new<T>(it: T) -> UnicodeDataExpander<I> 548 where 549 T: IntoIterator<IntoIter = I, Item = I::Item>, 550 { 551 UnicodeDataExpander { 552 it: it.into_iter().peekable(), 553 range: CodepointRange { 554 range: 0..0, 555 start_record: UnicodeData::default(), 556 }, 557 } 558 } 559 } 560 561 impl<I: Iterator<Item = UnicodeData>> Iterator for UnicodeDataExpander<I> { 562 type Item = UnicodeData; 563 next(&mut self) -> Option<UnicodeData>564 fn next(&mut self) -> Option<UnicodeData> { 565 if let Some(udata) = self.range.next() { 566 return Some(udata); 567 } 568 let row1 = match self.it.next() { 569 None => return None, 570 Some(row1) => row1, 571 }; 572 if !row1.is_range_start() 573 || !self.it.peek().map_or(false, |row2| row2.is_range_end()) 574 { 575 return Some(row1); 576 } 577 let row2 = self.it.next().unwrap(); 578 self.range = CodepointRange { 579 range: row1.codepoint.value()..(row2.codepoint.value() + 1), 580 start_record: row1, 581 }; 582 self.next() 583 } 584 } 585 586 impl Iterator for CodepointRange { 587 type Item = UnicodeData; 588 next(&mut self) -> Option<UnicodeData>589 fn next(&mut self) -> Option<UnicodeData> { 590 let cp = match self.range.next() { 591 None => return None, 592 Some(cp) => cp, 593 }; 594 Some(UnicodeData { 595 codepoint: Codepoint::from_u32(cp).unwrap(), 596 name: "".to_string(), 597 ..self.start_record.clone() 598 }) 599 } 600 } 601 602 #[cfg(test)] 603 mod tests { 604 use crate::common::Codepoint; 605 606 use super::{ 607 UnicodeData, UnicodeDataDecomposition, UnicodeDataDecompositionTag, 608 UnicodeDataNumeric, 609 }; 610 codepoint(n: u32) -> Codepoint611 fn codepoint(n: u32) -> Codepoint { 612 Codepoint::from_u32(n).unwrap() 613 } 614 s(string: &str) -> String615 fn s(string: &str) -> String { 616 string.to_string() 617 } 618 619 #[test] parse1()620 fn parse1() { 621 let line = "249D;PARENTHESIZED LATIN SMALL LETTER B;So;0;L;<compat> 0028 0062 0029;;;;N;;;;;\n"; 622 let data: UnicodeData = line.parse().unwrap(); 623 assert_eq!( 624 data, 625 UnicodeData { 626 codepoint: codepoint(0x249d), 627 name: s("PARENTHESIZED LATIN SMALL LETTER B"), 628 general_category: s("So"), 629 canonical_combining_class: 0, 630 bidi_class: s("L"), 631 decomposition: UnicodeDataDecomposition::new( 632 Some(UnicodeDataDecompositionTag::Compat), 633 &[codepoint(0x28), codepoint(0x62), codepoint(0x29)], 634 ) 635 .unwrap(), 636 numeric_type_decimal: None, 637 numeric_type_digit: None, 638 numeric_type_numeric: None, 639 bidi_mirrored: false, 640 unicode1_name: s(""), 641 iso_comment: s(""), 642 simple_uppercase_mapping: None, 643 simple_lowercase_mapping: None, 644 simple_titlecase_mapping: None, 645 } 646 ); 647 } 648 649 #[test] parse2()650 fn parse2() { 651 let line = "000D;<control>;Cc;0;B;;;;;N;CARRIAGE RETURN (CR);;;;\n"; 652 let data: UnicodeData = line.parse().unwrap(); 653 assert_eq!( 654 data, 655 UnicodeData { 656 codepoint: codepoint(0x000D), 657 name: s("<control>"), 658 general_category: s("Cc"), 659 canonical_combining_class: 0, 660 bidi_class: s("B"), 661 decomposition: UnicodeDataDecomposition::new( 662 None, 663 &[codepoint(0x000D)] 664 ) 665 .unwrap(), 666 numeric_type_decimal: None, 667 numeric_type_digit: None, 668 numeric_type_numeric: None, 669 bidi_mirrored: false, 670 unicode1_name: s("CARRIAGE RETURN (CR)"), 671 iso_comment: s(""), 672 simple_uppercase_mapping: None, 673 simple_lowercase_mapping: None, 674 simple_titlecase_mapping: None, 675 } 676 ); 677 } 678 679 #[test] parse3()680 fn parse3() { 681 let line = "00BC;VULGAR FRACTION ONE QUARTER;No;0;ON;<fraction> 0031 2044 0034;;;1/4;N;FRACTION ONE QUARTER;;;;\n"; 682 let data: UnicodeData = line.parse().unwrap(); 683 assert_eq!( 684 data, 685 UnicodeData { 686 codepoint: codepoint(0x00BC), 687 name: s("VULGAR FRACTION ONE QUARTER"), 688 general_category: s("No"), 689 canonical_combining_class: 0, 690 bidi_class: s("ON"), 691 decomposition: UnicodeDataDecomposition::new( 692 Some(UnicodeDataDecompositionTag::Fraction), 693 &[codepoint(0x31), codepoint(0x2044), codepoint(0x34)], 694 ) 695 .unwrap(), 696 numeric_type_decimal: None, 697 numeric_type_digit: None, 698 numeric_type_numeric: Some(UnicodeDataNumeric::Rational(1, 4)), 699 bidi_mirrored: false, 700 unicode1_name: s("FRACTION ONE QUARTER"), 701 iso_comment: s(""), 702 simple_uppercase_mapping: None, 703 simple_lowercase_mapping: None, 704 simple_titlecase_mapping: None, 705 } 706 ); 707 } 708 709 #[test] parse4()710 fn parse4() { 711 let line = "0041;LATIN CAPITAL LETTER A;Lu;0;L;;;;;N;;;;0061;\n"; 712 let data: UnicodeData = line.parse().unwrap(); 713 assert_eq!( 714 data, 715 UnicodeData { 716 codepoint: codepoint(0x0041), 717 name: s("LATIN CAPITAL LETTER A"), 718 general_category: s("Lu"), 719 canonical_combining_class: 0, 720 bidi_class: s("L"), 721 decomposition: UnicodeDataDecomposition::new( 722 None, 723 &[codepoint(0x0041)] 724 ) 725 .unwrap(), 726 numeric_type_decimal: None, 727 numeric_type_digit: None, 728 numeric_type_numeric: None, 729 bidi_mirrored: false, 730 unicode1_name: s(""), 731 iso_comment: s(""), 732 simple_uppercase_mapping: None, 733 simple_lowercase_mapping: Some(codepoint(0x0061)), 734 simple_titlecase_mapping: None, 735 } 736 ); 737 } 738 739 #[test] parse5()740 fn parse5() { 741 let line = "0F33;TIBETAN DIGIT HALF ZERO;No;0;L;;;;-1/2;N;;;;;\n"; 742 let data: UnicodeData = line.parse().unwrap(); 743 assert_eq!( 744 data, 745 UnicodeData { 746 codepoint: codepoint(0x0F33), 747 name: s("TIBETAN DIGIT HALF ZERO"), 748 general_category: s("No"), 749 canonical_combining_class: 0, 750 bidi_class: s("L"), 751 decomposition: UnicodeDataDecomposition::new( 752 None, 753 &[codepoint(0x0F33)] 754 ) 755 .unwrap(), 756 numeric_type_decimal: None, 757 numeric_type_digit: None, 758 numeric_type_numeric: Some(UnicodeDataNumeric::Rational( 759 -1, 2 760 )), 761 bidi_mirrored: false, 762 unicode1_name: s(""), 763 iso_comment: s(""), 764 simple_uppercase_mapping: None, 765 simple_lowercase_mapping: None, 766 simple_titlecase_mapping: None, 767 } 768 ); 769 } 770 771 #[test] expander()772 fn expander() { 773 use super::UnicodeDataExpander; 774 use crate::common::UcdLineParser; 775 776 let data = "\ 777 ABF9;MEETEI MAYEK DIGIT NINE;Nd;0;L;;9;9;9;N;;;;; 778 AC00;<Hangul Syllable, First>;Lo;0;L;;;;;N;;;;; 779 D7A3;<Hangul Syllable, Last>;Lo;0;L;;;;;N;;;;; 780 D7B0;HANGUL JUNGSEONG O-YEO;Lo;0;L;;;;;N;;;;; 781 "; 782 let records = UcdLineParser::new(None, data.as_bytes()) 783 .collect::<Result<Vec<_>, _>>() 784 .unwrap(); 785 assert_eq!(UnicodeDataExpander::new(records).count(), 11174); 786 } 787 } 788