1 //! Correct unicode handling. 2 //! 3 //! Correct unicode is hard, I'll spare you the gory details... 4 //! This module adds a slight runtime speed cost and a considerable binary size cost. 5 //! Uses the crates [`unicode-normalization`] and [`caseless`]. 6 //! 7 //! In javascript [`String.prototype.normalize()`] defaults to NFC so we use it too. 8 //! 9 //! We use the default `String` order, which is probably not correct unicode order. 10 //! 11 //! 12 //! # Normalization 13 //! 14 //! Two unicode strings with different bytes can be considered equal. 15 //! Normalizing makes them have the same bytes as long as the same normalization form is used. 16 //! 17 //! We use [canonical equivalence] to ensure characters have the same visual appearance and behavior. 18 //! 19 //! There is a runtime space-time tradeoff between the two canonical [normalization forms]: 20 //! * NFD = more string space, less normalization time (decomposition) 21 //! * NFC = less string space, more normalization time (decomposition + composition) 22 //! 23 //! 24 //! # Partial strings 25 //! 26 //! A concatenated string must be normalized even if the original strings were normalized. 27 //! 28 //! I'm not sure, but utf8 substrings are probably normalized if the original was normalized. 29 //! 30 //! 31 //! # Ordering 32 //! 33 //! Should depend on the locale. 34 //! 35 //! 36 //! # Case sensitive 37 //! 38 //! Compare normalized strings. 39 //! 40 //! 41 //! # Case insensitive 42 //! 43 //! Compare caseless normalized strings. 44 //! 45 //! To get a caseless string we must normalize, then get a caseless representation with [case folding] and normalize again. 46 //! 47 //! Based on [(faq/casemap_charprop.html#2)], you can expect a lowercase result most of the time: 48 //! "Case folding in Unicode is primarily based on the lowercase mapping..." 49 //! 50 //! 51 //! [`caseless`]: https://crates.io/crates/caseless 52 //! [`unicode-normalization`]: https://crates.io/crates/unicode-normalization 53 //! [`String.prototype.normalize()`]: https://developer.mozilla.org/en-US/docs/Web/JavaScript/Reference/Global_Objects/String/normalize 54 //! [canonical equivalence]: http://www.unicode.org/reports/tr15/#Canon_Compat_Equivalence 55 //! [normalization forms]: http://www.unicode.org/reports/tr15/#Norm_Forms 56 //! [case folding]: https://www.w3.org/International/wiki/Case_folding 57 //! [(faq/casemap_charprop.html#2)]: http://unicode.org/faq/casemap_charprop.html#2 58 #![allow(dead_code)] 59 60 use std::fmt; 61 use std::ops; 62 63 use caseless::Caseless; 64 use unicode_normalization::{is_nfc, is_nfc_quick, IsNormalized, UnicodeNormalization}; 65 66 /// A unicode string normalized with NFC. 67 /// 68 /// Can be used transparently as a `&str`. 69 /// The inner `String` is private to ensure it remains normalized. 70 /// Uses default `String` order, which is probably not correct unicode order. 71 #[derive(Clone, Hash, Eq, PartialEq, Ord, PartialOrd)] 72 pub struct Nfc { 73 inner: String, 74 } 75 76 impl Nfc { 77 /// Creates a normalized caseless string. caseless(s: &str) -> Self78 pub fn caseless(s: &str) -> Self { 79 let string: String = if s.is_ascii() { 80 s.to_ascii_lowercase() 81 } else { 82 s.chars().default_case_fold().collect() 83 }; 84 Self::from(string) 85 } 86 87 /// Creates a normalized path string. 88 /// Converts '\\' to '/'. path(s: &str) -> Self89 pub fn path(s: &str) -> Self { 90 let s = s.replace('\\', "/"); 91 Self::from(s) 92 } 93 94 /// Creates a normalized caseless path string. 95 /// Converts '\\' to '/'. caseless_path(s: &str) -> Self96 pub fn caseless_path(s: &str) -> Self { 97 let s = s.replace('\\', "/"); 98 Self::caseless(&s) 99 } 100 101 /// Match `String::as_str()`. as_str(&self) -> &str102 pub fn as_str(&self) -> &str { 103 &self.inner 104 } 105 } 106 107 impl AsRef<[u8]> for Nfc { as_ref(&self) -> &[u8]108 fn as_ref(&self) -> &[u8] { 109 self.inner.as_bytes() 110 } 111 } 112 113 impl AsRef<str> for Nfc { as_ref(&self) -> &str114 fn as_ref(&self) -> &str { 115 &self.inner 116 } 117 } 118 119 impl From<&str> for Nfc { from(s: &str) -> Self120 fn from(s: &str) -> Self { 121 Self::from(s.to_owned()) 122 } 123 } 124 125 /// Converts to a normalized string. 126 /// Consumes the original string. 127 impl From<String> for Nfc { from(string: String) -> Self128 fn from(string: String) -> Self { 129 match is_nfc_quick(string.chars()) { 130 IsNormalized::Yes => Self { inner: string }, 131 IsNormalized::Maybe if is_nfc(&string) => Self { inner: string }, 132 _ => { 133 let string = string.chars().nfc().collect(); 134 Self { inner: string } 135 } 136 } 137 } 138 } 139 140 /// Unwraps the inner string. 141 impl Into<String> for Nfc { into(self) -> String142 fn into(self) -> String { 143 self.inner 144 } 145 } 146 147 /// Inherits all the methods of `str`. 148 impl ops::Deref for Nfc { 149 type Target = str; 150 deref(&self) -> &str151 fn deref(&self) -> &str { 152 &self.inner 153 } 154 } 155 156 /// Adds a string. 157 impl ops::Add<&str> for Nfc { 158 type Output = Self; 159 add(self, other: &str) -> Self160 fn add(self, other: &str) -> Self { 161 let string = self.inner.chars().chain(other.chars()).nfc().collect(); 162 Self { inner: string } 163 } 164 } 165 166 /// Matches the inner string. 167 impl fmt::Display for Nfc { fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result168 fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { 169 let display: &dyn fmt::Display = &self.inner; 170 display.fmt(f) 171 } 172 } 173 174 /// Matches the inner string. 175 impl fmt::Debug for Nfc { fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result176 fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { 177 let debug: &dyn fmt::Debug = &self.inner; 178 debug.fmt(f) 179 } 180 } 181