1 //! Correct unicode handling.
2 //!
3 //! Correct unicode is hard, I'll spare you the gory details...
4 //! This module adds a slight runtime speed cost and a considerable binary size cost.
5 //! Uses the crates [`unicode-normalization`] and [`caseless`].
6 //!
7 //! In javascript [`String.prototype.normalize()`] defaults to NFC so we use it too.
8 //!
9 //! We use the default `String` order, which is probably not correct unicode order.
10 //!
11 //!
12 //! # Normalization
13 //!
14 //! Two unicode strings with different bytes can be considered equal.
15 //! Normalizing makes them have the same bytes as long as the same normalization form is used.
16 //!
17 //! We use [canonical equivalence] to ensure characters have the same visual appearance and behavior.
18 //!
19 //! There is a runtime space-time tradeoff between the two canonical [normalization forms]:
20 //!  * NFD = more string space, less normalization time (decomposition)
21 //!  * NFC = less string space, more normalization time (decomposition + composition)
22 //!
23 //!
24 //! # Partial strings
25 //!
26 //! A concatenated string must be normalized even if the original strings were normalized.
27 //!
28 //! I'm not sure, but utf8 substrings are probably normalized if the original was normalized.
29 //!
30 //!
31 //! # Ordering
32 //!
33 //! Should depend on the locale.
34 //!
35 //!
36 //! # Case sensitive
37 //!
38 //! Compare normalized strings.
39 //!
40 //!
41 //! # Case insensitive
42 //!
43 //! Compare caseless normalized strings.
44 //!
45 //! To get a caseless string we must normalize, then get a caseless representation with [case folding] and normalize again.
46 //!
47 //! Based on [(faq/casemap_charprop.html#2)], you can expect a lowercase result most of the time:
48 //! "Case folding in Unicode is primarily based on the lowercase mapping..."
49 //!
50 //!
51 //! [`caseless`]: https://crates.io/crates/caseless
52 //! [`unicode-normalization`]: https://crates.io/crates/unicode-normalization
53 //! [`String.prototype.normalize()`]: https://developer.mozilla.org/en-US/docs/Web/JavaScript/Reference/Global_Objects/String/normalize
54 //! [canonical equivalence]: http://www.unicode.org/reports/tr15/#Canon_Compat_Equivalence
55 //! [normalization forms]: http://www.unicode.org/reports/tr15/#Norm_Forms
56 //! [case folding]: https://www.w3.org/International/wiki/Case_folding
57 //! [(faq/casemap_charprop.html#2)]: http://unicode.org/faq/casemap_charprop.html#2
58 #![allow(dead_code)]
59 
60 use std::fmt;
61 use std::ops;
62 
63 use caseless::Caseless;
64 use unicode_normalization::{is_nfc, is_nfc_quick, IsNormalized, UnicodeNormalization};
65 
66 /// A unicode string normalized with NFC.
67 ///
68 /// Can be used transparently as a `&str`.
69 /// The inner `String` is private to ensure it remains normalized.
70 /// Uses default `String` order, which is probably not correct unicode order.
71 #[derive(Clone, Hash, Eq, PartialEq, Ord, PartialOrd)]
72 pub struct Nfc {
73     inner: String,
74 }
75 
76 impl Nfc {
77     /// Creates a normalized caseless string.
caseless(s: &str) -> Self78     pub fn caseless(s: &str) -> Self {
79         let string: String = if s.is_ascii() {
80             s.to_ascii_lowercase()
81         } else {
82             s.chars().default_case_fold().collect()
83         };
84         Self::from(string)
85     }
86 
87     /// Creates a normalized path string.
88     /// Converts '\\' to '/'.
path(s: &str) -> Self89     pub fn path(s: &str) -> Self {
90         let s = s.replace('\\', "/");
91         Self::from(s)
92     }
93 
94     /// Creates a normalized caseless path string.
95     /// Converts '\\' to '/'.
caseless_path(s: &str) -> Self96     pub fn caseless_path(s: &str) -> Self {
97         let s = s.replace('\\', "/");
98         Self::caseless(&s)
99     }
100 
101     /// Match `String::as_str()`.
as_str(&self) -> &str102     pub fn as_str(&self) -> &str {
103         &self.inner
104     }
105 }
106 
107 impl AsRef<[u8]> for Nfc {
as_ref(&self) -> &[u8]108     fn as_ref(&self) -> &[u8] {
109         self.inner.as_bytes()
110     }
111 }
112 
113 impl AsRef<str> for Nfc {
as_ref(&self) -> &str114     fn as_ref(&self) -> &str {
115         &self.inner
116     }
117 }
118 
119 impl From<&str> for Nfc {
from(s: &str) -> Self120     fn from(s: &str) -> Self {
121         Self::from(s.to_owned())
122     }
123 }
124 
125 /// Converts to a normalized string.
126 /// Consumes the original string.
127 impl From<String> for Nfc {
from(string: String) -> Self128     fn from(string: String) -> Self {
129         match is_nfc_quick(string.chars()) {
130             IsNormalized::Yes => Self { inner: string },
131             IsNormalized::Maybe if is_nfc(&string) => Self { inner: string },
132             _ => {
133                 let string = string.chars().nfc().collect();
134                 Self { inner: string }
135             }
136         }
137     }
138 }
139 
140 /// Unwraps the inner string.
141 impl Into<String> for Nfc {
into(self) -> String142     fn into(self) -> String {
143         self.inner
144     }
145 }
146 
147 /// Inherits all the methods of `str`.
148 impl ops::Deref for Nfc {
149     type Target = str;
150 
deref(&self) -> &str151     fn deref(&self) -> &str {
152         &self.inner
153     }
154 }
155 
156 /// Adds a string.
157 impl ops::Add<&str> for Nfc {
158     type Output = Self;
159 
add(self, other: &str) -> Self160     fn add(self, other: &str) -> Self {
161         let string = self.inner.chars().chain(other.chars()).nfc().collect();
162         Self { inner: string }
163     }
164 }
165 
166 /// Matches the inner string.
167 impl fmt::Display for Nfc {
fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result168     fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
169         let display: &dyn fmt::Display = &self.inner;
170         display.fmt(f)
171     }
172 }
173 
174 /// Matches the inner string.
175 impl fmt::Debug for Nfc {
fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result176     fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
177         let debug: &dyn fmt::Debug = &self.inner;
178         debug.fmt(f)
179     }
180 }
181