1 /* Copyright 2016 The encode_unicode Developers
2  *
3  * Licensed under the Apache License, Version 2.0, <LICENSE-APACHE or
4  * http://apache.org/licenses/LICENSE-2.0> or the MIT license <LICENSE-MIT or
5  * http://opensource.org/licenses/MIT>, at your option. This file may not be
6  * copied, modified, or distributed except according to those terms.
7  */
8 
9 //! Test that every method gives the correct result for valid values.
10 //! Except iterators, which are stateful.
11 
12 use std::char;
13 use std::str::{self,FromStr};
14 use std::cmp::Ordering;
15 use std::hash::{Hash,Hasher};
16 use std::collections::hash_map::DefaultHasher;
17 #[allow(deprecated,unused)]
18 use std::ascii::AsciiExt;
19 use std::iter::FromIterator;
20 extern crate encode_unicode;
21 use encode_unicode::*;
22 
23 
24 #[test]
equal_defaults()25 fn equal_defaults() {
26     assert_eq!(Utf8Char::default().to_char(), char::default());
27     assert_eq!(Utf16Char::default().to_char(), char::default());
28 }
29 
30 #[test]
same_size_as_char()31 fn same_size_as_char() {
32     use std::mem::size_of;
33     assert_eq!(size_of::<Utf8Char>(), size_of::<char>());
34     assert_eq!(size_of::<Utf16Char>(), size_of::<char>());
35 }
36 
37 #[test]
utf16chars_to_string()38 fn utf16chars_to_string() {
39     let s = "aå\u{10ffff}‽\u{100000}\u{fee1}";
40     let u16cs = s.chars().map(|c| Utf16Char::from(c) ).collect::<Vec<Utf16Char>>();
41 
42     let mut from_refs: String = u16cs.iter().collect();
43     assert_eq!(&from_refs, s);
44     from_refs.extend(&u16cs);
45     assert_eq!(&from_refs[s.len()..], s);
46 
47     let mut from_vals: String = u16cs.iter().cloned().collect();
48     assert_eq!(&from_vals, s);
49     from_vals.extend(u16cs);
50     assert_eq!(&from_vals[s.len()..], s);
51 }
52 
53 
54 const EDGES_AND_BETWEEN: [char;19] = [
55     '\u{0}',// min
56     '\u{3b}',// middle ASCII
57     'A',// min ASCII uppercase
58     'N',// middle ASCII uppercase
59     'Z',// max ASCII uppercase
60     'a',// min ASCII lowercase
61     'm',// middle ASCII lowercase
62     'z',// max ASCII lowercase
63     '\u{7f}',// max ASCII and 1-byte UTF-8
64     '\u{80}',// min 2-byte UTF-8
65     '\u{111}',// middle
66     '\u{7ff}',// max 2-byte UTF-8
67     '\u{800}',// min 3-byte UTF-8
68     '\u{d7ff}',// before reserved
69     '\u{e000}',// after reserved
70     '\u{ffff}',// max UTF-16 single and 3-byte UTF-8
71     '\u{10000}',// min UTF-16 surrogate and 4-byte UTF-8
72     '\u{abcde}',// middle
73     '\u{10ffff}',// max
74 ];
75 
eq_cmp_hash(c: char) -> (Utf8Char, Utf16Char)76 fn eq_cmp_hash(c: char) -> (Utf8Char, Utf16Char) {
77     fn hash<T:Hash>(v: T) -> u64 {
78         #[allow(deprecated)]
79         let mut hasher = DefaultHasher::new();
80         v.hash(&mut hasher);
81         hasher.finish()
82     }
83     let u8c = c.to_utf8();
84     assert_eq!(u8c.to_char(), c);
85     assert_eq!(u8c, u8c);
86     assert_eq!(hash(u8c), hash(u8c));
87     assert_eq!(u8c.cmp(&u8c), Ordering::Equal);
88     assert!(u8c.eq_ignore_ascii_case(&u8c));
89     let u16c = c.to_utf16();
90     assert_eq!(u16c.to_char(), c);
91     assert_eq!(u16c, u16c);
92     assert_eq!(hash(u16c), hash(c));
93     assert_eq!(u16c.cmp(&u16c), Ordering::Equal);
94     assert!(u16c.eq_ignore_ascii_case(&u16c));
95 
96     assert_eq!(u8c, c);
97     assert_eq!(c, u8c);
98     assert_eq!(u16c, c);
99     assert_eq!(c, u16c);
100     assert_eq!(u8c, u16c);
101     assert_eq!(u16c, u8c);
102     assert_eq!(u8c == c as u8,  c <= '\u{7F}');
103     assert_eq!(u16c == c as u8,  c <= '\u{FF}');
104     assert_eq!(u16c == c as u16,  c <= '\u{FFFF}');
105 
106     assert_eq!(u8c.partial_cmp(&c), Some(Ordering::Equal));
107     assert_eq!(c.partial_cmp(&u8c), Some(Ordering::Equal));
108     assert_eq!(u16c.partial_cmp(&c), Some(Ordering::Equal));
109     assert_eq!(c.partial_cmp(&u16c), Some(Ordering::Equal));
110     assert_eq!(u8c.partial_cmp(&u16c), Some(Ordering::Equal));
111     assert_eq!(u16c.partial_cmp(&u8c), Some(Ordering::Equal));
112 
113 
114     for &other in &EDGES_AND_BETWEEN {
115         let u8other = other.to_utf8();
116         assert_eq!(u8c == u8other,  c == other);
117         assert_eq!(hash(u8c)==hash(u8other),  hash(c)==hash(other));
118         assert_eq!(u8c.cmp(&u8other), c.cmp(&other));
119         assert_eq!(u8c.eq_ignore_ascii_case(&u8other), c.eq_ignore_ascii_case(&other));
120         assert_eq!(u8c.partial_cmp(&other), c.partial_cmp(&other));
121         assert_eq!(c.partial_cmp(&u8other), c.partial_cmp(&other));
122         assert_eq!(u8other.partial_cmp(&c), other.partial_cmp(&c));
123         assert_eq!(other.partial_cmp(&u8c), other.partial_cmp(&c));
124         assert_eq!(u8c == other as u8,  other as u8 <= 127 && c == other as u8 as char);
125 
126         let u16other = other.to_utf16();
127         assert_eq!(u16c == u16other,  c == other);
128         assert_eq!(hash(u16c)==hash(u16other),  hash(c)==hash(other));
129         assert_eq!(u16c.cmp(&u16other), c.cmp(&other));
130         assert_eq!(u16c.eq_ignore_ascii_case(&u16other), c.eq_ignore_ascii_case(&other));
131         assert_eq!(u16c.partial_cmp(&other), c.partial_cmp(&other));
132         assert_eq!(c.partial_cmp(&u16other), c.partial_cmp(&other));
133         assert_eq!(u16other.partial_cmp(&c), other.partial_cmp(&c));
134         assert_eq!(other.partial_cmp(&u16c), other.partial_cmp(&c));
135         assert_eq!(u16c == other as u8,  c == other as u8 as char);
136         assert_eq!(u16c == other as u16,  c as u32 == other as u16 as u32);
137 
138         assert_eq!(u8c == u16other,  c == other);
139         assert_eq!(u16c == u8other,  c == other);
140         assert_eq!(u8c.partial_cmp(&u16other),  c.partial_cmp(&other));
141         assert_eq!(u16c.partial_cmp(&u8other),  c.partial_cmp(&other));
142         assert_eq!(u8other.partial_cmp(&u16c),  other.partial_cmp(&c));
143         assert_eq!(u16other.partial_cmp(&u8c),  other.partial_cmp(&c));
144     }
145     (u8c, u16c)
146 }
147 
iterators(c: char)148 fn iterators(c: char) {
149     let mut iter = c.iter_utf8_bytes();
150     let mut buf = [0; 4];
151     let mut iter_ref = c.encode_utf8(&mut buf[..]).as_bytes().iter();
152     for _ in 0..6 {
153         assert_eq!(iter.size_hint(), iter_ref.size_hint());
154         assert_eq!(format!("{:?}", iter), format!("{:?}", iter_ref.as_slice()));
155         assert_eq!(iter.next(), iter_ref.next().cloned());
156     }
157 
158     let mut iter = c.iter_utf16_units();
159     let mut buf = [0; 2];
160     let mut iter_ref = c.encode_utf16(&mut buf[..]).iter();
161     for _ in 0..4 {
162         assert_eq!(iter.size_hint(), iter_ref.size_hint());
163         assert_eq!(format!("{:?}", iter), format!("{:?}", iter_ref.as_slice()));
164         assert_eq!(iter.next(), iter_ref.next().cloned());
165     }
166 }
167 
test(c: char)168 fn test(c: char) {
169     assert_eq!(char::from_u32(c as u32), Some(c));
170     assert_eq!(char::from_u32_detailed(c as u32), Ok(c));
171     assert_eq!(unsafe{ char::from_u32_unchecked(c as u32) }, c);
172     let (u8c, u16c) = eq_cmp_hash(c);
173     iterators(c);
174     assert_eq!(Utf16Char::from(u8c), u16c);
175     assert_eq!(Utf8Char::from(u16c), u8c);
176     let utf8_len = c.len_utf8();
177     let utf16_len = c.len_utf16();
178     let mut as_str = c.to_string();
179 
180     // UTF-8
181     let mut buf = [0; 4];
182     let reference = c.encode_utf8(&mut buf[..]).as_bytes();
183     let len = reference.len(); // short name because it is used in many places.
184     assert_eq!(len, utf8_len);
185     assert_eq!(reference[0].extra_utf8_bytes(), Ok(len-1));
186     assert_eq!(reference[0].extra_utf8_bytes_unchecked(), len-1);
187     assert_eq!(AsRef::<[u8]>::as_ref(&u8c), reference);
188 
189     let (arr,arrlen) = u8c.to_array();
190     assert_eq!(arrlen, len);
191     assert_eq!(Utf8Char::from_array(arr), Ok(u8c));
192     assert_eq!(c.to_utf8_array(),  (arr, len));
193 
194     let str_ = str::from_utf8(reference).unwrap();
195     let ustr = Utf8Char::from_str(str_).unwrap();
196     assert_eq!(ustr.to_array().0, arr);// bitwise equality
197     assert_eq!(char::from_utf8_array(arr), Ok(c));
198     let mut longer = [0xff; 5]; // 0xff is never valid
199     longer[..len].copy_from_slice(reference);
200     assert_eq!(char::from_utf8_slice_start(reference), Ok((c,len)));
201     assert_eq!(char::from_utf8_slice_start(&longer), Ok((c,len)));
202     assert_eq!(Utf8Char::from_slice_start(reference), Ok((u8c,len)));
203     assert_eq!(Utf8Char::from_slice_start(&longer), Ok((u8c,len)));
204     for other in &mut longer[len..] {*other = b'?'}
205     assert_eq!(Utf8Char::from_str(str_), Ok(u8c));
206     assert_eq!(Utf8Char::from_str_start(str_), Ok((u8c,len)));
207     assert_eq!(Utf8Char::from_str_start(str::from_utf8(&longer).unwrap()), Ok((u8c,len)));
208     unsafe {
209         // Hopefully make bugs easier to catch by making reads into unallocated memory by filling
210         // a jemalloc bin. See table on http://jemalloc.net/jemalloc.3.html for bin sizes.
211         // I have no idea whether this works.
212         let mut boxed = Box::new([0xffu8; 16]);
213         let start = boxed.len()-len; // reach the end
214         boxed[start..].copy_from_slice(reference);
215         let slice = &boxed[start..start]; // length of slice should be ignored.
216         assert_eq!(Utf8Char::from_slice_start_unchecked(slice), (u8c,len));
217     }
218     assert_eq!(&Vec::<u8>::from_iter(Some(u8c))[..], reference);
219     assert_eq!(&String::from_iter(Some(u8c))[..], str_);
220     assert_eq!(format!("{:?}", u8c), format!("{:?}", c));
221     assert_eq!(format!("{}", u8c), format!("{}", c));
222     assert_eq!(u8c.is_ascii(), c.is_ascii());
223     assert_eq!(u8c.to_ascii_lowercase().to_char(), c.to_ascii_lowercase());
224     assert_eq!(u8c.to_ascii_uppercase().to_char(), c.to_ascii_uppercase());
225 
226     // UTF-16
227     let mut buf = [0; 2];
228     let reference = c.encode_utf16(&mut buf[..]);
229     let len = reference.len();
230     assert_eq!(len, utf16_len);
231     assert_eq!(reference[0].utf16_needs_extra_unit(), Ok(len==2));
232     assert_eq!(reference[0].is_utf16_leading_surrogate(), len==2);
233     assert_eq!(u16c.as_ref(), reference);
234     let mut longer = [0; 3];
235     longer[..len].copy_from_slice(reference);
236     assert_eq!(char::from_utf16_slice_start(reference), Ok((c,len)));
237     assert_eq!(char::from_utf16_slice_start(&longer), Ok((c,len)));
238     assert_eq!(Utf16Char::from_slice_start(reference), Ok((u16c,len)));
239     assert_eq!(Utf16Char::from_slice_start(&longer), Ok((u16c,len)));
240     assert_eq!(Utf16Char::from_str(&as_str), Ok(u16c));
241     as_str.push(c);
242     assert_eq!(Utf16Char::from_str_start(&as_str), Ok((u16c,utf8_len)));
243     unsafe {
244         // Hopefully make bugs easier to catch by making reads into unallocated memory by filling
245         // a jemalloc bin. See table on http://jemalloc.net/jemalloc.3.html for bin sizes.
246         // I have no idea whether this works.
247         let mut boxed = Box::new([0u16; 8]);
248         let start = boxed.len()-len; // reach the end
249         boxed[start..].copy_from_slice(reference);
250         let slice = &boxed[start..start]; // length of slice should be ignored.
251         assert_eq!(Utf16Char::from_slice_start_unchecked(slice), (u16c,len));
252     }
253     let array = c.to_utf16_array();
254     let tuple = c.to_utf16_tuple();
255     assert_eq!(&array[..reference.len()], reference);
256     assert_eq!(tuple, (reference[0],reference.get(1).cloned()));
257     assert_eq!(char::from_utf16_array(array), Ok(c));
258     assert_eq!(char::from_utf16_tuple(tuple), Ok(c));
259     assert_eq!(c.to_utf16().to_char(), c);
260     assert_eq!(&Vec::<u16>::from_iter(Some(u16c))[..], reference);
261     assert_eq!(format!("{:?}", u16c), format!("{:?}", c));
262     assert_eq!(format!("{}", u16c), format!("{}", c));
263     assert_eq!(u16c.is_ascii(), c.is_ascii());
264     assert_eq!(u16c.to_ascii_lowercase().to_char(), c.to_ascii_lowercase());
265     assert_eq!(u16c.to_ascii_uppercase().to_char(), c.to_ascii_uppercase());
266 }
267 
268 
269 #[test]
edges_middle()270 fn edges_middle() {
271     for &c in &EDGES_AND_BETWEEN {
272         test(c);
273     }
274 }
275 
276 
277 #[test]
278 #[ignore]
all()279 fn all() {
280     for cp in std::iter::Iterator::chain(0..0xd800, 0xe000..0x110000) {
281         let c = char::from_u32(cp).expect("not a valid char");
282         test(c);
283     }
284 }
285