1 /* Copyright 2016 The encode_unicode Developers
2  *
3  * Licensed under the Apache License, Version 2.0, <LICENSE-APACHE or
4  * http://apache.org/licenses/LICENSE-2.0> or the MIT license <LICENSE-MIT or
5  * http://opensource.org/licenses/MIT>, at your option. This file may not be
6  * copied, modified, or distributed except according to those terms.
7  */
8 
9 use utf8_char::Utf8Char;
10 use errors::EmptyStrError;
11 extern crate core;
12 use self::core::{mem, u32, u64};
13 use self::core::ops::Not;
14 use self::core::fmt;
15 use self::core::borrow::Borrow;
16 #[cfg(feature="std")]
17 use std::io::{Read, Error as ioError};
18 
19 
20 
21 /// Read or iterate over the bytes of the UTF-8 representation of a codepoint.
22 #[derive(Clone)]
23 pub struct Utf8Iterator (u32);
24 
25 impl From<Utf8Char> for Utf8Iterator {
from(uc: Utf8Char) -> Self26     fn from(uc: Utf8Char) -> Self {
27         let used = u32::from_le(unsafe{ mem::transmute(uc.to_array().0) });
28         // uses u64 because shifting an u32 by 32 bits is a no-op.
29         let unused_set = (u64::MAX  <<  uc.len() as u64*8) as u32;
30         Utf8Iterator(used | unused_set)
31     }
32 }
33 impl From<char> for Utf8Iterator {
from(c: char) -> Self34     fn from(c: char) -> Self {
35         Self::from(Utf8Char::from(c))
36     }
37 }
38 impl Iterator for Utf8Iterator {
39     type Item=u8;
next(&mut self) -> Option<u8>40     fn next(&mut self) -> Option<u8> {
41         let next = self.0 as u8;
42         if next == 0xff {
43             None
44         } else {
45             self.0 = (self.0 >> 8)  |  0xff_00_00_00;
46             Some(next)
47         }
48     }
size_hint(&self) -> (usize, Option<usize>)49     fn size_hint(&self) -> (usize, Option<usize>) {
50         (self.len(),  Some(self.len()))
51     }
52 }
53 impl ExactSizeIterator for Utf8Iterator {
len(&self) -> usize54     fn len(&self) -> usize {// not straightforward, but possible
55         let unused_bytes = self.0.not().leading_zeros() / 8;
56         4 - unused_bytes as usize
57     }
58 }
59 #[cfg(feature="std")]
60 impl Read for Utf8Iterator {
61     /// Always returns Ok
read(&mut self, buf: &mut[u8]) -> Result<usize, ioError>62     fn read(&mut self,  buf: &mut[u8]) -> Result<usize, ioError> {
63         // Cannot call self.next() until I know I can write the result.
64         for (i, dst) in buf.iter_mut().enumerate() {
65             match self.next() {
66                 Some(b) => *dst = b,
67                 None    => return Ok(i),
68             }
69         }
70         Ok(buf.len())
71     }
72 }
73 impl fmt::Debug for Utf8Iterator {
fmt(&self, fmtr: &mut fmt::Formatter) -> fmt::Result74     fn fmt(&self,  fmtr: &mut fmt::Formatter) -> fmt::Result {
75         let mut content = [0; 4];
76         let mut i = 0;
77         for b in self.clone() {
78             content[i] = b;
79             i += 1;
80         }
81         write!(fmtr, "{:?}", &content[..i])
82     }
83 }
84 
85 
86 
87 /// Converts an iterator of `Utf8Char` (or `&Utf8Char`)
88 /// to an iterator of `u8`s.
89 /// Is equivalent to calling `.flat_map()` on the original iterator,
90 /// but the returned iterator is ~40% faster.
91 ///
92 /// The iterator also implements `Read` (if the `std` feature isn't disabled).
93 /// Reading will never produce an error, and calls to `.read()` and `.next()`
94 /// can be mixed.
95 ///
96 /// The exact number of bytes cannot be known in advance, but `size_hint()`
97 /// gives the possible range.
98 /// (min: all remaining characters are ASCII, max: all require four bytes)
99 ///
100 /// # Examples
101 ///
102 /// From iterator of values:
103 ///
104 /// ```
105 /// use encode_unicode::{iter_bytes, CharExt};
106 ///
107 /// let iterator = "foo".chars().map(|c| c.to_utf8() );
108 /// let mut bytes = [0; 4];
109 /// for (u,dst) in iter_bytes(iterator).zip(&mut bytes) {*dst=u;}
110 /// assert_eq!(&bytes, b"foo\0");
111 /// ```
112 ///
113 /// From iterator of references:
114 ///
115 #[cfg_attr(feature="std", doc=" ```")]
116 #[cfg_attr(not(feature="std"), doc=" ```no_compile")]
117 /// use encode_unicode::{iter_bytes, CharExt, Utf8Char};
118 ///
119 /// let chars: Vec<Utf8Char> = "�� bomb ��".chars().map(|c| c.to_utf8() ).collect();
120 /// let bytes: Vec<u8> = iter_bytes(&chars).collect();
121 /// let flat_map: Vec<u8> = chars.iter().flat_map(|u8c| *u8c ).collect();
122 /// assert_eq!(bytes, flat_map);
123 /// ```
124 ///
125 /// `Read`ing from it:
126 ///
127 #[cfg_attr(feature="std", doc=" ```")]
128 #[cfg_attr(not(feature="std"), doc=" ```no_compile")]
129 /// use encode_unicode::{iter_bytes, CharExt};
130 /// use std::io::Read;
131 ///
132 /// let s = "Ååh‽";
133 /// assert_eq!(s.len(), 8);
134 /// let mut buf = [b'E'; 9];
135 /// let mut reader = iter_bytes(s.chars().map(|c| c.to_utf8() ));
136 /// assert_eq!(reader.read(&mut buf[..]).unwrap(), 8);
137 /// assert_eq!(reader.read(&mut buf[..]).unwrap(), 0);
138 /// assert_eq!(&buf[..8], s.as_bytes());
139 /// assert_eq!(buf[8], b'E');
140 /// ```
iter_bytes<U:Borrow<Utf8Char>, I:IntoIterator<Item=U>> (iterable: I) -> Utf8CharSplitter<U, I::IntoIter>141 pub fn iter_bytes<U:Borrow<Utf8Char>, I:IntoIterator<Item=U>>
142 (iterable: I) -> Utf8CharSplitter<U, I::IntoIter> {
143     Utf8CharSplitter{ inner: iterable.into_iter(),  prev: 0 }
144 }
145 
146 /// The iterator type returned by `iter_bytes()`
147 ///
148 /// See its documentation for details.
149 #[derive(Clone)]
150 pub struct Utf8CharSplitter<U:Borrow<Utf8Char>, I:Iterator<Item=U>> {
151     inner: I,
152     prev: u32,
153 }
154 impl<I:Iterator<Item=Utf8Char>> From<I> for Utf8CharSplitter<Utf8Char,I> {
155     /// A less generic constructor than `iter_bytes()`
from(iter: I) -> Self156     fn from(iter: I) -> Self {
157         iter_bytes(iter)
158     }
159 }
160 impl<U:Borrow<Utf8Char>, I:Iterator<Item=U>> Utf8CharSplitter<U,I> {
161     /// Extracts the source iterator.
162     ///
163     /// Note that `iter_bytes(iter.into_inner())` is not a no-op:
164     /// If the last returned byte from `next()` was not an ASCII by,
165     /// the remaining bytes of that codepoint is lost.
into_inner(self) -> I166     pub fn into_inner(self) -> I {
167         self.inner
168     }
169 }
170 impl<U:Borrow<Utf8Char>, I:Iterator<Item=U>> Iterator for Utf8CharSplitter<U,I> {
171     type Item = u8;
next(&mut self) -> Option<Self::Item>172     fn next(&mut self) -> Option<Self::Item> {
173         if self.prev == 0 {
174             self.inner.next().map(|u8c| {
175                 let array = u8c.borrow().to_array().0;
176                 self.prev = unsafe{ u32::from_le(mem::transmute(array)) } >> 8;
177                 array[0]
178             })
179         } else {
180             let next = self.prev as u8;
181             self.prev >>= 8;
182             Some(next)
183         }
184     }
size_hint(&self) -> (usize,Option<usize>)185     fn size_hint(&self) -> (usize,Option<usize>) {
186         // Doesn't need to handle unlikely overflows correctly because
187         // size_hint() cannot be relied upon anyway. (the trait isn't unsafe)
188         let (min, max) = self.inner.size_hint();
189         let add = 4 - (self.prev.leading_zeros() / 8) as usize;
190         (min.wrapping_add(add), max.map(|max| max.wrapping_mul(4).wrapping_add(add) ))
191     }
192 }
193 #[cfg(feature="std")]
194 impl<U:Borrow<Utf8Char>, I:Iterator<Item=U>> Read for Utf8CharSplitter<U,I> {
195     /// Always returns `Ok`
read(&mut self, buf: &mut[u8]) -> Result<usize, ioError>196     fn read(&mut self,  buf: &mut[u8]) -> Result<usize, ioError> {
197         let mut i = 0;
198         // write remaining bytes of previous codepoint
199         while self.prev != 0  &&  i < buf.len() {
200             buf[i] = self.prev as u8;
201             self.prev >>= 8;
202             i += 1;
203         }
204         // write whole characters
205         while i < buf.len() {
206             let bytes = match self.inner.next() {
207                 Some(u8c) => u8c.borrow().to_array().0,
208                 None => break
209             };
210             buf[i] = bytes[0];
211             i += 1;
212             if bytes[1] != 0 {
213                 let len = bytes[0].not().leading_zeros() as usize;
214                 let mut written = 1;
215                 while written < len {
216                     if i < buf.len() {
217                         buf[i] = bytes[written];
218                         i += 1;
219                         written += 1;
220                     } else {
221                         let bytes_as_u32 = unsafe{ u32::from_le(mem::transmute(bytes)) };
222                         self.prev = bytes_as_u32 >> (8*written);
223                         return Ok(i);
224                     }
225                 }
226             }
227         }
228         Ok(i)
229     }
230 }
231 
232 
233 
234 /// An iterator over the `Utf8Char` of a string slice, and their positions.
235 ///
236 /// This struct is created by the `utf8char_indices() method from [`StrExt`] trait. See its documentation for more.
237 #[derive(Clone)]
238 pub struct Utf8CharIndices<'a>{
239     str: &'a str,
240     index: usize,
241 }
242 impl<'a> From<&'a str> for Utf8CharIndices<'a> {
from(s: &str) -> Utf8CharIndices243     fn from(s: &str) -> Utf8CharIndices {
244         Utf8CharIndices{str: s, index: 0}
245     }
246 }
247 impl<'a> Utf8CharIndices<'a> {
248     /// Extract the remainder of the source `str`.
249     ///
250     /// # Examples
251     ///
252     /// ```
253     /// use encode_unicode::{StrExt, Utf8Char};
254     /// let mut iter = "abc".utf8char_indices();
255     /// assert_eq!(iter.next_back(), Some((2, Utf8Char::from('c'))));
256     /// assert_eq!(iter.next(), Some((0, Utf8Char::from('a'))));
257     /// assert_eq!(iter.as_str(), "b");
258     /// ```
as_str(&self) -> &'a str259     pub fn as_str(&self) -> &'a str {
260         &self.str[self.index..]
261     }
262 }
263 impl<'a> Iterator for Utf8CharIndices<'a> {
264     type Item = (usize,Utf8Char);
next(&mut self) -> Option<(usize,Utf8Char)>265     fn next(&mut self) -> Option<(usize,Utf8Char)> {
266         match Utf8Char::from_str_start(&self.str[self.index..]) {
267             Ok((u8c, len)) => {
268                 let item = (self.index, u8c);
269                 self.index += len;
270                 Some(item)
271             },
272             Err(EmptyStrError) => None
273         }
274     }
size_hint(&self) -> (usize,Option<usize>)275     fn size_hint(&self) -> (usize,Option<usize>) {
276         let len = self.str.len() - self.index;
277         // For len+3 to overflow, the slice must fill all but two bytes of
278         // addressable memory, and size_hint() doesn't need to be correct.
279         (len.wrapping_add(3)/4, Some(len))
280     }
281 }
282 impl<'a> DoubleEndedIterator for Utf8CharIndices<'a> {
next_back(&mut self) -> Option<(usize,Utf8Char)>283     fn next_back(&mut self) -> Option<(usize,Utf8Char)> {
284         // Cannot refactor out the unwrap without switching to ::from_slice()
285         // since slicing the str panics if not on a boundary.
286         if self.index < self.str.len() {
287             let rev = self.str.bytes().rev();
288             let len = 1 + rev.take_while(|b| b & 0b1100_0000 == 0b1000_0000 ).count();
289             let starts = self.str.len() - len;
290             let (u8c,_) = Utf8Char::from_str_start(&self.str[starts..]).unwrap();
291             self.str = &self.str[..starts];
292             Some((starts, u8c))
293         } else {
294             None
295         }
296     }
297 }
298 impl<'a> fmt::Debug for Utf8CharIndices<'a> {
fmt(&self, fmtr: &mut fmt::Formatter) -> fmt::Result299     fn fmt(&self,  fmtr: &mut fmt::Formatter) -> fmt::Result {
300         fmtr.debug_tuple("Utf8CharIndices")
301             .field(&self.index)
302             .field(&self.as_str())
303             .finish()
304     }
305 }
306 
307 
308 /// An iterator over the codepoints in a `str` represented as `Utf8Char`.
309 #[derive(Clone)]
310 pub struct Utf8Chars<'a>(Utf8CharIndices<'a>);
311 impl<'a> From<&'a str> for Utf8Chars<'a> {
from(s: &str) -> Utf8Chars312     fn from(s: &str) -> Utf8Chars {
313         Utf8Chars(Utf8CharIndices::from(s))
314     }
315 }
316 impl<'a> Utf8Chars<'a> {
317     /// Extract the remainder of the source `str`.
318     ///
319     /// # Examples
320     ///
321     /// ```
322     /// use encode_unicode::{StrExt, Utf8Char};
323     /// let mut iter = "abc".utf8chars();
324     /// assert_eq!(iter.next(), Some(Utf8Char::from('a')));
325     /// assert_eq!(iter.next_back(), Some(Utf8Char::from('c')));
326     /// assert_eq!(iter.as_str(), "b");
327     /// ```
as_str(&self) -> &'a str328     pub fn as_str(&self) -> &'a str {
329         self.0.as_str()
330     }
331 }
332 impl<'a> Iterator for Utf8Chars<'a> {
333     type Item = Utf8Char;
next(&mut self) -> Option<Utf8Char>334     fn next(&mut self) -> Option<Utf8Char> {
335         self.0.next().map(|(_,u8c)| u8c )
336     }
size_hint(&self) -> (usize,Option<usize>)337     fn size_hint(&self) -> (usize,Option<usize>) {
338         self.0.size_hint()
339     }
340 }
341 impl<'a> DoubleEndedIterator for Utf8Chars<'a> {
next_back(&mut self) -> Option<Utf8Char>342     fn next_back(&mut self) -> Option<Utf8Char> {
343         self.0.next_back().map(|(_,u8c)| u8c )
344     }
345 }
346 impl<'a> fmt::Debug for Utf8Chars<'a> {
fmt(&self, fmtr: &mut fmt::Formatter) -> fmt::Result347     fn fmt(&self,  fmtr: &mut fmt::Formatter) -> fmt::Result {
348         fmtr.debug_tuple("Utf8CharIndices")
349             .field(&self.as_str())
350             .finish()
351     }
352 }
353