1 /* Copyright 2016 The encode_unicode Developers
2  *
3  * Licensed under the Apache License, Version 2.0, <LICENSE-APACHE or
4  * http://apache.org/licenses/LICENSE-2.0> or the MIT license <LICENSE-MIT or
5  * http://opensource.org/licenses/MIT>, at your option. This file may not be
6  * copied, modified, or distributed except according to those terms.
7  */
8 
9 use traits::CharExt;
10 use utf16_char::Utf16Char;
11 use errors::EmptyStrError;
12 extern crate core;
13 use self::core::fmt;
14 use self::core::borrow::Borrow;
15 
16 // Invalid values that says the field is consumed or empty.
17 const FIRST_USED: u16 = 0x_dc_00;
18 const SECOND_USED: u16 = 0;
19 
20 /// Iterate over the units of the UTF-16 representation of a codepoint.
21 #[derive(Clone)]
22 pub struct Utf16Iterator {
23     first: u16,
24     second: u16,
25 }
26 impl From<char> for Utf16Iterator {
from(c: char) -> Self27     fn from(c: char) -> Self {
28         let (first, second) = c.to_utf16_tuple();
29         Utf16Iterator{ first: first,  second: second.unwrap_or(SECOND_USED) }
30     }
31 }
32 impl From<Utf16Char> for Utf16Iterator {
from(uc: Utf16Char) -> Self33     fn from(uc: Utf16Char) -> Self {
34         let (first, second) = uc.to_tuple();
35         Utf16Iterator{ first: first,  second: second.unwrap_or(SECOND_USED) }
36     }
37 }
38 impl Iterator for Utf16Iterator {
39     type Item=u16;
next(&mut self) -> Option<u16>40     fn next(&mut self) -> Option<u16> {
41         match (self.first, self.second) {
42             (FIRST_USED, SECOND_USED)  =>  {                            None        },
43             (FIRST_USED, second     )  =>  {self.second = SECOND_USED;  Some(second)},
44             (first     ,      _     )  =>  {self.first = FIRST_USED;    Some(first )},
45         }
46     }
size_hint(&self) -> (usize, Option<usize>)47     fn size_hint(&self) -> (usize, Option<usize>) {
48         (self.len(), Some(self.len()))
49     }
50 }
51 impl ExactSizeIterator for Utf16Iterator {
len(&self) -> usize52     fn len(&self) -> usize {
53         (if self.first == FIRST_USED {0} else {1}) +
54         (if self.second == SECOND_USED {0} else {1})
55     }
56 }
57 impl fmt::Debug for Utf16Iterator {
fmt(&self, fmtr: &mut fmt::Formatter) -> fmt::Result58     fn fmt(&self,  fmtr: &mut fmt::Formatter) -> fmt::Result {
59         let mut clone = self.clone();
60         match (clone.next(), clone.next()) {
61             (Some(one), None)  => write!(fmtr, "[{}]", one),
62             (Some(a), Some(b)) => write!(fmtr, "[{}, {}]", a, b),
63             (None,  _)         => write!(fmtr, "[]"),
64         }
65     }
66 }
67 
68 
69 
70 /// Converts an iterator of `Utf16Char` (or `&Utf16Char`)
71 /// to an iterator of `u16`s.
72 /// Is equivalent to calling `.flat_map()` on the original iterator,
73 /// but the returned iterator is about twice as fast.
74 ///
75 /// The exact number of units cannot be known in advance, but `size_hint()`
76 /// gives the possible range.
77 ///
78 /// # Examples
79 ///
80 /// From iterator of values:
81 ///
82 /// ```
83 /// use encode_unicode::{iter_units, CharExt};
84 ///
85 /// let iterator = "foo".chars().map(|c| c.to_utf16() );
86 /// let mut units = [0; 4];
87 /// for (u,dst) in iter_units(iterator).zip(&mut units) {*dst=u;}
88 /// assert_eq!(units, ['f' as u16, 'o' as u16, 'o' as u16, 0]);
89 /// ```
90 ///
91 /// From iterator of references:
92 ///
93 #[cfg_attr(feature="std", doc=" ```")]
94 #[cfg_attr(not(feature="std"), doc=" ```no_compile")]
95 /// use encode_unicode::{iter_units, CharExt, Utf16Char};
96 ///
97 /// // (�� takes two units)
98 /// let chars: Vec<Utf16Char> = "�� bomb ��".chars().map(|c| c.to_utf16() ).collect();
99 /// let units: Vec<u16> = iter_units(&chars).collect();
100 /// let flat_map: Vec<u16> = chars.iter().flat_map(|u16c| *u16c ).collect();
101 /// assert_eq!(units, flat_map);
102 /// ```
iter_units<U:Borrow<Utf16Char>, I:IntoIterator<Item=U>> (iterable: I) -> Utf16CharSplitter<U, I::IntoIter>103 pub fn iter_units<U:Borrow<Utf16Char>, I:IntoIterator<Item=U>>
104 (iterable: I) -> Utf16CharSplitter<U, I::IntoIter> {
105     Utf16CharSplitter{ inner: iterable.into_iter(),  prev_second: 0 }
106 }
107 
108 /// The iterator type returned by `iter_units()`
109 #[derive(Clone)]
110 pub struct Utf16CharSplitter<U:Borrow<Utf16Char>, I:Iterator<Item=U>> {
111     inner: I,
112     prev_second: u16,
113 }
114 impl<I:Iterator<Item=Utf16Char>> From<I> for Utf16CharSplitter<Utf16Char,I> {
115     /// A less generic constructor than `iter_units()`
from(iter: I) -> Self116     fn from(iter: I) -> Self {
117         iter_units(iter)
118     }
119 }
120 impl<U:Borrow<Utf16Char>, I:Iterator<Item=U>> Utf16CharSplitter<U,I> {
121     /// Extracts the source iterator.
122     ///
123     /// Note that `iter_units(iter.into_inner())` is not a no-op:
124     /// If the last returned unit from `next()` was a leading surrogate,
125     /// the trailing surrogate is lost.
into_inner(self) -> I126     pub fn into_inner(self) -> I {
127         self.inner
128     }
129 }
130 impl<U:Borrow<Utf16Char>, I:Iterator<Item=U>> Iterator for Utf16CharSplitter<U,I> {
131     type Item = u16;
next(&mut self) -> Option<Self::Item>132     fn next(&mut self) -> Option<Self::Item> {
133         if self.prev_second == 0 {
134             self.inner.next().map(|u16c| {
135                 let units = u16c.borrow().to_array();
136                 self.prev_second = units[1];
137                 units[0]
138             })
139         } else {
140             let prev_second = self.prev_second;
141             self.prev_second = 0;
142             Some(prev_second)
143         }
144     }
size_hint(&self) -> (usize,Option<usize>)145     fn size_hint(&self) -> (usize,Option<usize>) {
146         // Doesn't need to handle unlikely overflows correctly because
147         // size_hint() cannot be relied upon anyway. (the trait isn't unsafe)
148         let (min, max) = self.inner.size_hint();
149         let add = if self.prev_second == 0 {0} else {1};
150         (min.wrapping_add(add), max.map(|max| max.wrapping_mul(2).wrapping_add(add) ))
151     }
152 }
153 
154 
155 
156 /// An iterator over the codepoints in a `str` represented as `Utf16Char`.
157 #[derive(Clone)]
158 pub struct Utf16CharIndices<'a>{
159     str: &'a str,
160     index: usize,
161 }
162 impl<'a> From<&'a str> for Utf16CharIndices<'a> {
from(s: &str) -> Utf16CharIndices163     fn from(s: &str) -> Utf16CharIndices {
164         Utf16CharIndices{str: s, index: 0}
165     }
166 }
167 impl<'a> Utf16CharIndices<'a> {
168     /// Extract the remainder of the source `str`.
169     ///
170     /// # Examples
171     ///
172     /// ```
173     /// use encode_unicode::{StrExt, Utf16Char};
174     /// let mut iter = "abc".utf16char_indices();
175     /// assert_eq!(iter.next_back(), Some((2, Utf16Char::from('c'))));
176     /// assert_eq!(iter.next(), Some((0, Utf16Char::from('a'))));
177     /// assert_eq!(iter.as_str(), "b");
178     /// ```
as_str(&self) -> &'a str179     pub fn as_str(&self) -> &'a str {
180         &self.str[self.index..]
181     }
182 }
183 impl<'a> Iterator for Utf16CharIndices<'a> {
184     type Item = (usize,Utf16Char);
next(&mut self) -> Option<(usize,Utf16Char)>185     fn next(&mut self) -> Option<(usize,Utf16Char)> {
186         match Utf16Char::from_str_start(&self.str[self.index..]) {
187             Ok((u16c, bytes)) => {
188                 let item = (self.index, u16c);
189                 self.index += bytes;
190                 Some(item)
191             },
192             Err(EmptyStrError) => None
193         }
194     }
size_hint(&self) -> (usize,Option<usize>)195     fn size_hint(&self) -> (usize,Option<usize>) {
196         let len = self.str.len() - self.index;
197         // For len+3 to overflow, the slice must fill all but two bytes of
198         // addressable memory, and size_hint() doesn't need to be correct.
199         (len.wrapping_add(3)/4, Some(len))
200     }
201 }
202 impl<'a> DoubleEndedIterator for Utf16CharIndices<'a> {
next_back(&mut self) -> Option<(usize,Utf16Char)>203     fn next_back(&mut self) -> Option<(usize,Utf16Char)> {
204         if self.index < self.str.len() {
205             let rev = self.str.bytes().rev();
206             let len = 1 + rev.take_while(|b| b & 0b1100_0000 == 0b1000_0000 ).count();
207             let starts = self.str.len() - len;
208             let (u16c,_) = Utf16Char::from_str_start(&self.str[starts..]).unwrap();
209             self.str = &self.str[..starts];
210             Some((starts, u16c))
211         } else {
212             None
213         }
214     }
215 }
216 impl<'a> fmt::Debug for Utf16CharIndices<'a> {
fmt(&self, fmtr: &mut fmt::Formatter) -> fmt::Result217     fn fmt(&self,  fmtr: &mut fmt::Formatter) -> fmt::Result {
218         fmtr.debug_tuple("Utf16CharIndices")
219             .field(&self.index)
220             .field(&self.as_str())
221             .finish()
222     }
223 }
224 
225 
226 /// An iterator over the codepoints in a `str` represented as `Utf16Char`.
227 #[derive(Clone)]
228 pub struct Utf16Chars<'a>(Utf16CharIndices<'a>);
229 impl<'a> From<&'a str> for Utf16Chars<'a> {
from(s: &str) -> Utf16Chars230     fn from(s: &str) -> Utf16Chars {
231         Utf16Chars(Utf16CharIndices::from(s))
232     }
233 }
234 impl<'a> Utf16Chars<'a> {
235     /// Extract the remainder of the source `str`.
236     ///
237     /// # Examples
238     ///
239     /// ```
240     /// use encode_unicode::{StrExt, Utf16Char};
241     /// let mut iter = "abc".utf16chars();
242     /// assert_eq!(iter.next(), Some(Utf16Char::from('a')));
243     /// assert_eq!(iter.next_back(), Some(Utf16Char::from('c')));
244     /// assert_eq!(iter.as_str(), "b");
245     /// ```
as_str(&self) -> &'a str246     pub fn as_str(&self) -> &'a str {
247         self.0.as_str()
248     }
249 }
250 impl<'a> Iterator for Utf16Chars<'a> {
251     type Item = Utf16Char;
next(&mut self) -> Option<Utf16Char>252     fn next(&mut self) -> Option<Utf16Char> {
253         self.0.next().map(|(_,u16c)| u16c )
254     }
size_hint(&self) -> (usize,Option<usize>)255     fn size_hint(&self) -> (usize,Option<usize>) {
256         self.0.size_hint()
257     }
258 }
259 impl<'a> DoubleEndedIterator for Utf16Chars<'a> {
next_back(&mut self) -> Option<Utf16Char>260     fn next_back(&mut self) -> Option<Utf16Char> {
261         self.0.next_back().map(|(_,u16c)| u16c )
262     }
263 }
264 impl<'a> fmt::Debug for Utf16Chars<'a> {
fmt(&self, fmtr: &mut fmt::Formatter) -> fmt::Result265     fn fmt(&self,  fmtr: &mut fmt::Formatter) -> fmt::Result {
266         fmtr.debug_tuple("Utf16Chars")
267             .field(&self.as_str())
268             .finish()
269     }
270 }
271