1 use std::char;
2 use std::cmp::Ordering;
3 use std::fmt;
4 use std::ops;
5 use std::u32;
6 
7 use syntax;
8 
9 use literal::LiteralSearcher;
10 use prog::InstEmptyLook;
11 use utf8::{decode_last_utf8, decode_utf8};
12 
13 /// Represents a location in the input.
14 #[derive(Clone, Copy, Debug)]
15 pub struct InputAt {
16     pos: usize,
17     c: Char,
18     byte: Option<u8>,
19     len: usize,
20 }
21 
22 impl InputAt {
23     /// Returns true iff this position is at the beginning of the input.
is_start(&self) -> bool24     pub fn is_start(&self) -> bool {
25         self.pos == 0
26     }
27 
28     /// Returns true iff this position is past the end of the input.
is_end(&self) -> bool29     pub fn is_end(&self) -> bool {
30         self.c.is_none() && self.byte.is_none()
31     }
32 
33     /// Returns the character at this position.
34     ///
35     /// If this position is just before or after the input, then an absent
36     /// character is returned.
char(&self) -> Char37     pub fn char(&self) -> Char {
38         self.c
39     }
40 
41     /// Returns the byte at this position.
byte(&self) -> Option<u8>42     pub fn byte(&self) -> Option<u8> {
43         self.byte
44     }
45 
46     /// Returns the UTF-8 width of the character at this position.
len(&self) -> usize47     pub fn len(&self) -> usize {
48         self.len
49     }
50 
51     /// Returns whether the UTF-8 width of the character at this position
52     /// is zero.
is_empty(&self) -> bool53     pub fn is_empty(&self) -> bool {
54         self.len == 0
55     }
56 
57     /// Returns the byte offset of this position.
pos(&self) -> usize58     pub fn pos(&self) -> usize {
59         self.pos
60     }
61 
62     /// Returns the byte offset of the next position in the input.
next_pos(&self) -> usize63     pub fn next_pos(&self) -> usize {
64         self.pos + self.len
65     }
66 }
67 
68 /// An abstraction over input used in the matching engines.
69 pub trait Input: fmt::Debug {
70     /// Return an encoding of the position at byte offset `i`.
at(&self, i: usize) -> InputAt71     fn at(&self, i: usize) -> InputAt;
72 
73     /// Return the Unicode character occurring next to `at`.
74     ///
75     /// If no such character could be decoded, then `Char` is absent.
next_char(&self, at: InputAt) -> Char76     fn next_char(&self, at: InputAt) -> Char;
77 
78     /// Return the Unicode character occurring previous to `at`.
79     ///
80     /// If no such character could be decoded, then `Char` is absent.
previous_char(&self, at: InputAt) -> Char81     fn previous_char(&self, at: InputAt) -> Char;
82 
83     /// Return true if the given empty width instruction matches at the
84     /// input position given.
is_empty_match(&self, at: InputAt, empty: &InstEmptyLook) -> bool85     fn is_empty_match(&self, at: InputAt, empty: &InstEmptyLook) -> bool;
86 
87     /// Scan the input for a matching prefix.
prefix_at( &self, prefixes: &LiteralSearcher, at: InputAt, ) -> Option<InputAt>88     fn prefix_at(
89         &self,
90         prefixes: &LiteralSearcher,
91         at: InputAt,
92     ) -> Option<InputAt>;
93 
94     /// The number of bytes in the input.
len(&self) -> usize95     fn len(&self) -> usize;
96 
97     /// Whether the input is empty.
is_empty(&self) -> bool98     fn is_empty(&self) -> bool {
99         self.len() == 0
100     }
101 
102     /// Return the given input as a sequence of bytes.
as_bytes(&self) -> &[u8]103     fn as_bytes(&self) -> &[u8];
104 }
105 
106 impl<'a, T: Input> Input for &'a T {
at(&self, i: usize) -> InputAt107     fn at(&self, i: usize) -> InputAt {
108         (**self).at(i)
109     }
110 
next_char(&self, at: InputAt) -> Char111     fn next_char(&self, at: InputAt) -> Char {
112         (**self).next_char(at)
113     }
114 
previous_char(&self, at: InputAt) -> Char115     fn previous_char(&self, at: InputAt) -> Char {
116         (**self).previous_char(at)
117     }
118 
is_empty_match(&self, at: InputAt, empty: &InstEmptyLook) -> bool119     fn is_empty_match(&self, at: InputAt, empty: &InstEmptyLook) -> bool {
120         (**self).is_empty_match(at, empty)
121     }
122 
prefix_at( &self, prefixes: &LiteralSearcher, at: InputAt, ) -> Option<InputAt>123     fn prefix_at(
124         &self,
125         prefixes: &LiteralSearcher,
126         at: InputAt,
127     ) -> Option<InputAt> {
128         (**self).prefix_at(prefixes, at)
129     }
130 
len(&self) -> usize131     fn len(&self) -> usize {
132         (**self).len()
133     }
134 
as_bytes(&self) -> &[u8]135     fn as_bytes(&self) -> &[u8] {
136         (**self).as_bytes()
137     }
138 }
139 
140 /// An input reader over characters.
141 #[derive(Clone, Copy, Debug)]
142 pub struct CharInput<'t>(&'t [u8]);
143 
144 impl<'t> CharInput<'t> {
145     /// Return a new character input reader for the given string.
new(s: &'t [u8]) -> CharInput<'t>146     pub fn new(s: &'t [u8]) -> CharInput<'t> {
147         CharInput(s)
148     }
149 }
150 
151 impl<'t> ops::Deref for CharInput<'t> {
152     type Target = [u8];
153 
deref(&self) -> &[u8]154     fn deref(&self) -> &[u8] {
155         self.0
156     }
157 }
158 
159 impl<'t> Input for CharInput<'t> {
at(&self, i: usize) -> InputAt160     fn at(&self, i: usize) -> InputAt {
161         if i >= self.len() {
162             InputAt { pos: self.len(), c: None.into(), byte: None, len: 0 }
163         } else {
164             let c = decode_utf8(&self[i..]).map(|(c, _)| c).into();
165             InputAt { pos: i, c: c, byte: None, len: c.len_utf8() }
166         }
167     }
168 
next_char(&self, at: InputAt) -> Char169     fn next_char(&self, at: InputAt) -> Char {
170         at.char()
171     }
172 
previous_char(&self, at: InputAt) -> Char173     fn previous_char(&self, at: InputAt) -> Char {
174         decode_last_utf8(&self[..at.pos()]).map(|(c, _)| c).into()
175     }
176 
is_empty_match(&self, at: InputAt, empty: &InstEmptyLook) -> bool177     fn is_empty_match(&self, at: InputAt, empty: &InstEmptyLook) -> bool {
178         use prog::EmptyLook::*;
179         match empty.look {
180             StartLine => {
181                 let c = self.previous_char(at);
182                 at.pos() == 0 || c == '\n'
183             }
184             EndLine => {
185                 let c = self.next_char(at);
186                 at.pos() == self.len() || c == '\n'
187             }
188             StartText => at.pos() == 0,
189             EndText => at.pos() == self.len(),
190             WordBoundary => {
191                 let (c1, c2) = (self.previous_char(at), self.next_char(at));
192                 c1.is_word_char() != c2.is_word_char()
193             }
194             NotWordBoundary => {
195                 let (c1, c2) = (self.previous_char(at), self.next_char(at));
196                 c1.is_word_char() == c2.is_word_char()
197             }
198             WordBoundaryAscii => {
199                 let (c1, c2) = (self.previous_char(at), self.next_char(at));
200                 c1.is_word_byte() != c2.is_word_byte()
201             }
202             NotWordBoundaryAscii => {
203                 let (c1, c2) = (self.previous_char(at), self.next_char(at));
204                 c1.is_word_byte() == c2.is_word_byte()
205             }
206         }
207     }
208 
prefix_at( &self, prefixes: &LiteralSearcher, at: InputAt, ) -> Option<InputAt>209     fn prefix_at(
210         &self,
211         prefixes: &LiteralSearcher,
212         at: InputAt,
213     ) -> Option<InputAt> {
214         prefixes.find(&self[at.pos()..]).map(|(s, _)| self.at(at.pos() + s))
215     }
216 
len(&self) -> usize217     fn len(&self) -> usize {
218         self.0.len()
219     }
220 
as_bytes(&self) -> &[u8]221     fn as_bytes(&self) -> &[u8] {
222         self.0
223     }
224 }
225 
226 /// An input reader over bytes.
227 #[derive(Clone, Copy, Debug)]
228 pub struct ByteInput<'t> {
229     text: &'t [u8],
230     only_utf8: bool,
231 }
232 
233 impl<'t> ByteInput<'t> {
234     /// Return a new byte-based input reader for the given string.
new(text: &'t [u8], only_utf8: bool) -> ByteInput<'t>235     pub fn new(text: &'t [u8], only_utf8: bool) -> ByteInput<'t> {
236         ByteInput { text: text, only_utf8: only_utf8 }
237     }
238 }
239 
240 impl<'t> ops::Deref for ByteInput<'t> {
241     type Target = [u8];
242 
deref(&self) -> &[u8]243     fn deref(&self) -> &[u8] {
244         self.text
245     }
246 }
247 
248 impl<'t> Input for ByteInput<'t> {
at(&self, i: usize) -> InputAt249     fn at(&self, i: usize) -> InputAt {
250         if i >= self.len() {
251             InputAt { pos: self.len(), c: None.into(), byte: None, len: 0 }
252         } else {
253             InputAt {
254                 pos: i,
255                 c: None.into(),
256                 byte: self.get(i).cloned(),
257                 len: 1,
258             }
259         }
260     }
261 
next_char(&self, at: InputAt) -> Char262     fn next_char(&self, at: InputAt) -> Char {
263         decode_utf8(&self[at.pos()..]).map(|(c, _)| c).into()
264     }
265 
previous_char(&self, at: InputAt) -> Char266     fn previous_char(&self, at: InputAt) -> Char {
267         decode_last_utf8(&self[..at.pos()]).map(|(c, _)| c).into()
268     }
269 
is_empty_match(&self, at: InputAt, empty: &InstEmptyLook) -> bool270     fn is_empty_match(&self, at: InputAt, empty: &InstEmptyLook) -> bool {
271         use prog::EmptyLook::*;
272         match empty.look {
273             StartLine => {
274                 let c = self.previous_char(at);
275                 at.pos() == 0 || c == '\n'
276             }
277             EndLine => {
278                 let c = self.next_char(at);
279                 at.pos() == self.len() || c == '\n'
280             }
281             StartText => at.pos() == 0,
282             EndText => at.pos() == self.len(),
283             WordBoundary => {
284                 let (c1, c2) = (self.previous_char(at), self.next_char(at));
285                 c1.is_word_char() != c2.is_word_char()
286             }
287             NotWordBoundary => {
288                 let (c1, c2) = (self.previous_char(at), self.next_char(at));
289                 c1.is_word_char() == c2.is_word_char()
290             }
291             WordBoundaryAscii => {
292                 let (c1, c2) = (self.previous_char(at), self.next_char(at));
293                 if self.only_utf8 {
294                     // If we must match UTF-8, then we can't match word
295                     // boundaries at invalid UTF-8.
296                     if c1.is_none() && !at.is_start() {
297                         return false;
298                     }
299                     if c2.is_none() && !at.is_end() {
300                         return false;
301                     }
302                 }
303                 c1.is_word_byte() != c2.is_word_byte()
304             }
305             NotWordBoundaryAscii => {
306                 let (c1, c2) = (self.previous_char(at), self.next_char(at));
307                 if self.only_utf8 {
308                     // If we must match UTF-8, then we can't match word
309                     // boundaries at invalid UTF-8.
310                     if c1.is_none() && !at.is_start() {
311                         return false;
312                     }
313                     if c2.is_none() && !at.is_end() {
314                         return false;
315                     }
316                 }
317                 c1.is_word_byte() == c2.is_word_byte()
318             }
319         }
320     }
321 
prefix_at( &self, prefixes: &LiteralSearcher, at: InputAt, ) -> Option<InputAt>322     fn prefix_at(
323         &self,
324         prefixes: &LiteralSearcher,
325         at: InputAt,
326     ) -> Option<InputAt> {
327         prefixes.find(&self[at.pos()..]).map(|(s, _)| self.at(at.pos() + s))
328     }
329 
len(&self) -> usize330     fn len(&self) -> usize {
331         self.text.len()
332     }
333 
as_bytes(&self) -> &[u8]334     fn as_bytes(&self) -> &[u8] {
335         self.text
336     }
337 }
338 
339 /// An inline representation of `Option<char>`.
340 ///
341 /// This eliminates the need to do case analysis on `Option<char>` to determine
342 /// ordinality with other characters.
343 ///
344 /// (The `Option<char>` is not related to encoding. Instead, it is used in the
345 /// matching engines to represent the beginning and ending boundaries of the
346 /// search text.)
347 #[derive(Clone, Copy, Hash, PartialEq, Eq, PartialOrd, Ord)]
348 pub struct Char(u32);
349 
350 impl fmt::Debug for Char {
fmt(&self, f: &mut fmt::Formatter) -> fmt::Result351     fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
352         match char::from_u32(self.0) {
353             None => write!(f, "Empty"),
354             Some(c) => write!(f, "{:?}", c),
355         }
356     }
357 }
358 
359 impl Char {
360     /// Returns true iff the character is absent.
361     #[inline]
is_none(self) -> bool362     pub fn is_none(self) -> bool {
363         self.0 == u32::MAX
364     }
365 
366     /// Returns the length of the character's UTF-8 encoding.
367     ///
368     /// If the character is absent, then `1` is returned.
369     #[inline]
len_utf8(self) -> usize370     pub fn len_utf8(self) -> usize {
371         char::from_u32(self.0).map_or(1, |c| c.len_utf8())
372     }
373 
374     /// Returns true iff the character is a word character.
375     ///
376     /// If the character is absent, then false is returned.
is_word_char(self) -> bool377     pub fn is_word_char(self) -> bool {
378         // is_word_character can panic if the Unicode data for \w isn't
379         // available. However, our compiler ensures that if a Unicode word
380         // boundary is used, then the data must also be available. If it isn't,
381         // then the compiler returns an error.
382         char::from_u32(self.0).map_or(false, syntax::is_word_character)
383     }
384 
385     /// Returns true iff the byte is a word byte.
386     ///
387     /// If the byte is absent, then false is returned.
is_word_byte(self) -> bool388     pub fn is_word_byte(self) -> bool {
389         match char::from_u32(self.0) {
390             Some(c) if c <= '\u{7F}' => syntax::is_word_byte(c as u8),
391             None | Some(_) => false,
392         }
393     }
394 }
395 
396 impl From<char> for Char {
from(c: char) -> Char397     fn from(c: char) -> Char {
398         Char(c as u32)
399     }
400 }
401 
402 impl From<Option<char>> for Char {
from(c: Option<char>) -> Char403     fn from(c: Option<char>) -> Char {
404         c.map_or(Char(u32::MAX), |c| c.into())
405     }
406 }
407 
408 impl PartialEq<char> for Char {
409     #[inline]
eq(&self, other: &char) -> bool410     fn eq(&self, other: &char) -> bool {
411         self.0 == *other as u32
412     }
413 }
414 
415 impl PartialEq<Char> for char {
416     #[inline]
eq(&self, other: &Char) -> bool417     fn eq(&self, other: &Char) -> bool {
418         *self as u32 == other.0
419     }
420 }
421 
422 impl PartialOrd<char> for Char {
423     #[inline]
partial_cmp(&self, other: &char) -> Option<Ordering>424     fn partial_cmp(&self, other: &char) -> Option<Ordering> {
425         self.0.partial_cmp(&(*other as u32))
426     }
427 }
428 
429 impl PartialOrd<Char> for char {
430     #[inline]
partial_cmp(&self, other: &Char) -> Option<Ordering>431     fn partial_cmp(&self, other: &Char) -> Option<Ordering> {
432         (*self as u32).partial_cmp(&other.0)
433     }
434 }
435