1 use std::char;
2 use std::cmp::Ordering;
3 use std::fmt;
4 use std::ops;
5 use std::u32;
6 
7 use crate::literal::LiteralSearcher;
8 use crate::prog::InstEmptyLook;
9 use crate::utf8::{decode_last_utf8, decode_utf8};
10 
11 /// Represents a location in the input.
12 #[derive(Clone, Copy, Debug)]
13 pub struct InputAt {
14     pos: usize,
15     c: Char,
16     byte: Option<u8>,
17     len: usize,
18 }
19 
20 impl InputAt {
21     /// Returns true iff this position is at the beginning of the input.
is_start(&self) -> bool22     pub fn is_start(&self) -> bool {
23         self.pos == 0
24     }
25 
26     /// Returns true iff this position is past the end of the input.
is_end(&self) -> bool27     pub fn is_end(&self) -> bool {
28         self.c.is_none() && self.byte.is_none()
29     }
30 
31     /// Returns the character at this position.
32     ///
33     /// If this position is just before or after the input, then an absent
34     /// character is returned.
char(&self) -> Char35     pub fn char(&self) -> Char {
36         self.c
37     }
38 
39     /// Returns the byte at this position.
byte(&self) -> Option<u8>40     pub fn byte(&self) -> Option<u8> {
41         self.byte
42     }
43 
44     /// Returns the UTF-8 width of the character at this position.
len(&self) -> usize45     pub fn len(&self) -> usize {
46         self.len
47     }
48 
49     /// Returns whether the UTF-8 width of the character at this position
50     /// is zero.
is_empty(&self) -> bool51     pub fn is_empty(&self) -> bool {
52         self.len == 0
53     }
54 
55     /// Returns the byte offset of this position.
pos(&self) -> usize56     pub fn pos(&self) -> usize {
57         self.pos
58     }
59 
60     /// Returns the byte offset of the next position in the input.
next_pos(&self) -> usize61     pub fn next_pos(&self) -> usize {
62         self.pos + self.len
63     }
64 }
65 
66 /// An abstraction over input used in the matching engines.
67 pub trait Input: fmt::Debug {
68     /// Return an encoding of the position at byte offset `i`.
at(&self, i: usize) -> InputAt69     fn at(&self, i: usize) -> InputAt;
70 
71     /// Return the Unicode character occurring next to `at`.
72     ///
73     /// If no such character could be decoded, then `Char` is absent.
next_char(&self, at: InputAt) -> Char74     fn next_char(&self, at: InputAt) -> Char;
75 
76     /// Return the Unicode character occurring previous to `at`.
77     ///
78     /// If no such character could be decoded, then `Char` is absent.
previous_char(&self, at: InputAt) -> Char79     fn previous_char(&self, at: InputAt) -> Char;
80 
81     /// Return true if the given empty width instruction matches at the
82     /// input position given.
is_empty_match(&self, at: InputAt, empty: &InstEmptyLook) -> bool83     fn is_empty_match(&self, at: InputAt, empty: &InstEmptyLook) -> bool;
84 
85     /// Scan the input for a matching prefix.
prefix_at( &self, prefixes: &LiteralSearcher, at: InputAt, ) -> Option<InputAt>86     fn prefix_at(
87         &self,
88         prefixes: &LiteralSearcher,
89         at: InputAt,
90     ) -> Option<InputAt>;
91 
92     /// The number of bytes in the input.
len(&self) -> usize93     fn len(&self) -> usize;
94 
95     /// Whether the input is empty.
is_empty(&self) -> bool96     fn is_empty(&self) -> bool {
97         self.len() == 0
98     }
99 
100     /// Return the given input as a sequence of bytes.
as_bytes(&self) -> &[u8]101     fn as_bytes(&self) -> &[u8];
102 }
103 
104 impl<'a, T: Input> Input for &'a T {
at(&self, i: usize) -> InputAt105     fn at(&self, i: usize) -> InputAt {
106         (**self).at(i)
107     }
108 
next_char(&self, at: InputAt) -> Char109     fn next_char(&self, at: InputAt) -> Char {
110         (**self).next_char(at)
111     }
112 
previous_char(&self, at: InputAt) -> Char113     fn previous_char(&self, at: InputAt) -> Char {
114         (**self).previous_char(at)
115     }
116 
is_empty_match(&self, at: InputAt, empty: &InstEmptyLook) -> bool117     fn is_empty_match(&self, at: InputAt, empty: &InstEmptyLook) -> bool {
118         (**self).is_empty_match(at, empty)
119     }
120 
prefix_at( &self, prefixes: &LiteralSearcher, at: InputAt, ) -> Option<InputAt>121     fn prefix_at(
122         &self,
123         prefixes: &LiteralSearcher,
124         at: InputAt,
125     ) -> Option<InputAt> {
126         (**self).prefix_at(prefixes, at)
127     }
128 
len(&self) -> usize129     fn len(&self) -> usize {
130         (**self).len()
131     }
132 
as_bytes(&self) -> &[u8]133     fn as_bytes(&self) -> &[u8] {
134         (**self).as_bytes()
135     }
136 }
137 
138 /// An input reader over characters.
139 #[derive(Clone, Copy, Debug)]
140 pub struct CharInput<'t>(&'t [u8]);
141 
142 impl<'t> CharInput<'t> {
143     /// Return a new character input reader for the given string.
new(s: &'t [u8]) -> CharInput<'t>144     pub fn new(s: &'t [u8]) -> CharInput<'t> {
145         CharInput(s)
146     }
147 }
148 
149 impl<'t> ops::Deref for CharInput<'t> {
150     type Target = [u8];
151 
deref(&self) -> &[u8]152     fn deref(&self) -> &[u8] {
153         self.0
154     }
155 }
156 
157 impl<'t> Input for CharInput<'t> {
at(&self, i: usize) -> InputAt158     fn at(&self, i: usize) -> InputAt {
159         if i >= self.len() {
160             InputAt { pos: self.len(), c: None.into(), byte: None, len: 0 }
161         } else {
162             let c = decode_utf8(&self[i..]).map(|(c, _)| c).into();
163             InputAt { pos: i, c: c, byte: None, len: c.len_utf8() }
164         }
165     }
166 
next_char(&self, at: InputAt) -> Char167     fn next_char(&self, at: InputAt) -> Char {
168         at.char()
169     }
170 
previous_char(&self, at: InputAt) -> Char171     fn previous_char(&self, at: InputAt) -> Char {
172         decode_last_utf8(&self[..at.pos()]).map(|(c, _)| c).into()
173     }
174 
is_empty_match(&self, at: InputAt, empty: &InstEmptyLook) -> bool175     fn is_empty_match(&self, at: InputAt, empty: &InstEmptyLook) -> bool {
176         use crate::prog::EmptyLook::*;
177         match empty.look {
178             StartLine => {
179                 let c = self.previous_char(at);
180                 at.pos() == 0 || c == '\n'
181             }
182             EndLine => {
183                 let c = self.next_char(at);
184                 at.pos() == self.len() || c == '\n'
185             }
186             StartText => at.pos() == 0,
187             EndText => at.pos() == self.len(),
188             WordBoundary => {
189                 let (c1, c2) = (self.previous_char(at), self.next_char(at));
190                 c1.is_word_char() != c2.is_word_char()
191             }
192             NotWordBoundary => {
193                 let (c1, c2) = (self.previous_char(at), self.next_char(at));
194                 c1.is_word_char() == c2.is_word_char()
195             }
196             WordBoundaryAscii => {
197                 let (c1, c2) = (self.previous_char(at), self.next_char(at));
198                 c1.is_word_byte() != c2.is_word_byte()
199             }
200             NotWordBoundaryAscii => {
201                 let (c1, c2) = (self.previous_char(at), self.next_char(at));
202                 c1.is_word_byte() == c2.is_word_byte()
203             }
204         }
205     }
206 
prefix_at( &self, prefixes: &LiteralSearcher, at: InputAt, ) -> Option<InputAt>207     fn prefix_at(
208         &self,
209         prefixes: &LiteralSearcher,
210         at: InputAt,
211     ) -> Option<InputAt> {
212         prefixes.find(&self[at.pos()..]).map(|(s, _)| self.at(at.pos() + s))
213     }
214 
len(&self) -> usize215     fn len(&self) -> usize {
216         self.0.len()
217     }
218 
as_bytes(&self) -> &[u8]219     fn as_bytes(&self) -> &[u8] {
220         self.0
221     }
222 }
223 
224 /// An input reader over bytes.
225 #[derive(Clone, Copy, Debug)]
226 pub struct ByteInput<'t> {
227     text: &'t [u8],
228     only_utf8: bool,
229 }
230 
231 impl<'t> ByteInput<'t> {
232     /// Return a new byte-based input reader for the given string.
new(text: &'t [u8], only_utf8: bool) -> ByteInput<'t>233     pub fn new(text: &'t [u8], only_utf8: bool) -> ByteInput<'t> {
234         ByteInput { text: text, only_utf8: only_utf8 }
235     }
236 }
237 
238 impl<'t> ops::Deref for ByteInput<'t> {
239     type Target = [u8];
240 
deref(&self) -> &[u8]241     fn deref(&self) -> &[u8] {
242         self.text
243     }
244 }
245 
246 impl<'t> Input for ByteInput<'t> {
at(&self, i: usize) -> InputAt247     fn at(&self, i: usize) -> InputAt {
248         if i >= self.len() {
249             InputAt { pos: self.len(), c: None.into(), byte: None, len: 0 }
250         } else {
251             InputAt {
252                 pos: i,
253                 c: None.into(),
254                 byte: self.get(i).cloned(),
255                 len: 1,
256             }
257         }
258     }
259 
next_char(&self, at: InputAt) -> Char260     fn next_char(&self, at: InputAt) -> Char {
261         decode_utf8(&self[at.pos()..]).map(|(c, _)| c).into()
262     }
263 
previous_char(&self, at: InputAt) -> Char264     fn previous_char(&self, at: InputAt) -> Char {
265         decode_last_utf8(&self[..at.pos()]).map(|(c, _)| c).into()
266     }
267 
is_empty_match(&self, at: InputAt, empty: &InstEmptyLook) -> bool268     fn is_empty_match(&self, at: InputAt, empty: &InstEmptyLook) -> bool {
269         use crate::prog::EmptyLook::*;
270         match empty.look {
271             StartLine => {
272                 let c = self.previous_char(at);
273                 at.pos() == 0 || c == '\n'
274             }
275             EndLine => {
276                 let c = self.next_char(at);
277                 at.pos() == self.len() || c == '\n'
278             }
279             StartText => at.pos() == 0,
280             EndText => at.pos() == self.len(),
281             WordBoundary => {
282                 let (c1, c2) = (self.previous_char(at), self.next_char(at));
283                 c1.is_word_char() != c2.is_word_char()
284             }
285             NotWordBoundary => {
286                 let (c1, c2) = (self.previous_char(at), self.next_char(at));
287                 c1.is_word_char() == c2.is_word_char()
288             }
289             WordBoundaryAscii => {
290                 let (c1, c2) = (self.previous_char(at), self.next_char(at));
291                 if self.only_utf8 {
292                     // If we must match UTF-8, then we can't match word
293                     // boundaries at invalid UTF-8.
294                     if c1.is_none() && !at.is_start() {
295                         return false;
296                     }
297                     if c2.is_none() && !at.is_end() {
298                         return false;
299                     }
300                 }
301                 c1.is_word_byte() != c2.is_word_byte()
302             }
303             NotWordBoundaryAscii => {
304                 let (c1, c2) = (self.previous_char(at), self.next_char(at));
305                 if self.only_utf8 {
306                     // If we must match UTF-8, then we can't match word
307                     // boundaries at invalid UTF-8.
308                     if c1.is_none() && !at.is_start() {
309                         return false;
310                     }
311                     if c2.is_none() && !at.is_end() {
312                         return false;
313                     }
314                 }
315                 c1.is_word_byte() == c2.is_word_byte()
316             }
317         }
318     }
319 
prefix_at( &self, prefixes: &LiteralSearcher, at: InputAt, ) -> Option<InputAt>320     fn prefix_at(
321         &self,
322         prefixes: &LiteralSearcher,
323         at: InputAt,
324     ) -> Option<InputAt> {
325         prefixes.find(&self[at.pos()..]).map(|(s, _)| self.at(at.pos() + s))
326     }
327 
len(&self) -> usize328     fn len(&self) -> usize {
329         self.text.len()
330     }
331 
as_bytes(&self) -> &[u8]332     fn as_bytes(&self) -> &[u8] {
333         self.text
334     }
335 }
336 
337 /// An inline representation of `Option<char>`.
338 ///
339 /// This eliminates the need to do case analysis on `Option<char>` to determine
340 /// ordinality with other characters.
341 ///
342 /// (The `Option<char>` is not related to encoding. Instead, it is used in the
343 /// matching engines to represent the beginning and ending boundaries of the
344 /// search text.)
345 #[derive(Clone, Copy, Hash, PartialEq, Eq, PartialOrd, Ord)]
346 pub struct Char(u32);
347 
348 impl fmt::Debug for Char {
fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result349     fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
350         match char::from_u32(self.0) {
351             None => write!(f, "Empty"),
352             Some(c) => write!(f, "{:?}", c),
353         }
354     }
355 }
356 
357 impl Char {
358     /// Returns true iff the character is absent.
359     #[inline]
is_none(self) -> bool360     pub fn is_none(self) -> bool {
361         self.0 == u32::MAX
362     }
363 
364     /// Returns the length of the character's UTF-8 encoding.
365     ///
366     /// If the character is absent, then `1` is returned.
367     #[inline]
len_utf8(self) -> usize368     pub fn len_utf8(self) -> usize {
369         char::from_u32(self.0).map_or(1, |c| c.len_utf8())
370     }
371 
372     /// Returns true iff the character is a word character.
373     ///
374     /// If the character is absent, then false is returned.
is_word_char(self) -> bool375     pub fn is_word_char(self) -> bool {
376         // is_word_character can panic if the Unicode data for \w isn't
377         // available. However, our compiler ensures that if a Unicode word
378         // boundary is used, then the data must also be available. If it isn't,
379         // then the compiler returns an error.
380         char::from_u32(self.0).map_or(false, regex_syntax::is_word_character)
381     }
382 
383     /// Returns true iff the byte is a word byte.
384     ///
385     /// If the byte is absent, then false is returned.
is_word_byte(self) -> bool386     pub fn is_word_byte(self) -> bool {
387         match char::from_u32(self.0) {
388             Some(c) if c <= '\u{7F}' => regex_syntax::is_word_byte(c as u8),
389             None | Some(_) => false,
390         }
391     }
392 }
393 
394 impl From<char> for Char {
from(c: char) -> Char395     fn from(c: char) -> Char {
396         Char(c as u32)
397     }
398 }
399 
400 impl From<Option<char>> for Char {
from(c: Option<char>) -> Char401     fn from(c: Option<char>) -> Char {
402         c.map_or(Char(u32::MAX), |c| c.into())
403     }
404 }
405 
406 impl PartialEq<char> for Char {
407     #[inline]
eq(&self, other: &char) -> bool408     fn eq(&self, other: &char) -> bool {
409         self.0 == *other as u32
410     }
411 }
412 
413 impl PartialEq<Char> for char {
414     #[inline]
eq(&self, other: &Char) -> bool415     fn eq(&self, other: &Char) -> bool {
416         *self as u32 == other.0
417     }
418 }
419 
420 impl PartialOrd<char> for Char {
421     #[inline]
partial_cmp(&self, other: &char) -> Option<Ordering>422     fn partial_cmp(&self, other: &char) -> Option<Ordering> {
423         self.0.partial_cmp(&(*other as u32))
424     }
425 }
426 
427 impl PartialOrd<Char> for char {
428     #[inline]
partial_cmp(&self, other: &Char) -> Option<Ordering>429     fn partial_cmp(&self, other: &Char) -> Option<Ordering> {
430         (*self as u32).partial_cmp(&other.0)
431     }
432 }
433