1 use std::char; 2 use std::cmp::Ordering; 3 use std::fmt; 4 use std::ops; 5 use std::u32; 6 7 use syntax; 8 9 use literal::LiteralSearcher; 10 use prog::InstEmptyLook; 11 use utf8::{decode_last_utf8, decode_utf8}; 12 13 /// Represents a location in the input. 14 #[derive(Clone, Copy, Debug)] 15 pub struct InputAt { 16 pos: usize, 17 c: Char, 18 byte: Option<u8>, 19 len: usize, 20 } 21 22 impl InputAt { 23 /// Returns true iff this position is at the beginning of the input. is_start(&self) -> bool24 pub fn is_start(&self) -> bool { 25 self.pos == 0 26 } 27 28 /// Returns true iff this position is past the end of the input. is_end(&self) -> bool29 pub fn is_end(&self) -> bool { 30 self.c.is_none() && self.byte.is_none() 31 } 32 33 /// Returns the character at this position. 34 /// 35 /// If this position is just before or after the input, then an absent 36 /// character is returned. char(&self) -> Char37 pub fn char(&self) -> Char { 38 self.c 39 } 40 41 /// Returns the byte at this position. byte(&self) -> Option<u8>42 pub fn byte(&self) -> Option<u8> { 43 self.byte 44 } 45 46 /// Returns the UTF-8 width of the character at this position. len(&self) -> usize47 pub fn len(&self) -> usize { 48 self.len 49 } 50 51 /// Returns whether the UTF-8 width of the character at this position 52 /// is zero. is_empty(&self) -> bool53 pub fn is_empty(&self) -> bool { 54 self.len == 0 55 } 56 57 /// Returns the byte offset of this position. pos(&self) -> usize58 pub fn pos(&self) -> usize { 59 self.pos 60 } 61 62 /// Returns the byte offset of the next position in the input. next_pos(&self) -> usize63 pub fn next_pos(&self) -> usize { 64 self.pos + self.len 65 } 66 } 67 68 /// An abstraction over input used in the matching engines. 69 pub trait Input: fmt::Debug { 70 /// Return an encoding of the position at byte offset `i`. at(&self, i: usize) -> InputAt71 fn at(&self, i: usize) -> InputAt; 72 73 /// Return the Unicode character occurring next to `at`. 74 /// 75 /// If no such character could be decoded, then `Char` is absent. next_char(&self, at: InputAt) -> Char76 fn next_char(&self, at: InputAt) -> Char; 77 78 /// Return the Unicode character occurring previous to `at`. 79 /// 80 /// If no such character could be decoded, then `Char` is absent. previous_char(&self, at: InputAt) -> Char81 fn previous_char(&self, at: InputAt) -> Char; 82 83 /// Return true if the given empty width instruction matches at the 84 /// input position given. is_empty_match(&self, at: InputAt, empty: &InstEmptyLook) -> bool85 fn is_empty_match(&self, at: InputAt, empty: &InstEmptyLook) -> bool; 86 87 /// Scan the input for a matching prefix. prefix_at( &self, prefixes: &LiteralSearcher, at: InputAt, ) -> Option<InputAt>88 fn prefix_at( 89 &self, 90 prefixes: &LiteralSearcher, 91 at: InputAt, 92 ) -> Option<InputAt>; 93 94 /// The number of bytes in the input. len(&self) -> usize95 fn len(&self) -> usize; 96 97 /// Whether the input is empty. is_empty(&self) -> bool98 fn is_empty(&self) -> bool { 99 self.len() == 0 100 } 101 102 /// Return the given input as a sequence of bytes. as_bytes(&self) -> &[u8]103 fn as_bytes(&self) -> &[u8]; 104 } 105 106 impl<'a, T: Input> Input for &'a T { at(&self, i: usize) -> InputAt107 fn at(&self, i: usize) -> InputAt { 108 (**self).at(i) 109 } 110 next_char(&self, at: InputAt) -> Char111 fn next_char(&self, at: InputAt) -> Char { 112 (**self).next_char(at) 113 } 114 previous_char(&self, at: InputAt) -> Char115 fn previous_char(&self, at: InputAt) -> Char { 116 (**self).previous_char(at) 117 } 118 is_empty_match(&self, at: InputAt, empty: &InstEmptyLook) -> bool119 fn is_empty_match(&self, at: InputAt, empty: &InstEmptyLook) -> bool { 120 (**self).is_empty_match(at, empty) 121 } 122 prefix_at( &self, prefixes: &LiteralSearcher, at: InputAt, ) -> Option<InputAt>123 fn prefix_at( 124 &self, 125 prefixes: &LiteralSearcher, 126 at: InputAt, 127 ) -> Option<InputAt> { 128 (**self).prefix_at(prefixes, at) 129 } 130 len(&self) -> usize131 fn len(&self) -> usize { 132 (**self).len() 133 } 134 as_bytes(&self) -> &[u8]135 fn as_bytes(&self) -> &[u8] { 136 (**self).as_bytes() 137 } 138 } 139 140 /// An input reader over characters. 141 #[derive(Clone, Copy, Debug)] 142 pub struct CharInput<'t>(&'t [u8]); 143 144 impl<'t> CharInput<'t> { 145 /// Return a new character input reader for the given string. new(s: &'t [u8]) -> CharInput<'t>146 pub fn new(s: &'t [u8]) -> CharInput<'t> { 147 CharInput(s) 148 } 149 } 150 151 impl<'t> ops::Deref for CharInput<'t> { 152 type Target = [u8]; 153 deref(&self) -> &[u8]154 fn deref(&self) -> &[u8] { 155 self.0 156 } 157 } 158 159 impl<'t> Input for CharInput<'t> { at(&self, i: usize) -> InputAt160 fn at(&self, i: usize) -> InputAt { 161 if i >= self.len() { 162 InputAt { pos: self.len(), c: None.into(), byte: None, len: 0 } 163 } else { 164 let c = decode_utf8(&self[i..]).map(|(c, _)| c).into(); 165 InputAt { pos: i, c: c, byte: None, len: c.len_utf8() } 166 } 167 } 168 next_char(&self, at: InputAt) -> Char169 fn next_char(&self, at: InputAt) -> Char { 170 at.char() 171 } 172 previous_char(&self, at: InputAt) -> Char173 fn previous_char(&self, at: InputAt) -> Char { 174 decode_last_utf8(&self[..at.pos()]).map(|(c, _)| c).into() 175 } 176 is_empty_match(&self, at: InputAt, empty: &InstEmptyLook) -> bool177 fn is_empty_match(&self, at: InputAt, empty: &InstEmptyLook) -> bool { 178 use prog::EmptyLook::*; 179 match empty.look { 180 StartLine => { 181 let c = self.previous_char(at); 182 at.pos() == 0 || c == '\n' 183 } 184 EndLine => { 185 let c = self.next_char(at); 186 at.pos() == self.len() || c == '\n' 187 } 188 StartText => at.pos() == 0, 189 EndText => at.pos() == self.len(), 190 WordBoundary => { 191 let (c1, c2) = (self.previous_char(at), self.next_char(at)); 192 c1.is_word_char() != c2.is_word_char() 193 } 194 NotWordBoundary => { 195 let (c1, c2) = (self.previous_char(at), self.next_char(at)); 196 c1.is_word_char() == c2.is_word_char() 197 } 198 WordBoundaryAscii => { 199 let (c1, c2) = (self.previous_char(at), self.next_char(at)); 200 c1.is_word_byte() != c2.is_word_byte() 201 } 202 NotWordBoundaryAscii => { 203 let (c1, c2) = (self.previous_char(at), self.next_char(at)); 204 c1.is_word_byte() == c2.is_word_byte() 205 } 206 } 207 } 208 prefix_at( &self, prefixes: &LiteralSearcher, at: InputAt, ) -> Option<InputAt>209 fn prefix_at( 210 &self, 211 prefixes: &LiteralSearcher, 212 at: InputAt, 213 ) -> Option<InputAt> { 214 prefixes.find(&self[at.pos()..]).map(|(s, _)| self.at(at.pos() + s)) 215 } 216 len(&self) -> usize217 fn len(&self) -> usize { 218 self.0.len() 219 } 220 as_bytes(&self) -> &[u8]221 fn as_bytes(&self) -> &[u8] { 222 self.0 223 } 224 } 225 226 /// An input reader over bytes. 227 #[derive(Clone, Copy, Debug)] 228 pub struct ByteInput<'t> { 229 text: &'t [u8], 230 only_utf8: bool, 231 } 232 233 impl<'t> ByteInput<'t> { 234 /// Return a new byte-based input reader for the given string. new(text: &'t [u8], only_utf8: bool) -> ByteInput<'t>235 pub fn new(text: &'t [u8], only_utf8: bool) -> ByteInput<'t> { 236 ByteInput { text: text, only_utf8: only_utf8 } 237 } 238 } 239 240 impl<'t> ops::Deref for ByteInput<'t> { 241 type Target = [u8]; 242 deref(&self) -> &[u8]243 fn deref(&self) -> &[u8] { 244 self.text 245 } 246 } 247 248 impl<'t> Input for ByteInput<'t> { at(&self, i: usize) -> InputAt249 fn at(&self, i: usize) -> InputAt { 250 if i >= self.len() { 251 InputAt { pos: self.len(), c: None.into(), byte: None, len: 0 } 252 } else { 253 InputAt { 254 pos: i, 255 c: None.into(), 256 byte: self.get(i).cloned(), 257 len: 1, 258 } 259 } 260 } 261 next_char(&self, at: InputAt) -> Char262 fn next_char(&self, at: InputAt) -> Char { 263 decode_utf8(&self[at.pos()..]).map(|(c, _)| c).into() 264 } 265 previous_char(&self, at: InputAt) -> Char266 fn previous_char(&self, at: InputAt) -> Char { 267 decode_last_utf8(&self[..at.pos()]).map(|(c, _)| c).into() 268 } 269 is_empty_match(&self, at: InputAt, empty: &InstEmptyLook) -> bool270 fn is_empty_match(&self, at: InputAt, empty: &InstEmptyLook) -> bool { 271 use prog::EmptyLook::*; 272 match empty.look { 273 StartLine => { 274 let c = self.previous_char(at); 275 at.pos() == 0 || c == '\n' 276 } 277 EndLine => { 278 let c = self.next_char(at); 279 at.pos() == self.len() || c == '\n' 280 } 281 StartText => at.pos() == 0, 282 EndText => at.pos() == self.len(), 283 WordBoundary => { 284 let (c1, c2) = (self.previous_char(at), self.next_char(at)); 285 c1.is_word_char() != c2.is_word_char() 286 } 287 NotWordBoundary => { 288 let (c1, c2) = (self.previous_char(at), self.next_char(at)); 289 c1.is_word_char() == c2.is_word_char() 290 } 291 WordBoundaryAscii => { 292 let (c1, c2) = (self.previous_char(at), self.next_char(at)); 293 if self.only_utf8 { 294 // If we must match UTF-8, then we can't match word 295 // boundaries at invalid UTF-8. 296 if c1.is_none() && !at.is_start() { 297 return false; 298 } 299 if c2.is_none() && !at.is_end() { 300 return false; 301 } 302 } 303 c1.is_word_byte() != c2.is_word_byte() 304 } 305 NotWordBoundaryAscii => { 306 let (c1, c2) = (self.previous_char(at), self.next_char(at)); 307 if self.only_utf8 { 308 // If we must match UTF-8, then we can't match word 309 // boundaries at invalid UTF-8. 310 if c1.is_none() && !at.is_start() { 311 return false; 312 } 313 if c2.is_none() && !at.is_end() { 314 return false; 315 } 316 } 317 c1.is_word_byte() == c2.is_word_byte() 318 } 319 } 320 } 321 prefix_at( &self, prefixes: &LiteralSearcher, at: InputAt, ) -> Option<InputAt>322 fn prefix_at( 323 &self, 324 prefixes: &LiteralSearcher, 325 at: InputAt, 326 ) -> Option<InputAt> { 327 prefixes.find(&self[at.pos()..]).map(|(s, _)| self.at(at.pos() + s)) 328 } 329 len(&self) -> usize330 fn len(&self) -> usize { 331 self.text.len() 332 } 333 as_bytes(&self) -> &[u8]334 fn as_bytes(&self) -> &[u8] { 335 self.text 336 } 337 } 338 339 /// An inline representation of `Option<char>`. 340 /// 341 /// This eliminates the need to do case analysis on `Option<char>` to determine 342 /// ordinality with other characters. 343 /// 344 /// (The `Option<char>` is not related to encoding. Instead, it is used in the 345 /// matching engines to represent the beginning and ending boundaries of the 346 /// search text.) 347 #[derive(Clone, Copy, Hash, PartialEq, Eq, PartialOrd, Ord)] 348 pub struct Char(u32); 349 350 impl fmt::Debug for Char { fmt(&self, f: &mut fmt::Formatter) -> fmt::Result351 fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result { 352 match char::from_u32(self.0) { 353 None => write!(f, "Empty"), 354 Some(c) => write!(f, "{:?}", c), 355 } 356 } 357 } 358 359 impl Char { 360 /// Returns true iff the character is absent. 361 #[inline] is_none(self) -> bool362 pub fn is_none(self) -> bool { 363 self.0 == u32::MAX 364 } 365 366 /// Returns the length of the character's UTF-8 encoding. 367 /// 368 /// If the character is absent, then `1` is returned. 369 #[inline] len_utf8(self) -> usize370 pub fn len_utf8(self) -> usize { 371 char::from_u32(self.0).map_or(1, |c| c.len_utf8()) 372 } 373 374 /// Returns true iff the character is a word character. 375 /// 376 /// If the character is absent, then false is returned. is_word_char(self) -> bool377 pub fn is_word_char(self) -> bool { 378 // is_word_character can panic if the Unicode data for \w isn't 379 // available. However, our compiler ensures that if a Unicode word 380 // boundary is used, then the data must also be available. If it isn't, 381 // then the compiler returns an error. 382 char::from_u32(self.0).map_or(false, syntax::is_word_character) 383 } 384 385 /// Returns true iff the byte is a word byte. 386 /// 387 /// If the byte is absent, then false is returned. is_word_byte(self) -> bool388 pub fn is_word_byte(self) -> bool { 389 match char::from_u32(self.0) { 390 Some(c) if c <= '\u{7F}' => syntax::is_word_byte(c as u8), 391 None | Some(_) => false, 392 } 393 } 394 } 395 396 impl From<char> for Char { from(c: char) -> Char397 fn from(c: char) -> Char { 398 Char(c as u32) 399 } 400 } 401 402 impl From<Option<char>> for Char { from(c: Option<char>) -> Char403 fn from(c: Option<char>) -> Char { 404 c.map_or(Char(u32::MAX), |c| c.into()) 405 } 406 } 407 408 impl PartialEq<char> for Char { 409 #[inline] eq(&self, other: &char) -> bool410 fn eq(&self, other: &char) -> bool { 411 self.0 == *other as u32 412 } 413 } 414 415 impl PartialEq<Char> for char { 416 #[inline] eq(&self, other: &Char) -> bool417 fn eq(&self, other: &Char) -> bool { 418 *self as u32 == other.0 419 } 420 } 421 422 impl PartialOrd<char> for Char { 423 #[inline] partial_cmp(&self, other: &char) -> Option<Ordering>424 fn partial_cmp(&self, other: &char) -> Option<Ordering> { 425 self.0.partial_cmp(&(*other as u32)) 426 } 427 } 428 429 impl PartialOrd<Char> for char { 430 #[inline] partial_cmp(&self, other: &Char) -> Option<Ordering>431 fn partial_cmp(&self, other: &Char) -> Option<Ordering> { 432 (*self as u32).partial_cmp(&other.0) 433 } 434 } 435