1 #[cfg(feature = "std")]
2 use core::fmt;
3 #[cfg(feature = "std")]
4 use core::iter;
5 use core::marker::PhantomData;
6 use core::mem::size_of;
7 #[cfg(feature = "std")]
8 use std::collections::HashMap;
9 
10 #[cfg(feature = "std")]
11 use byteorder::{BigEndian, LittleEndian};
12 use byteorder::{ByteOrder, NativeEndian};
13 
14 use classes::ByteClasses;
15 use dense;
16 use dfa::DFA;
17 #[cfg(feature = "std")]
18 use error::{Error, Result};
19 #[cfg(feature = "std")]
20 use state_id::{dead_id, usize_to_state_id, write_state_id_bytes, StateID};
21 #[cfg(not(feature = "std"))]
22 use state_id::{dead_id, StateID};
23 
24 /// A sparse table-based deterministic finite automaton (DFA).
25 ///
26 /// In contrast to a [dense DFA](enum.DenseDFA.html), a sparse DFA uses a
27 /// more space efficient representation for its transition table. Consequently,
28 /// sparse DFAs can use much less memory than dense DFAs, but this comes at a
29 /// price. In particular, reading the more space efficient transitions takes
30 /// more work, and consequently, searching using a sparse DFA is typically
31 /// slower than a dense DFA.
32 ///
33 /// A sparse DFA can be built using the default configuration via the
34 /// [`SparseDFA::new`](enum.SparseDFA.html#method.new) constructor. Otherwise,
35 /// one can configure various aspects of a dense DFA via
36 /// [`dense::Builder`](dense/struct.Builder.html), and then convert a dense
37 /// DFA to a sparse DFA using
38 /// [`DenseDFA::to_sparse`](enum.DenseDFA.html#method.to_sparse).
39 ///
40 /// In general, a sparse DFA supports all the same operations as a dense DFA.
41 ///
42 /// Making the choice between a dense and sparse DFA depends on your specific
43 /// work load. If you can sacrifice a bit of search time performance, then a
44 /// sparse DFA might be the best choice. In particular, while sparse DFAs are
45 /// probably always slower than dense DFAs, you may find that they are easily
46 /// fast enough for your purposes!
47 ///
48 /// # State size
49 ///
50 /// A `SparseDFA` has two type parameters, `T` and `S`. `T` corresponds to
51 /// the type of the DFA's transition table while `S` corresponds to the
52 /// representation used for the DFA's state identifiers as described by the
53 /// [`StateID`](trait.StateID.html) trait. This type parameter is typically
54 /// `usize`, but other valid choices provided by this crate include `u8`,
55 /// `u16`, `u32` and `u64`. The primary reason for choosing a different state
56 /// identifier representation than the default is to reduce the amount of
57 /// memory used by a DFA. Note though, that if the chosen representation cannot
58 /// accommodate the size of your DFA, then building the DFA will fail and
59 /// return an error.
60 ///
61 /// While the reduction in heap memory used by a DFA is one reason for choosing
62 /// a smaller state identifier representation, another possible reason is for
63 /// decreasing the serialization size of a DFA, as returned by
64 /// [`to_bytes_little_endian`](enum.SparseDFA.html#method.to_bytes_little_endian),
65 /// [`to_bytes_big_endian`](enum.SparseDFA.html#method.to_bytes_big_endian)
66 /// or
67 /// [`to_bytes_native_endian`](enum.DenseDFA.html#method.to_bytes_native_endian).
68 ///
69 /// The type of the transition table is typically either `Vec<u8>` or `&[u8]`,
70 /// depending on where the transition table is stored. Note that this is
71 /// different than a dense DFA, whose transition table is typically
72 /// `Vec<S>` or `&[S]`. The reason for this is that a sparse DFA always reads
73 /// its transition table from raw bytes because the table is compactly packed.
74 ///
75 /// # Variants
76 ///
77 /// This DFA is defined as a non-exhaustive enumeration of different types of
78 /// dense DFAs. All of the variants use the same internal representation
79 /// for the transition table, but they vary in how the transition table is
80 /// read. A DFA's specific variant depends on the configuration options set via
81 /// [`dense::Builder`](dense/struct.Builder.html). The default variant is
82 /// `ByteClass`.
83 ///
84 /// # The `DFA` trait
85 ///
86 /// This type implements the [`DFA`](trait.DFA.html) trait, which means it
87 /// can be used for searching. For example:
88 ///
89 /// ```
90 /// use regex_automata::{DFA, SparseDFA};
91 ///
92 /// # fn example() -> Result<(), regex_automata::Error> {
93 /// let dfa = SparseDFA::new("foo[0-9]+")?;
94 /// assert_eq!(Some(8), dfa.find(b"foo12345"));
95 /// # Ok(()) }; example().unwrap()
96 /// ```
97 ///
98 /// The `DFA` trait also provides an assortment of other lower level methods
99 /// for DFAs, such as `start_state` and `next_state`. While these are correctly
100 /// implemented, it is an anti-pattern to use them in performance sensitive
101 /// code on the `SparseDFA` type directly. Namely, each implementation requires
102 /// a branch to determine which type of sparse DFA is being used. Instead,
103 /// this branch should be pushed up a layer in the code since walking the
104 /// transitions of a DFA is usually a hot path. If you do need to use these
105 /// lower level methods in performance critical code, then you should match on
106 /// the variants of this DFA and use each variant's implementation of the `DFA`
107 /// trait directly.
108 #[derive(Clone, Debug)]
109 pub enum SparseDFA<T: AsRef<[u8]>, S: StateID = usize> {
110     /// A standard DFA that does not use byte classes.
111     Standard(Standard<T, S>),
112     /// A DFA that shrinks its alphabet to a set of equivalence classes instead
113     /// of using all possible byte values. Any two bytes belong to the same
114     /// equivalence class if and only if they can be used interchangeably
115     /// anywhere in the DFA while never discriminating between a match and a
116     /// non-match.
117     ///
118     /// Unlike dense DFAs, sparse DFAs do not tend to benefit nearly as much
119     /// from using byte classes. In some cases, using byte classes can even
120     /// marginally increase the size of a sparse DFA's transition table. The
121     /// reason for this is that a sparse DFA already compacts each state's
122     /// transitions separate from whether byte classes are used.
123     ByteClass(ByteClass<T, S>),
124     /// Hints that destructuring should not be exhaustive.
125     ///
126     /// This enum may grow additional variants, so this makes sure clients
127     /// don't count on exhaustive matching. (Otherwise, adding a new variant
128     /// could break existing code.)
129     #[doc(hidden)]
130     __Nonexhaustive,
131 }
132 
133 #[cfg(feature = "std")]
134 impl SparseDFA<Vec<u8>, usize> {
135     /// Parse the given regular expression using a default configuration and
136     /// return the corresponding sparse DFA.
137     ///
138     /// The default configuration uses `usize` for state IDs and reduces the
139     /// alphabet size by splitting bytes into equivalence classes. The
140     /// resulting DFA is *not* minimized.
141     ///
142     /// If you want a non-default configuration, then use the
143     /// [`dense::Builder`](dense/struct.Builder.html)
144     /// to set your own configuration, and then call
145     /// [`DenseDFA::to_sparse`](enum.DenseDFA.html#method.to_sparse)
146     /// to create a sparse DFA.
147     ///
148     /// # Example
149     ///
150     /// ```
151     /// use regex_automata::{DFA, SparseDFA};
152     ///
153     /// # fn example() -> Result<(), regex_automata::Error> {
154     /// let dfa = SparseDFA::new("foo[0-9]+bar")?;
155     /// assert_eq!(Some(11), dfa.find(b"foo12345bar"));
156     /// # Ok(()) }; example().unwrap()
157     /// ```
new(pattern: &str) -> Result<SparseDFA<Vec<u8>, usize>>158     pub fn new(pattern: &str) -> Result<SparseDFA<Vec<u8>, usize>> {
159         dense::Builder::new()
160             .build(pattern)
161             .and_then(|dense| dense.to_sparse())
162     }
163 }
164 
165 #[cfg(feature = "std")]
166 impl<S: StateID> SparseDFA<Vec<u8>, S> {
167     /// Create a new empty sparse DFA that never matches any input.
168     ///
169     /// # Example
170     ///
171     /// In order to build an empty DFA, callers must provide a type hint
172     /// indicating their choice of state identifier representation.
173     ///
174     /// ```
175     /// use regex_automata::{DFA, SparseDFA};
176     ///
177     /// # fn example() -> Result<(), regex_automata::Error> {
178     /// let dfa: SparseDFA<Vec<u8>, usize> = SparseDFA::empty();
179     /// assert_eq!(None, dfa.find(b""));
180     /// assert_eq!(None, dfa.find(b"foo"));
181     /// # Ok(()) }; example().unwrap()
182     /// ```
empty() -> SparseDFA<Vec<u8>, S>183     pub fn empty() -> SparseDFA<Vec<u8>, S> {
184         dense::DenseDFA::empty().to_sparse().unwrap()
185     }
186 
from_dense_sized<T: AsRef<[S]>, A: StateID>( dfa: &dense::Repr<T, S>, ) -> Result<SparseDFA<Vec<u8>, A>>187     pub(crate) fn from_dense_sized<T: AsRef<[S]>, A: StateID>(
188         dfa: &dense::Repr<T, S>,
189     ) -> Result<SparseDFA<Vec<u8>, A>> {
190         Repr::from_dense_sized(dfa).map(|r| r.into_sparse_dfa())
191     }
192 }
193 
194 impl<T: AsRef<[u8]>, S: StateID> SparseDFA<T, S> {
195     /// Cheaply return a borrowed version of this sparse DFA. Specifically, the
196     /// DFA returned always uses `&[u8]` for its transition table while keeping
197     /// the same state identifier representation.
as_ref<'a>(&'a self) -> SparseDFA<&'a [u8], S>198     pub fn as_ref<'a>(&'a self) -> SparseDFA<&'a [u8], S> {
199         match *self {
200             SparseDFA::Standard(Standard(ref r)) => {
201                 SparseDFA::Standard(Standard(r.as_ref()))
202             }
203             SparseDFA::ByteClass(ByteClass(ref r)) => {
204                 SparseDFA::ByteClass(ByteClass(r.as_ref()))
205             }
206             SparseDFA::__Nonexhaustive => unreachable!(),
207         }
208     }
209 
210     /// Return an owned version of this sparse DFA. Specifically, the DFA
211     /// returned always uses `Vec<u8>` for its transition table while keeping
212     /// the same state identifier representation.
213     ///
214     /// Effectively, this returns a sparse DFA whose transition table lives
215     /// on the heap.
216     #[cfg(feature = "std")]
to_owned(&self) -> SparseDFA<Vec<u8>, S>217     pub fn to_owned(&self) -> SparseDFA<Vec<u8>, S> {
218         match *self {
219             SparseDFA::Standard(Standard(ref r)) => {
220                 SparseDFA::Standard(Standard(r.to_owned()))
221             }
222             SparseDFA::ByteClass(ByteClass(ref r)) => {
223                 SparseDFA::ByteClass(ByteClass(r.to_owned()))
224             }
225             SparseDFA::__Nonexhaustive => unreachable!(),
226         }
227     }
228 
229     /// Returns the memory usage, in bytes, of this DFA.
230     ///
231     /// The memory usage is computed based on the number of bytes used to
232     /// represent this DFA's transition table. This typically corresponds to
233     /// heap memory usage.
234     ///
235     /// This does **not** include the stack size used up by this DFA. To
236     /// compute that, used `std::mem::size_of::<SparseDFA>()`.
memory_usage(&self) -> usize237     pub fn memory_usage(&self) -> usize {
238         self.repr().memory_usage()
239     }
240 
repr(&self) -> &Repr<T, S>241     fn repr(&self) -> &Repr<T, S> {
242         match *self {
243             SparseDFA::Standard(ref r) => &r.0,
244             SparseDFA::ByteClass(ref r) => &r.0,
245             SparseDFA::__Nonexhaustive => unreachable!(),
246         }
247     }
248 }
249 
250 /// Routines for converting a sparse DFA to other representations, such as
251 /// smaller state identifiers or raw bytes suitable for persistent storage.
252 #[cfg(feature = "std")]
253 impl<T: AsRef<[u8]>, S: StateID> SparseDFA<T, S> {
254     /// Create a new sparse DFA whose match semantics are equivalent to
255     /// this DFA, but attempt to use `u8` for the representation of state
256     /// identifiers. If `u8` is insufficient to represent all state identifiers
257     /// in this DFA, then this returns an error.
258     ///
259     /// This is a convenience routine for `to_sized::<u8>()`.
to_u8(&self) -> Result<SparseDFA<Vec<u8>, u8>>260     pub fn to_u8(&self) -> Result<SparseDFA<Vec<u8>, u8>> {
261         self.to_sized()
262     }
263 
264     /// Create a new sparse DFA whose match semantics are equivalent to
265     /// this DFA, but attempt to use `u16` for the representation of state
266     /// identifiers. If `u16` is insufficient to represent all state
267     /// identifiers in this DFA, then this returns an error.
268     ///
269     /// This is a convenience routine for `to_sized::<u16>()`.
to_u16(&self) -> Result<SparseDFA<Vec<u8>, u16>>270     pub fn to_u16(&self) -> Result<SparseDFA<Vec<u8>, u16>> {
271         self.to_sized()
272     }
273 
274     /// Create a new sparse DFA whose match semantics are equivalent to
275     /// this DFA, but attempt to use `u32` for the representation of state
276     /// identifiers. If `u32` is insufficient to represent all state
277     /// identifiers in this DFA, then this returns an error.
278     ///
279     /// This is a convenience routine for `to_sized::<u32>()`.
280     #[cfg(any(target_pointer_width = "32", target_pointer_width = "64"))]
to_u32(&self) -> Result<SparseDFA<Vec<u8>, u32>>281     pub fn to_u32(&self) -> Result<SparseDFA<Vec<u8>, u32>> {
282         self.to_sized()
283     }
284 
285     /// Create a new sparse DFA whose match semantics are equivalent to
286     /// this DFA, but attempt to use `u64` for the representation of state
287     /// identifiers. If `u64` is insufficient to represent all state
288     /// identifiers in this DFA, then this returns an error.
289     ///
290     /// This is a convenience routine for `to_sized::<u64>()`.
291     #[cfg(target_pointer_width = "64")]
to_u64(&self) -> Result<SparseDFA<Vec<u8>, u64>>292     pub fn to_u64(&self) -> Result<SparseDFA<Vec<u8>, u64>> {
293         self.to_sized()
294     }
295 
296     /// Create a new sparse DFA whose match semantics are equivalent to
297     /// this DFA, but attempt to use `A` for the representation of state
298     /// identifiers. If `A` is insufficient to represent all state identifiers
299     /// in this DFA, then this returns an error.
300     ///
301     /// An alternative way to construct such a DFA is to use
302     /// [`DenseDFA::to_sparse_sized`](enum.DenseDFA.html#method.to_sparse_sized).
303     /// In general, picking the appropriate size upon initial construction of
304     /// a sparse DFA is preferred, since it will do the conversion in one
305     /// step instead of two.
to_sized<A: StateID>(&self) -> Result<SparseDFA<Vec<u8>, A>>306     pub fn to_sized<A: StateID>(&self) -> Result<SparseDFA<Vec<u8>, A>> {
307         self.repr().to_sized().map(|r| r.into_sparse_dfa())
308     }
309 
310     /// Serialize a sparse DFA to raw bytes in little endian format.
311     ///
312     /// If the state identifier representation of this DFA has a size different
313     /// than 1, 2, 4 or 8 bytes, then this returns an error. All
314     /// implementations of `StateID` provided by this crate satisfy this
315     /// requirement.
to_bytes_little_endian(&self) -> Result<Vec<u8>>316     pub fn to_bytes_little_endian(&self) -> Result<Vec<u8>> {
317         self.repr().to_bytes::<LittleEndian>()
318     }
319 
320     /// Serialize a sparse DFA to raw bytes in big endian format.
321     ///
322     /// If the state identifier representation of this DFA has a size different
323     /// than 1, 2, 4 or 8 bytes, then this returns an error. All
324     /// implementations of `StateID` provided by this crate satisfy this
325     /// requirement.
to_bytes_big_endian(&self) -> Result<Vec<u8>>326     pub fn to_bytes_big_endian(&self) -> Result<Vec<u8>> {
327         self.repr().to_bytes::<BigEndian>()
328     }
329 
330     /// Serialize a sparse DFA to raw bytes in native endian format.
331     /// Generally, it is better to pick an explicit endianness using either
332     /// `to_bytes_little_endian` or `to_bytes_big_endian`. This routine is
333     /// useful in tests where the DFA is serialized and deserialized on the
334     /// same platform.
335     ///
336     /// If the state identifier representation of this DFA has a size different
337     /// than 1, 2, 4 or 8 bytes, then this returns an error. All
338     /// implementations of `StateID` provided by this crate satisfy this
339     /// requirement.
to_bytes_native_endian(&self) -> Result<Vec<u8>>340     pub fn to_bytes_native_endian(&self) -> Result<Vec<u8>> {
341         self.repr().to_bytes::<NativeEndian>()
342     }
343 }
344 
345 impl<'a, S: StateID> SparseDFA<&'a [u8], S> {
346     /// Deserialize a sparse DFA with a specific state identifier
347     /// representation.
348     ///
349     /// Deserializing a DFA using this routine will never allocate heap memory.
350     /// This is also guaranteed to be a constant time operation that does not
351     /// vary with the size of the DFA.
352     ///
353     /// The bytes given should be generated by the serialization of a DFA with
354     /// either the
355     /// [`to_bytes_little_endian`](enum.DenseDFA.html#method.to_bytes_little_endian)
356     /// method or the
357     /// [`to_bytes_big_endian`](enum.DenseDFA.html#method.to_bytes_big_endian)
358     /// endian, depending on the endianness of the machine you are
359     /// deserializing this DFA from.
360     ///
361     /// If the state identifier representation is `usize`, then deserialization
362     /// is dependent on the pointer size. For this reason, it is best to
363     /// serialize DFAs using a fixed size representation for your state
364     /// identifiers, such as `u8`, `u16`, `u32` or `u64`.
365     ///
366     /// # Panics
367     ///
368     /// The bytes given should be *trusted*. In particular, if the bytes
369     /// are not a valid serialization of a DFA, or if the endianness of the
370     /// serialized bytes is different than the endianness of the machine that
371     /// is deserializing the DFA, then this routine will panic. Moreover, it
372     /// is possible for this deserialization routine to succeed even if the
373     /// given bytes do not represent a valid serialized sparse DFA.
374     ///
375     /// # Safety
376     ///
377     /// This routine is unsafe because it permits callers to provide an
378     /// arbitrary transition table with possibly incorrect transitions. While
379     /// the various serialization routines will never return an incorrect
380     /// transition table, there is no guarantee that the bytes provided here
381     /// are correct. While deserialization does many checks (as documented
382     /// above in the panic conditions), this routine does not check that the
383     /// transition table is correct. Given an incorrect transition table, it is
384     /// possible for the search routines to access out-of-bounds memory because
385     /// of explicit bounds check elision.
386     ///
387     /// # Example
388     ///
389     /// This example shows how to serialize a DFA to raw bytes, deserialize it
390     /// and then use it for searching. Note that we first convert the DFA to
391     /// using `u16` for its state identifier representation before serializing
392     /// it. While this isn't strictly necessary, it's good practice in order to
393     /// decrease the size of the DFA and to avoid platform specific pitfalls
394     /// such as differing pointer sizes.
395     ///
396     /// ```
397     /// use regex_automata::{DFA, DenseDFA, SparseDFA};
398     ///
399     /// # fn example() -> Result<(), regex_automata::Error> {
400     /// let sparse = SparseDFA::new("foo[0-9]+")?;
401     /// let bytes = sparse.to_u16()?.to_bytes_native_endian()?;
402     ///
403     /// let dfa: SparseDFA<&[u8], u16> = unsafe {
404     ///     SparseDFA::from_bytes(&bytes)
405     /// };
406     ///
407     /// assert_eq!(Some(8), dfa.find(b"foo12345"));
408     /// # Ok(()) }; example().unwrap()
409     /// ```
from_bytes(buf: &'a [u8]) -> SparseDFA<&'a [u8], S>410     pub unsafe fn from_bytes(buf: &'a [u8]) -> SparseDFA<&'a [u8], S> {
411         Repr::from_bytes(buf).into_sparse_dfa()
412     }
413 }
414 
415 impl<T: AsRef<[u8]>, S: StateID> DFA for SparseDFA<T, S> {
416     type ID = S;
417 
418     #[inline]
start_state(&self) -> S419     fn start_state(&self) -> S {
420         self.repr().start_state()
421     }
422 
423     #[inline]
is_match_state(&self, id: S) -> bool424     fn is_match_state(&self, id: S) -> bool {
425         self.repr().is_match_state(id)
426     }
427 
428     #[inline]
is_dead_state(&self, id: S) -> bool429     fn is_dead_state(&self, id: S) -> bool {
430         self.repr().is_dead_state(id)
431     }
432 
433     #[inline]
is_match_or_dead_state(&self, id: S) -> bool434     fn is_match_or_dead_state(&self, id: S) -> bool {
435         self.repr().is_match_or_dead_state(id)
436     }
437 
438     #[inline]
is_anchored(&self) -> bool439     fn is_anchored(&self) -> bool {
440         self.repr().is_anchored()
441     }
442 
443     #[inline]
next_state(&self, current: S, input: u8) -> S444     fn next_state(&self, current: S, input: u8) -> S {
445         match *self {
446             SparseDFA::Standard(ref r) => r.next_state(current, input),
447             SparseDFA::ByteClass(ref r) => r.next_state(current, input),
448             SparseDFA::__Nonexhaustive => unreachable!(),
449         }
450     }
451 
452     #[inline]
next_state_unchecked(&self, current: S, input: u8) -> S453     unsafe fn next_state_unchecked(&self, current: S, input: u8) -> S {
454         self.next_state(current, input)
455     }
456 
457     // We specialize the following methods because it lets us lift the
458     // case analysis between the different types of sparse DFAs. Instead of
459     // doing the case analysis for every transition, we do it once before
460     // searching. For sparse DFAs, this doesn't seem to benefit performance as
461     // much as it does for the dense DFAs, but it's easy to do so we might as
462     // well do it.
463 
464     #[inline]
is_match_at(&self, bytes: &[u8], start: usize) -> bool465     fn is_match_at(&self, bytes: &[u8], start: usize) -> bool {
466         match *self {
467             SparseDFA::Standard(ref r) => r.is_match_at(bytes, start),
468             SparseDFA::ByteClass(ref r) => r.is_match_at(bytes, start),
469             SparseDFA::__Nonexhaustive => unreachable!(),
470         }
471     }
472 
473     #[inline]
shortest_match_at(&self, bytes: &[u8], start: usize) -> Option<usize>474     fn shortest_match_at(&self, bytes: &[u8], start: usize) -> Option<usize> {
475         match *self {
476             SparseDFA::Standard(ref r) => r.shortest_match_at(bytes, start),
477             SparseDFA::ByteClass(ref r) => r.shortest_match_at(bytes, start),
478             SparseDFA::__Nonexhaustive => unreachable!(),
479         }
480     }
481 
482     #[inline]
find_at(&self, bytes: &[u8], start: usize) -> Option<usize>483     fn find_at(&self, bytes: &[u8], start: usize) -> Option<usize> {
484         match *self {
485             SparseDFA::Standard(ref r) => r.find_at(bytes, start),
486             SparseDFA::ByteClass(ref r) => r.find_at(bytes, start),
487             SparseDFA::__Nonexhaustive => unreachable!(),
488         }
489     }
490 
491     #[inline]
rfind_at(&self, bytes: &[u8], start: usize) -> Option<usize>492     fn rfind_at(&self, bytes: &[u8], start: usize) -> Option<usize> {
493         match *self {
494             SparseDFA::Standard(ref r) => r.rfind_at(bytes, start),
495             SparseDFA::ByteClass(ref r) => r.rfind_at(bytes, start),
496             SparseDFA::__Nonexhaustive => unreachable!(),
497         }
498     }
499 }
500 
501 /// A standard sparse DFA that does not use premultiplication or byte classes.
502 ///
503 /// Generally, it isn't necessary to use this type directly, since a
504 /// `SparseDFA` can be used for searching directly. One possible reason why
505 /// one might want to use this type directly is if you are implementing your
506 /// own search routines by walking a DFA's transitions directly. In that case,
507 /// you'll want to use this type (or any of the other DFA variant types)
508 /// directly, since they implement `next_state` more efficiently.
509 #[derive(Clone, Debug)]
510 pub struct Standard<T: AsRef<[u8]>, S: StateID = usize>(Repr<T, S>);
511 
512 impl<T: AsRef<[u8]>, S: StateID> DFA for Standard<T, S> {
513     type ID = S;
514 
515     #[inline]
start_state(&self) -> S516     fn start_state(&self) -> S {
517         self.0.start_state()
518     }
519 
520     #[inline]
is_match_state(&self, id: S) -> bool521     fn is_match_state(&self, id: S) -> bool {
522         self.0.is_match_state(id)
523     }
524 
525     #[inline]
is_dead_state(&self, id: S) -> bool526     fn is_dead_state(&self, id: S) -> bool {
527         self.0.is_dead_state(id)
528     }
529 
530     #[inline]
is_match_or_dead_state(&self, id: S) -> bool531     fn is_match_or_dead_state(&self, id: S) -> bool {
532         self.0.is_match_or_dead_state(id)
533     }
534 
535     #[inline]
is_anchored(&self) -> bool536     fn is_anchored(&self) -> bool {
537         self.0.is_anchored()
538     }
539 
540     #[inline]
next_state(&self, current: S, input: u8) -> S541     fn next_state(&self, current: S, input: u8) -> S {
542         self.0.state(current).next(input)
543     }
544 
545     #[inline]
next_state_unchecked(&self, current: S, input: u8) -> S546     unsafe fn next_state_unchecked(&self, current: S, input: u8) -> S {
547         self.next_state(current, input)
548     }
549 }
550 
551 /// A sparse DFA that shrinks its alphabet.
552 ///
553 /// Alphabet shrinking is achieved by using a set of equivalence classes
554 /// instead of using all possible byte values. Any two bytes belong to the same
555 /// equivalence class if and only if they can be used interchangeably anywhere
556 /// in the DFA while never discriminating between a match and a non-match.
557 ///
558 /// Unlike dense DFAs, sparse DFAs do not tend to benefit nearly as much from
559 /// using byte classes. In some cases, using byte classes can even marginally
560 /// increase the size of a sparse DFA's transition table. The reason for this
561 /// is that a sparse DFA already compacts each state's transitions separate
562 /// from whether byte classes are used.
563 ///
564 /// Generally, it isn't necessary to use this type directly, since a
565 /// `SparseDFA` can be used for searching directly. One possible reason why
566 /// one might want to use this type directly is if you are implementing your
567 /// own search routines by walking a DFA's transitions directly. In that case,
568 /// you'll want to use this type (or any of the other DFA variant types)
569 /// directly, since they implement `next_state` more efficiently.
570 #[derive(Clone, Debug)]
571 pub struct ByteClass<T: AsRef<[u8]>, S: StateID = usize>(Repr<T, S>);
572 
573 impl<T: AsRef<[u8]>, S: StateID> DFA for ByteClass<T, S> {
574     type ID = S;
575 
576     #[inline]
start_state(&self) -> S577     fn start_state(&self) -> S {
578         self.0.start_state()
579     }
580 
581     #[inline]
is_match_state(&self, id: S) -> bool582     fn is_match_state(&self, id: S) -> bool {
583         self.0.is_match_state(id)
584     }
585 
586     #[inline]
is_dead_state(&self, id: S) -> bool587     fn is_dead_state(&self, id: S) -> bool {
588         self.0.is_dead_state(id)
589     }
590 
591     #[inline]
is_match_or_dead_state(&self, id: S) -> bool592     fn is_match_or_dead_state(&self, id: S) -> bool {
593         self.0.is_match_or_dead_state(id)
594     }
595 
596     #[inline]
is_anchored(&self) -> bool597     fn is_anchored(&self) -> bool {
598         self.0.is_anchored()
599     }
600 
601     #[inline]
next_state(&self, current: S, input: u8) -> S602     fn next_state(&self, current: S, input: u8) -> S {
603         let input = self.0.byte_classes.get(input);
604         self.0.state(current).next(input)
605     }
606 
607     #[inline]
next_state_unchecked(&self, current: S, input: u8) -> S608     unsafe fn next_state_unchecked(&self, current: S, input: u8) -> S {
609         self.next_state(current, input)
610     }
611 }
612 
613 /// The underlying representation of a sparse DFA. This is shared by all of
614 /// the different variants of a sparse DFA.
615 #[derive(Clone)]
616 #[cfg_attr(not(feature = "std"), derive(Debug))]
617 struct Repr<T: AsRef<[u8]>, S: StateID = usize> {
618     anchored: bool,
619     start: S,
620     state_count: usize,
621     max_match: S,
622     byte_classes: ByteClasses,
623     trans: T,
624 }
625 
626 impl<T: AsRef<[u8]>, S: StateID> Repr<T, S> {
into_sparse_dfa(self) -> SparseDFA<T, S>627     fn into_sparse_dfa(self) -> SparseDFA<T, S> {
628         if self.byte_classes.is_singleton() {
629             SparseDFA::Standard(Standard(self))
630         } else {
631             SparseDFA::ByteClass(ByteClass(self))
632         }
633     }
634 
as_ref<'a>(&'a self) -> Repr<&'a [u8], S>635     fn as_ref<'a>(&'a self) -> Repr<&'a [u8], S> {
636         Repr {
637             anchored: self.anchored,
638             start: self.start,
639             state_count: self.state_count,
640             max_match: self.max_match,
641             byte_classes: self.byte_classes.clone(),
642             trans: self.trans(),
643         }
644     }
645 
646     #[cfg(feature = "std")]
to_owned(&self) -> Repr<Vec<u8>, S>647     fn to_owned(&self) -> Repr<Vec<u8>, S> {
648         Repr {
649             anchored: self.anchored,
650             start: self.start,
651             state_count: self.state_count,
652             max_match: self.max_match,
653             byte_classes: self.byte_classes.clone(),
654             trans: self.trans().to_vec(),
655         }
656     }
657 
658     /// Return a convenient representation of the given state.
659     ///
660     /// This is marked as inline because it doesn't seem to get inlined
661     /// otherwise, which leads to a fairly significant performance loss (~25%).
662     #[inline]
state<'a>(&'a self, id: S) -> State<'a, S>663     fn state<'a>(&'a self, id: S) -> State<'a, S> {
664         let mut pos = id.to_usize();
665         let ntrans = NativeEndian::read_u16(&self.trans()[pos..]) as usize;
666         pos += 2;
667         let input_ranges = &self.trans()[pos..pos + (ntrans * 2)];
668         pos += 2 * ntrans;
669         let next = &self.trans()[pos..pos + (ntrans * size_of::<S>())];
670         State { _state_id_repr: PhantomData, ntrans, input_ranges, next }
671     }
672 
673     /// Return an iterator over all of the states in this DFA.
674     ///
675     /// The iterator returned yields tuples, where the first element is the
676     /// state ID and the second element is the state itself.
677     #[cfg(feature = "std")]
states<'a>(&'a self) -> StateIter<'a, T, S>678     fn states<'a>(&'a self) -> StateIter<'a, T, S> {
679         StateIter { dfa: self, id: dead_id() }
680     }
681 
memory_usage(&self) -> usize682     fn memory_usage(&self) -> usize {
683         self.trans().len()
684     }
685 
start_state(&self) -> S686     fn start_state(&self) -> S {
687         self.start
688     }
689 
is_match_state(&self, id: S) -> bool690     fn is_match_state(&self, id: S) -> bool {
691         self.is_match_or_dead_state(id) && !self.is_dead_state(id)
692     }
693 
is_dead_state(&self, id: S) -> bool694     fn is_dead_state(&self, id: S) -> bool {
695         id == dead_id()
696     }
697 
is_match_or_dead_state(&self, id: S) -> bool698     fn is_match_or_dead_state(&self, id: S) -> bool {
699         id <= self.max_match
700     }
701 
is_anchored(&self) -> bool702     fn is_anchored(&self) -> bool {
703         self.anchored
704     }
705 
trans(&self) -> &[u8]706     fn trans(&self) -> &[u8] {
707         self.trans.as_ref()
708     }
709 
710     /// Create a new sparse DFA whose match semantics are equivalent to this
711     /// DFA, but attempt to use `A` for the representation of state
712     /// identifiers. If `A` is insufficient to represent all state identifiers
713     /// in this DFA, then this returns an error.
714     #[cfg(feature = "std")]
to_sized<A: StateID>(&self) -> Result<Repr<Vec<u8>, A>>715     fn to_sized<A: StateID>(&self) -> Result<Repr<Vec<u8>, A>> {
716         // To build the new DFA, we proceed much like the initial construction
717         // of the sparse DFA. Namely, since the state ID size is changing,
718         // we don't actually know all of our state IDs until we've allocated
719         // all necessary space. So we do one pass that allocates all of the
720         // storage we need, and then another pass to fill in the transitions.
721 
722         let mut trans = Vec::with_capacity(size_of::<A>() * self.state_count);
723         let mut map: HashMap<S, A> = HashMap::with_capacity(self.state_count);
724         for (old_id, state) in self.states() {
725             let pos = trans.len();
726             map.insert(old_id, usize_to_state_id(pos)?);
727 
728             let n = state.ntrans;
729             let zeros = 2 + (n * 2) + (n * size_of::<A>());
730             trans.extend(iter::repeat(0).take(zeros));
731 
732             NativeEndian::write_u16(&mut trans[pos..], n as u16);
733             let (s, e) = (pos + 2, pos + 2 + (n * 2));
734             trans[s..e].copy_from_slice(state.input_ranges);
735         }
736 
737         let mut new = Repr {
738             anchored: self.anchored,
739             start: map[&self.start],
740             state_count: self.state_count,
741             max_match: map[&self.max_match],
742             byte_classes: self.byte_classes.clone(),
743             trans,
744         };
745         for (&old_id, &new_id) in map.iter() {
746             let old_state = self.state(old_id);
747             let mut new_state = new.state_mut(new_id);
748             for i in 0..new_state.ntrans {
749                 let next = map[&old_state.next_at(i)];
750                 new_state.set_next_at(i, usize_to_state_id(next.to_usize())?);
751             }
752         }
753         new.start = map[&self.start];
754         new.max_match = map[&self.max_match];
755         Ok(new)
756     }
757 
758     /// Serialize a sparse DFA to raw bytes using the provided endianness.
759     ///
760     /// If the state identifier representation of this DFA has a size different
761     /// than 1, 2, 4 or 8 bytes, then this returns an error. All
762     /// implementations of `StateID` provided by this crate satisfy this
763     /// requirement.
764     ///
765     /// Unlike dense DFAs, the result is not necessarily aligned since a
766     /// sparse DFA's transition table is always read as a sequence of bytes.
767     #[cfg(feature = "std")]
to_bytes<A: ByteOrder>(&self) -> Result<Vec<u8>>768     fn to_bytes<A: ByteOrder>(&self) -> Result<Vec<u8>> {
769         let label = b"rust-regex-automata-sparse-dfa\x00";
770         let size =
771             // For human readable label.
772             label.len()
773             // endiannes check, must be equal to 0xFEFF for native endian
774             + 2
775             // For version number.
776             + 2
777             // Size of state ID representation, in bytes.
778             // Must be 1, 2, 4 or 8.
779             + 2
780             // For DFA misc options. (Currently unused.)
781             + 2
782             // For start state.
783             + 8
784             // For state count.
785             + 8
786             // For max match state.
787             + 8
788             // For byte class map.
789             + 256
790             // For transition table.
791             + self.trans().len();
792 
793         let mut i = 0;
794         let mut buf = vec![0; size];
795 
796         // write label
797         for &b in label {
798             buf[i] = b;
799             i += 1;
800         }
801         // endianness check
802         A::write_u16(&mut buf[i..], 0xFEFF);
803         i += 2;
804         // version number
805         A::write_u16(&mut buf[i..], 1);
806         i += 2;
807         // size of state ID
808         let state_size = size_of::<S>();
809         if ![1, 2, 4, 8].contains(&state_size) {
810             return Err(Error::serialize(&format!(
811                 "state size of {} not supported, must be 1, 2, 4 or 8",
812                 state_size
813             )));
814         }
815         A::write_u16(&mut buf[i..], state_size as u16);
816         i += 2;
817         // DFA misc options
818         let mut options = 0u16;
819         if self.anchored {
820             options |= dense::MASK_ANCHORED;
821         }
822         A::write_u16(&mut buf[i..], options);
823         i += 2;
824         // start state
825         A::write_u64(&mut buf[i..], self.start.to_usize() as u64);
826         i += 8;
827         // state count
828         A::write_u64(&mut buf[i..], self.state_count as u64);
829         i += 8;
830         // max match state
831         A::write_u64(&mut buf[i..], self.max_match.to_usize() as u64);
832         i += 8;
833         // byte class map
834         for b in (0..256).map(|b| b as u8) {
835             buf[i] = self.byte_classes.get(b);
836             i += 1;
837         }
838         // transition table
839         for (_, state) in self.states() {
840             A::write_u16(&mut buf[i..], state.ntrans as u16);
841             i += 2;
842             buf[i..i + (state.ntrans * 2)].copy_from_slice(state.input_ranges);
843             i += state.ntrans * 2;
844             for j in 0..state.ntrans {
845                 write_state_id_bytes::<A, _>(&mut buf[i..], state.next_at(j));
846                 i += size_of::<S>();
847             }
848         }
849 
850         assert_eq!(size, i, "expected to consume entire buffer");
851 
852         Ok(buf)
853     }
854 }
855 
856 impl<'a, S: StateID> Repr<&'a [u8], S> {
857     /// The implementation for deserializing a sparse DFA from raw bytes.
from_bytes(mut buf: &'a [u8]) -> Repr<&'a [u8], S>858     unsafe fn from_bytes(mut buf: &'a [u8]) -> Repr<&'a [u8], S> {
859         // skip over label
860         match buf.iter().position(|&b| b == b'\x00') {
861             None => panic!("could not find label"),
862             Some(i) => buf = &buf[i + 1..],
863         }
864 
865         // check that current endianness is same as endianness of DFA
866         let endian_check = NativeEndian::read_u16(buf);
867         buf = &buf[2..];
868         if endian_check != 0xFEFF {
869             panic!(
870                 "endianness mismatch, expected 0xFEFF but got 0x{:X}. \
871                  are you trying to load a SparseDFA serialized with a \
872                  different endianness?",
873                 endian_check,
874             );
875         }
876 
877         // check that the version number is supported
878         let version = NativeEndian::read_u16(buf);
879         buf = &buf[2..];
880         if version != 1 {
881             panic!(
882                 "expected version 1, but found unsupported version {}",
883                 version,
884             );
885         }
886 
887         // read size of state
888         let state_size = NativeEndian::read_u16(buf) as usize;
889         if state_size != size_of::<S>() {
890             panic!(
891                 "state size of SparseDFA ({}) does not match \
892                  requested state size ({})",
893                 state_size,
894                 size_of::<S>(),
895             );
896         }
897         buf = &buf[2..];
898 
899         // read miscellaneous options
900         let opts = NativeEndian::read_u16(buf);
901         buf = &buf[2..];
902 
903         // read start state
904         let start = S::from_usize(NativeEndian::read_u64(buf) as usize);
905         buf = &buf[8..];
906 
907         // read state count
908         let state_count = NativeEndian::read_u64(buf) as usize;
909         buf = &buf[8..];
910 
911         // read max match state
912         let max_match = S::from_usize(NativeEndian::read_u64(buf) as usize);
913         buf = &buf[8..];
914 
915         // read byte classes
916         let byte_classes = ByteClasses::from_slice(&buf[..256]);
917         buf = &buf[256..];
918 
919         Repr {
920             anchored: opts & dense::MASK_ANCHORED > 0,
921             start,
922             state_count,
923             max_match,
924             byte_classes,
925             trans: buf,
926         }
927     }
928 }
929 
930 #[cfg(feature = "std")]
931 impl<S: StateID> Repr<Vec<u8>, S> {
932     /// The implementation for constructing a sparse DFA from a dense DFA.
from_dense_sized<T: AsRef<[S]>, A: StateID>( dfa: &dense::Repr<T, S>, ) -> Result<Repr<Vec<u8>, A>>933     fn from_dense_sized<T: AsRef<[S]>, A: StateID>(
934         dfa: &dense::Repr<T, S>,
935     ) -> Result<Repr<Vec<u8>, A>> {
936         // In order to build the transition table, we need to be able to write
937         // state identifiers for each of the "next" transitions in each state.
938         // Our state identifiers correspond to the byte offset in the
939         // transition table at which the state is encoded. Therefore, we do not
940         // actually know what the state identifiers are until we've allocated
941         // exactly as much space as we need for each state. Thus, construction
942         // of the transition table happens in two passes.
943         //
944         // In the first pass, we fill out the shell of each state, which
945         // includes the transition count, the input byte ranges and zero-filled
946         // space for the transitions. In this first pass, we also build up a
947         // map from the state identifier index of the dense DFA to the state
948         // identifier in this sparse DFA.
949         //
950         // In the second pass, we fill in the transitions based on the map
951         // built in the first pass.
952 
953         let mut trans = Vec::with_capacity(size_of::<A>() * dfa.state_count());
954         let mut remap: Vec<A> = vec![dead_id(); dfa.state_count()];
955         for (old_id, state) in dfa.states() {
956             let pos = trans.len();
957 
958             remap[dfa.state_id_to_index(old_id)] = usize_to_state_id(pos)?;
959             // zero-filled space for the transition count
960             trans.push(0);
961             trans.push(0);
962 
963             let mut trans_count = 0;
964             for (b1, b2, _) in state.sparse_transitions() {
965                 trans_count += 1;
966                 trans.push(b1);
967                 trans.push(b2);
968             }
969             // fill in the transition count
970             NativeEndian::write_u16(&mut trans[pos..], trans_count);
971 
972             // zero-fill the actual transitions
973             let zeros = trans_count as usize * size_of::<A>();
974             trans.extend(iter::repeat(0).take(zeros));
975         }
976 
977         let mut new = Repr {
978             anchored: dfa.is_anchored(),
979             start: remap[dfa.state_id_to_index(dfa.start_state())],
980             state_count: dfa.state_count(),
981             max_match: remap[dfa.state_id_to_index(dfa.max_match_state())],
982             byte_classes: dfa.byte_classes().clone(),
983             trans,
984         };
985         for (old_id, old_state) in dfa.states() {
986             let new_id = remap[dfa.state_id_to_index(old_id)];
987             let mut new_state = new.state_mut(new_id);
988             let sparse = old_state.sparse_transitions();
989             for (i, (_, _, next)) in sparse.enumerate() {
990                 let next = remap[dfa.state_id_to_index(next)];
991                 new_state.set_next_at(i, next);
992             }
993         }
994         Ok(new)
995     }
996 
997     /// Return a convenient mutable representation of the given state.
state_mut<'a>(&'a mut self, id: S) -> StateMut<'a, S>998     fn state_mut<'a>(&'a mut self, id: S) -> StateMut<'a, S> {
999         let mut pos = id.to_usize();
1000         let ntrans = NativeEndian::read_u16(&self.trans[pos..]) as usize;
1001         pos += 2;
1002 
1003         let size = (ntrans * 2) + (ntrans * size_of::<S>());
1004         let ranges_and_next = &mut self.trans[pos..pos + size];
1005         let (input_ranges, next) = ranges_and_next.split_at_mut(ntrans * 2);
1006         StateMut { _state_id_repr: PhantomData, ntrans, input_ranges, next }
1007     }
1008 }
1009 
1010 #[cfg(feature = "std")]
1011 impl<T: AsRef<[u8]>, S: StateID> fmt::Debug for Repr<T, S> {
fmt(&self, f: &mut fmt::Formatter) -> fmt::Result1012     fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
1013         fn state_status<T: AsRef<[u8]>, S: StateID>(
1014             dfa: &Repr<T, S>,
1015             id: S,
1016         ) -> &'static str {
1017             if id == dead_id() {
1018                 if dfa.is_match_state(id) {
1019                     "D*"
1020                 } else {
1021                     "D "
1022                 }
1023             } else if id == dfa.start_state() {
1024                 if dfa.is_match_state(id) {
1025                     ">*"
1026                 } else {
1027                     "> "
1028                 }
1029             } else {
1030                 if dfa.is_match_state(id) {
1031                     " *"
1032                 } else {
1033                     "  "
1034                 }
1035             }
1036         }
1037 
1038         writeln!(f, "SparseDFA(")?;
1039         for (id, state) in self.states() {
1040             let status = state_status(self, id);
1041             writeln!(f, "{}{:06}: {:?}", status, id.to_usize(), state)?;
1042         }
1043         writeln!(f, ")")?;
1044         Ok(())
1045     }
1046 }
1047 
1048 /// An iterator over all states in a sparse DFA.
1049 ///
1050 /// This iterator yields tuples, where the first element is the state ID and
1051 /// the second element is the state itself.
1052 #[cfg(feature = "std")]
1053 #[derive(Debug)]
1054 struct StateIter<'a, T: AsRef<[u8]> + 'a, S: StateID + 'a = usize> {
1055     dfa: &'a Repr<T, S>,
1056     id: S,
1057 }
1058 
1059 #[cfg(feature = "std")]
1060 impl<'a, T: AsRef<[u8]>, S: StateID> Iterator for StateIter<'a, T, S> {
1061     type Item = (S, State<'a, S>);
1062 
next(&mut self) -> Option<(S, State<'a, S>)>1063     fn next(&mut self) -> Option<(S, State<'a, S>)> {
1064         if self.id.to_usize() >= self.dfa.trans().len() {
1065             return None;
1066         }
1067         let id = self.id;
1068         let state = self.dfa.state(id);
1069         self.id = S::from_usize(self.id.to_usize() + state.bytes());
1070         Some((id, state))
1071     }
1072 }
1073 
1074 /// A representation of a sparse DFA state that can be cheaply materialized
1075 /// from a state identifier.
1076 #[derive(Clone)]
1077 struct State<'a, S: StateID = usize> {
1078     /// The state identifier representation used by the DFA from which this
1079     /// state was extracted. Since our transition table is compacted in a
1080     /// &[u8], we don't actually use the state ID type parameter explicitly
1081     /// anywhere, so we fake it. This prevents callers from using an incorrect
1082     /// state ID representation to read from this state.
1083     _state_id_repr: PhantomData<S>,
1084     /// The number of transitions in this state.
1085     ntrans: usize,
1086     /// Pairs of input ranges, where there is one pair for each transition.
1087     /// Each pair specifies an inclusive start and end byte range for the
1088     /// corresponding transition.
1089     input_ranges: &'a [u8],
1090     /// Transitions to the next state. This slice contains native endian
1091     /// encoded state identifiers, with `S` as the representation. Thus, there
1092     /// are `ntrans * size_of::<S>()` bytes in this slice.
1093     next: &'a [u8],
1094 }
1095 
1096 impl<'a, S: StateID> State<'a, S> {
1097     /// Searches for the next transition given an input byte. If no such
1098     /// transition could be found, then a dead state is returned.
next(&self, input: u8) -> S1099     fn next(&self, input: u8) -> S {
1100         // This straight linear search was observed to be much better than
1101         // binary search on ASCII haystacks, likely because a binary search
1102         // visits the ASCII case last but a linear search sees it first. A
1103         // binary search does do a little better on non-ASCII haystacks, but
1104         // not by much. There might be a better trade off lurking here.
1105         for i in 0..self.ntrans {
1106             let (start, end) = self.range(i);
1107             if start <= input && input <= end {
1108                 return self.next_at(i);
1109             }
1110             // We could bail early with an extra branch: if input < b1, then
1111             // we know we'll never find a matching transition. Interestingly,
1112             // this extra branch seems to not help performance, or will even
1113             // hurt it. It's likely very dependent on the DFA itself and what
1114             // is being searched.
1115         }
1116         dead_id()
1117     }
1118 
1119     /// Returns the inclusive input byte range for the ith transition in this
1120     /// state.
range(&self, i: usize) -> (u8, u8)1121     fn range(&self, i: usize) -> (u8, u8) {
1122         (self.input_ranges[i * 2], self.input_ranges[i * 2 + 1])
1123     }
1124 
1125     /// Returns the next state for the ith transition in this state.
next_at(&self, i: usize) -> S1126     fn next_at(&self, i: usize) -> S {
1127         S::read_bytes(&self.next[i * size_of::<S>()..])
1128     }
1129 
1130     /// Return the total number of bytes that this state consumes in its
1131     /// encoded form.
1132     #[cfg(feature = "std")]
bytes(&self) -> usize1133     fn bytes(&self) -> usize {
1134         2 + (self.ntrans * 2) + (self.ntrans * size_of::<S>())
1135     }
1136 }
1137 
1138 #[cfg(feature = "std")]
1139 impl<'a, S: StateID> fmt::Debug for State<'a, S> {
fmt(&self, f: &mut fmt::Formatter) -> fmt::Result1140     fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
1141         let mut transitions = vec![];
1142         for i in 0..self.ntrans {
1143             let next = self.next_at(i);
1144             if next == dead_id() {
1145                 continue;
1146             }
1147 
1148             let (start, end) = self.range(i);
1149             if start == end {
1150                 transitions.push(format!(
1151                     "{} => {}",
1152                     escape(start),
1153                     next.to_usize()
1154                 ));
1155             } else {
1156                 transitions.push(format!(
1157                     "{}-{} => {}",
1158                     escape(start),
1159                     escape(end),
1160                     next.to_usize(),
1161                 ));
1162             }
1163         }
1164         write!(f, "{}", transitions.join(", "))
1165     }
1166 }
1167 
1168 /// A representation of a mutable sparse DFA state that can be cheaply
1169 /// materialized from a state identifier.
1170 #[cfg(feature = "std")]
1171 struct StateMut<'a, S: StateID = usize> {
1172     /// The state identifier representation used by the DFA from which this
1173     /// state was extracted. Since our transition table is compacted in a
1174     /// &[u8], we don't actually use the state ID type parameter explicitly
1175     /// anywhere, so we fake it. This prevents callers from using an incorrect
1176     /// state ID representation to read from this state.
1177     _state_id_repr: PhantomData<S>,
1178     /// The number of transitions in this state.
1179     ntrans: usize,
1180     /// Pairs of input ranges, where there is one pair for each transition.
1181     /// Each pair specifies an inclusive start and end byte range for the
1182     /// corresponding transition.
1183     input_ranges: &'a mut [u8],
1184     /// Transitions to the next state. This slice contains native endian
1185     /// encoded state identifiers, with `S` as the representation. Thus, there
1186     /// are `ntrans * size_of::<S>()` bytes in this slice.
1187     next: &'a mut [u8],
1188 }
1189 
1190 #[cfg(feature = "std")]
1191 impl<'a, S: StateID> StateMut<'a, S> {
1192     /// Sets the ith transition to the given state.
set_next_at(&mut self, i: usize, next: S)1193     fn set_next_at(&mut self, i: usize, next: S) {
1194         next.write_bytes(&mut self.next[i * size_of::<S>()..]);
1195     }
1196 }
1197 
1198 #[cfg(feature = "std")]
1199 impl<'a, S: StateID> fmt::Debug for StateMut<'a, S> {
fmt(&self, f: &mut fmt::Formatter) -> fmt::Result1200     fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
1201         let state = State {
1202             _state_id_repr: self._state_id_repr,
1203             ntrans: self.ntrans,
1204             input_ranges: self.input_ranges,
1205             next: self.next,
1206         };
1207         fmt::Debug::fmt(&state, f)
1208     }
1209 }
1210 
1211 /// Return the given byte as its escaped string form.
1212 #[cfg(feature = "std")]
escape(b: u8) -> String1213 fn escape(b: u8) -> String {
1214     use std::ascii;
1215 
1216     String::from_utf8(ascii::escape_default(b).collect::<Vec<_>>()).unwrap()
1217 }
1218 
1219 /// A binary search routine specialized specifically to a sparse DFA state's
1220 /// transitions. Specifically, the transitions are defined as a set of pairs
1221 /// of input bytes that delineate an inclusive range of bytes. If the input
1222 /// byte is in the range, then the corresponding transition is a match.
1223 ///
1224 /// This binary search accepts a slice of these pairs and returns the position
1225 /// of the matching pair (the ith transition), or None if no matching pair
1226 /// could be found.
1227 ///
1228 /// Note that this routine is not currently used since it was observed to
1229 /// either decrease performance when searching ASCII, or did not provide enough
1230 /// of a boost on non-ASCII haystacks to be worth it. However, we leave it here
1231 /// for posterity in case we can find a way to use it.
1232 ///
1233 /// In theory, we could use the standard library's search routine if we could
1234 /// cast a `&[u8]` to a `&[(u8, u8)]`, but I don't believe this is currently
1235 /// guaranteed to be safe and is thus UB (since I don't think the in-memory
1236 /// representation of `(u8, u8)` has been nailed down).
1237 #[inline(always)]
1238 #[allow(dead_code)]
binary_search_ranges(ranges: &[u8], needle: u8) -> Option<usize>1239 fn binary_search_ranges(ranges: &[u8], needle: u8) -> Option<usize> {
1240     debug_assert!(ranges.len() % 2 == 0, "ranges must have even length");
1241     debug_assert!(ranges.len() <= 512, "ranges should be short");
1242 
1243     let (mut left, mut right) = (0, ranges.len() / 2);
1244     while left < right {
1245         let mid = (left + right) / 2;
1246         let (b1, b2) = (ranges[mid * 2], ranges[mid * 2 + 1]);
1247         if needle < b1 {
1248             right = mid;
1249         } else if needle > b2 {
1250             left = mid + 1;
1251         } else {
1252             return Some(mid);
1253         }
1254     }
1255     None
1256 }
1257