1 use std::char;
2 use std::cmp;
3 use std::fmt::Debug;
4 use std::slice;
5 use std::u8;
6 
7 use crate::unicode;
8 
9 // This module contains an *internal* implementation of interval sets.
10 //
11 // The primary invariant that interval sets guards is canonical ordering. That
12 // is, every interval set contains an ordered sequence of intervals where
13 // no two intervals are overlapping or adjacent. While this invariant is
14 // occasionally broken within the implementation, it should be impossible for
15 // callers to observe it.
16 //
17 // Since case folding (as implemented below) breaks that invariant, we roll
18 // that into this API even though it is a little out of place in an otherwise
19 // generic interval set. (Hence the reason why the `unicode` module is imported
20 // here.)
21 //
22 // Some of the implementation complexity here is a result of me wanting to
23 // preserve the sequential representation without using additional memory.
24 // In many cases, we do use linear extra memory, but it is at most 2x and it
25 // is amortized. If we relaxed the memory requirements, this implementation
26 // could become much simpler. The extra memory is honestly probably OK, but
27 // character classes (especially of the Unicode variety) can become quite
28 // large, and it would be nice to keep regex compilation snappy even in debug
29 // builds. (In the past, I have been careless with this area of code and it has
30 // caused slow regex compilations in debug mode, so this isn't entirely
31 // unwarranted.)
32 //
33 // Tests on this are relegated to the public API of HIR in src/hir.rs.
34 
35 #[derive(Clone, Debug, Eq, PartialEq)]
36 pub struct IntervalSet<I> {
37     ranges: Vec<I>,
38 }
39 
40 impl<I: Interval> IntervalSet<I> {
41     /// Create a new set from a sequence of intervals. Each interval is
42     /// specified as a pair of bounds, where both bounds are inclusive.
43     ///
44     /// The given ranges do not need to be in any specific order, and ranges
45     /// may overlap.
new<T: IntoIterator<Item = I>>(intervals: T) -> IntervalSet<I>46     pub fn new<T: IntoIterator<Item = I>>(intervals: T) -> IntervalSet<I> {
47         let mut set = IntervalSet { ranges: intervals.into_iter().collect() };
48         set.canonicalize();
49         set
50     }
51 
52     /// Add a new interval to this set.
push(&mut self, interval: I)53     pub fn push(&mut self, interval: I) {
54         // TODO: This could be faster. e.g., Push the interval such that
55         // it preserves canonicalization.
56         self.ranges.push(interval);
57         self.canonicalize();
58     }
59 
60     /// Return an iterator over all intervals in this set.
61     ///
62     /// The iterator yields intervals in ascending order.
iter(&self) -> IntervalSetIter<'_, I>63     pub fn iter(&self) -> IntervalSetIter<'_, I> {
64         IntervalSetIter(self.ranges.iter())
65     }
66 
67     /// Return an immutable slice of intervals in this set.
68     ///
69     /// The sequence returned is in canonical ordering.
intervals(&self) -> &[I]70     pub fn intervals(&self) -> &[I] {
71         &self.ranges
72     }
73 
74     /// Expand this interval set such that it contains all case folded
75     /// characters. For example, if this class consists of the range `a-z`,
76     /// then applying case folding will result in the class containing both the
77     /// ranges `a-z` and `A-Z`.
78     ///
79     /// This returns an error if the necessary case mapping data is not
80     /// available.
case_fold_simple(&mut self) -> Result<(), unicode::CaseFoldError>81     pub fn case_fold_simple(&mut self) -> Result<(), unicode::CaseFoldError> {
82         let len = self.ranges.len();
83         for i in 0..len {
84             let range = self.ranges[i];
85             if let Err(err) = range.case_fold_simple(&mut self.ranges) {
86                 self.canonicalize();
87                 return Err(err);
88             }
89         }
90         self.canonicalize();
91         Ok(())
92     }
93 
94     /// Union this set with the given set, in place.
union(&mut self, other: &IntervalSet<I>)95     pub fn union(&mut self, other: &IntervalSet<I>) {
96         // This could almost certainly be done more efficiently.
97         self.ranges.extend(&other.ranges);
98         self.canonicalize();
99     }
100 
101     /// Intersect this set with the given set, in place.
intersect(&mut self, other: &IntervalSet<I>)102     pub fn intersect(&mut self, other: &IntervalSet<I>) {
103         if self.ranges.is_empty() {
104             return;
105         }
106         if other.ranges.is_empty() {
107             self.ranges.clear();
108             return;
109         }
110 
111         // There should be a way to do this in-place with constant memory,
112         // but I couldn't figure out a simple way to do it. So just append
113         // the intersection to the end of this range, and then drain it before
114         // we're done.
115         let drain_end = self.ranges.len();
116 
117         let mut ita = (0..drain_end).into_iter();
118         let mut itb = (0..other.ranges.len()).into_iter();
119         let mut a = ita.next().unwrap();
120         let mut b = itb.next().unwrap();
121         loop {
122             if let Some(ab) = self.ranges[a].intersect(&other.ranges[b]) {
123                 self.ranges.push(ab);
124             }
125             let (it, aorb) =
126                 if self.ranges[a].upper() < other.ranges[b].upper() {
127                     (&mut ita, &mut a)
128                 } else {
129                     (&mut itb, &mut b)
130                 };
131             match it.next() {
132                 Some(v) => *aorb = v,
133                 None => break,
134             }
135         }
136         self.ranges.drain(..drain_end);
137     }
138 
139     /// Subtract the given set from this set, in place.
difference(&mut self, other: &IntervalSet<I>)140     pub fn difference(&mut self, other: &IntervalSet<I>) {
141         if self.ranges.is_empty() || other.ranges.is_empty() {
142             return;
143         }
144 
145         // This algorithm is (to me) surprisingly complex. A search of the
146         // interwebs indicate that this is a potentially interesting problem.
147         // Folks seem to suggest interval or segment trees, but I'd like to
148         // avoid the overhead (both runtime and conceptual) of that.
149         //
150         // The following is basically my Shitty First Draft. Therefore, in
151         // order to grok it, you probably need to read each line carefully.
152         // Simplifications are most welcome!
153         //
154         // Remember, we can assume the canonical format invariant here, which
155         // says that all ranges are sorted, not overlapping and not adjacent in
156         // each class.
157         let drain_end = self.ranges.len();
158         let (mut a, mut b) = (0, 0);
159         'LOOP: while a < drain_end && b < other.ranges.len() {
160             // Basically, the easy cases are when neither range overlaps with
161             // each other. If the `b` range is less than our current `a`
162             // range, then we can skip it and move on.
163             if other.ranges[b].upper() < self.ranges[a].lower() {
164                 b += 1;
165                 continue;
166             }
167             // ... similarly for the `a` range. If it's less than the smallest
168             // `b` range, then we can add it as-is.
169             if self.ranges[a].upper() < other.ranges[b].lower() {
170                 let range = self.ranges[a];
171                 self.ranges.push(range);
172                 a += 1;
173                 continue;
174             }
175             // Otherwise, we have overlapping ranges.
176             assert!(!self.ranges[a].is_intersection_empty(&other.ranges[b]));
177 
178             // This part is tricky and was non-obvious to me without looking
179             // at explicit examples (see the tests). The trickiness stems from
180             // two things: 1) subtracting a range from another range could
181             // yield two ranges and 2) after subtracting a range, it's possible
182             // that future ranges can have an impact. The loop below advances
183             // the `b` ranges until they can't possible impact the current
184             // range.
185             //
186             // For example, if our `a` range is `a-t` and our next three `b`
187             // ranges are `a-c`, `g-i`, `r-t` and `x-z`, then we need to apply
188             // subtraction three times before moving on to the next `a` range.
189             let mut range = self.ranges[a];
190             while b < other.ranges.len()
191                 && !range.is_intersection_empty(&other.ranges[b])
192             {
193                 let old_range = range;
194                 range = match range.difference(&other.ranges[b]) {
195                     (None, None) => {
196                         // We lost the entire range, so move on to the next
197                         // without adding this one.
198                         a += 1;
199                         continue 'LOOP;
200                     }
201                     (Some(range1), None) | (None, Some(range1)) => range1,
202                     (Some(range1), Some(range2)) => {
203                         self.ranges.push(range1);
204                         range2
205                     }
206                 };
207                 // It's possible that the `b` range has more to contribute
208                 // here. In particular, if it is greater than the original
209                 // range, then it might impact the next `a` range *and* it
210                 // has impacted the current `a` range as much as possible,
211                 // so we can quit. We don't bump `b` so that the next `a`
212                 // range can apply it.
213                 if other.ranges[b].upper() > old_range.upper() {
214                     break;
215                 }
216                 // Otherwise, the next `b` range might apply to the current
217                 // `a` range.
218                 b += 1;
219             }
220             self.ranges.push(range);
221             a += 1;
222         }
223         while a < drain_end {
224             let range = self.ranges[a];
225             self.ranges.push(range);
226             a += 1;
227         }
228         self.ranges.drain(..drain_end);
229     }
230 
231     /// Compute the symmetric difference of the two sets, in place.
232     ///
233     /// This computes the symmetric difference of two interval sets. This
234     /// removes all elements in this set that are also in the given set,
235     /// but also adds all elements from the given set that aren't in this
236     /// set. That is, the set will contain all elements in either set,
237     /// but will not contain any elements that are in both sets.
symmetric_difference(&mut self, other: &IntervalSet<I>)238     pub fn symmetric_difference(&mut self, other: &IntervalSet<I>) {
239         // TODO(burntsushi): Fix this so that it amortizes allocation.
240         let mut intersection = self.clone();
241         intersection.intersect(other);
242         self.union(other);
243         self.difference(&intersection);
244     }
245 
246     /// Negate this interval set.
247     ///
248     /// For all `x` where `x` is any element, if `x` was in this set, then it
249     /// will not be in this set after negation.
negate(&mut self)250     pub fn negate(&mut self) {
251         if self.ranges.is_empty() {
252             let (min, max) = (I::Bound::min_value(), I::Bound::max_value());
253             self.ranges.push(I::create(min, max));
254             return;
255         }
256 
257         // There should be a way to do this in-place with constant memory,
258         // but I couldn't figure out a simple way to do it. So just append
259         // the negation to the end of this range, and then drain it before
260         // we're done.
261         let drain_end = self.ranges.len();
262 
263         // We do checked arithmetic below because of the canonical ordering
264         // invariant.
265         if self.ranges[0].lower() > I::Bound::min_value() {
266             let upper = self.ranges[0].lower().decrement();
267             self.ranges.push(I::create(I::Bound::min_value(), upper));
268         }
269         for i in 1..drain_end {
270             let lower = self.ranges[i - 1].upper().increment();
271             let upper = self.ranges[i].lower().decrement();
272             self.ranges.push(I::create(lower, upper));
273         }
274         if self.ranges[drain_end - 1].upper() < I::Bound::max_value() {
275             let lower = self.ranges[drain_end - 1].upper().increment();
276             self.ranges.push(I::create(lower, I::Bound::max_value()));
277         }
278         self.ranges.drain(..drain_end);
279     }
280 
281     /// Converts this set into a canonical ordering.
canonicalize(&mut self)282     fn canonicalize(&mut self) {
283         if self.is_canonical() {
284             return;
285         }
286         self.ranges.sort();
287         assert!(!self.ranges.is_empty());
288 
289         // Is there a way to do this in-place with constant memory? I couldn't
290         // figure out a way to do it. So just append the canonicalization to
291         // the end of this range, and then drain it before we're done.
292         let drain_end = self.ranges.len();
293         for oldi in 0..drain_end {
294             // If we've added at least one new range, then check if we can
295             // merge this range in the previously added range.
296             if self.ranges.len() > drain_end {
297                 let (last, rest) = self.ranges.split_last_mut().unwrap();
298                 if let Some(union) = last.union(&rest[oldi]) {
299                     *last = union;
300                     continue;
301                 }
302             }
303             let range = self.ranges[oldi];
304             self.ranges.push(range);
305         }
306         self.ranges.drain(..drain_end);
307     }
308 
309     /// Returns true if and only if this class is in a canonical ordering.
is_canonical(&self) -> bool310     fn is_canonical(&self) -> bool {
311         for pair in self.ranges.windows(2) {
312             if pair[0] >= pair[1] {
313                 return false;
314             }
315             if pair[0].is_contiguous(&pair[1]) {
316                 return false;
317             }
318         }
319         true
320     }
321 }
322 
323 /// An iterator over intervals.
324 #[derive(Debug)]
325 pub struct IntervalSetIter<'a, I>(slice::Iter<'a, I>);
326 
327 impl<'a, I> Iterator for IntervalSetIter<'a, I> {
328     type Item = &'a I;
329 
next(&mut self) -> Option<&'a I>330     fn next(&mut self) -> Option<&'a I> {
331         self.0.next()
332     }
333 }
334 
335 pub trait Interval:
336     Clone + Copy + Debug + Default + Eq + PartialEq + PartialOrd + Ord
337 {
338     type Bound: Bound;
339 
lower(&self) -> Self::Bound340     fn lower(&self) -> Self::Bound;
upper(&self) -> Self::Bound341     fn upper(&self) -> Self::Bound;
set_lower(&mut self, bound: Self::Bound)342     fn set_lower(&mut self, bound: Self::Bound);
set_upper(&mut self, bound: Self::Bound)343     fn set_upper(&mut self, bound: Self::Bound);
case_fold_simple( &self, intervals: &mut Vec<Self>, ) -> Result<(), unicode::CaseFoldError>344     fn case_fold_simple(
345         &self,
346         intervals: &mut Vec<Self>,
347     ) -> Result<(), unicode::CaseFoldError>;
348 
349     /// Create a new interval.
create(lower: Self::Bound, upper: Self::Bound) -> Self350     fn create(lower: Self::Bound, upper: Self::Bound) -> Self {
351         let mut int = Self::default();
352         if lower <= upper {
353             int.set_lower(lower);
354             int.set_upper(upper);
355         } else {
356             int.set_lower(upper);
357             int.set_upper(lower);
358         }
359         int
360     }
361 
362     /// Union the given overlapping range into this range.
363     ///
364     /// If the two ranges aren't contiguous, then this returns `None`.
union(&self, other: &Self) -> Option<Self>365     fn union(&self, other: &Self) -> Option<Self> {
366         if !self.is_contiguous(other) {
367             return None;
368         }
369         let lower = cmp::min(self.lower(), other.lower());
370         let upper = cmp::max(self.upper(), other.upper());
371         Some(Self::create(lower, upper))
372     }
373 
374     /// Intersect this range with the given range and return the result.
375     ///
376     /// If the intersection is empty, then this returns `None`.
intersect(&self, other: &Self) -> Option<Self>377     fn intersect(&self, other: &Self) -> Option<Self> {
378         let lower = cmp::max(self.lower(), other.lower());
379         let upper = cmp::min(self.upper(), other.upper());
380         if lower <= upper {
381             Some(Self::create(lower, upper))
382         } else {
383             None
384         }
385     }
386 
387     /// Subtract the given range from this range and return the resulting
388     /// ranges.
389     ///
390     /// If subtraction would result in an empty range, then no ranges are
391     /// returned.
difference(&self, other: &Self) -> (Option<Self>, Option<Self>)392     fn difference(&self, other: &Self) -> (Option<Self>, Option<Self>) {
393         if self.is_subset(other) {
394             return (None, None);
395         }
396         if self.is_intersection_empty(other) {
397             return (Some(self.clone()), None);
398         }
399         let add_lower = other.lower() > self.lower();
400         let add_upper = other.upper() < self.upper();
401         // We know this because !self.is_subset(other) and the ranges have
402         // a non-empty intersection.
403         assert!(add_lower || add_upper);
404         let mut ret = (None, None);
405         if add_lower {
406             let upper = other.lower().decrement();
407             ret.0 = Some(Self::create(self.lower(), upper));
408         }
409         if add_upper {
410             let lower = other.upper().increment();
411             let range = Self::create(lower, self.upper());
412             if ret.0.is_none() {
413                 ret.0 = Some(range);
414             } else {
415                 ret.1 = Some(range);
416             }
417         }
418         ret
419     }
420 
421     /// Compute the symmetric difference the given range from this range. This
422     /// returns the union of the two ranges minus its intersection.
symmetric_difference( &self, other: &Self, ) -> (Option<Self>, Option<Self>)423     fn symmetric_difference(
424         &self,
425         other: &Self,
426     ) -> (Option<Self>, Option<Self>) {
427         let union = match self.union(other) {
428             None => return (Some(self.clone()), Some(other.clone())),
429             Some(union) => union,
430         };
431         let intersection = match self.intersect(other) {
432             None => return (Some(self.clone()), Some(other.clone())),
433             Some(intersection) => intersection,
434         };
435         union.difference(&intersection)
436     }
437 
438     /// Returns true if and only if the two ranges are contiguous. Two ranges
439     /// are contiguous if and only if the ranges are either overlapping or
440     /// adjacent.
is_contiguous(&self, other: &Self) -> bool441     fn is_contiguous(&self, other: &Self) -> bool {
442         let lower1 = self.lower().as_u32();
443         let upper1 = self.upper().as_u32();
444         let lower2 = other.lower().as_u32();
445         let upper2 = other.upper().as_u32();
446         cmp::max(lower1, lower2) <= cmp::min(upper1, upper2).saturating_add(1)
447     }
448 
449     /// Returns true if and only if the intersection of this range and the
450     /// other range is empty.
is_intersection_empty(&self, other: &Self) -> bool451     fn is_intersection_empty(&self, other: &Self) -> bool {
452         let (lower1, upper1) = (self.lower(), self.upper());
453         let (lower2, upper2) = (other.lower(), other.upper());
454         cmp::max(lower1, lower2) > cmp::min(upper1, upper2)
455     }
456 
457     /// Returns true if and only if this range is a subset of the other range.
is_subset(&self, other: &Self) -> bool458     fn is_subset(&self, other: &Self) -> bool {
459         let (lower1, upper1) = (self.lower(), self.upper());
460         let (lower2, upper2) = (other.lower(), other.upper());
461         (lower2 <= lower1 && lower1 <= upper2)
462             && (lower2 <= upper1 && upper1 <= upper2)
463     }
464 }
465 
466 pub trait Bound:
467     Copy + Clone + Debug + Eq + PartialEq + PartialOrd + Ord
468 {
min_value() -> Self469     fn min_value() -> Self;
max_value() -> Self470     fn max_value() -> Self;
as_u32(self) -> u32471     fn as_u32(self) -> u32;
increment(self) -> Self472     fn increment(self) -> Self;
decrement(self) -> Self473     fn decrement(self) -> Self;
474 }
475 
476 impl Bound for u8 {
min_value() -> Self477     fn min_value() -> Self {
478         u8::MIN
479     }
max_value() -> Self480     fn max_value() -> Self {
481         u8::MAX
482     }
as_u32(self) -> u32483     fn as_u32(self) -> u32 {
484         self as u32
485     }
increment(self) -> Self486     fn increment(self) -> Self {
487         self.checked_add(1).unwrap()
488     }
decrement(self) -> Self489     fn decrement(self) -> Self {
490         self.checked_sub(1).unwrap()
491     }
492 }
493 
494 impl Bound for char {
min_value() -> Self495     fn min_value() -> Self {
496         '\x00'
497     }
max_value() -> Self498     fn max_value() -> Self {
499         '\u{10FFFF}'
500     }
as_u32(self) -> u32501     fn as_u32(self) -> u32 {
502         self as u32
503     }
504 
increment(self) -> Self505     fn increment(self) -> Self {
506         match self {
507             '\u{D7FF}' => '\u{E000}',
508             c => char::from_u32((c as u32).checked_add(1).unwrap()).unwrap(),
509         }
510     }
511 
decrement(self) -> Self512     fn decrement(self) -> Self {
513         match self {
514             '\u{E000}' => '\u{D7FF}',
515             c => char::from_u32((c as u32).checked_sub(1).unwrap()).unwrap(),
516         }
517     }
518 }
519 
520 // Tests for interval sets are written in src/hir.rs against the public API.
521