1 /* Copyright (c) 2012-2017 The ANTLR Project. All rights reserved.
2  * Use of this file is governed by the BSD 3-clause license that
3  * can be found in the LICENSE.txt file in the project root.
4  */
5 
6 #include "misc/MurmurHash.h"
7 #include "Lexer.h"
8 #include "Exceptions.h"
9 #include "Vocabulary.h"
10 
11 #include "misc/IntervalSet.h"
12 
13 using namespace antlr4;
14 using namespace antlr4::misc;
15 
16 IntervalSet const IntervalSet::COMPLETE_CHAR_SET =
17     IntervalSet::of(Lexer::MIN_CHAR_VALUE, Lexer::MAX_CHAR_VALUE);
18 
19 IntervalSet const IntervalSet::EMPTY_SET;
20 
IntervalSet()21 IntervalSet::IntervalSet() : _intervals() {
22 }
23 
IntervalSet(const IntervalSet & set)24 IntervalSet::IntervalSet(const IntervalSet &set) : IntervalSet() {
25   _intervals = set._intervals;
26 }
27 
IntervalSet(IntervalSet && set)28 IntervalSet::IntervalSet(IntervalSet&& set) : IntervalSet(std::move(set._intervals)) {
29 }
30 
IntervalSet(std::vector<Interval> && intervals)31 IntervalSet::IntervalSet(std::vector<Interval>&& intervals) : _intervals(std::move(intervals)) {
32 }
33 
operator =(const IntervalSet & other)34 IntervalSet& IntervalSet::operator=(const IntervalSet& other) {
35   _intervals = other._intervals;
36   return *this;
37 }
38 
operator =(IntervalSet && other)39 IntervalSet& IntervalSet::operator=(IntervalSet&& other) {
40   _intervals = move(other._intervals);
41   return *this;
42 }
43 
of(ssize_t a)44 IntervalSet IntervalSet::of(ssize_t a) {
45   return IntervalSet({ Interval(a, a) });
46 }
47 
of(ssize_t a,ssize_t b)48 IntervalSet IntervalSet::of(ssize_t a, ssize_t b) {
49   return IntervalSet({ Interval(a, b) });
50 }
51 
clear()52 void IntervalSet::clear() {
53   _intervals.clear();
54 }
55 
add(ssize_t el)56 void IntervalSet::add(ssize_t el) {
57   add(el, el);
58 }
59 
add(ssize_t a,ssize_t b)60 void IntervalSet::add(ssize_t a, ssize_t b) {
61   add(Interval(a, b));
62 }
63 
add(const Interval & addition)64 void IntervalSet::add(const Interval &addition) {
65   if (addition.b < addition.a) {
66     return;
67   }
68 
69   // find position in list
70   for (auto iterator = _intervals.begin(); iterator != _intervals.end(); ++iterator) {
71     Interval r = *iterator;
72     if (addition == r) {
73       return;
74     }
75 
76     if (addition.adjacent(r) || !addition.disjoint(r)) {
77       // next to each other, make a single larger interval
78       Interval bigger = addition.Union(r);
79       *iterator = bigger;
80 
81       // make sure we didn't just create an interval that
82       // should be merged with next interval in list
83       while (iterator + 1 != _intervals.end()) {
84         Interval next = *++iterator;
85         if (!bigger.adjacent(next) && bigger.disjoint(next)) {
86           break;
87         }
88 
89         // if we bump up against or overlap next, merge
90         iterator = _intervals.erase(iterator);// remove this one
91         --iterator; // move backwards to what we just set
92         *iterator = bigger.Union(next); // set to 3 merged ones
93         // ml: no need to advance iterator, we do that in the next round anyway. ++iterator; // first call to next after previous duplicates the result
94       }
95       return;
96     }
97 
98     if (addition.startsBeforeDisjoint(r)) {
99       // insert before r
100       //--iterator;
101       _intervals.insert(iterator, addition);
102       return;
103     }
104 
105     // if disjoint and after r, a future iteration will handle it
106   }
107 
108   // ok, must be after last interval (and disjoint from last interval)
109   // just add it
110   _intervals.push_back(addition);
111 }
112 
Or(const std::vector<IntervalSet> & sets)113 IntervalSet IntervalSet::Or(const std::vector<IntervalSet> &sets) {
114   IntervalSet result;
115   for (auto &s : sets) {
116     result.addAll(s);
117   }
118   return result;
119 }
120 
addAll(const IntervalSet & set)121 IntervalSet& IntervalSet::addAll(const IntervalSet &set) {
122   // walk set and add each interval
123   for (auto const& interval : set._intervals) {
124     add(interval);
125   }
126   return *this;
127 }
128 
complement(ssize_t minElement,ssize_t maxElement) const129 IntervalSet IntervalSet::complement(ssize_t minElement, ssize_t maxElement) const {
130   return complement(IntervalSet::of(minElement, maxElement));
131 }
132 
complement(const IntervalSet & vocabulary) const133 IntervalSet IntervalSet::complement(const IntervalSet &vocabulary) const {
134   return vocabulary.subtract(*this);
135 }
136 
subtract(const IntervalSet & other) const137 IntervalSet IntervalSet::subtract(const IntervalSet &other) const {
138   return subtract(*this, other);
139 }
140 
subtract(const IntervalSet & left,const IntervalSet & right)141 IntervalSet IntervalSet::subtract(const IntervalSet &left, const IntervalSet &right) {
142   if (left.isEmpty()) {
143     return IntervalSet();
144   }
145 
146   if (right.isEmpty()) {
147     // right set has no elements; just return the copy of the current set
148     return left;
149   }
150 
151   IntervalSet result(left);
152   size_t resultI = 0;
153   size_t rightI = 0;
154   while (resultI < result._intervals.size() && rightI < right._intervals.size()) {
155     Interval &resultInterval = result._intervals[resultI];
156     const Interval &rightInterval = right._intervals[rightI];
157 
158     // operation: (resultInterval - rightInterval) and update indexes
159 
160     if (rightInterval.b < resultInterval.a) {
161       rightI++;
162       continue;
163     }
164 
165     if (rightInterval.a > resultInterval.b) {
166       resultI++;
167       continue;
168     }
169 
170     Interval beforeCurrent;
171     Interval afterCurrent;
172     if (rightInterval.a > resultInterval.a) {
173       beforeCurrent = Interval(resultInterval.a, rightInterval.a - 1);
174     }
175 
176     if (rightInterval.b < resultInterval.b) {
177       afterCurrent = Interval(rightInterval.b + 1, resultInterval.b);
178     }
179 
180     if (beforeCurrent.a > -1) { // -1 is the default value
181       if (afterCurrent.a > -1) {
182         // split the current interval into two
183         result._intervals[resultI] = beforeCurrent;
184         result._intervals.insert(result._intervals.begin() + resultI + 1, afterCurrent);
185         resultI++;
186         rightI++;
187       } else {
188         // replace the current interval
189         result._intervals[resultI] = beforeCurrent;
190         resultI++;
191       }
192     } else {
193       if (afterCurrent.a > -1) {
194         // replace the current interval
195         result._intervals[resultI] = afterCurrent;
196         rightI++;
197       } else {
198         // remove the current interval (thus no need to increment resultI)
199         result._intervals.erase(result._intervals.begin() + resultI);
200       }
201     }
202   }
203 
204   // If rightI reached right.intervals.size(), no more intervals to subtract from result.
205   // If resultI reached result.intervals.size(), we would be subtracting from an empty set.
206   // Either way, we are done.
207   return result;
208 }
209 
Or(const IntervalSet & a) const210 IntervalSet IntervalSet::Or(const IntervalSet &a) const {
211   IntervalSet result;
212   result.addAll(*this);
213   result.addAll(a);
214   return result;
215 }
216 
And(const IntervalSet & other) const217 IntervalSet IntervalSet::And(const IntervalSet &other) const {
218   IntervalSet intersection;
219   size_t i = 0;
220   size_t j = 0;
221 
222   // iterate down both interval lists looking for nondisjoint intervals
223   while (i < _intervals.size() && j < other._intervals.size()) {
224     Interval mine = _intervals[i];
225     Interval theirs = other._intervals[j];
226 
227     if (mine.startsBeforeDisjoint(theirs)) {
228       // move this iterator looking for interval that might overlap
229       i++;
230     } else if (theirs.startsBeforeDisjoint(mine)) {
231       // move other iterator looking for interval that might overlap
232       j++;
233     } else if (mine.properlyContains(theirs)) {
234       // overlap, add intersection, get next theirs
235       intersection.add(mine.intersection(theirs));
236       j++;
237     } else if (theirs.properlyContains(mine)) {
238       // overlap, add intersection, get next mine
239       intersection.add(mine.intersection(theirs));
240       i++;
241     } else if (!mine.disjoint(theirs)) {
242       // overlap, add intersection
243       intersection.add(mine.intersection(theirs));
244 
245       // Move the iterator of lower range [a..b], but not
246       // the upper range as it may contain elements that will collide
247       // with the next iterator. So, if mine=[0..115] and
248       // theirs=[115..200], then intersection is 115 and move mine
249       // but not theirs as theirs may collide with the next range
250       // in thisIter.
251       // move both iterators to next ranges
252       if (mine.startsAfterNonDisjoint(theirs)) {
253         j++;
254       } else if (theirs.startsAfterNonDisjoint(mine)) {
255         i++;
256       }
257     }
258   }
259 
260   return intersection;
261 }
262 
contains(size_t el) const263 bool IntervalSet::contains(size_t el) const {
264   return contains(symbolToNumeric(el));
265 }
266 
contains(ssize_t el) const267 bool IntervalSet::contains(ssize_t el) const {
268   if (_intervals.empty())
269     return false;
270 
271   if (el < _intervals[0].a) // list is sorted and el is before first interval; not here
272     return false;
273 
274   for (auto &interval : _intervals) {
275     if (el >= interval.a && el <= interval.b) {
276       return true; // found in this interval
277     }
278   }
279   return false;
280 }
281 
isEmpty() const282 bool IntervalSet::isEmpty() const {
283   return _intervals.empty();
284 }
285 
getSingleElement() const286 ssize_t IntervalSet::getSingleElement() const {
287   if (_intervals.size() == 1) {
288     if (_intervals[0].a == _intervals[0].b) {
289       return _intervals[0].a;
290     }
291   }
292 
293   return Token::INVALID_TYPE; // XXX: this value is 0, but 0 is a valid interval range, how can that work?
294 }
295 
getMaxElement() const296 ssize_t IntervalSet::getMaxElement() const {
297   if (_intervals.empty()) {
298     return Token::INVALID_TYPE;
299   }
300 
301   return _intervals.back().b;
302 }
303 
getMinElement() const304 ssize_t IntervalSet::getMinElement() const {
305   if (_intervals.empty()) {
306     return Token::INVALID_TYPE;
307   }
308 
309   return _intervals[0].a;
310 }
311 
getIntervals() const312 std::vector<Interval> const& IntervalSet::getIntervals() const {
313   return _intervals;
314 }
315 
hashCode() const316 size_t IntervalSet::hashCode() const {
317   size_t hash = MurmurHash::initialize();
318   for (auto &interval : _intervals) {
319     hash = MurmurHash::update(hash, interval.a);
320     hash = MurmurHash::update(hash, interval.b);
321   }
322 
323   return MurmurHash::finish(hash, _intervals.size() * 2);
324 }
325 
operator ==(const IntervalSet & other) const326 bool IntervalSet::operator == (const IntervalSet &other) const {
327   if (_intervals.empty() && other._intervals.empty())
328     return true;
329 
330   if (_intervals.size() != other._intervals.size())
331     return false;
332 
333   return std::equal(_intervals.begin(), _intervals.end(), other._intervals.begin());
334 }
335 
toString() const336 std::string IntervalSet::toString() const {
337   return toString(false);
338 }
339 
toString(bool elemAreChar) const340 std::string IntervalSet::toString(bool elemAreChar) const {
341   if (_intervals.empty()) {
342     return "{}";
343   }
344 
345   std::stringstream ss;
346   size_t effectiveSize = size();
347   if (effectiveSize > 1) {
348     ss << "{";
349   }
350 
351   bool firstEntry = true;
352   for (auto &interval : _intervals) {
353     if (!firstEntry)
354       ss << ", ";
355     firstEntry = false;
356 
357     ssize_t a = interval.a;
358     ssize_t b = interval.b;
359     if (a == b) {
360       if (a == -1) {
361         ss << "<EOF>";
362       } else if (elemAreChar) {
363         ss << "'" << static_cast<char>(a) << "'";
364       } else {
365         ss << a;
366       }
367     } else {
368       if (elemAreChar) {
369         ss << "'" << static_cast<char>(a) << "'..'" << static_cast<char>(b) << "'";
370       } else {
371         ss << a << ".." << b;
372       }
373     }
374   }
375   if (effectiveSize > 1) {
376     ss << "}";
377   }
378 
379   return ss.str();
380 }
381 
toString(const std::vector<std::string> & tokenNames) const382 std::string IntervalSet::toString(const std::vector<std::string> &tokenNames) const {
383   return toString(dfa::Vocabulary::fromTokenNames(tokenNames));
384 }
385 
toString(const dfa::Vocabulary & vocabulary) const386 std::string IntervalSet::toString(const dfa::Vocabulary &vocabulary) const {
387   if (_intervals.empty()) {
388     return "{}";
389   }
390 
391   std::stringstream ss;
392   size_t effectiveSize = size();
393   if (effectiveSize > 1) {
394     ss << "{";
395   }
396 
397   bool firstEntry = true;
398   for (auto &interval : _intervals) {
399     if (!firstEntry)
400       ss << ", ";
401     firstEntry = false;
402 
403     ssize_t a = interval.a;
404     ssize_t b = interval.b;
405     if (a == b) {
406       ss << elementName(vocabulary, a);
407     } else {
408       for (ssize_t i = a; i <= b; i++) {
409         if (i > a) {
410           ss << ", ";
411         }
412         ss << elementName(vocabulary, i);
413       }
414     }
415   }
416   if (effectiveSize > 1) {
417     ss << "}";
418   }
419 
420   return ss.str();
421 }
422 
elementName(const std::vector<std::string> & tokenNames,ssize_t a) const423 std::string IntervalSet::elementName(const std::vector<std::string> &tokenNames, ssize_t a) const {
424   return elementName(dfa::Vocabulary::fromTokenNames(tokenNames), a);
425 }
426 
elementName(const dfa::Vocabulary & vocabulary,ssize_t a) const427 std::string IntervalSet::elementName(const dfa::Vocabulary &vocabulary, ssize_t a) const {
428   if (a == -1) {
429     return "<EOF>";
430   } else if (a == -2) {
431     return "<EPSILON>";
432   } else {
433     return vocabulary.getDisplayName(a);
434   }
435 }
436 
size() const437 size_t IntervalSet::size() const {
438   size_t result = 0;
439   for (auto &interval : _intervals) {
440     result += size_t(interval.b - interval.a + 1);
441   }
442   return result;
443 }
444 
toList() const445 std::vector<ssize_t> IntervalSet::toList() const {
446   std::vector<ssize_t> result;
447   for (auto &interval : _intervals) {
448     ssize_t a = interval.a;
449     ssize_t b = interval.b;
450     for (ssize_t v = a; v <= b; v++) {
451       result.push_back(v);
452     }
453   }
454   return result;
455 }
456 
toSet() const457 std::set<ssize_t> IntervalSet::toSet() const {
458   std::set<ssize_t> result;
459   for (auto &interval : _intervals) {
460     ssize_t a = interval.a;
461     ssize_t b = interval.b;
462     for (ssize_t v = a; v <= b; v++) {
463       result.insert(v);
464     }
465   }
466   return result;
467 }
468 
get(size_t i) const469 ssize_t IntervalSet::get(size_t i) const {
470   size_t index = 0;
471   for (auto &interval : _intervals) {
472     ssize_t a = interval.a;
473     ssize_t b = interval.b;
474     for (ssize_t v = a; v <= b; v++) {
475       if (index == i) {
476         return v;
477       }
478       index++;
479     }
480   }
481   return -1;
482 }
483 
remove(size_t el)484 void IntervalSet::remove(size_t el) {
485   remove(symbolToNumeric(el));
486 }
487 
remove(ssize_t el)488 void IntervalSet::remove(ssize_t el) {
489   for (size_t i = 0; i < _intervals.size(); ++i) {
490     Interval &interval = _intervals[i];
491     ssize_t a = interval.a;
492     ssize_t b = interval.b;
493     if (el < a) {
494       break; // list is sorted and el is before this interval; not here
495     }
496 
497     // if whole interval x..x, rm
498     if (el == a && el == b) {
499       _intervals.erase(_intervals.begin() + (long)i);
500       break;
501     }
502     // if on left edge x..b, adjust left
503     if (el == a) {
504       interval.a++;
505       break;
506     }
507     // if on right edge a..x, adjust right
508     if (el == b) {
509       interval.b--;
510       break;
511     }
512     // if in middle a..x..b, split interval
513     if (el > a && el < b) { // found in this interval
514       ssize_t oldb = interval.b;
515       interval.b = el - 1; // [a..x-1]
516       add(el + 1, oldb); // add [x+1..b]
517 
518       break; // ml: not in the Java code but I believe we also should stop searching here, as we found x.
519     }
520   }
521 }
522