1 // $Id$
2 // Copyright (c) 2001,2002 RIPE NCC
3 //
4 // All Rights Reserved
5 //
6 // Permission to use, copy, modify, and distribute this software and its
7 // documentation for any purpose and without fee is hereby granted,
8 // provided that the above copyright notice appear in all copies and that
9 // both that copyright notice and this permission notice appear in
10 // supporting documentation, and that the name of the author not be
11 // used in advertising or publicity pertaining to distribution of the
12 // software without specific, written prior permission.
13 //
14 // THE AUTHOR DISCLAIMS ALL WARRANTIES WITH REGARD TO THIS SOFTWARE, INCLUDING
15 // ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS; IN NO EVENT SHALL
16 // AUTHOR BE LIABLE FOR ANY SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY
17 // DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN
18 // AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF
19 // OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
20 //
21 //
22 // Copyright (c) 1994 by the University of Southern California
23 // All rights reserved.
24 //
25 // Permission is hereby granted, free of charge, to any person obtaining a copy
26 // of this software and associated documentation files (the "Software"), to deal
27 // in the Software without restriction, including without limitation the rights
28 // to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
29 // copies of the Software, and to permit persons to whom the Software is
30 // furnished to do so, subject to the following conditions:
31 //
32 // The above copyright notice and this permission notice shall be included in
33 // all copies or substantial portions of the Software.
34 //
35 // THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
36 // IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
37 // FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
38 // AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
39 // LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
40 // OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
41 // THE SOFTWARE.
42 //
43 // Questions concerning this software should be directed to
44 // irrtoolset@cs.usc.edu.
45 //
46 // Author(s): Cengiz Alaettinoglu <cengiz@ISI.EDU>
47
48 #include "config.h"
49 #include <cstring>
50 #include <cassert>
51 #include <map>
52
53 #include "irrutil/debug.hh"
54 #include "regexp_nf.hh"
55 #include "dataset/SetOfUInt.hh"
56 #include "irr/irr.hh"
57
58 using namespace std;
59
60 bool regexp_nf::expand_as_macros = true;
61
62 ///// regexp_empty_set ///////////////////////////////////////////////////////
63
dup() const64 regexp* regexp_nf::dup() const {
65 return new regexp_nf(*this);
66 }
67
68 ///// regexp_nf ///////////////////////////////////////////////////////
69
operator ==(regexp_nf & b)70 bool regexp_nf::operator==(regexp_nf& b) {
71 return rd_equal_dfa(dfa(), b.dfa());
72 }
73
is_universal()74 bool regexp_nf::is_universal() {
75 return rd_is_universal_dfa(dfa());
76 }
77
isEmpty()78 bool regexp_nf::isEmpty() {
79 return rd_isEmpty_dfa(dfa());
80 }
81
isEmptyStr()82 bool regexp_nf::isEmptyStr() {
83 rd_init();
84 rd_fm* m = rd_empty_string_machine();
85 m = rd_ntod(m);
86 rd_minimize(m);
87 bool code = rd_equal_dfa(dfa(), m);
88 rd_free_dfa(m);
89 return code;
90 }
91
dfa(ASt peerAS)92 rd_fm* regexp_nf::dfa(ASt peerAS) {
93 if (!m) {
94 rd_init();
95 m = re2nfa(this, peerAS);
96 m = rd_make_bol(m);
97 m = rd_make_eol(m);
98 m = rd_ntod(m);
99 rd_minimize(m);
100 }
101 return m;
102 }
103
operator <<(ostream & s,const regexp_nf & rs)104 ostream& operator<<(ostream& s, const regexp_nf& rs) {
105 regexp_nf::RegexpConjunct *rc;
106 regexp_nf::RegexpConjunct::ReInt *ri;
107 bool complex = false;
108 bool needOr = false;
109
110 // simple re, print the regexp
111 for (rc = rs.rclist.head(); rc; rc = rs.rclist.next(rc)) {
112 if (rc->regexs.isSingleton()) {
113 ri = rc->regexs.head();
114 if (! ri->negated) {
115 if (needOr)
116 s << " | ";
117 else
118 needOr = true;
119 s << *ri->re;
120 } else
121 complex = true;
122 } else
123 complex = true;
124 }
125
126 if (complex) {
127 // complex re, print the dfa
128 regexp *reg = rs.construct();
129 s << "^" << *reg << "$";
130 delete reg;
131 }
132
133 return s;
134 }
135
136 ////////////////////////////// re2nfa ////////////////////
137
do_tildaplus_as(rd_fm * result,ASt as,bool star) const138 rd_fm *regexp_nf::do_tildaplus_as(rd_fm *result, ASt as, bool star) const {
139 rd_dq *rdr = rd_alloc_range_list_empty();
140 rd_insert_range(rdr, rd_alloc_range(as, as));
141 rd_fm* m;
142 m = rd_singleton(rdr);
143 if (star)
144 m = rd_zero_or_more(m);
145 else
146 m = rd_one_or_more(m);
147 if (! result)
148 result = m;
149 else
150 result = rd_alternate(result, m);
151
152 return result;
153 }
154
do_tildaplus(regexp_symbol * sym,ASt peerAS,bool star) const155 rd_fm *regexp_nf::do_tildaplus(regexp_symbol *sym, ASt peerAS, bool star) const {
156 RangeList::Range *r;
157 ASt as;
158 assert(!sym->complemented);
159
160 rd_fm *result = NULL;
161 for (r = sym->asnumbers.ranges.head();
162 r;
163 r = sym->asnumbers.ranges.next(r))
164 for (as = r->low; as <= r->high; ++as)
165 result = do_tildaplus_as(result, as, star);
166
167 for (Pix p = sym->asSets.first();
168 p;
169 sym->asSets.next(p)) {
170 SymID sid = (SymID) sym->asSets(p);
171 if (symbols.isPeerAS(sid)) {
172 assert(peerAS != RE_INVALID_AS);
173 result = do_tildaplus_as(result, peerAS, star);
174 if (expand_as_macros)
175 sym->asnumbers.add(peerAS, peerAS);
176 } else {
177 sid = symbols.resolvePeerAS(sid, peerAS);
178 const SetOfUInt* as_set = irr->expandASSet(sid);
179
180 if (as_set) {
181 for (Pix pi = as_set->first(); pi; as_set->next(pi)) {
182 ASt as1 = (*as_set)(pi);
183 result = do_tildaplus_as(result, as1, star);
184 if (expand_as_macros)
185 sym->asnumbers.add(as1, as1);
186 }
187 }
188 }
189 }
190
191 if (expand_as_macros)
192 sym->asSets.clear();
193
194 return result;
195 }
196
re2nfa(regexp * re,ASt peerAS) const197 rd_fm* regexp_nf::re2nfa(regexp *re, ASt peerAS) const {
198 if (typeid(*re) == typeid(regexp_empty_set))
199 return rd_empty_set_machine();
200
201 if (typeid(*re) == typeid(regexp_bol)) {
202 rd_fm* m = rd_empty_string_machine();
203 m->bolexp = 1;
204 return m;
205 }
206
207 if (typeid(*re) == typeid(regexp_eol)) {
208 rd_fm* m = rd_empty_string_machine();
209 m->eolexp = 1;
210 return m;
211 }
212
213 if (typeid(*re) == typeid(regexp_empty_str))
214 return rd_empty_string_machine();
215
216 if (typeid(*re) == typeid(regexp_cat))
217 return rd_concatenate(re2nfa(((regexp_cat *) re)->left, peerAS),
218 re2nfa(((regexp_cat *) re)->right, peerAS));
219
220 if (typeid(*re) == typeid(regexp_or))
221 return rd_alternate(re2nfa(((regexp_or *) re)->left, peerAS),
222 re2nfa(((regexp_or *) re)->right, peerAS));
223
224 if (typeid(*re) == typeid(regexp_star))
225 return rd_zero_or_more(re2nfa(((regexp_star *) re)->left, peerAS));
226
227 if (typeid(*re) == typeid(regexp_question))
228 return rd_zero_or_one(re2nfa(((regexp_question *) re)->left, peerAS));
229
230 if (typeid(*re) == typeid(regexp_plus))
231 return rd_one_or_more(re2nfa(((regexp_plus *) re)->left, peerAS));
232
233 if (typeid(*re) == typeid(regexp_tildaplus)) {
234 assert(typeid(*((regexp_tildaplus *) re)->left) == typeid(regexp_symbol));
235 return do_tildaplus((regexp_symbol *) ((regexp_tildaplus *) re)->left, peerAS, false);
236 }
237
238 if (typeid(*re) == typeid(regexp_tildastar)) {
239 assert(typeid(*((regexp_tildastar *) re)->left) == typeid(regexp_symbol));
240 return do_tildaplus((regexp_symbol *) ((regexp_tildastar *) re)->left, peerAS, true);
241 }
242
243 if (typeid(*re) == typeid(regexp_symbol)) {
244 rd_dq *rdr = rd_alloc_range_list_empty();
245
246 RangeList::Range *r;
247 for (r = ((regexp_symbol *) re)->asnumbers.ranges.head();
248 r;
249 r = ((regexp_symbol *) re)->asnumbers.ranges.next(r))
250 rd_insert_range(rdr, rd_alloc_range(r->low, r->high));
251
252 for (Pix p = ((regexp_symbol *) re)->asSets.first();
253 p;
254 ((regexp_symbol *) re)->asSets.next(p)) {
255 SymID sid = (SymID) ((regexp_symbol *) re)->asSets(p);
256 if (symbols.isPeerAS(sid)) {
257 assert(peerAS != RE_INVALID_AS);
258 rd_insert_range(rdr, rd_alloc_range(peerAS, peerAS));
259 if (expand_as_macros)
260 (((regexp_symbol *) re)->asnumbers).add(peerAS, peerAS);
261 } else {
262 sid = symbols.resolvePeerAS(sid, peerAS);
263 const SetOfUInt* as_set = irr->expandASSet(sid);
264
265 if (as_set) {
266 for (Pix pi = as_set->first(); pi; as_set->next(pi)) {
267 ASt as1 = (*as_set)(pi);
268 rd_insert_range(rdr, rd_alloc_range(as1, as1));
269 if (expand_as_macros)
270 (((regexp_symbol *) re)->asnumbers).add(as1, as1);
271 }
272 }
273 }
274 }
275
276 if (expand_as_macros)
277 ((SetOfSymID &) ((regexp_symbol *) re)->asSets).clear();
278
279 if (((regexp_symbol *) re)->complemented)
280 rd_complement_range_list(rdr);
281
282 rd_fm* m;
283 if (RDQ_ISEMPTY(rdr)) {
284 free(rdr);
285 m = rd_empty_set_machine();
286 } else
287 m = rd_singleton(rdr);
288
289 return m;
290 }
291
292 if (typeid(*re) == typeid(regexp_nf)) {
293 assert(((regexp_nf *) re)->rclist.isSingleton());
294 RegexpConjunct *rc= ((regexp_nf *) re)->rclist.head();
295 assert(rc->regexs.isSingleton());
296 regexp_nf::RegexpConjunct::ReInt *ri = rc->regexs.head();
297 return re2nfa(ri->re, peerAS);
298 }
299
300 assert(1);
301 rd_fm* m;
302 return m;
303 }
304
305 //////////////////////////////////////// NF //////////////////////////////
306
become_universal()307 void regexp_nf::become_universal() {
308 rclist.clear();
309
310 if (m)
311 rd_free_dfa(m);
312 m = rd_empty_set_dfa();
313 rd_complement_dfa(m);
314 }
315
become_empty()316 void regexp_nf::become_empty() {
317 rclist.clear();
318
319 if (m)
320 rd_free_dfa(m);
321 m = rd_empty_set_dfa();
322 }
323
regexp_nf(const regexp_nf & s)324 regexp_nf::regexp_nf(const regexp_nf& s) {
325 RegexpConjunct *rc, *rc2;
326
327 for (rc = s.rclist.head(); rc; rc = s.rclist.next(rc)) {
328 rc2 = new RegexpConjunct(*rc);
329 rclist.append(rc2);
330 }
331
332 m = NULL;
333 m = rd_duplicate_dfa(s.m);
334 }
335
RegexpConjunct(const RegexpConjunct & s)336 regexp_nf::RegexpConjunct::RegexpConjunct(const RegexpConjunct &s) {
337 mark = s.mark;
338
339 RegexpConjunct::ReInt *ri, *ri2;
340
341 for (ri = s.regexs.head(); ri; ri = s.regexs.next(ri)) {
342 ri2 = new RegexpConjunct::ReInt(*ri);
343 regexs.append(ri2);
344 }
345 }
346
do_or(regexp_nf & b)347 void regexp_nf::do_or(regexp_nf &b) {
348 if (b.isEmpty() || is_universal())
349 return;
350
351 if (b.is_universal()) {
352 become_universal();
353 return;
354 }
355
356 rd_fm *_m = rd_duplicate_dfa(m);
357 rd_fm *_b_m = rd_duplicate_dfa(b.m);
358
359 rd_init();
360 rd_dton(_m);
361 rd_dton(_b_m);
362
363 _m = rd_alternate(_m, _b_m);
364 _m = rd_ntod(_m);
365 rd_minimize(_m);
366
367 if (rd_equal_dfa(m, _m)) // union is same as us
368 ;
369 else if (rd_equal_dfa(b.m, _m)) { // union is same as b
370 rclist.clear();
371 rclist.splice(b.rclist);
372 b.become_empty();
373 } else { // union is new!
374 rclist.splice(b.rclist);
375 b.become_empty();
376 }
377
378 rd_free_dfa(m); /* works with dfa too */
379 m = _m;
380
381 if (rd_is_universal_dfa(m))
382 rclist.clear();
383 }
384
do_and(regexp_nf & b)385 void regexp_nf::do_and(regexp_nf &b) {
386 if (b.is_universal() || isEmpty())
387 return;
388
389 if (b.isEmpty()) {
390 become_empty();
391 return;
392 }
393
394 rd_fm *m3 = rd_intersect_dfa(m, b.m);
395
396 do_and_terms(b);
397 b.become_empty();
398
399 rd_free_dfa(m); /* works with dfa too */
400 m = m3;
401
402 if (rd_isEmpty_dfa(m))
403 rclist.clear();
404 }
405
do_and_terms(regexp_nf & b)406 void regexp_nf::do_and_terms(regexp_nf &b) {
407 List<RegexpConjunct> tmp;
408 RegexpConjunct *rc1, *rc2, *rc3, *tmp2;
409
410 if (rclist.isEmpty()) {
411 rclist.splice(b.rclist);
412 return;
413 }
414
415 for (rc1 = rclist.head(); rc1; rc1 = rclist.next(rc1))
416 for (rc2 = b.rclist.head(); rc2; rc2 = b.rclist.next(rc2)) {
417 rc3 = new RegexpConjunct;
418 tmp2 = new RegexpConjunct(*rc1);
419 rc3->regexs.splice(tmp2->regexs);
420 delete tmp2;
421 tmp2 = new RegexpConjunct(*rc2);
422 rc3->regexs.splice(tmp2->regexs);
423 delete tmp2;
424 tmp.append(rc3);
425 }
426
427 rclist.clear();
428 b.rclist.clear();
429 rclist.splice(tmp);
430 }
431
do_not()432 void regexp_nf::do_not() {
433 if (is_universal()) {
434 become_empty();
435 return;
436 }
437
438 if (isEmpty()) {
439 become_universal();
440 return;
441 }
442
443 // complement machine
444 rd_complement_dfa(m);
445
446 // complement terms
447 regexp_nf tmp, tmp2;
448 RegexpConjunct *rc1, *rc2;
449 RegexpConjunct::ReInt *ri1, *ri2;
450
451 tmp.become_universal();
452 for (rc1 = rclist.head(); rc1; rc1 = rclist.next(rc1)) {
453 tmp2.become_empty();
454 for (ri1 = rc1->regexs.head(); ri1; ri1 = rc1->regexs.next(ri1)) {
455 ri2 = new RegexpConjunct::ReInt(*ri1);
456 ri2->negated = ~ri2->negated;
457 rc2 = new RegexpConjunct;
458 rc2->regexs.append(ri2);
459 tmp2.rclist.append(rc2);
460 }
461 tmp.do_and_terms(tmp2);
462 }
463
464 tmp2.rclist.clear();
465 rclist.clear();
466 rclist.splice(tmp.rclist);
467 }
468
469 ////////////////////////////// fsa to regexp conversion /////////////////////
470
471 #define state rd_state
472
473 struct int2 {
int2int2474 int2() {}
int2int2475 int2(state* ii, state* jj) : i(ii), j(jj) {}
476 state *i, *j;
477
478 struct less {
operator ()int2::less479 int operator() (const int2& a, const int2& b) const {
480 return a.i < b.i || (a.i == b.i && a.j < b.j);
481 }
482 };
483 };
484
pmap(map<int2,regexp *,int2::less> & fmtore_map)485 void pmap(map<int2, regexp*, int2::less> &fmtore_map) {
486 map<int2, regexp*, int2::less>::iterator pi;
487
488 for (pi = fmtore_map.begin(); pi != fmtore_map.end(); ++pi)
489 cerr << "map " << (*pi).first.i << " " << (*pi).first.j
490 << " " << *(*pi).second << "\n";
491
492 }
493
construct() const494 regexp* regexp_nf::construct() const {
495 map<int2, regexp*, int2::less> fmtore_map;
496 map<int2, regexp*, int2::less>::iterator pi, qi, si;
497
498 regexp *prefix, *suffix, *middle;
499 rd_state *stt;
500 rd_arc *arc;
501 regexp_symbol *r_sym;
502
503 // initialize fmtore_map from m
504 RDQ_LIST_START(&(m->rf_states), m, stt, rd_state) {
505 RDQ_LIST_START(&(stt->rs_arcs), stt, arc, rd_arc) {
506 int2 i2(stt, arc->ra_to);
507
508 pi = fmtore_map.find(i2);
509 if (pi == fmtore_map.end())
510 fmtore_map[i2] = r_sym = new regexp_symbol;
511 else
512 r_sym = (regexp_symbol *) (*pi).second;
513 r_sym->add(arc->ra_low, arc->ra_high);
514
515 } RDQ_LIST_END(&(stt->rs_arcs), stt, arc, rd_arc);
516 } RDQ_LIST_END(&(m->states), m, stt, rd_state);
517
518 // make two states;
519 state start;
520 state final;
521
522 fmtore_map[int2(&start, m->rf_start)] = new regexp_empty_str;
523
524 RDQ_LIST_START(&(m->rf_final), m, stt, rd_state) {
525 fmtore_map[int2(stt, &final)] = new regexp_empty_str;
526 } RDQ_LIST_END(&(m->rf_final), m, stt, rd_state);
527
528 RDQ_LIST_START(&(m->rf_states), m, stt, rd_state) {// eliminate each state
529 // pmap(fmtore_map);
530 // make self looping middle
531 if ((pi = fmtore_map.find(int2(stt, stt))) != fmtore_map.end()) {
532 middle = (*pi).second;
533 middle = buildStar(middle);
534 fmtore_map.erase(pi);
535 } else
536 middle = new regexp_empty_str;
537
538 // cerr << "Eliminating " << next[i].value() << " middle " << *middle << "\n";
539
540 for (pi = fmtore_map.begin(); pi != fmtore_map.end(); ) {
541 if ((*pi).first.j == stt) {
542 prefix = buildCat((*pi).second, middle->dup());
543 // cerr << "arc from " << (*pi).first.i << " q " << *q << "\n";
544 for (qi = fmtore_map.lower_bound(int2(stt, 0));
545 qi != fmtore_map.end() && (*qi).first.i == stt;
546 ++qi) {
547 suffix = buildCat(prefix->dup(), (*qi).second->dup());
548 // cerr << " into " << (*qi).first.j << " p " << *p << "\n";
549 if ((si = fmtore_map.find(int2((*pi).first.i, (*qi).first.j))) != fmtore_map.end())
550 (*si).second = buildOr((*si).second, suffix);
551 else
552 fmtore_map[int2((*pi).first.i, (*qi).first.j)] = suffix;
553 }
554 delete prefix;
555 si = pi;
556 ++pi;
557 fmtore_map.erase(si);
558 } else
559 ++pi;
560 }
561 for (qi = fmtore_map.lower_bound(int2(stt, 0));
562 qi != fmtore_map.end() && (*qi).first.i == stt;
563 ) {
564 delete (*qi).second;
565 si = qi;
566 ++qi;
567 fmtore_map.erase(si);
568 }
569
570 delete middle;
571 } RDQ_LIST_END(&(m->states), m, stt, rd_state);
572
573 // check for empty string
574 if (RD_ACCEPTS_EMPTY_STRING(m))
575 return buildQuestion(fmtore_map[int2(&start, &final)]);
576
577 return fmtore_map[int2(&start, &final)];
578 }
579
buildCat(regexp * l,regexp * r) const580 regexp* regexp_nf::buildCat(regexp *l, regexp *r) const {
581 assert(l);
582 assert(r);
583
584 if (typeid(*l) == typeid(regexp_empty_set)) {
585 delete r;
586 return l;
587 }
588
589 if (typeid(*r) == typeid(regexp_empty_set)) {
590 delete l;
591 return r;
592 }
593
594 if (typeid(*l) == typeid(regexp_empty_str)) {
595 delete l;
596 return r;
597 }
598
599 if (typeid(*r) == typeid(regexp_empty_str)) {
600 delete r;
601 return l;
602 }
603
604 regexp_cat* re = new regexp_cat(l, r);
605
606 return re;
607 }
608
buildOr(regexp * l,regexp * r) const609 regexp* regexp_nf::buildOr(regexp *l, regexp *r) const {
610 assert(l);
611 assert(r);
612
613 if (typeid(*l) == typeid(regexp_empty_set)) {
614 delete l;
615 return r;
616 }
617
618 if (typeid(*r) == typeid(regexp_empty_set)) {
619 delete r;
620 return l;
621 }
622
623 if (typeid(*l) == typeid(regexp_empty_str)) {
624 delete l;
625 return buildQuestion(r);
626 }
627
628 if (typeid(*r) == typeid(regexp_empty_str)) {
629 delete r;
630 return buildQuestion(l);
631 }
632
633 if (*l == *r) {
634 delete r;
635 return l;
636 }
637
638 regexp_or* re = new regexp_or(l, r);
639 return re;
640 }
641
buildStar(regexp * l) const642 regexp* regexp_nf::buildStar(regexp *l) const {
643 assert(l);
644
645 if (typeid(*l) == typeid(regexp_empty_set))
646 return l;
647
648 if (typeid(*l) == typeid(regexp_empty_str))
649 return l;
650
651 if (typeid(*l) == typeid(regexp_star))
652 return l;
653
654 regexp_star* re;
655
656 if (typeid(*l) == typeid(regexp_question)) {
657 re = new regexp_star(((regexp_question *) l)->left);
658 ((regexp_question *) l)->left = NULL;
659 delete l;
660 } else
661 re = new regexp_star(l);
662
663 return re;
664 }
665
buildQuestion(regexp * l) const666 regexp* regexp_nf::buildQuestion(regexp *l) const {
667 assert(l);
668
669 if (typeid(*l) == typeid(regexp_empty_set))
670 return l;
671
672 if (typeid(*l) == typeid(regexp_empty_str))
673 return l;
674
675 if (typeid(*l) == typeid(regexp_star))
676 return l;
677
678 if (typeid(*l) == typeid(regexp_question))
679 return l;
680
681 regexp_question* re = new regexp_question(l);
682 return re;
683 }
684