1------------------------------------------------------------------------------ 2-- -- 3-- GNAT COMPILER COMPONENTS -- 4-- -- 5-- S Y S T E M . R E G E X P -- 6-- -- 7-- B o d y -- 8-- -- 9-- Copyright (C) 1999-2012, AdaCore -- 10-- -- 11-- GNAT is free software; you can redistribute it and/or modify it under -- 12-- terms of the GNU General Public License as published by the Free Soft- -- 13-- ware Foundation; either version 3, or (at your option) any later ver- -- 14-- sion. GNAT is distributed in the hope that it will be useful, but WITH- -- 15-- OUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY -- 16-- or FITNESS FOR A PARTICULAR PURPOSE. -- 17-- -- 18-- As a special exception under Section 7 of GPL version 3, you are granted -- 19-- additional permissions described in the GCC Runtime Library Exception, -- 20-- version 3.1, as published by the Free Software Foundation. -- 21-- -- 22-- You should have received a copy of the GNU General Public License and -- 23-- a copy of the GCC Runtime Library Exception along with this program; -- 24-- see the files COPYING3 and COPYING.RUNTIME respectively. If not, see -- 25-- <http://www.gnu.org/licenses/>. -- 26-- -- 27-- GNAT was originally developed by the GNAT team at New York University. -- 28-- Extensive contributions were provided by Ada Core Technologies Inc. -- 29-- -- 30------------------------------------------------------------------------------ 31 32with Ada.Unchecked_Deallocation; 33 34with System.Case_Util; 35 36package body System.Regexp is 37 38 Open_Paren : constant Character := '('; 39 Close_Paren : constant Character := ')'; 40 Open_Bracket : constant Character := '['; 41 Close_Bracket : constant Character := ']'; 42 43 type State_Index is new Natural; 44 type Column_Index is new Natural; 45 46 type Regexp_Array is array 47 (State_Index range <>, Column_Index range <>) of State_Index; 48 -- First index is for the state number 49 -- Second index is for the character type 50 -- Contents is the new State 51 52 type Regexp_Array_Access is access Regexp_Array; 53 -- Use this type through the functions Set below, so that it 54 -- can grow dynamically depending on the needs. 55 56 type Mapping is array (Character'Range) of Column_Index; 57 -- Mapping between characters and column in the Regexp_Array 58 59 type Boolean_Array is array (State_Index range <>) of Boolean; 60 61 type Regexp_Value 62 (Alphabet_Size : Column_Index; 63 Num_States : State_Index) is 64 record 65 Map : Mapping; 66 States : Regexp_Array (1 .. Num_States, 0 .. Alphabet_Size); 67 Is_Final : Boolean_Array (1 .. Num_States); 68 Case_Sensitive : Boolean; 69 end record; 70 -- Deterministic finite-state machine 71 72 ----------------------- 73 -- Local Subprograms -- 74 ----------------------- 75 76 procedure Set 77 (Table : in out Regexp_Array_Access; 78 State : State_Index; 79 Column : Column_Index; 80 Value : State_Index); 81 -- Sets a value in the table. If the table is too small, reallocate it 82 -- dynamically so that (State, Column) is a valid index in it. 83 84 function Get 85 (Table : Regexp_Array_Access; 86 State : State_Index; 87 Column : Column_Index) 88 return State_Index; 89 -- Returns the value in the table at (State, Column). 90 -- If this index does not exist in the table, returns 0 91 92 procedure Free is new Ada.Unchecked_Deallocation 93 (Regexp_Array, Regexp_Array_Access); 94 95 ------------ 96 -- Adjust -- 97 ------------ 98 99 procedure Adjust (R : in out Regexp) is 100 Tmp : Regexp_Access; 101 102 begin 103 if R.R /= null then 104 Tmp := new Regexp_Value (Alphabet_Size => R.R.Alphabet_Size, 105 Num_States => R.R.Num_States); 106 Tmp.all := R.R.all; 107 R.R := Tmp; 108 end if; 109 end Adjust; 110 111 ------------- 112 -- Compile -- 113 ------------- 114 115 function Compile 116 (Pattern : String; 117 Glob : Boolean := False; 118 Case_Sensitive : Boolean := True) 119 return Regexp 120 is 121 S : String := Pattern; 122 -- The pattern which is really compiled (when the pattern is case 123 -- insensitive, we convert this string to lower-cases 124 125 Map : Mapping := (others => 0); 126 -- Mapping between characters and columns in the tables 127 128 Alphabet_Size : Column_Index := 0; 129 -- Number of significant characters in the regular expression. 130 -- This total does not include special operators, such as *, (, ... 131 132 procedure Check_Well_Formed_Pattern; 133 -- Check that the pattern to compile is well-formed, so that subsequent 134 -- code can rely on this without performing each time the checks to 135 -- avoid accessing the pattern outside its bounds. However, not all 136 -- well-formedness rules are checked. In particular, rules about special 137 -- characters not being treated as regular characters are not checked. 138 139 procedure Create_Mapping; 140 -- Creates a mapping between characters in the regexp and columns 141 -- in the tables representing the regexp. Test that the regexp is 142 -- well-formed Modifies Alphabet_Size and Map 143 144 procedure Create_Primary_Table 145 (Table : out Regexp_Array_Access; 146 Num_States : out State_Index; 147 Start_State : out State_Index; 148 End_State : out State_Index); 149 -- Creates the first version of the regexp (this is a non deterministic 150 -- finite state machine, which is unadapted for a fast pattern 151 -- matching algorithm). We use a recursive algorithm to process the 152 -- parenthesis sub-expressions. 153 -- 154 -- Table : at the end of the procedure : Column 0 is for any character 155 -- ('.') and the last columns are for no character (closure) 156 -- Num_States is set to the number of states in the table 157 -- Start_State is the number of the starting state in the regexp 158 -- End_State is the number of the final state when the regexp matches 159 160 procedure Create_Primary_Table_Glob 161 (Table : out Regexp_Array_Access; 162 Num_States : out State_Index; 163 Start_State : out State_Index; 164 End_State : out State_Index); 165 -- Same function as above, but it deals with the second possible 166 -- grammar for 'globbing pattern', which is a kind of subset of the 167 -- whole regular expression grammar. 168 169 function Create_Secondary_Table 170 (First_Table : Regexp_Array_Access; 171 Num_States : State_Index; 172 Start_State : State_Index; 173 End_State : State_Index) 174 return Regexp; 175 -- Creates the definitive table representing the regular expression 176 -- This is actually a transformation of the primary table First_Table, 177 -- where every state is grouped with the states in its 'no-character' 178 -- columns. The transitions between the new states are then recalculated 179 -- and if necessary some new states are created. 180 -- 181 -- Note that the resulting finite-state machine is not optimized in 182 -- terms of the number of states : it would be more time-consuming to 183 -- add a third pass to reduce the number of states in the machine, with 184 -- no speed improvement... 185 186 procedure Raise_Exception (M : String; Index : Integer); 187 pragma No_Return (Raise_Exception); 188 -- Raise an exception, indicating an error at character Index in S 189 190 ------------------------------- 191 -- Check_Well_Formed_Pattern -- 192 ------------------------------- 193 194 procedure Check_Well_Formed_Pattern is 195 J : Integer; 196 197 Past_Elmt : Boolean := False; 198 -- Set to True everywhere an elmt has been parsed, if Glob=False, 199 -- meaning there can be now an occurrence of '*', '+' and '?'. 200 201 Past_Term : Boolean := False; 202 -- Set to True everywhere a term has been parsed, if Glob=False, 203 -- meaning there can be now an occurrence of '|'. 204 205 Parenthesis_Level : Integer := 0; 206 Curly_Level : Integer := 0; 207 208 Last_Open : Integer := S'First - 1; 209 -- The last occurrence of an opening parenthesis, if Glob=False, 210 -- or the last occurrence of an opening curly brace, if Glob=True. 211 212 procedure Raise_Exception_If_No_More_Chars (K : Integer := 0); 213 -- If no more characters are raised, call Raise_Exception 214 215 -------------------------------------- 216 -- Raise_Exception_If_No_More_Chars -- 217 -------------------------------------- 218 219 procedure Raise_Exception_If_No_More_Chars (K : Integer := 0) is 220 begin 221 if J + K > S'Last then 222 Raise_Exception ("Ill-formed pattern while parsing", J); 223 end if; 224 end Raise_Exception_If_No_More_Chars; 225 226 -- Start of processing for Check_Well_Formed_Pattern 227 228 begin 229 J := S'First; 230 while J <= S'Last loop 231 case S (J) is 232 when Open_Bracket => 233 J := J + 1; 234 Raise_Exception_If_No_More_Chars; 235 236 if not Glob then 237 if S (J) = '^' then 238 J := J + 1; 239 Raise_Exception_If_No_More_Chars; 240 end if; 241 end if; 242 243 -- The first character never has a special meaning 244 245 if S (J) = ']' or else S (J) = '-' then 246 J := J + 1; 247 Raise_Exception_If_No_More_Chars; 248 end if; 249 250 -- The set of characters cannot be empty 251 252 if S (J) = ']' then 253 Raise_Exception 254 ("Set of characters cannot be empty in regular " 255 & "expression", J); 256 end if; 257 258 declare 259 Possible_Range_Start : Boolean := True; 260 -- Set True everywhere a range character '-' can occur 261 262 begin 263 loop 264 exit when S (J) = Close_Bracket; 265 266 -- The current character should be followed by a 267 -- closing bracket. 268 269 Raise_Exception_If_No_More_Chars (1); 270 271 if S (J) = '-' 272 and then S (J + 1) /= Close_Bracket 273 then 274 if not Possible_Range_Start then 275 Raise_Exception 276 ("No mix of ranges is allowed in " 277 & "regular expression", J); 278 end if; 279 280 J := J + 1; 281 Raise_Exception_If_No_More_Chars; 282 283 -- Range cannot be followed by '-' character, 284 -- except as last character in the set. 285 286 Possible_Range_Start := False; 287 288 else 289 Possible_Range_Start := True; 290 end if; 291 292 if S (J) = '\' then 293 J := J + 1; 294 Raise_Exception_If_No_More_Chars; 295 end if; 296 297 J := J + 1; 298 end loop; 299 end; 300 301 -- A closing bracket can end an elmt or term 302 303 Past_Elmt := True; 304 Past_Term := True; 305 306 when Close_Bracket => 307 308 -- A close bracket must follow a open_bracket, and cannot be 309 -- found alone on the line. 310 311 Raise_Exception 312 ("Incorrect character ']' in regular expression", J); 313 314 when '\' => 315 if J < S'Last then 316 J := J + 1; 317 318 -- Any character can be an elmt or a term 319 320 Past_Elmt := True; 321 Past_Term := True; 322 323 else 324 -- \ not allowed at the end of the regexp 325 326 Raise_Exception 327 ("Incorrect character '\' in regular expression", J); 328 end if; 329 330 when Open_Paren => 331 if not Glob then 332 Parenthesis_Level := Parenthesis_Level + 1; 333 Last_Open := J; 334 335 -- An open parenthesis does not end an elmt or term 336 337 Past_Elmt := False; 338 Past_Term := False; 339 end if; 340 341 when Close_Paren => 342 if not Glob then 343 Parenthesis_Level := Parenthesis_Level - 1; 344 345 if Parenthesis_Level < 0 then 346 Raise_Exception 347 ("')' is not associated with '(' in regular " 348 & "expression", J); 349 end if; 350 351 if J = Last_Open + 1 then 352 Raise_Exception 353 ("Empty parentheses not allowed in regular " 354 & "expression", J); 355 end if; 356 357 if not Past_Term then 358 Raise_Exception 359 ("Closing parenthesis not allowed here in regular " 360 & "expression", J); 361 end if; 362 363 -- A closing parenthesis can end an elmt or term 364 365 Past_Elmt := True; 366 Past_Term := True; 367 end if; 368 369 when '{' => 370 if Glob then 371 Curly_Level := Curly_Level + 1; 372 Last_Open := J; 373 374 else 375 -- Any character can be an elmt or a term 376 377 Past_Elmt := True; 378 Past_Term := True; 379 end if; 380 381 -- No need to check for ',' as the code always accepts them 382 383 when '}' => 384 if Glob then 385 Curly_Level := Curly_Level - 1; 386 387 if Curly_Level < 0 then 388 Raise_Exception 389 ("'}' is not associated with '{' in regular " 390 & "expression", J); 391 end if; 392 393 if J = Last_Open + 1 then 394 Raise_Exception 395 ("Empty curly braces not allowed in regular " 396 & "expression", J); 397 end if; 398 399 else 400 -- Any character can be an elmt or a term 401 402 Past_Elmt := True; 403 Past_Term := True; 404 end if; 405 406 when '*' | '?' | '+' => 407 if not Glob then 408 409 -- These operators must apply to an elmt sub-expression, 410 -- and cannot be found if one has not just been parsed. 411 412 if not Past_Elmt then 413 Raise_Exception 414 ("'*', '+' and '?' operators must be " 415 & "applied to an element in regular expression", J); 416 end if; 417 418 Past_Elmt := False; 419 Past_Term := True; 420 end if; 421 422 when '|' => 423 if not Glob then 424 425 -- This operator must apply to a term sub-expression, 426 -- and cannot be found if one has not just been parsed. 427 428 if not Past_Term then 429 Raise_Exception 430 ("'|' operator must be " 431 & "applied to a term in regular expression", J); 432 end if; 433 434 Past_Elmt := False; 435 Past_Term := False; 436 end if; 437 438 when others => 439 if not Glob then 440 441 -- Any character can be an elmt or a term 442 443 Past_Elmt := True; 444 Past_Term := True; 445 end if; 446 end case; 447 448 J := J + 1; 449 end loop; 450 451 -- A closing parenthesis must follow an open parenthesis 452 453 if Parenthesis_Level /= 0 then 454 Raise_Exception 455 ("'(' must always be associated with a ')'", J); 456 end if; 457 458 -- A closing curly brace must follow an open curly brace 459 460 if Curly_Level /= 0 then 461 Raise_Exception 462 ("'{' must always be associated with a '}'", J); 463 end if; 464 end Check_Well_Formed_Pattern; 465 466 -------------------- 467 -- Create_Mapping -- 468 -------------------- 469 470 procedure Create_Mapping is 471 472 procedure Add_In_Map (C : Character); 473 -- Add a character in the mapping, if it is not already defined 474 475 ---------------- 476 -- Add_In_Map -- 477 ---------------- 478 479 procedure Add_In_Map (C : Character) is 480 begin 481 if Map (C) = 0 then 482 Alphabet_Size := Alphabet_Size + 1; 483 Map (C) := Alphabet_Size; 484 end if; 485 end Add_In_Map; 486 487 J : Integer := S'First; 488 Parenthesis_Level : Integer := 0; 489 Curly_Level : Integer := 0; 490 Last_Open : Integer := S'First - 1; 491 492 -- Start of processing for Create_Mapping 493 494 begin 495 while J <= S'Last loop 496 case S (J) is 497 when Open_Bracket => 498 J := J + 1; 499 500 if S (J) = '^' then 501 J := J + 1; 502 end if; 503 504 if S (J) = ']' or else S (J) = '-' then 505 J := J + 1; 506 end if; 507 508 -- The first character never has a special meaning 509 510 loop 511 if J > S'Last then 512 Raise_Exception 513 ("Ran out of characters while parsing ", J); 514 end if; 515 516 exit when S (J) = Close_Bracket; 517 518 if S (J) = '-' 519 and then S (J + 1) /= Close_Bracket 520 then 521 declare 522 Start : constant Integer := J - 1; 523 524 begin 525 J := J + 1; 526 527 if S (J) = '\' then 528 J := J + 1; 529 end if; 530 531 for Char in S (Start) .. S (J) loop 532 Add_In_Map (Char); 533 end loop; 534 end; 535 else 536 if S (J) = '\' then 537 J := J + 1; 538 end if; 539 540 Add_In_Map (S (J)); 541 end if; 542 543 J := J + 1; 544 end loop; 545 546 -- A close bracket must follow a open_bracket, 547 -- and cannot be found alone on the line 548 549 when Close_Bracket => 550 Raise_Exception 551 ("Incorrect character ']' in regular expression", J); 552 553 when '\' => 554 if J < S'Last then 555 J := J + 1; 556 Add_In_Map (S (J)); 557 558 else 559 -- \ not allowed at the end of the regexp 560 561 Raise_Exception 562 ("Incorrect character '\' in regular expression", J); 563 end if; 564 565 when Open_Paren => 566 if not Glob then 567 Parenthesis_Level := Parenthesis_Level + 1; 568 Last_Open := J; 569 else 570 Add_In_Map (Open_Paren); 571 end if; 572 573 when Close_Paren => 574 if not Glob then 575 Parenthesis_Level := Parenthesis_Level - 1; 576 577 if Parenthesis_Level < 0 then 578 Raise_Exception 579 ("')' is not associated with '(' in regular " 580 & "expression", J); 581 end if; 582 583 if J = Last_Open + 1 then 584 Raise_Exception 585 ("Empty parenthesis not allowed in regular " 586 & "expression", J); 587 end if; 588 589 else 590 Add_In_Map (Close_Paren); 591 end if; 592 593 when '.' => 594 if Glob then 595 Add_In_Map ('.'); 596 end if; 597 598 when '{' => 599 if not Glob then 600 Add_In_Map (S (J)); 601 else 602 Curly_Level := Curly_Level + 1; 603 end if; 604 605 when '}' => 606 if not Glob then 607 Add_In_Map (S (J)); 608 else 609 Curly_Level := Curly_Level - 1; 610 end if; 611 612 when '*' | '?' => 613 if not Glob then 614 if J = S'First then 615 Raise_Exception 616 ("'*', '+', '?' and '|' operators cannot be in " 617 & "first position in regular expression", J); 618 end if; 619 end if; 620 621 when '|' | '+' => 622 if not Glob then 623 if J = S'First then 624 625 -- These operators must apply to a sub-expression, 626 -- and cannot be found at the beginning of the line 627 628 Raise_Exception 629 ("'*', '+', '?' and '|' operators cannot be in " 630 & "first position in regular expression", J); 631 end if; 632 633 else 634 Add_In_Map (S (J)); 635 end if; 636 637 when others => 638 Add_In_Map (S (J)); 639 end case; 640 641 J := J + 1; 642 end loop; 643 644 -- A closing parenthesis must follow an open parenthesis 645 646 if Parenthesis_Level /= 0 then 647 Raise_Exception 648 ("'(' must always be associated with a ')'", J); 649 end if; 650 651 if Curly_Level /= 0 then 652 Raise_Exception 653 ("'{' must always be associated with a '}'", J); 654 end if; 655 end Create_Mapping; 656 657 -------------------------- 658 -- Create_Primary_Table -- 659 -------------------------- 660 661 procedure Create_Primary_Table 662 (Table : out Regexp_Array_Access; 663 Num_States : out State_Index; 664 Start_State : out State_Index; 665 End_State : out State_Index) 666 is 667 Empty_Char : constant Column_Index := Alphabet_Size + 1; 668 669 Current_State : State_Index := 0; 670 -- Index of the last created state 671 672 procedure Add_Empty_Char 673 (State : State_Index; 674 To_State : State_Index); 675 -- Add a empty-character transition from State to To_State 676 677 procedure Create_Repetition 678 (Repetition : Character; 679 Start_Prev : State_Index; 680 End_Prev : State_Index; 681 New_Start : out State_Index; 682 New_End : in out State_Index); 683 -- Create the table in case we have a '*', '+' or '?'. 684 -- Start_Prev .. End_Prev should indicate respectively the start and 685 -- end index of the previous expression, to which '*', '+' or '?' is 686 -- applied. 687 688 procedure Create_Simple 689 (Start_Index : Integer; 690 End_Index : Integer; 691 Start_State : out State_Index; 692 End_State : out State_Index); 693 -- Fill the table for the regexp Simple. 694 -- This is the recursive procedure called to handle () expressions 695 -- If End_State = 0, then the call to Create_Simple creates an 696 -- independent regexp, not a concatenation 697 -- Start_Index .. End_Index is the starting index in the string S. 698 -- 699 -- Warning: it may look like we are creating too many empty-string 700 -- transitions, but they are needed to get the correct regexp. 701 -- The table is filled as follow ( s means start-state, e means 702 -- end-state) : 703 -- 704 -- regexp state_num | a b * empty_string 705 -- ------- ------------------------------ 706 -- a 1 (s) | 2 - - - 707 -- 2 (e) | - - - - 708 -- 709 -- ab 1 (s) | 2 - - - 710 -- 2 | - - - 3 711 -- 3 | - 4 - - 712 -- 4 (e) | - - - - 713 -- 714 -- a|b 1 | 2 - - - 715 -- 2 | - - - 6 716 -- 3 | - 4 - - 717 -- 4 | - - - 6 718 -- 5 (s) | - - - 1,3 719 -- 6 (e) | - - - - 720 -- 721 -- a* 1 | 2 - - - 722 -- 2 | - - - 4 723 -- 3 (s) | - - - 1,4 724 -- 4 (e) | - - - 3 725 -- 726 -- (a) 1 (s) | 2 - - - 727 -- 2 (e) | - - - - 728 -- 729 -- a+ 1 | 2 - - - 730 -- 2 | - - - 4 731 -- 3 (s) | - - - 1 732 -- 4 (e) | - - - 3 733 -- 734 -- a? 1 | 2 - - - 735 -- 2 | - - - 4 736 -- 3 (s) | - - - 1,4 737 -- 4 (e) | - - - - 738 -- 739 -- . 1 (s) | 2 2 2 - 740 -- 2 (e) | - - - - 741 742 function Next_Sub_Expression 743 (Start_Index : Integer; 744 End_Index : Integer) 745 return Integer; 746 -- Returns the index of the last character of the next sub-expression 747 -- in Simple. Index cannot be greater than End_Index. 748 749 -------------------- 750 -- Add_Empty_Char -- 751 -------------------- 752 753 procedure Add_Empty_Char 754 (State : State_Index; 755 To_State : State_Index) 756 is 757 J : Column_Index := Empty_Char; 758 759 begin 760 while Get (Table, State, J) /= 0 loop 761 J := J + 1; 762 end loop; 763 764 Set (Table, State, J, To_State); 765 end Add_Empty_Char; 766 767 ----------------------- 768 -- Create_Repetition -- 769 ----------------------- 770 771 procedure Create_Repetition 772 (Repetition : Character; 773 Start_Prev : State_Index; 774 End_Prev : State_Index; 775 New_Start : out State_Index; 776 New_End : in out State_Index) 777 is 778 begin 779 New_Start := Current_State + 1; 780 781 if New_End /= 0 then 782 Add_Empty_Char (New_End, New_Start); 783 end if; 784 785 Current_State := Current_State + 2; 786 New_End := Current_State; 787 788 Add_Empty_Char (End_Prev, New_End); 789 Add_Empty_Char (New_Start, Start_Prev); 790 791 if Repetition /= '+' then 792 Add_Empty_Char (New_Start, New_End); 793 end if; 794 795 if Repetition /= '?' then 796 Add_Empty_Char (New_End, New_Start); 797 end if; 798 end Create_Repetition; 799 800 ------------------- 801 -- Create_Simple -- 802 ------------------- 803 804 procedure Create_Simple 805 (Start_Index : Integer; 806 End_Index : Integer; 807 Start_State : out State_Index; 808 End_State : out State_Index) 809 is 810 J : Integer := Start_Index; 811 Last_Start : State_Index := 0; 812 813 begin 814 Start_State := 0; 815 End_State := 0; 816 while J <= End_Index loop 817 case S (J) is 818 when Open_Paren => 819 declare 820 J_Start : constant Integer := J + 1; 821 Next_Start : State_Index; 822 Next_End : State_Index; 823 824 begin 825 J := Next_Sub_Expression (J, End_Index); 826 Create_Simple (J_Start, J - 1, Next_Start, Next_End); 827 828 if J < End_Index 829 and then (S (J + 1) = '*' or else 830 S (J + 1) = '+' or else 831 S (J + 1) = '?') 832 then 833 J := J + 1; 834 Create_Repetition 835 (S (J), 836 Next_Start, 837 Next_End, 838 Last_Start, 839 End_State); 840 841 else 842 Last_Start := Next_Start; 843 844 if End_State /= 0 then 845 Add_Empty_Char (End_State, Last_Start); 846 end if; 847 848 End_State := Next_End; 849 end if; 850 end; 851 852 when '|' => 853 declare 854 Start_Prev : constant State_Index := Start_State; 855 End_Prev : constant State_Index := End_State; 856 Start_J : constant Integer := J + 1; 857 Start_Next : State_Index := 0; 858 End_Next : State_Index := 0; 859 860 begin 861 J := Next_Sub_Expression (J, End_Index); 862 863 -- Create a new state for the start of the alternative 864 865 Current_State := Current_State + 1; 866 Last_Start := Current_State; 867 Start_State := Last_Start; 868 869 -- Create the tree for the second part of alternative 870 871 Create_Simple (Start_J, J, Start_Next, End_Next); 872 873 -- Create the end state 874 875 Add_Empty_Char (Last_Start, Start_Next); 876 Add_Empty_Char (Last_Start, Start_Prev); 877 Current_State := Current_State + 1; 878 End_State := Current_State; 879 Add_Empty_Char (End_Prev, End_State); 880 Add_Empty_Char (End_Next, End_State); 881 end; 882 883 when Open_Bracket => 884 Current_State := Current_State + 1; 885 886 declare 887 Next_State : State_Index := Current_State + 1; 888 889 begin 890 J := J + 1; 891 892 if S (J) = '^' then 893 J := J + 1; 894 895 Next_State := 0; 896 897 for Column in 0 .. Alphabet_Size loop 898 Set (Table, Current_State, Column, 899 Value => Current_State + 1); 900 end loop; 901 end if; 902 903 -- Automatically add the first character 904 905 if S (J) = '-' or else S (J) = ']' then 906 Set (Table, Current_State, Map (S (J)), 907 Value => Next_State); 908 J := J + 1; 909 end if; 910 911 -- Loop till closing bracket found 912 913 loop 914 exit when S (J) = Close_Bracket; 915 916 if S (J) = '-' 917 and then S (J + 1) /= ']' 918 then 919 declare 920 Start : constant Integer := J - 1; 921 922 begin 923 J := J + 1; 924 925 if S (J) = '\' then 926 J := J + 1; 927 end if; 928 929 for Char in S (Start) .. S (J) loop 930 Set (Table, Current_State, Map (Char), 931 Value => Next_State); 932 end loop; 933 end; 934 935 else 936 if S (J) = '\' then 937 J := J + 1; 938 end if; 939 940 Set (Table, Current_State, Map (S (J)), 941 Value => Next_State); 942 end if; 943 J := J + 1; 944 end loop; 945 end; 946 947 Current_State := Current_State + 1; 948 949 -- If the next symbol is a special symbol 950 951 if J < End_Index 952 and then (S (J + 1) = '*' or else 953 S (J + 1) = '+' or else 954 S (J + 1) = '?') 955 then 956 J := J + 1; 957 Create_Repetition 958 (S (J), 959 Current_State - 1, 960 Current_State, 961 Last_Start, 962 End_State); 963 964 else 965 Last_Start := Current_State - 1; 966 967 if End_State /= 0 then 968 Add_Empty_Char (End_State, Last_Start); 969 end if; 970 971 End_State := Current_State; 972 end if; 973 974 when '*' | '+' | '?' | Close_Paren | Close_Bracket => 975 Raise_Exception 976 ("Incorrect character in regular expression :", J); 977 978 when others => 979 Current_State := Current_State + 1; 980 981 -- Create the state for the symbol S (J) 982 983 if S (J) = '.' then 984 for K in 0 .. Alphabet_Size loop 985 Set (Table, Current_State, K, 986 Value => Current_State + 1); 987 end loop; 988 989 else 990 if S (J) = '\' then 991 J := J + 1; 992 end if; 993 994 Set (Table, Current_State, Map (S (J)), 995 Value => Current_State + 1); 996 end if; 997 998 Current_State := Current_State + 1; 999 1000 -- If the next symbol is a special symbol 1001 1002 if J < End_Index 1003 and then (S (J + 1) = '*' or else 1004 S (J + 1) = '+' or else 1005 S (J + 1) = '?') 1006 then 1007 J := J + 1; 1008 Create_Repetition 1009 (S (J), 1010 Current_State - 1, 1011 Current_State, 1012 Last_Start, 1013 End_State); 1014 1015 else 1016 Last_Start := Current_State - 1; 1017 1018 if End_State /= 0 then 1019 Add_Empty_Char (End_State, Last_Start); 1020 end if; 1021 1022 End_State := Current_State; 1023 end if; 1024 1025 end case; 1026 1027 if Start_State = 0 then 1028 Start_State := Last_Start; 1029 end if; 1030 1031 J := J + 1; 1032 end loop; 1033 end Create_Simple; 1034 1035 ------------------------- 1036 -- Next_Sub_Expression -- 1037 ------------------------- 1038 1039 function Next_Sub_Expression 1040 (Start_Index : Integer; 1041 End_Index : Integer) 1042 return Integer 1043 is 1044 J : Integer := Start_Index; 1045 Start_On_Alter : Boolean := False; 1046 1047 begin 1048 if S (J) = '|' then 1049 Start_On_Alter := True; 1050 end if; 1051 1052 loop 1053 exit when J = End_Index; 1054 J := J + 1; 1055 1056 case S (J) is 1057 when '\' => 1058 J := J + 1; 1059 1060 when Open_Bracket => 1061 loop 1062 J := J + 1; 1063 exit when S (J) = Close_Bracket; 1064 1065 if S (J) = '\' then 1066 J := J + 1; 1067 end if; 1068 end loop; 1069 1070 when Open_Paren => 1071 J := Next_Sub_Expression (J, End_Index); 1072 1073 when Close_Paren => 1074 return J; 1075 1076 when '|' => 1077 if Start_On_Alter then 1078 return J - 1; 1079 end if; 1080 1081 when others => 1082 null; 1083 end case; 1084 end loop; 1085 1086 return J; 1087 end Next_Sub_Expression; 1088 1089 -- Start of Create_Primary_Table 1090 1091 begin 1092 Table.all := (others => (others => 0)); 1093 Create_Simple (S'First, S'Last, Start_State, End_State); 1094 Num_States := Current_State; 1095 end Create_Primary_Table; 1096 1097 ------------------------------- 1098 -- Create_Primary_Table_Glob -- 1099 ------------------------------- 1100 1101 procedure Create_Primary_Table_Glob 1102 (Table : out Regexp_Array_Access; 1103 Num_States : out State_Index; 1104 Start_State : out State_Index; 1105 End_State : out State_Index) 1106 is 1107 Empty_Char : constant Column_Index := Alphabet_Size + 1; 1108 1109 Current_State : State_Index := 0; 1110 -- Index of the last created state 1111 1112 procedure Add_Empty_Char 1113 (State : State_Index; 1114 To_State : State_Index); 1115 -- Add a empty-character transition from State to To_State 1116 1117 procedure Create_Simple 1118 (Start_Index : Integer; 1119 End_Index : Integer; 1120 Start_State : out State_Index; 1121 End_State : out State_Index); 1122 -- Fill the table for the S (Start_Index .. End_Index). 1123 -- This is the recursive procedure called to handle () expressions 1124 1125 -------------------- 1126 -- Add_Empty_Char -- 1127 -------------------- 1128 1129 procedure Add_Empty_Char 1130 (State : State_Index; 1131 To_State : State_Index) 1132 is 1133 J : Column_Index := Empty_Char; 1134 1135 begin 1136 while Get (Table, State, J) /= 0 loop 1137 J := J + 1; 1138 end loop; 1139 1140 Set (Table, State, J, 1141 Value => To_State); 1142 end Add_Empty_Char; 1143 1144 ------------------- 1145 -- Create_Simple -- 1146 ------------------- 1147 1148 procedure Create_Simple 1149 (Start_Index : Integer; 1150 End_Index : Integer; 1151 Start_State : out State_Index; 1152 End_State : out State_Index) 1153 is 1154 J : Integer := Start_Index; 1155 Last_Start : State_Index := 0; 1156 1157 begin 1158 Start_State := 0; 1159 End_State := 0; 1160 1161 while J <= End_Index loop 1162 case S (J) is 1163 1164 when Open_Bracket => 1165 Current_State := Current_State + 1; 1166 1167 declare 1168 Next_State : State_Index := Current_State + 1; 1169 1170 begin 1171 J := J + 1; 1172 1173 if S (J) = '^' then 1174 J := J + 1; 1175 Next_State := 0; 1176 1177 for Column in 0 .. Alphabet_Size loop 1178 Set (Table, Current_State, Column, 1179 Value => Current_State + 1); 1180 end loop; 1181 end if; 1182 1183 -- Automatically add the first character 1184 1185 if S (J) = '-' or else S (J) = ']' then 1186 Set (Table, Current_State, Map (S (J)), 1187 Value => Current_State); 1188 J := J + 1; 1189 end if; 1190 1191 -- Loop till closing bracket found 1192 1193 loop 1194 exit when S (J) = Close_Bracket; 1195 1196 if S (J) = '-' 1197 and then S (J + 1) /= ']' 1198 then 1199 declare 1200 Start : constant Integer := J - 1; 1201 begin 1202 J := J + 1; 1203 1204 if S (J) = '\' then 1205 J := J + 1; 1206 end if; 1207 1208 for Char in S (Start) .. S (J) loop 1209 Set (Table, Current_State, Map (Char), 1210 Value => Next_State); 1211 end loop; 1212 end; 1213 1214 else 1215 if S (J) = '\' then 1216 J := J + 1; 1217 end if; 1218 1219 Set (Table, Current_State, Map (S (J)), 1220 Value => Next_State); 1221 end if; 1222 J := J + 1; 1223 end loop; 1224 end; 1225 1226 Last_Start := Current_State; 1227 Current_State := Current_State + 1; 1228 1229 if End_State /= 0 then 1230 Add_Empty_Char (End_State, Last_Start); 1231 end if; 1232 1233 End_State := Current_State; 1234 1235 when '{' => 1236 declare 1237 End_Sub : Integer; 1238 Start_Regexp_Sub : State_Index; 1239 End_Regexp_Sub : State_Index; 1240 Create_Start : State_Index := 0; 1241 1242 Create_End : State_Index := 0; 1243 -- Initialized to avoid junk warning 1244 1245 begin 1246 while S (J) /= '}' loop 1247 1248 -- First step : find sub pattern 1249 1250 End_Sub := J + 1; 1251 while S (End_Sub) /= ',' 1252 and then S (End_Sub) /= '}' 1253 loop 1254 End_Sub := End_Sub + 1; 1255 end loop; 1256 1257 -- Second step : create a sub pattern 1258 1259 Create_Simple 1260 (J + 1, 1261 End_Sub - 1, 1262 Start_Regexp_Sub, 1263 End_Regexp_Sub); 1264 1265 J := End_Sub; 1266 1267 -- Third step : create an alternative 1268 1269 if Create_Start = 0 then 1270 Current_State := Current_State + 1; 1271 Create_Start := Current_State; 1272 Add_Empty_Char (Create_Start, Start_Regexp_Sub); 1273 Current_State := Current_State + 1; 1274 Create_End := Current_State; 1275 Add_Empty_Char (End_Regexp_Sub, Create_End); 1276 1277 else 1278 Current_State := Current_State + 1; 1279 Add_Empty_Char (Current_State, Create_Start); 1280 Create_Start := Current_State; 1281 Add_Empty_Char (Create_Start, Start_Regexp_Sub); 1282 Add_Empty_Char (End_Regexp_Sub, Create_End); 1283 end if; 1284 end loop; 1285 1286 if End_State /= 0 then 1287 Add_Empty_Char (End_State, Create_Start); 1288 end if; 1289 1290 End_State := Create_End; 1291 Last_Start := Create_Start; 1292 end; 1293 1294 when '*' => 1295 Current_State := Current_State + 1; 1296 1297 if End_State /= 0 then 1298 Add_Empty_Char (End_State, Current_State); 1299 end if; 1300 1301 Add_Empty_Char (Current_State, Current_State + 1); 1302 Add_Empty_Char (Current_State, Current_State + 3); 1303 Last_Start := Current_State; 1304 1305 Current_State := Current_State + 1; 1306 1307 for K in 0 .. Alphabet_Size loop 1308 Set (Table, Current_State, K, 1309 Value => Current_State + 1); 1310 end loop; 1311 1312 Current_State := Current_State + 1; 1313 Add_Empty_Char (Current_State, Current_State + 1); 1314 1315 Current_State := Current_State + 1; 1316 Add_Empty_Char (Current_State, Last_Start); 1317 End_State := Current_State; 1318 1319 when others => 1320 Current_State := Current_State + 1; 1321 1322 if S (J) = '?' then 1323 for K in 0 .. Alphabet_Size loop 1324 Set (Table, Current_State, K, 1325 Value => Current_State + 1); 1326 end loop; 1327 1328 else 1329 if S (J) = '\' then 1330 J := J + 1; 1331 end if; 1332 1333 -- Create the state for the symbol S (J) 1334 1335 Set (Table, Current_State, Map (S (J)), 1336 Value => Current_State + 1); 1337 end if; 1338 1339 Last_Start := Current_State; 1340 Current_State := Current_State + 1; 1341 1342 if End_State /= 0 then 1343 Add_Empty_Char (End_State, Last_Start); 1344 end if; 1345 1346 End_State := Current_State; 1347 1348 end case; 1349 1350 if Start_State = 0 then 1351 Start_State := Last_Start; 1352 end if; 1353 1354 J := J + 1; 1355 end loop; 1356 end Create_Simple; 1357 1358 -- Start of processing for Create_Primary_Table_Glob 1359 1360 begin 1361 Table.all := (others => (others => 0)); 1362 Create_Simple (S'First, S'Last, Start_State, End_State); 1363 Num_States := Current_State; 1364 end Create_Primary_Table_Glob; 1365 1366 ---------------------------- 1367 -- Create_Secondary_Table -- 1368 ---------------------------- 1369 1370 function Create_Secondary_Table 1371 (First_Table : Regexp_Array_Access; 1372 Num_States : State_Index; 1373 Start_State : State_Index; 1374 End_State : State_Index) return Regexp 1375 is 1376 pragma Warnings (Off, Num_States); 1377 1378 Last_Index : constant State_Index := First_Table'Last (1); 1379 type Meta_State is array (1 .. Last_Index) of Boolean; 1380 1381 Table : Regexp_Array (1 .. Last_Index, 0 .. Alphabet_Size) := 1382 (others => (others => 0)); 1383 1384 Meta_States : array (1 .. Last_Index + 1) of Meta_State := 1385 (others => (others => False)); 1386 1387 Temp_State_Not_Null : Boolean; 1388 1389 Is_Final : Boolean_Array (1 .. Last_Index) := (others => False); 1390 1391 Current_State : State_Index := 1; 1392 Nb_State : State_Index := 1; 1393 1394 procedure Closure 1395 (State : in out Meta_State; 1396 Item : State_Index); 1397 -- Compute the closure of the state (that is every other state which 1398 -- has a empty-character transition) and add it to the state 1399 1400 ------------- 1401 -- Closure -- 1402 ------------- 1403 1404 procedure Closure 1405 (State : in out Meta_State; 1406 Item : State_Index) 1407 is 1408 begin 1409 if State (Item) then 1410 return; 1411 end if; 1412 1413 State (Item) := True; 1414 1415 for Column in Alphabet_Size + 1 .. First_Table'Last (2) loop 1416 if First_Table (Item, Column) = 0 then 1417 return; 1418 end if; 1419 1420 Closure (State, First_Table (Item, Column)); 1421 end loop; 1422 end Closure; 1423 1424 -- Start of processing for Create_Secondary_Table 1425 1426 begin 1427 -- Create a new state 1428 1429 Closure (Meta_States (Current_State), Start_State); 1430 1431 while Current_State <= Nb_State loop 1432 1433 -- If this new meta-state includes the primary table end state, 1434 -- then this meta-state will be a final state in the regexp 1435 1436 if Meta_States (Current_State)(End_State) then 1437 Is_Final (Current_State) := True; 1438 end if; 1439 1440 -- For every character in the regexp, calculate the possible 1441 -- transitions from Current_State 1442 1443 for Column in 0 .. Alphabet_Size loop 1444 Meta_States (Nb_State + 1) := (others => False); 1445 Temp_State_Not_Null := False; 1446 1447 for K in Meta_States (Current_State)'Range loop 1448 if Meta_States (Current_State)(K) 1449 and then First_Table (K, Column) /= 0 1450 then 1451 Closure 1452 (Meta_States (Nb_State + 1), First_Table (K, Column)); 1453 Temp_State_Not_Null := True; 1454 end if; 1455 end loop; 1456 1457 -- If at least one transition existed 1458 1459 if Temp_State_Not_Null then 1460 1461 -- Check if this new state corresponds to an old one 1462 1463 for K in 1 .. Nb_State loop 1464 if Meta_States (K) = Meta_States (Nb_State + 1) then 1465 Table (Current_State, Column) := K; 1466 exit; 1467 end if; 1468 end loop; 1469 1470 -- If not, create a new state 1471 1472 if Table (Current_State, Column) = 0 then 1473 Nb_State := Nb_State + 1; 1474 Table (Current_State, Column) := Nb_State; 1475 end if; 1476 end if; 1477 end loop; 1478 1479 Current_State := Current_State + 1; 1480 end loop; 1481 1482 -- Returns the regexp 1483 1484 declare 1485 R : Regexp_Access; 1486 1487 begin 1488 R := new Regexp_Value (Alphabet_Size => Alphabet_Size, 1489 Num_States => Nb_State); 1490 R.Map := Map; 1491 R.Is_Final := Is_Final (1 .. Nb_State); 1492 R.Case_Sensitive := Case_Sensitive; 1493 1494 for State in 1 .. Nb_State loop 1495 for K in 0 .. Alphabet_Size loop 1496 R.States (State, K) := Table (State, K); 1497 end loop; 1498 end loop; 1499 1500 return (Ada.Finalization.Controlled with R => R); 1501 end; 1502 end Create_Secondary_Table; 1503 1504 --------------------- 1505 -- Raise_Exception -- 1506 --------------------- 1507 1508 procedure Raise_Exception (M : String; Index : Integer) is 1509 begin 1510 raise Error_In_Regexp with M & " at offset" & Index'Img; 1511 end Raise_Exception; 1512 1513 -- Start of processing for Compile 1514 1515 begin 1516 -- Special case for the empty string: it always matches, and the 1517 -- following processing would fail on it. 1518 if S = "" then 1519 return (Ada.Finalization.Controlled with 1520 R => new Regexp_Value' 1521 (Alphabet_Size => 0, 1522 Num_States => 1, 1523 Map => (others => 0), 1524 States => (others => (others => 1)), 1525 Is_Final => (others => True), 1526 Case_Sensitive => True)); 1527 end if; 1528 1529 if not Case_Sensitive then 1530 System.Case_Util.To_Lower (S); 1531 end if; 1532 1533 -- Check the pattern is well-formed before any treatment 1534 1535 Check_Well_Formed_Pattern; 1536 1537 Create_Mapping; 1538 1539 -- Creates the primary table 1540 1541 declare 1542 Table : Regexp_Array_Access; 1543 Num_States : State_Index; 1544 Start_State : State_Index; 1545 End_State : State_Index; 1546 R : Regexp; 1547 1548 begin 1549 Table := new Regexp_Array (1 .. 100, 1550 0 .. Alphabet_Size + 10); 1551 if not Glob then 1552 Create_Primary_Table (Table, Num_States, Start_State, End_State); 1553 else 1554 Create_Primary_Table_Glob 1555 (Table, Num_States, Start_State, End_State); 1556 end if; 1557 1558 -- Creates the secondary table 1559 1560 R := Create_Secondary_Table 1561 (Table, Num_States, Start_State, End_State); 1562 Free (Table); 1563 return R; 1564 end; 1565 end Compile; 1566 1567 -------------- 1568 -- Finalize -- 1569 -------------- 1570 1571 procedure Finalize (R : in out Regexp) is 1572 procedure Free is new 1573 Ada.Unchecked_Deallocation (Regexp_Value, Regexp_Access); 1574 1575 begin 1576 Free (R.R); 1577 end Finalize; 1578 1579 --------- 1580 -- Get -- 1581 --------- 1582 1583 function Get 1584 (Table : Regexp_Array_Access; 1585 State : State_Index; 1586 Column : Column_Index) return State_Index 1587 is 1588 begin 1589 if State <= Table'Last (1) 1590 and then Column <= Table'Last (2) 1591 then 1592 return Table (State, Column); 1593 else 1594 return 0; 1595 end if; 1596 end Get; 1597 1598 ----------- 1599 -- Match -- 1600 ----------- 1601 1602 function Match (S : String; R : Regexp) return Boolean is 1603 Current_State : State_Index := 1; 1604 1605 begin 1606 if R.R = null then 1607 raise Constraint_Error; 1608 end if; 1609 1610 for Char in S'Range loop 1611 1612 if R.R.Case_Sensitive then 1613 Current_State := R.R.States (Current_State, R.R.Map (S (Char))); 1614 else 1615 Current_State := 1616 R.R.States (Current_State, 1617 R.R.Map (System.Case_Util.To_Lower (S (Char)))); 1618 end if; 1619 1620 if Current_State = 0 then 1621 return False; 1622 end if; 1623 1624 end loop; 1625 1626 return R.R.Is_Final (Current_State); 1627 end Match; 1628 1629 --------- 1630 -- Set -- 1631 --------- 1632 1633 procedure Set 1634 (Table : in out Regexp_Array_Access; 1635 State : State_Index; 1636 Column : Column_Index; 1637 Value : State_Index) 1638 is 1639 New_Lines : State_Index; 1640 New_Columns : Column_Index; 1641 New_Table : Regexp_Array_Access; 1642 1643 begin 1644 if State <= Table'Last (1) 1645 and then Column <= Table'Last (2) 1646 then 1647 Table (State, Column) := Value; 1648 else 1649 -- Doubles the size of the table until it is big enough that 1650 -- (State, Column) is a valid index 1651 1652 New_Lines := Table'Last (1) * (State / Table'Last (1) + 1); 1653 New_Columns := Table'Last (2) * (Column / Table'Last (2) + 1); 1654 New_Table := new Regexp_Array (Table'First (1) .. New_Lines, 1655 Table'First (2) .. New_Columns); 1656 New_Table.all := (others => (others => 0)); 1657 1658 for J in Table'Range (1) loop 1659 for K in Table'Range (2) loop 1660 New_Table (J, K) := Table (J, K); 1661 end loop; 1662 end loop; 1663 1664 Free (Table); 1665 Table := New_Table; 1666 Table (State, Column) := Value; 1667 end if; 1668 end Set; 1669 1670end System.Regexp; 1671