1------------------------------------------------------------------------------ 2-- -- 3-- GNAT LIBRARY COMPONENTS -- 4-- -- 5-- G N A T . R E G P A T -- 6-- -- 7-- B o d y -- 8-- -- 9-- Copyright (C) 1986 by University of Toronto. -- 10-- Copyright (C) 1999-2013, AdaCore -- 11-- -- 12-- GNAT is free software; you can redistribute it and/or modify it under -- 13-- terms of the GNU General Public License as published by the Free Soft- -- 14-- ware Foundation; either version 3, or (at your option) any later ver- -- 15-- sion. GNAT is distributed in the hope that it will be useful, but WITH- -- 16-- OUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY -- 17-- or FITNESS FOR A PARTICULAR PURPOSE. -- 18-- -- 19-- As a special exception under Section 7 of GPL version 3, you are granted -- 20-- additional permissions described in the GCC Runtime Library Exception, -- 21-- version 3.1, as published by the Free Software Foundation. -- 22-- -- 23-- You should have received a copy of the GNU General Public License and -- 24-- a copy of the GCC Runtime Library Exception along with this program; -- 25-- see the files COPYING3 and COPYING.RUNTIME respectively. If not, see -- 26-- <http://www.gnu.org/licenses/>. -- 27-- -- 28-- GNAT was originally developed by the GNAT team at New York University. -- 29-- Extensive contributions were provided by Ada Core Technologies Inc. -- 30-- -- 31------------------------------------------------------------------------------ 32 33-- This is an altered Ada 95 version of the original V8 style regular 34-- expression library written in C by Henry Spencer. Apart from the 35-- translation to Ada, the interface has been considerably changed to 36-- use the Ada String type instead of C-style nul-terminated strings. 37 38-- Beware that some of this code is subtly aware of the way operator 39-- precedence is structured in regular expressions. Serious changes in 40-- regular-expression syntax might require a total rethink. 41 42with System.IO; use System.IO; 43with Ada.Characters.Handling; use Ada.Characters.Handling; 44with Ada.Unchecked_Conversion; 45 46package body System.Regpat is 47 48 Debug : constant Boolean := False; 49 -- Set to True to activate debug traces. This is normally set to constant 50 -- False to simply delete all the trace code. It is to be edited to True 51 -- for internal debugging of the package. 52 53 ---------------------------- 54 -- Implementation details -- 55 ---------------------------- 56 57 -- This is essentially a linear encoding of a nondeterministic 58 -- finite-state machine, also known as syntax charts or 59 -- "railroad normal form" in parsing technology. 60 61 -- Each node is an opcode plus a "next" pointer, possibly plus an 62 -- operand. "Next" pointers of all nodes except BRANCH implement 63 -- concatenation; a "next" pointer with a BRANCH on both ends of it 64 -- is connecting two alternatives. 65 66 -- The operand of some types of node is a literal string; for others, 67 -- it is a node leading into a sub-FSM. In particular, the operand of 68 -- a BRANCH node is the first node of the branch. 69 -- (NB this is *not* a tree structure: the tail of the branch connects 70 -- to the thing following the set of BRANCHes). 71 72 -- You can see the exact byte-compiled version by using the Dump 73 -- subprogram. However, here are a few examples: 74 75 -- (a|b): 1 : BRANCH (next at 9) 76 -- 4 : EXACT (next at 17) operand=a 77 -- 9 : BRANCH (next at 17) 78 -- 12 : EXACT (next at 17) operand=b 79 -- 17 : EOP (next at 0) 80 -- 81 -- (ab)*: 1 : CURLYX (next at 25) { 0, 32767} 82 -- 8 : OPEN 1 (next at 12) 83 -- 12 : EXACT (next at 18) operand=ab 84 -- 18 : CLOSE 1 (next at 22) 85 -- 22 : WHILEM (next at 0) 86 -- 25 : NOTHING (next at 28) 87 -- 28 : EOP (next at 0) 88 89 -- The opcodes are: 90 91 type Opcode is 92 93 -- Name Operand? Meaning 94 95 (EOP, -- no End of program 96 MINMOD, -- no Next operator is not greedy 97 98 -- Classes of characters 99 100 ANY, -- no Match any one character except newline 101 SANY, -- no Match any character, including new line 102 ANYOF, -- class Match any character in this class 103 EXACT, -- str Match this string exactly 104 EXACTF, -- str Match this string (case-folding is one) 105 NOTHING, -- no Match empty string 106 SPACE, -- no Match any whitespace character 107 NSPACE, -- no Match any non-whitespace character 108 DIGIT, -- no Match any numeric character 109 NDIGIT, -- no Match any non-numeric character 110 ALNUM, -- no Match any alphanumeric character 111 NALNUM, -- no Match any non-alphanumeric character 112 113 -- Branches 114 115 BRANCH, -- node Match this alternative, or the next 116 117 -- Simple loops (when the following node is one character in length) 118 119 STAR, -- node Match this simple thing 0 or more times 120 PLUS, -- node Match this simple thing 1 or more times 121 CURLY, -- 2num node Match this simple thing between n and m times. 122 123 -- Complex loops 124 125 CURLYX, -- 2num node Match this complex thing {n,m} times 126 -- The nums are coded on two characters each 127 128 WHILEM, -- no Do curly processing and see if rest matches 129 130 -- Matches after or before a word 131 132 BOL, -- no Match "" at beginning of line 133 MBOL, -- no Same, assuming multiline (match after \n) 134 SBOL, -- no Same, assuming single line (don't match at \n) 135 EOL, -- no Match "" at end of line 136 MEOL, -- no Same, assuming multiline (match before \n) 137 SEOL, -- no Same, assuming single line (don't match at \n) 138 139 BOUND, -- no Match "" at any word boundary 140 NBOUND, -- no Match "" at any word non-boundary 141 142 -- Parenthesis groups handling 143 144 REFF, -- num Match some already matched string, folded 145 OPEN, -- num Mark this point in input as start of #n 146 CLOSE); -- num Analogous to OPEN 147 148 for Opcode'Size use 8; 149 150 -- Opcode notes: 151 152 -- BRANCH 153 -- The set of branches constituting a single choice are hooked 154 -- together with their "next" pointers, since precedence prevents 155 -- anything being concatenated to any individual branch. The 156 -- "next" pointer of the last BRANCH in a choice points to the 157 -- thing following the whole choice. This is also where the 158 -- final "next" pointer of each individual branch points; each 159 -- branch starts with the operand node of a BRANCH node. 160 161 -- STAR,PLUS 162 -- '?', and complex '*' and '+', are implemented with CURLYX. 163 -- branches. Simple cases (one character per match) are implemented with 164 -- STAR and PLUS for speed and to minimize recursive plunges. 165 166 -- OPEN,CLOSE 167 -- ...are numbered at compile time. 168 169 -- EXACT, EXACTF 170 -- There are in fact two arguments, the first one is the length (minus 171 -- one of the string argument), coded on one character, the second 172 -- argument is the string itself, coded on length + 1 characters. 173 174 -- A node is one char of opcode followed by two chars of "next" pointer. 175 -- "Next" pointers are stored as two 8-bit pieces, high order first. The 176 -- value is a positive offset from the opcode of the node containing it. 177 -- An operand, if any, simply follows the node. (Note that much of the 178 -- code generation knows about this implicit relationship.) 179 180 -- Using two bytes for the "next" pointer is vast overkill for most 181 -- things, but allows patterns to get big without disasters. 182 183 Next_Pointer_Bytes : constant := 3; 184 -- Points after the "next pointer" data. An instruction is therefore: 185 -- 1 byte: instruction opcode 186 -- 2 bytes: pointer to next instruction 187 -- * bytes: optional data for the instruction 188 189 ----------------------- 190 -- Character classes -- 191 ----------------------- 192 -- This is the implementation for character classes ([...]) in the 193 -- syntax for regular expressions. Each character (0..256) has an 194 -- entry into the table. This makes for a very fast matching 195 -- algorithm. 196 197 type Class_Byte is mod 256; 198 type Character_Class is array (Class_Byte range 0 .. 31) of Class_Byte; 199 200 type Bit_Conversion_Array is array (Class_Byte range 0 .. 7) of Class_Byte; 201 Bit_Conversion : constant Bit_Conversion_Array := 202 (1, 2, 4, 8, 16, 32, 64, 128); 203 204 type Std_Class is (ANYOF_NONE, 205 ANYOF_ALNUM, -- Alphanumeric class [a-zA-Z0-9] 206 ANYOF_NALNUM, 207 ANYOF_SPACE, -- Space class [ \t\n\r\f] 208 ANYOF_NSPACE, 209 ANYOF_DIGIT, -- Digit class [0-9] 210 ANYOF_NDIGIT, 211 ANYOF_ALNUMC, -- Alphanumeric class [a-zA-Z0-9] 212 ANYOF_NALNUMC, 213 ANYOF_ALPHA, -- Alpha class [a-zA-Z] 214 ANYOF_NALPHA, 215 ANYOF_ASCII, -- Ascii class (7 bits) 0..127 216 ANYOF_NASCII, 217 ANYOF_CNTRL, -- Control class 218 ANYOF_NCNTRL, 219 ANYOF_GRAPH, -- Graphic class 220 ANYOF_NGRAPH, 221 ANYOF_LOWER, -- Lower case class [a-z] 222 ANYOF_NLOWER, 223 ANYOF_PRINT, -- printable class 224 ANYOF_NPRINT, 225 ANYOF_PUNCT, -- 226 ANYOF_NPUNCT, 227 ANYOF_UPPER, -- Upper case class [A-Z] 228 ANYOF_NUPPER, 229 ANYOF_XDIGIT, -- Hexadecimal digit 230 ANYOF_NXDIGIT 231 ); 232 233 procedure Set_In_Class 234 (Bitmap : in out Character_Class; 235 C : Character); 236 -- Set the entry to True for C in the class Bitmap 237 238 function Get_From_Class 239 (Bitmap : Character_Class; 240 C : Character) return Boolean; 241 -- Return True if the entry is set for C in the class Bitmap 242 243 procedure Reset_Class (Bitmap : out Character_Class); 244 -- Clear all the entries in the class Bitmap 245 246 pragma Inline (Set_In_Class); 247 pragma Inline (Get_From_Class); 248 pragma Inline (Reset_Class); 249 250 ----------------------- 251 -- Local Subprograms -- 252 ----------------------- 253 254 function "=" (Left : Character; Right : Opcode) return Boolean; 255 256 function Is_Alnum (C : Character) return Boolean; 257 -- Return True if C is an alphanum character or an underscore ('_') 258 259 function Is_White_Space (C : Character) return Boolean; 260 -- Return True if C is a whitespace character 261 262 function Is_Printable (C : Character) return Boolean; 263 -- Return True if C is a printable character 264 265 function Operand (P : Pointer) return Pointer; 266 -- Return a pointer to the first operand of the node at P 267 268 function String_Length 269 (Program : Program_Data; 270 P : Pointer) return Program_Size; 271 -- Return the length of the string argument of the node at P 272 273 function String_Operand (P : Pointer) return Pointer; 274 -- Return a pointer to the string argument of the node at P 275 276 procedure Bitmap_Operand 277 (Program : Program_Data; 278 P : Pointer; 279 Op : out Character_Class); 280 -- Return a pointer to the string argument of the node at P 281 282 function Get_Next 283 (Program : Program_Data; 284 IP : Pointer) return Pointer; 285 -- Dig the next instruction pointer out of a node 286 287 procedure Optimize (Self : in out Pattern_Matcher); 288 -- Optimize a Pattern_Matcher by noting certain special cases 289 290 function Read_Natural 291 (Program : Program_Data; 292 IP : Pointer) return Natural; 293 -- Return the 2-byte natural coded at position IP 294 295 -- All of the subprograms above are tiny and should be inlined 296 297 pragma Inline ("="); 298 pragma Inline (Is_Alnum); 299 pragma Inline (Is_White_Space); 300 pragma Inline (Get_Next); 301 pragma Inline (Operand); 302 pragma Inline (Read_Natural); 303 pragma Inline (String_Length); 304 pragma Inline (String_Operand); 305 306 type Expression_Flags is record 307 Has_Width, -- Known never to match null string 308 Simple, -- Simple enough to be STAR/PLUS operand 309 SP_Start : Boolean; -- Starts with * or + 310 end record; 311 312 Worst_Expression : constant Expression_Flags := (others => False); 313 -- Worst case 314 315 procedure Dump_Until 316 (Program : Program_Data; 317 Index : in out Pointer; 318 Till : Pointer; 319 Indent : Natural; 320 Do_Print : Boolean := True); 321 -- Dump the program until the node Till (not included) is met. Every line 322 -- is indented with Index spaces at the beginning Dumps till the end if 323 -- Till is 0. 324 325 procedure Dump_Operation 326 (Program : Program_Data; 327 Index : Pointer; 328 Indent : Natural); 329 -- Same as above, but only dumps a single operation, and compute its 330 -- indentation from the program. 331 332 --------- 333 -- "=" -- 334 --------- 335 336 function "=" (Left : Character; Right : Opcode) return Boolean is 337 begin 338 return Character'Pos (Left) = Opcode'Pos (Right); 339 end "="; 340 341 -------------------- 342 -- Bitmap_Operand -- 343 -------------------- 344 345 procedure Bitmap_Operand 346 (Program : Program_Data; 347 P : Pointer; 348 Op : out Character_Class) 349 is 350 function Convert is new Ada.Unchecked_Conversion 351 (Program_Data, Character_Class); 352 353 begin 354 Op (0 .. 31) := Convert (Program (P + Next_Pointer_Bytes .. P + 34)); 355 end Bitmap_Operand; 356 357 ------------- 358 -- Compile -- 359 ------------- 360 361 procedure Compile 362 (Matcher : out Pattern_Matcher; 363 Expression : String; 364 Final_Code_Size : out Program_Size; 365 Flags : Regexp_Flags := No_Flags) 366 is 367 -- We can't allocate space until we know how big the compiled form 368 -- will be, but we can't compile it (and thus know how big it is) 369 -- until we've got a place to put the code. So we cheat: we compile 370 -- it twice, once with code generation turned off and size counting 371 -- turned on, and once "for real". 372 373 -- This also means that we don't allocate space until we are sure 374 -- that the thing really will compile successfully, and we never 375 -- have to move the code and thus invalidate pointers into it. 376 377 -- Beware that the optimization-preparation code in here knows 378 -- about some of the structure of the compiled regexp. 379 380 PM : Pattern_Matcher renames Matcher; 381 Program : Program_Data renames PM.Program; 382 383 Emit_Ptr : Pointer := Program_First; 384 385 Parse_Pos : Natural := Expression'First; -- Input-scan pointer 386 Parse_End : constant Natural := Expression'Last; 387 388 ---------------------------- 389 -- Subprograms for Create -- 390 ---------------------------- 391 392 procedure Emit (B : Character); 393 -- Output the Character B to the Program. If code-generation is 394 -- disabled, simply increments the program counter. 395 396 function Emit_Node (Op : Opcode) return Pointer; 397 -- If code-generation is enabled, Emit_Node outputs the 398 -- opcode Op and reserves space for a pointer to the next node. 399 -- Return value is the location of new opcode, i.e. old Emit_Ptr. 400 401 procedure Emit_Natural (IP : Pointer; N : Natural); 402 -- Split N on two characters at position IP 403 404 procedure Emit_Class (Bitmap : Character_Class); 405 -- Emits a character class 406 407 procedure Case_Emit (C : Character); 408 -- Emit C, after converting is to lower-case if the regular 409 -- expression is case insensitive. 410 411 procedure Parse 412 (Parenthesized : Boolean; 413 Flags : out Expression_Flags; 414 IP : out Pointer); 415 -- Parse regular expression, i.e. main body or parenthesized thing 416 -- Caller must absorb opening parenthesis. 417 418 procedure Parse_Branch 419 (Flags : out Expression_Flags; 420 First : Boolean; 421 IP : out Pointer); 422 -- Implements the concatenation operator and handles '|' 423 -- First should be true if this is the first item of the alternative. 424 425 procedure Parse_Piece 426 (Expr_Flags : out Expression_Flags; 427 IP : out Pointer); 428 -- Parse something followed by possible [*+?] 429 430 procedure Parse_Atom 431 (Expr_Flags : out Expression_Flags; 432 IP : out Pointer); 433 -- Parse_Atom is the lowest level parse procedure. 434 -- 435 -- Optimization: Gobbles an entire sequence of ordinary characters so 436 -- that it can turn them into a single node, which is smaller to store 437 -- and faster to run. Backslashed characters are exceptions, each 438 -- becoming a separate node; the code is simpler that way and it's 439 -- not worth fixing. 440 441 procedure Insert_Operator 442 (Op : Opcode; 443 Operand : Pointer; 444 Greedy : Boolean := True); 445 -- Insert_Operator inserts an operator in front of an already-emitted 446 -- operand and relocates the operand. This applies to PLUS and STAR. 447 -- If Minmod is True, then the operator is non-greedy. 448 449 function Insert_Operator_Before 450 (Op : Opcode; 451 Operand : Pointer; 452 Greedy : Boolean; 453 Opsize : Pointer) return Pointer; 454 -- Insert an operator before Operand (and move the latter forward in the 455 -- program). Opsize is the size needed to represent the operator. This 456 -- returns the position at which the operator was inserted, and moves 457 -- Emit_Ptr after the new position of the operand. 458 459 procedure Insert_Curly_Operator 460 (Op : Opcode; 461 Min : Natural; 462 Max : Natural; 463 Operand : Pointer; 464 Greedy : Boolean := True); 465 -- Insert an operator for CURLY ({Min}, {Min,} or {Min,Max}). 466 -- If Minmod is True, then the operator is non-greedy. 467 468 procedure Link_Tail (P, Val : Pointer); 469 -- Link_Tail sets the next-pointer at the end of a node chain 470 471 procedure Link_Operand_Tail (P, Val : Pointer); 472 -- Link_Tail on operand of first argument; noop if operand-less 473 474 procedure Fail (M : String); 475 pragma No_Return (Fail); 476 -- Fail with a diagnostic message, if possible 477 478 function Is_Curly_Operator (IP : Natural) return Boolean; 479 -- Return True if IP is looking at a '{' that is the beginning 480 -- of a curly operator, i.e. it matches {\d+,?\d*} 481 482 function Is_Mult (IP : Natural) return Boolean; 483 -- Return True if C is a regexp multiplier: '+', '*' or '?' 484 485 procedure Get_Curly_Arguments 486 (IP : Natural; 487 Min : out Natural; 488 Max : out Natural; 489 Greedy : out Boolean); 490 -- Parse the argument list for a curly operator. 491 -- It is assumed that IP is indeed pointing at a valid operator. 492 -- So what is IP and how come IP is not referenced in the body ??? 493 494 procedure Parse_Character_Class (IP : out Pointer); 495 -- Parse a character class. 496 -- The calling subprogram should consume the opening '[' before. 497 498 procedure Parse_Literal 499 (Expr_Flags : out Expression_Flags; 500 IP : out Pointer); 501 -- Parse_Literal encodes a string of characters to be matched exactly 502 503 function Parse_Posix_Character_Class return Std_Class; 504 -- Parse a posix character class, like [:alpha:] or [:^alpha:]. 505 -- The caller is supposed to absorb the opening [. 506 507 pragma Inline (Is_Mult); 508 pragma Inline (Emit_Natural); 509 pragma Inline (Parse_Character_Class); -- since used only once 510 511 --------------- 512 -- Case_Emit -- 513 --------------- 514 515 procedure Case_Emit (C : Character) is 516 begin 517 if (Flags and Case_Insensitive) /= 0 then 518 Emit (To_Lower (C)); 519 520 else 521 -- Dump current character 522 523 Emit (C); 524 end if; 525 end Case_Emit; 526 527 ---------- 528 -- Emit -- 529 ---------- 530 531 procedure Emit (B : Character) is 532 begin 533 if Emit_Ptr <= PM.Size then 534 Program (Emit_Ptr) := B; 535 end if; 536 537 Emit_Ptr := Emit_Ptr + 1; 538 end Emit; 539 540 ---------------- 541 -- Emit_Class -- 542 ---------------- 543 544 procedure Emit_Class (Bitmap : Character_Class) is 545 subtype Program31 is Program_Data (0 .. 31); 546 547 function Convert is new Ada.Unchecked_Conversion 548 (Character_Class, Program31); 549 550 begin 551 -- What is the mysterious constant 31 here??? Can't it be expressed 552 -- symbolically (size of integer - 1 or some such???). In any case 553 -- it should be declared as a constant (and referenced presumably 554 -- as this constant + 1 below. 555 556 if Emit_Ptr + 31 <= PM.Size then 557 Program (Emit_Ptr .. Emit_Ptr + 31) := Convert (Bitmap); 558 end if; 559 560 Emit_Ptr := Emit_Ptr + 32; 561 end Emit_Class; 562 563 ------------------ 564 -- Emit_Natural -- 565 ------------------ 566 567 procedure Emit_Natural (IP : Pointer; N : Natural) is 568 begin 569 if IP + 1 <= PM.Size then 570 Program (IP + 1) := Character'Val (N / 256); 571 Program (IP) := Character'Val (N mod 256); 572 end if; 573 end Emit_Natural; 574 575 --------------- 576 -- Emit_Node -- 577 --------------- 578 579 function Emit_Node (Op : Opcode) return Pointer is 580 Result : constant Pointer := Emit_Ptr; 581 582 begin 583 if Emit_Ptr + 2 <= PM.Size then 584 Program (Emit_Ptr) := Character'Val (Opcode'Pos (Op)); 585 Program (Emit_Ptr + 1) := ASCII.NUL; 586 Program (Emit_Ptr + 2) := ASCII.NUL; 587 end if; 588 589 Emit_Ptr := Emit_Ptr + Next_Pointer_Bytes; 590 return Result; 591 end Emit_Node; 592 593 ---------- 594 -- Fail -- 595 ---------- 596 597 procedure Fail (M : String) is 598 begin 599 raise Expression_Error with M; 600 end Fail; 601 602 ------------------------- 603 -- Get_Curly_Arguments -- 604 ------------------------- 605 606 procedure Get_Curly_Arguments 607 (IP : Natural; 608 Min : out Natural; 609 Max : out Natural; 610 Greedy : out Boolean) 611 is 612 pragma Unreferenced (IP); 613 614 Save_Pos : Natural := Parse_Pos + 1; 615 616 begin 617 Min := 0; 618 Max := Max_Curly_Repeat; 619 620 while Expression (Parse_Pos) /= '}' 621 and then Expression (Parse_Pos) /= ',' 622 loop 623 Parse_Pos := Parse_Pos + 1; 624 end loop; 625 626 Min := Natural'Value (Expression (Save_Pos .. Parse_Pos - 1)); 627 628 if Expression (Parse_Pos) = ',' then 629 Save_Pos := Parse_Pos + 1; 630 while Expression (Parse_Pos) /= '}' loop 631 Parse_Pos := Parse_Pos + 1; 632 end loop; 633 634 if Save_Pos /= Parse_Pos then 635 Max := Natural'Value (Expression (Save_Pos .. Parse_Pos - 1)); 636 end if; 637 638 else 639 Max := Min; 640 end if; 641 642 if Parse_Pos < Expression'Last 643 and then Expression (Parse_Pos + 1) = '?' 644 then 645 Greedy := False; 646 Parse_Pos := Parse_Pos + 1; 647 648 else 649 Greedy := True; 650 end if; 651 end Get_Curly_Arguments; 652 653 --------------------------- 654 -- Insert_Curly_Operator -- 655 --------------------------- 656 657 procedure Insert_Curly_Operator 658 (Op : Opcode; 659 Min : Natural; 660 Max : Natural; 661 Operand : Pointer; 662 Greedy : Boolean := True) 663 is 664 Old : Pointer; 665 begin 666 Old := Insert_Operator_Before (Op, Operand, Greedy, Opsize => 7); 667 Emit_Natural (Old + Next_Pointer_Bytes, Min); 668 Emit_Natural (Old + Next_Pointer_Bytes + 2, Max); 669 end Insert_Curly_Operator; 670 671 ---------------------------- 672 -- Insert_Operator_Before -- 673 ---------------------------- 674 675 function Insert_Operator_Before 676 (Op : Opcode; 677 Operand : Pointer; 678 Greedy : Boolean; 679 Opsize : Pointer) return Pointer 680 is 681 Dest : constant Pointer := Emit_Ptr; 682 Old : Pointer; 683 Size : Pointer := Opsize; 684 685 begin 686 -- If not greedy, we have to emit another opcode first 687 688 if not Greedy then 689 Size := Size + Next_Pointer_Bytes; 690 end if; 691 692 -- Move the operand in the byte-compilation, so that we can insert 693 -- the operator before it. 694 695 if Emit_Ptr + Size <= PM.Size then 696 Program (Operand + Size .. Emit_Ptr + Size) := 697 Program (Operand .. Emit_Ptr); 698 end if; 699 700 -- Insert the operator at the position previously occupied by the 701 -- operand. 702 703 Emit_Ptr := Operand; 704 705 if not Greedy then 706 Old := Emit_Node (MINMOD); 707 Link_Tail (Old, Old + Next_Pointer_Bytes); 708 end if; 709 710 Old := Emit_Node (Op); 711 Emit_Ptr := Dest + Size; 712 return Old; 713 end Insert_Operator_Before; 714 715 --------------------- 716 -- Insert_Operator -- 717 --------------------- 718 719 procedure Insert_Operator 720 (Op : Opcode; 721 Operand : Pointer; 722 Greedy : Boolean := True) 723 is 724 Discard : Pointer; 725 pragma Warnings (Off, Discard); 726 begin 727 Discard := Insert_Operator_Before 728 (Op, Operand, Greedy, Opsize => Next_Pointer_Bytes); 729 end Insert_Operator; 730 731 ----------------------- 732 -- Is_Curly_Operator -- 733 ----------------------- 734 735 function Is_Curly_Operator (IP : Natural) return Boolean is 736 Scan : Natural := IP; 737 738 begin 739 if Expression (Scan) /= '{' 740 or else Scan + 2 > Expression'Last 741 or else not Is_Digit (Expression (Scan + 1)) 742 then 743 return False; 744 end if; 745 746 Scan := Scan + 1; 747 748 -- The first digit 749 750 loop 751 Scan := Scan + 1; 752 753 if Scan > Expression'Last then 754 return False; 755 end if; 756 757 exit when not Is_Digit (Expression (Scan)); 758 end loop; 759 760 if Expression (Scan) = ',' then 761 loop 762 Scan := Scan + 1; 763 764 if Scan > Expression'Last then 765 return False; 766 end if; 767 768 exit when not Is_Digit (Expression (Scan)); 769 end loop; 770 end if; 771 772 return Expression (Scan) = '}'; 773 end Is_Curly_Operator; 774 775 ------------- 776 -- Is_Mult -- 777 ------------- 778 779 function Is_Mult (IP : Natural) return Boolean is 780 C : constant Character := Expression (IP); 781 782 begin 783 return C = '*' 784 or else C = '+' 785 or else C = '?' 786 or else (C = '{' and then Is_Curly_Operator (IP)); 787 end Is_Mult; 788 789 ----------------------- 790 -- Link_Operand_Tail -- 791 ----------------------- 792 793 procedure Link_Operand_Tail (P, Val : Pointer) is 794 begin 795 if P <= PM.Size and then Program (P) = BRANCH then 796 Link_Tail (Operand (P), Val); 797 end if; 798 end Link_Operand_Tail; 799 800 --------------- 801 -- Link_Tail -- 802 --------------- 803 804 procedure Link_Tail (P, Val : Pointer) is 805 Scan : Pointer; 806 Temp : Pointer; 807 Offset : Pointer; 808 809 begin 810 -- Find last node (the size of the pattern matcher might be too 811 -- small, so don't try to read past its end). 812 813 Scan := P; 814 while Scan + Next_Pointer_Bytes <= PM.Size loop 815 Temp := Get_Next (Program, Scan); 816 exit when Temp = Scan; 817 Scan := Temp; 818 end loop; 819 820 Offset := Val - Scan; 821 822 Emit_Natural (Scan + 1, Natural (Offset)); 823 end Link_Tail; 824 825 ----------- 826 -- Parse -- 827 ----------- 828 829 -- Combining parenthesis handling with the base level of regular 830 -- expression is a trifle forced, but the need to tie the tails of the 831 -- the branches to what follows makes it hard to avoid. 832 833 procedure Parse 834 (Parenthesized : Boolean; 835 Flags : out Expression_Flags; 836 IP : out Pointer) 837 is 838 E : String renames Expression; 839 Br, Br2 : Pointer; 840 Ender : Pointer; 841 Par_No : Natural; 842 New_Flags : Expression_Flags; 843 Have_Branch : Boolean := False; 844 845 begin 846 Flags := (Has_Width => True, others => False); -- Tentatively 847 848 -- Make an OPEN node, if parenthesized 849 850 if Parenthesized then 851 if Matcher.Paren_Count > Max_Paren_Count then 852 Fail ("too many ()"); 853 end if; 854 855 Par_No := Matcher.Paren_Count + 1; 856 Matcher.Paren_Count := Matcher.Paren_Count + 1; 857 IP := Emit_Node (OPEN); 858 Emit (Character'Val (Par_No)); 859 860 else 861 IP := 0; 862 Par_No := 0; 863 end if; 864 865 -- Pick up the branches, linking them together 866 867 Parse_Branch (New_Flags, True, Br); 868 869 if Br = 0 then 870 IP := 0; 871 return; 872 end if; 873 874 if Parse_Pos <= Parse_End 875 and then E (Parse_Pos) = '|' 876 then 877 Insert_Operator (BRANCH, Br); 878 Have_Branch := True; 879 end if; 880 881 if IP /= 0 then 882 Link_Tail (IP, Br); -- OPEN -> first 883 else 884 IP := Br; 885 end if; 886 887 if not New_Flags.Has_Width then 888 Flags.Has_Width := False; 889 end if; 890 891 Flags.SP_Start := Flags.SP_Start or else New_Flags.SP_Start; 892 893 while Parse_Pos <= Parse_End 894 and then (E (Parse_Pos) = '|') 895 loop 896 Parse_Pos := Parse_Pos + 1; 897 Parse_Branch (New_Flags, False, Br); 898 899 if Br = 0 then 900 IP := 0; 901 return; 902 end if; 903 904 Link_Tail (IP, Br); -- BRANCH -> BRANCH 905 906 if not New_Flags.Has_Width then 907 Flags.Has_Width := False; 908 end if; 909 910 Flags.SP_Start := Flags.SP_Start or else New_Flags.SP_Start; 911 end loop; 912 913 -- Make a closing node, and hook it on the end 914 915 if Parenthesized then 916 Ender := Emit_Node (CLOSE); 917 Emit (Character'Val (Par_No)); 918 else 919 Ender := Emit_Node (EOP); 920 end if; 921 922 Link_Tail (IP, Ender); 923 924 if Have_Branch and then Emit_Ptr <= PM.Size + 1 then 925 926 -- Hook the tails of the branches to the closing node 927 928 Br := IP; 929 loop 930 Link_Operand_Tail (Br, Ender); 931 Br2 := Get_Next (Program, Br); 932 exit when Br2 = Br; 933 Br := Br2; 934 end loop; 935 end if; 936 937 -- Check for proper termination 938 939 if Parenthesized then 940 if Parse_Pos > Parse_End or else E (Parse_Pos) /= ')' then 941 Fail ("unmatched ()"); 942 end if; 943 944 Parse_Pos := Parse_Pos + 1; 945 946 elsif Parse_Pos <= Parse_End then 947 if E (Parse_Pos) = ')' then 948 Fail ("unmatched ()"); 949 else 950 Fail ("junk on end"); -- "Can't happen" 951 end if; 952 end if; 953 end Parse; 954 955 ---------------- 956 -- Parse_Atom -- 957 ---------------- 958 959 procedure Parse_Atom 960 (Expr_Flags : out Expression_Flags; 961 IP : out Pointer) 962 is 963 C : Character; 964 965 begin 966 -- Tentatively set worst expression case 967 968 Expr_Flags := Worst_Expression; 969 970 C := Expression (Parse_Pos); 971 Parse_Pos := Parse_Pos + 1; 972 973 case (C) is 974 when '^' => 975 IP := 976 Emit_Node 977 (if (Flags and Multiple_Lines) /= 0 then MBOL 978 elsif (Flags and Single_Line) /= 0 then SBOL 979 else BOL); 980 981 when '$' => 982 IP := 983 Emit_Node 984 (if (Flags and Multiple_Lines) /= 0 then MEOL 985 elsif (Flags and Single_Line) /= 0 then SEOL 986 else EOL); 987 988 when '.' => 989 IP := 990 Emit_Node 991 (if (Flags and Single_Line) /= 0 then SANY else ANY); 992 993 Expr_Flags.Has_Width := True; 994 Expr_Flags.Simple := True; 995 996 when '[' => 997 Parse_Character_Class (IP); 998 Expr_Flags.Has_Width := True; 999 Expr_Flags.Simple := True; 1000 1001 when '(' => 1002 declare 1003 New_Flags : Expression_Flags; 1004 1005 begin 1006 Parse (True, New_Flags, IP); 1007 1008 if IP = 0 then 1009 return; 1010 end if; 1011 1012 Expr_Flags.Has_Width := 1013 Expr_Flags.Has_Width or else New_Flags.Has_Width; 1014 Expr_Flags.SP_Start := 1015 Expr_Flags.SP_Start or else New_Flags.SP_Start; 1016 end; 1017 1018 when '|' | ASCII.LF | ')' => 1019 Fail ("internal urp"); -- Supposed to be caught earlier 1020 1021 when '?' | '+' | '*' => 1022 Fail (C & " follows nothing"); 1023 1024 when '{' => 1025 if Is_Curly_Operator (Parse_Pos - 1) then 1026 Fail (C & " follows nothing"); 1027 else 1028 Parse_Literal (Expr_Flags, IP); 1029 end if; 1030 1031 when '\' => 1032 if Parse_Pos > Parse_End then 1033 Fail ("trailing \"); 1034 end if; 1035 1036 Parse_Pos := Parse_Pos + 1; 1037 1038 case Expression (Parse_Pos - 1) is 1039 when 'b' => 1040 IP := Emit_Node (BOUND); 1041 1042 when 'B' => 1043 IP := Emit_Node (NBOUND); 1044 1045 when 's' => 1046 IP := Emit_Node (SPACE); 1047 Expr_Flags.Simple := True; 1048 Expr_Flags.Has_Width := True; 1049 1050 when 'S' => 1051 IP := Emit_Node (NSPACE); 1052 Expr_Flags.Simple := True; 1053 Expr_Flags.Has_Width := True; 1054 1055 when 'd' => 1056 IP := Emit_Node (DIGIT); 1057 Expr_Flags.Simple := True; 1058 Expr_Flags.Has_Width := True; 1059 1060 when 'D' => 1061 IP := Emit_Node (NDIGIT); 1062 Expr_Flags.Simple := True; 1063 Expr_Flags.Has_Width := True; 1064 1065 when 'w' => 1066 IP := Emit_Node (ALNUM); 1067 Expr_Flags.Simple := True; 1068 Expr_Flags.Has_Width := True; 1069 1070 when 'W' => 1071 IP := Emit_Node (NALNUM); 1072 Expr_Flags.Simple := True; 1073 Expr_Flags.Has_Width := True; 1074 1075 when 'A' => 1076 IP := Emit_Node (SBOL); 1077 1078 when 'G' => 1079 IP := Emit_Node (SEOL); 1080 1081 when '0' .. '9' => 1082 IP := Emit_Node (REFF); 1083 1084 declare 1085 Save : constant Natural := Parse_Pos - 1; 1086 1087 begin 1088 while Parse_Pos <= Expression'Last 1089 and then Is_Digit (Expression (Parse_Pos)) 1090 loop 1091 Parse_Pos := Parse_Pos + 1; 1092 end loop; 1093 1094 Emit (Character'Val (Natural'Value 1095 (Expression (Save .. Parse_Pos - 1)))); 1096 end; 1097 1098 when others => 1099 Parse_Pos := Parse_Pos - 1; 1100 Parse_Literal (Expr_Flags, IP); 1101 end case; 1102 1103 when others => 1104 Parse_Literal (Expr_Flags, IP); 1105 end case; 1106 end Parse_Atom; 1107 1108 ------------------ 1109 -- Parse_Branch -- 1110 ------------------ 1111 1112 procedure Parse_Branch 1113 (Flags : out Expression_Flags; 1114 First : Boolean; 1115 IP : out Pointer) 1116 is 1117 E : String renames Expression; 1118 Chain : Pointer; 1119 Last : Pointer; 1120 New_Flags : Expression_Flags; 1121 1122 Discard : Pointer; 1123 pragma Warnings (Off, Discard); 1124 1125 begin 1126 Flags := Worst_Expression; -- Tentatively 1127 IP := (if First then Emit_Ptr else Emit_Node (BRANCH)); 1128 1129 Chain := 0; 1130 while Parse_Pos <= Parse_End 1131 and then E (Parse_Pos) /= ')' 1132 and then E (Parse_Pos) /= ASCII.LF 1133 and then E (Parse_Pos) /= '|' 1134 loop 1135 Parse_Piece (New_Flags, Last); 1136 1137 if Last = 0 then 1138 IP := 0; 1139 return; 1140 end if; 1141 1142 Flags.Has_Width := Flags.Has_Width or else New_Flags.Has_Width; 1143 1144 if Chain = 0 then -- First piece 1145 Flags.SP_Start := Flags.SP_Start or else New_Flags.SP_Start; 1146 else 1147 Link_Tail (Chain, Last); 1148 end if; 1149 1150 Chain := Last; 1151 end loop; 1152 1153 -- Case where loop ran zero CURLY 1154 1155 if Chain = 0 then 1156 Discard := Emit_Node (NOTHING); 1157 end if; 1158 end Parse_Branch; 1159 1160 --------------------------- 1161 -- Parse_Character_Class -- 1162 --------------------------- 1163 1164 procedure Parse_Character_Class (IP : out Pointer) is 1165 Bitmap : Character_Class; 1166 Invert : Boolean := False; 1167 In_Range : Boolean := False; 1168 Named_Class : Std_Class := ANYOF_NONE; 1169 Value : Character; 1170 Last_Value : Character := ASCII.NUL; 1171 1172 begin 1173 Reset_Class (Bitmap); 1174 1175 -- Do we have an invert character class ? 1176 1177 if Parse_Pos <= Parse_End 1178 and then Expression (Parse_Pos) = '^' 1179 then 1180 Invert := True; 1181 Parse_Pos := Parse_Pos + 1; 1182 end if; 1183 1184 -- First character can be ] or - without closing the class 1185 1186 if Parse_Pos <= Parse_End 1187 and then (Expression (Parse_Pos) = ']' 1188 or else Expression (Parse_Pos) = '-') 1189 then 1190 Set_In_Class (Bitmap, Expression (Parse_Pos)); 1191 Parse_Pos := Parse_Pos + 1; 1192 end if; 1193 1194 -- While we don't have the end of the class 1195 1196 while Parse_Pos <= Parse_End 1197 and then Expression (Parse_Pos) /= ']' 1198 loop 1199 Named_Class := ANYOF_NONE; 1200 Value := Expression (Parse_Pos); 1201 Parse_Pos := Parse_Pos + 1; 1202 1203 -- Do we have a Posix character class 1204 if Value = '[' then 1205 Named_Class := Parse_Posix_Character_Class; 1206 1207 elsif Value = '\' then 1208 if Parse_Pos = Parse_End then 1209 Fail ("Trailing \"); 1210 end if; 1211 Value := Expression (Parse_Pos); 1212 Parse_Pos := Parse_Pos + 1; 1213 1214 case Value is 1215 when 'w' => Named_Class := ANYOF_ALNUM; 1216 when 'W' => Named_Class := ANYOF_NALNUM; 1217 when 's' => Named_Class := ANYOF_SPACE; 1218 when 'S' => Named_Class := ANYOF_NSPACE; 1219 when 'd' => Named_Class := ANYOF_DIGIT; 1220 when 'D' => Named_Class := ANYOF_NDIGIT; 1221 when 'n' => Value := ASCII.LF; 1222 when 'r' => Value := ASCII.CR; 1223 when 't' => Value := ASCII.HT; 1224 when 'f' => Value := ASCII.FF; 1225 when 'e' => Value := ASCII.ESC; 1226 when 'a' => Value := ASCII.BEL; 1227 1228 -- when 'x' => ??? hexadecimal value 1229 -- when 'c' => ??? control character 1230 -- when '0'..'9' => ??? octal character 1231 1232 when others => null; 1233 end case; 1234 end if; 1235 1236 -- Do we have a character class? 1237 1238 if Named_Class /= ANYOF_NONE then 1239 1240 -- A range like 'a-\d' or 'a-[:digit:] is not a range 1241 1242 if In_Range then 1243 Set_In_Class (Bitmap, Last_Value); 1244 Set_In_Class (Bitmap, '-'); 1245 In_Range := False; 1246 end if; 1247 1248 -- Expand the range 1249 1250 case Named_Class is 1251 when ANYOF_NONE => null; 1252 1253 when ANYOF_ALNUM | ANYOF_ALNUMC => 1254 for Value in Class_Byte'Range loop 1255 if Is_Alnum (Character'Val (Value)) then 1256 Set_In_Class (Bitmap, Character'Val (Value)); 1257 end if; 1258 end loop; 1259 1260 when ANYOF_NALNUM | ANYOF_NALNUMC => 1261 for Value in Class_Byte'Range loop 1262 if not Is_Alnum (Character'Val (Value)) then 1263 Set_In_Class (Bitmap, Character'Val (Value)); 1264 end if; 1265 end loop; 1266 1267 when ANYOF_SPACE => 1268 for Value in Class_Byte'Range loop 1269 if Is_White_Space (Character'Val (Value)) then 1270 Set_In_Class (Bitmap, Character'Val (Value)); 1271 end if; 1272 end loop; 1273 1274 when ANYOF_NSPACE => 1275 for Value in Class_Byte'Range loop 1276 if not Is_White_Space (Character'Val (Value)) then 1277 Set_In_Class (Bitmap, Character'Val (Value)); 1278 end if; 1279 end loop; 1280 1281 when ANYOF_DIGIT => 1282 for Value in Class_Byte'Range loop 1283 if Is_Digit (Character'Val (Value)) then 1284 Set_In_Class (Bitmap, Character'Val (Value)); 1285 end if; 1286 end loop; 1287 1288 when ANYOF_NDIGIT => 1289 for Value in Class_Byte'Range loop 1290 if not Is_Digit (Character'Val (Value)) then 1291 Set_In_Class (Bitmap, Character'Val (Value)); 1292 end if; 1293 end loop; 1294 1295 when ANYOF_ALPHA => 1296 for Value in Class_Byte'Range loop 1297 if Is_Letter (Character'Val (Value)) then 1298 Set_In_Class (Bitmap, Character'Val (Value)); 1299 end if; 1300 end loop; 1301 1302 when ANYOF_NALPHA => 1303 for Value in Class_Byte'Range loop 1304 if not Is_Letter (Character'Val (Value)) then 1305 Set_In_Class (Bitmap, Character'Val (Value)); 1306 end if; 1307 end loop; 1308 1309 when ANYOF_ASCII => 1310 for Value in 0 .. 127 loop 1311 Set_In_Class (Bitmap, Character'Val (Value)); 1312 end loop; 1313 1314 when ANYOF_NASCII => 1315 for Value in 128 .. 255 loop 1316 Set_In_Class (Bitmap, Character'Val (Value)); 1317 end loop; 1318 1319 when ANYOF_CNTRL => 1320 for Value in Class_Byte'Range loop 1321 if Is_Control (Character'Val (Value)) then 1322 Set_In_Class (Bitmap, Character'Val (Value)); 1323 end if; 1324 end loop; 1325 1326 when ANYOF_NCNTRL => 1327 for Value in Class_Byte'Range loop 1328 if not Is_Control (Character'Val (Value)) then 1329 Set_In_Class (Bitmap, Character'Val (Value)); 1330 end if; 1331 end loop; 1332 1333 when ANYOF_GRAPH => 1334 for Value in Class_Byte'Range loop 1335 if Is_Graphic (Character'Val (Value)) then 1336 Set_In_Class (Bitmap, Character'Val (Value)); 1337 end if; 1338 end loop; 1339 1340 when ANYOF_NGRAPH => 1341 for Value in Class_Byte'Range loop 1342 if not Is_Graphic (Character'Val (Value)) then 1343 Set_In_Class (Bitmap, Character'Val (Value)); 1344 end if; 1345 end loop; 1346 1347 when ANYOF_LOWER => 1348 for Value in Class_Byte'Range loop 1349 if Is_Lower (Character'Val (Value)) then 1350 Set_In_Class (Bitmap, Character'Val (Value)); 1351 end if; 1352 end loop; 1353 1354 when ANYOF_NLOWER => 1355 for Value in Class_Byte'Range loop 1356 if not Is_Lower (Character'Val (Value)) then 1357 Set_In_Class (Bitmap, Character'Val (Value)); 1358 end if; 1359 end loop; 1360 1361 when ANYOF_PRINT => 1362 for Value in Class_Byte'Range loop 1363 if Is_Printable (Character'Val (Value)) then 1364 Set_In_Class (Bitmap, Character'Val (Value)); 1365 end if; 1366 end loop; 1367 1368 when ANYOF_NPRINT => 1369 for Value in Class_Byte'Range loop 1370 if not Is_Printable (Character'Val (Value)) then 1371 Set_In_Class (Bitmap, Character'Val (Value)); 1372 end if; 1373 end loop; 1374 1375 when ANYOF_PUNCT => 1376 for Value in Class_Byte'Range loop 1377 if Is_Printable (Character'Val (Value)) 1378 and then not Is_White_Space (Character'Val (Value)) 1379 and then not Is_Alnum (Character'Val (Value)) 1380 then 1381 Set_In_Class (Bitmap, Character'Val (Value)); 1382 end if; 1383 end loop; 1384 1385 when ANYOF_NPUNCT => 1386 for Value in Class_Byte'Range loop 1387 if not Is_Printable (Character'Val (Value)) 1388 or else Is_White_Space (Character'Val (Value)) 1389 or else Is_Alnum (Character'Val (Value)) 1390 then 1391 Set_In_Class (Bitmap, Character'Val (Value)); 1392 end if; 1393 end loop; 1394 1395 when ANYOF_UPPER => 1396 for Value in Class_Byte'Range loop 1397 if Is_Upper (Character'Val (Value)) then 1398 Set_In_Class (Bitmap, Character'Val (Value)); 1399 end if; 1400 end loop; 1401 1402 when ANYOF_NUPPER => 1403 for Value in Class_Byte'Range loop 1404 if not Is_Upper (Character'Val (Value)) then 1405 Set_In_Class (Bitmap, Character'Val (Value)); 1406 end if; 1407 end loop; 1408 1409 when ANYOF_XDIGIT => 1410 for Value in Class_Byte'Range loop 1411 if Is_Hexadecimal_Digit (Character'Val (Value)) then 1412 Set_In_Class (Bitmap, Character'Val (Value)); 1413 end if; 1414 end loop; 1415 1416 when ANYOF_NXDIGIT => 1417 for Value in Class_Byte'Range loop 1418 if not Is_Hexadecimal_Digit 1419 (Character'Val (Value)) 1420 then 1421 Set_In_Class (Bitmap, Character'Val (Value)); 1422 end if; 1423 end loop; 1424 1425 end case; 1426 1427 -- Not a character range 1428 1429 elsif not In_Range then 1430 Last_Value := Value; 1431 1432 if Parse_Pos > Expression'Last then 1433 Fail ("Empty character class []"); 1434 end if; 1435 1436 if Expression (Parse_Pos) = '-' 1437 and then Parse_Pos < Parse_End 1438 and then Expression (Parse_Pos + 1) /= ']' 1439 then 1440 Parse_Pos := Parse_Pos + 1; 1441 1442 -- Do we have a range like '\d-a' and '[:space:]-a' 1443 -- which is not a real range 1444 1445 if Named_Class /= ANYOF_NONE then 1446 Set_In_Class (Bitmap, '-'); 1447 else 1448 In_Range := True; 1449 end if; 1450 1451 else 1452 Set_In_Class (Bitmap, Value); 1453 1454 end if; 1455 1456 -- Else in a character range 1457 1458 else 1459 if Last_Value > Value then 1460 Fail ("Invalid Range [" & Last_Value'Img 1461 & "-" & Value'Img & "]"); 1462 end if; 1463 1464 while Last_Value <= Value loop 1465 Set_In_Class (Bitmap, Last_Value); 1466 Last_Value := Character'Succ (Last_Value); 1467 end loop; 1468 1469 In_Range := False; 1470 1471 end if; 1472 1473 end loop; 1474 1475 -- Optimize case-insensitive ranges (put the upper case or lower 1476 -- case character into the bitmap) 1477 1478 if (Flags and Case_Insensitive) /= 0 then 1479 for C in Character'Range loop 1480 if Get_From_Class (Bitmap, C) then 1481 Set_In_Class (Bitmap, To_Lower (C)); 1482 Set_In_Class (Bitmap, To_Upper (C)); 1483 end if; 1484 end loop; 1485 end if; 1486 1487 -- Optimize inverted classes 1488 1489 if Invert then 1490 for J in Bitmap'Range loop 1491 Bitmap (J) := not Bitmap (J); 1492 end loop; 1493 end if; 1494 1495 Parse_Pos := Parse_Pos + 1; 1496 1497 -- Emit the class 1498 1499 IP := Emit_Node (ANYOF); 1500 Emit_Class (Bitmap); 1501 end Parse_Character_Class; 1502 1503 ------------------- 1504 -- Parse_Literal -- 1505 ------------------- 1506 1507 -- This is a bit tricky due to quoted chars and due to 1508 -- the multiplier characters '*', '+', and '?' that 1509 -- take the SINGLE char previous as their operand. 1510 1511 -- On entry, the character at Parse_Pos - 1 is going to go 1512 -- into the string, no matter what it is. It could be 1513 -- following a \ if Parse_Atom was entered from the '\' case. 1514 1515 -- Basic idea is to pick up a good char in C and examine 1516 -- the next char. If Is_Mult (C) then twiddle, if it's a \ 1517 -- then frozzle and if it's another magic char then push C and 1518 -- terminate the string. If none of the above, push C on the 1519 -- string and go around again. 1520 1521 -- Start_Pos is used to remember where "the current character" 1522 -- starts in the string, if due to an Is_Mult we need to back 1523 -- up and put the current char in a separate 1-character string. 1524 -- When Start_Pos is 0, C is the only char in the string; 1525 -- this is used in Is_Mult handling, and in setting the SIMPLE 1526 -- flag at the end. 1527 1528 procedure Parse_Literal 1529 (Expr_Flags : out Expression_Flags; 1530 IP : out Pointer) 1531 is 1532 Start_Pos : Natural := 0; 1533 C : Character; 1534 Length_Ptr : Pointer; 1535 1536 Has_Special_Operator : Boolean := False; 1537 1538 begin 1539 Parse_Pos := Parse_Pos - 1; -- Look at current character 1540 1541 IP := 1542 Emit_Node 1543 (if (Flags and Case_Insensitive) /= 0 then EXACTF else EXACT); 1544 1545 Length_Ptr := Emit_Ptr; 1546 Emit_Ptr := String_Operand (IP); 1547 1548 Parse_Loop : 1549 loop 1550 C := Expression (Parse_Pos); -- Get current character 1551 1552 case C is 1553 when '.' | '[' | '(' | ')' | '|' | ASCII.LF | '$' | '^' => 1554 1555 if Start_Pos = 0 then 1556 Start_Pos := Parse_Pos; 1557 Emit (C); -- First character is always emitted 1558 else 1559 exit Parse_Loop; -- Else we are done 1560 end if; 1561 1562 when '?' | '+' | '*' | '{' => 1563 1564 if Start_Pos = 0 then 1565 Start_Pos := Parse_Pos; 1566 Emit (C); -- First character is always emitted 1567 1568 -- Are we looking at an operator, or is this 1569 -- simply a normal character ? 1570 1571 elsif not Is_Mult (Parse_Pos) then 1572 Start_Pos := Parse_Pos; 1573 Case_Emit (C); 1574 1575 else 1576 -- We've got something like "abc?d". Mark this as a 1577 -- special case. What we want to emit is a first 1578 -- constant string for "ab", then one for "c" that will 1579 -- ultimately be transformed with a CURLY operator, A 1580 -- special case has to be handled for "a?", since there 1581 -- is no initial string to emit. 1582 1583 Has_Special_Operator := True; 1584 exit Parse_Loop; 1585 end if; 1586 1587 when '\' => 1588 Start_Pos := Parse_Pos; 1589 1590 if Parse_Pos = Parse_End then 1591 Fail ("Trailing \"); 1592 1593 else 1594 case Expression (Parse_Pos + 1) is 1595 when 'b' | 'B' | 's' | 'S' | 'd' | 'D' 1596 | 'w' | 'W' | '0' .. '9' | 'G' | 'A' 1597 => exit Parse_Loop; 1598 when 'n' => Emit (ASCII.LF); 1599 when 't' => Emit (ASCII.HT); 1600 when 'r' => Emit (ASCII.CR); 1601 when 'f' => Emit (ASCII.FF); 1602 when 'e' => Emit (ASCII.ESC); 1603 when 'a' => Emit (ASCII.BEL); 1604 when others => Emit (Expression (Parse_Pos + 1)); 1605 end case; 1606 1607 Parse_Pos := Parse_Pos + 1; 1608 end if; 1609 1610 when others => 1611 Start_Pos := Parse_Pos; 1612 Case_Emit (C); 1613 end case; 1614 1615 exit Parse_Loop when Emit_Ptr - Length_Ptr = 254; 1616 1617 Parse_Pos := Parse_Pos + 1; 1618 1619 exit Parse_Loop when Parse_Pos > Parse_End; 1620 end loop Parse_Loop; 1621 1622 -- Is the string followed by a '*+?{' operator ? If yes, and if there 1623 -- is an initial string to emit, do it now. 1624 1625 if Has_Special_Operator 1626 and then Emit_Ptr >= Length_Ptr + Next_Pointer_Bytes 1627 then 1628 Emit_Ptr := Emit_Ptr - 1; 1629 Parse_Pos := Start_Pos; 1630 end if; 1631 1632 if Length_Ptr <= PM.Size then 1633 Program (Length_Ptr) := Character'Val (Emit_Ptr - Length_Ptr - 2); 1634 end if; 1635 1636 Expr_Flags.Has_Width := True; 1637 1638 -- Slight optimization when there is a single character 1639 1640 if Emit_Ptr = Length_Ptr + 2 then 1641 Expr_Flags.Simple := True; 1642 end if; 1643 end Parse_Literal; 1644 1645 ----------------- 1646 -- Parse_Piece -- 1647 ----------------- 1648 1649 -- Note that the branching code sequences used for '?' and the 1650 -- general cases of '*' and + are somewhat optimized: they use 1651 -- the same NOTHING node as both the endmarker for their branch 1652 -- list and the body of the last branch. It might seem that 1653 -- this node could be dispensed with entirely, but the endmarker 1654 -- role is not redundant. 1655 1656 procedure Parse_Piece 1657 (Expr_Flags : out Expression_Flags; 1658 IP : out Pointer) 1659 is 1660 Op : Character; 1661 New_Flags : Expression_Flags; 1662 Greedy : Boolean := True; 1663 1664 begin 1665 Parse_Atom (New_Flags, IP); 1666 1667 if IP = 0 then 1668 return; 1669 end if; 1670 1671 if Parse_Pos > Parse_End 1672 or else not Is_Mult (Parse_Pos) 1673 then 1674 Expr_Flags := New_Flags; 1675 return; 1676 end if; 1677 1678 Op := Expression (Parse_Pos); 1679 1680 Expr_Flags := 1681 (if Op /= '+' 1682 then (SP_Start => True, others => False) 1683 else (Has_Width => True, others => False)); 1684 1685 -- Detect non greedy operators in the easy cases 1686 1687 if Op /= '{' 1688 and then Parse_Pos + 1 <= Parse_End 1689 and then Expression (Parse_Pos + 1) = '?' 1690 then 1691 Greedy := False; 1692 Parse_Pos := Parse_Pos + 1; 1693 end if; 1694 1695 -- Generate the byte code 1696 1697 case Op is 1698 when '*' => 1699 1700 if New_Flags.Simple then 1701 Insert_Operator (STAR, IP, Greedy); 1702 else 1703 Link_Tail (IP, Emit_Node (WHILEM)); 1704 Insert_Curly_Operator 1705 (CURLYX, 0, Max_Curly_Repeat, IP, Greedy); 1706 Link_Tail (IP, Emit_Node (NOTHING)); 1707 end if; 1708 1709 when '+' => 1710 1711 if New_Flags.Simple then 1712 Insert_Operator (PLUS, IP, Greedy); 1713 else 1714 Link_Tail (IP, Emit_Node (WHILEM)); 1715 Insert_Curly_Operator 1716 (CURLYX, 1, Max_Curly_Repeat, IP, Greedy); 1717 Link_Tail (IP, Emit_Node (NOTHING)); 1718 end if; 1719 1720 when '?' => 1721 if New_Flags.Simple then 1722 Insert_Curly_Operator (CURLY, 0, 1, IP, Greedy); 1723 else 1724 Link_Tail (IP, Emit_Node (WHILEM)); 1725 Insert_Curly_Operator (CURLYX, 0, 1, IP, Greedy); 1726 Link_Tail (IP, Emit_Node (NOTHING)); 1727 end if; 1728 1729 when '{' => 1730 declare 1731 Min, Max : Natural; 1732 1733 begin 1734 Get_Curly_Arguments (Parse_Pos, Min, Max, Greedy); 1735 1736 if New_Flags.Simple then 1737 Insert_Curly_Operator (CURLY, Min, Max, IP, Greedy); 1738 else 1739 Link_Tail (IP, Emit_Node (WHILEM)); 1740 Insert_Curly_Operator (CURLYX, Min, Max, IP, Greedy); 1741 Link_Tail (IP, Emit_Node (NOTHING)); 1742 end if; 1743 end; 1744 1745 when others => 1746 null; 1747 end case; 1748 1749 Parse_Pos := Parse_Pos + 1; 1750 1751 if Parse_Pos <= Parse_End 1752 and then Is_Mult (Parse_Pos) 1753 then 1754 Fail ("nested *+{"); 1755 end if; 1756 end Parse_Piece; 1757 1758 --------------------------------- 1759 -- Parse_Posix_Character_Class -- 1760 --------------------------------- 1761 1762 function Parse_Posix_Character_Class return Std_Class is 1763 Invert : Boolean := False; 1764 Class : Std_Class := ANYOF_NONE; 1765 E : String renames Expression; 1766 1767 -- Class names. Note that code assumes that the length of all 1768 -- classes starting with the same letter have the same length. 1769 1770 Alnum : constant String := "alnum:]"; 1771 Alpha : constant String := "alpha:]"; 1772 Ascii_C : constant String := "ascii:]"; 1773 Cntrl : constant String := "cntrl:]"; 1774 Digit : constant String := "digit:]"; 1775 Graph : constant String := "graph:]"; 1776 Lower : constant String := "lower:]"; 1777 Print : constant String := "print:]"; 1778 Punct : constant String := "punct:]"; 1779 Space : constant String := "space:]"; 1780 Upper : constant String := "upper:]"; 1781 Word : constant String := "word:]"; 1782 Xdigit : constant String := "xdigit:]"; 1783 1784 begin 1785 -- Case of character class specified 1786 1787 if Parse_Pos <= Parse_End 1788 and then Expression (Parse_Pos) = ':' 1789 then 1790 Parse_Pos := Parse_Pos + 1; 1791 1792 -- Do we have something like: [[:^alpha:]] 1793 1794 if Parse_Pos <= Parse_End 1795 and then Expression (Parse_Pos) = '^' 1796 then 1797 Invert := True; 1798 Parse_Pos := Parse_Pos + 1; 1799 end if; 1800 1801 -- Check for class names based on first letter 1802 1803 case Expression (Parse_Pos) is 1804 when 'a' => 1805 1806 -- All 'a' classes have the same length (Alnum'Length) 1807 1808 if Parse_Pos + Alnum'Length - 1 <= Parse_End then 1809 if 1810 E (Parse_Pos .. Parse_Pos + Alnum'Length - 1) = Alnum 1811 then 1812 Class := 1813 (if Invert then ANYOF_NALNUMC else ANYOF_ALNUMC); 1814 Parse_Pos := Parse_Pos + Alnum'Length; 1815 1816 elsif 1817 E (Parse_Pos .. Parse_Pos + Alpha'Length - 1) = Alpha 1818 then 1819 Class := 1820 (if Invert then ANYOF_NALPHA else ANYOF_ALPHA); 1821 Parse_Pos := Parse_Pos + Alpha'Length; 1822 1823 elsif E (Parse_Pos .. Parse_Pos + Ascii_C'Length - 1) = 1824 Ascii_C 1825 then 1826 Class := 1827 (if Invert then ANYOF_NASCII else ANYOF_ASCII); 1828 Parse_Pos := Parse_Pos + Ascii_C'Length; 1829 else 1830 Fail ("Invalid character class: " & E); 1831 end if; 1832 1833 else 1834 Fail ("Invalid character class: " & E); 1835 end if; 1836 1837 when 'c' => 1838 if Parse_Pos + Cntrl'Length - 1 <= Parse_End 1839 and then 1840 E (Parse_Pos .. Parse_Pos + Cntrl'Length - 1) = Cntrl 1841 then 1842 Class := (if Invert then ANYOF_NCNTRL else ANYOF_CNTRL); 1843 Parse_Pos := Parse_Pos + Cntrl'Length; 1844 else 1845 Fail ("Invalid character class: " & E); 1846 end if; 1847 1848 when 'd' => 1849 if Parse_Pos + Digit'Length - 1 <= Parse_End 1850 and then 1851 E (Parse_Pos .. Parse_Pos + Digit'Length - 1) = Digit 1852 then 1853 Class := (if Invert then ANYOF_NDIGIT else ANYOF_DIGIT); 1854 Parse_Pos := Parse_Pos + Digit'Length; 1855 end if; 1856 1857 when 'g' => 1858 if Parse_Pos + Graph'Length - 1 <= Parse_End 1859 and then 1860 E (Parse_Pos .. Parse_Pos + Graph'Length - 1) = Graph 1861 then 1862 Class := (if Invert then ANYOF_NGRAPH else ANYOF_GRAPH); 1863 Parse_Pos := Parse_Pos + Graph'Length; 1864 else 1865 Fail ("Invalid character class: " & E); 1866 end if; 1867 1868 when 'l' => 1869 if Parse_Pos + Lower'Length - 1 <= Parse_End 1870 and then 1871 E (Parse_Pos .. Parse_Pos + Lower'Length - 1) = Lower 1872 then 1873 Class := (if Invert then ANYOF_NLOWER else ANYOF_LOWER); 1874 Parse_Pos := Parse_Pos + Lower'Length; 1875 else 1876 Fail ("Invalid character class: " & E); 1877 end if; 1878 1879 when 'p' => 1880 1881 -- All 'p' classes have the same length 1882 1883 if Parse_Pos + Print'Length - 1 <= Parse_End then 1884 if 1885 E (Parse_Pos .. Parse_Pos + Print'Length - 1) = Print 1886 then 1887 Class := 1888 (if Invert then ANYOF_NPRINT else ANYOF_PRINT); 1889 Parse_Pos := Parse_Pos + Print'Length; 1890 1891 elsif 1892 E (Parse_Pos .. Parse_Pos + Punct'Length - 1) = Punct 1893 then 1894 Class := 1895 (if Invert then ANYOF_NPUNCT else ANYOF_PUNCT); 1896 Parse_Pos := Parse_Pos + Punct'Length; 1897 1898 else 1899 Fail ("Invalid character class: " & E); 1900 end if; 1901 1902 else 1903 Fail ("Invalid character class: " & E); 1904 end if; 1905 1906 when 's' => 1907 if Parse_Pos + Space'Length - 1 <= Parse_End 1908 and then 1909 E (Parse_Pos .. Parse_Pos + Space'Length - 1) = Space 1910 then 1911 Class := (if Invert then ANYOF_NSPACE else ANYOF_SPACE); 1912 Parse_Pos := Parse_Pos + Space'Length; 1913 else 1914 Fail ("Invalid character class: " & E); 1915 end if; 1916 1917 when 'u' => 1918 if Parse_Pos + Upper'Length - 1 <= Parse_End 1919 and then 1920 E (Parse_Pos .. Parse_Pos + Upper'Length - 1) = Upper 1921 then 1922 Class := (if Invert then ANYOF_NUPPER else ANYOF_UPPER); 1923 Parse_Pos := Parse_Pos + Upper'Length; 1924 else 1925 Fail ("Invalid character class: " & E); 1926 end if; 1927 1928 when 'w' => 1929 if Parse_Pos + Word'Length - 1 <= Parse_End 1930 and then 1931 E (Parse_Pos .. Parse_Pos + Word'Length - 1) = Word 1932 then 1933 Class := (if Invert then ANYOF_NALNUM else ANYOF_ALNUM); 1934 Parse_Pos := Parse_Pos + Word'Length; 1935 else 1936 Fail ("Invalid character class: " & E); 1937 end if; 1938 1939 when 'x' => 1940 if Parse_Pos + Xdigit'Length - 1 <= Parse_End 1941 and then 1942 E (Parse_Pos .. Parse_Pos + Xdigit'Length - 1) = Xdigit 1943 then 1944 Class := (if Invert then ANYOF_NXDIGIT else ANYOF_XDIGIT); 1945 Parse_Pos := Parse_Pos + Xdigit'Length; 1946 1947 else 1948 Fail ("Invalid character class: " & E); 1949 end if; 1950 1951 when others => 1952 Fail ("Invalid character class: " & E); 1953 end case; 1954 1955 -- Character class not specified 1956 1957 else 1958 return ANYOF_NONE; 1959 end if; 1960 1961 return Class; 1962 end Parse_Posix_Character_Class; 1963 1964 -- Local Declarations 1965 1966 Result : Pointer; 1967 1968 Expr_Flags : Expression_Flags; 1969 pragma Unreferenced (Expr_Flags); 1970 1971 -- Start of processing for Compile 1972 1973 begin 1974 Parse (False, Expr_Flags, Result); 1975 1976 if Result = 0 then 1977 Fail ("Couldn't compile expression"); 1978 end if; 1979 1980 Final_Code_Size := Emit_Ptr - 1; 1981 1982 -- Do we want to actually compile the expression, or simply get the 1983 -- code size ??? 1984 1985 if Emit_Ptr <= PM.Size then 1986 Optimize (PM); 1987 end if; 1988 1989 PM.Flags := Flags; 1990 end Compile; 1991 1992 function Compile 1993 (Expression : String; 1994 Flags : Regexp_Flags := No_Flags) return Pattern_Matcher 1995 is 1996 -- Assume the compiled regexp will fit in 1000 chars. If it does not we 1997 -- will have to compile a second time once the correct size is known. If 1998 -- it fits, we save a significant amount of time by avoiding the second 1999 -- compilation. 2000 2001 Dummy : Pattern_Matcher (1000); 2002 Size : Program_Size; 2003 2004 begin 2005 Compile (Dummy, Expression, Size, Flags); 2006 2007 if Size <= Dummy.Size then 2008 return Pattern_Matcher' 2009 (Size => Size, 2010 First => Dummy.First, 2011 Anchored => Dummy.Anchored, 2012 Must_Have => Dummy.Must_Have, 2013 Must_Have_Length => Dummy.Must_Have_Length, 2014 Paren_Count => Dummy.Paren_Count, 2015 Flags => Dummy.Flags, 2016 Program => 2017 Dummy.Program 2018 (Dummy.Program'First .. Dummy.Program'First + Size - 1)); 2019 else 2020 -- We have to recompile now that we know the size 2021 -- ??? Can we use Ada 2005's return construct ? 2022 2023 declare 2024 Result : Pattern_Matcher (Size); 2025 begin 2026 Compile (Result, Expression, Size, Flags); 2027 return Result; 2028 end; 2029 end if; 2030 end Compile; 2031 2032 procedure Compile 2033 (Matcher : out Pattern_Matcher; 2034 Expression : String; 2035 Flags : Regexp_Flags := No_Flags) 2036 is 2037 Size : Program_Size; 2038 2039 begin 2040 Compile (Matcher, Expression, Size, Flags); 2041 2042 if Size > Matcher.Size then 2043 raise Expression_Error with "Pattern_Matcher is too small"; 2044 end if; 2045 end Compile; 2046 2047 -------------------- 2048 -- Dump_Operation -- 2049 -------------------- 2050 2051 procedure Dump_Operation 2052 (Program : Program_Data; 2053 Index : Pointer; 2054 Indent : Natural) 2055 is 2056 Current : Pointer := Index; 2057 begin 2058 Dump_Until (Program, Current, Current + 1, Indent); 2059 end Dump_Operation; 2060 2061 ---------------- 2062 -- Dump_Until -- 2063 ---------------- 2064 2065 procedure Dump_Until 2066 (Program : Program_Data; 2067 Index : in out Pointer; 2068 Till : Pointer; 2069 Indent : Natural; 2070 Do_Print : Boolean := True) 2071 is 2072 function Image (S : String) return String; 2073 -- Remove leading space 2074 2075 ----------- 2076 -- Image -- 2077 ----------- 2078 2079 function Image (S : String) return String is 2080 begin 2081 if S (S'First) = ' ' then 2082 return S (S'First + 1 .. S'Last); 2083 else 2084 return S; 2085 end if; 2086 end Image; 2087 2088 -- Local variables 2089 2090 Op : Opcode; 2091 Next : Pointer; 2092 Length : Pointer; 2093 Local_Indent : Natural := Indent; 2094 2095 -- Start of processing for Dump_Until 2096 2097 begin 2098 while Index < Till loop 2099 Op := Opcode'Val (Character'Pos ((Program (Index)))); 2100 Next := Get_Next (Program, Index); 2101 2102 if Do_Print then 2103 declare 2104 Point : constant String := Pointer'Image (Index); 2105 begin 2106 Put ((1 .. 4 - Point'Length => ' ') 2107 & Point & ":" 2108 & (1 .. Local_Indent * 2 => ' ') & Opcode'Image (Op)); 2109 end; 2110 2111 -- Print the parenthesis number 2112 2113 if Op = OPEN or else Op = CLOSE or else Op = REFF then 2114 Put (Image (Natural'Image 2115 (Character'Pos 2116 (Program (Index + Next_Pointer_Bytes))))); 2117 end if; 2118 2119 if Next = Index then 2120 Put (" (-)"); 2121 else 2122 Put (" (" & Image (Pointer'Image (Next)) & ")"); 2123 end if; 2124 end if; 2125 2126 case Op is 2127 when ANYOF => 2128 declare 2129 Bitmap : Character_Class; 2130 Last : Character := ASCII.NUL; 2131 Current : Natural := 0; 2132 Current_Char : Character; 2133 2134 begin 2135 Bitmap_Operand (Program, Index, Bitmap); 2136 2137 if Do_Print then 2138 Put ("["); 2139 2140 while Current <= 255 loop 2141 Current_Char := Character'Val (Current); 2142 2143 -- First item in a range 2144 2145 if Get_From_Class (Bitmap, Current_Char) then 2146 Last := Current_Char; 2147 2148 -- Search for the last item in the range 2149 2150 loop 2151 Current := Current + 1; 2152 exit when Current > 255; 2153 Current_Char := Character'Val (Current); 2154 exit when 2155 not Get_From_Class (Bitmap, Current_Char); 2156 end loop; 2157 2158 if not Is_Graphic (Last) then 2159 Put (Last'Img); 2160 else 2161 Put (Last); 2162 end if; 2163 2164 if Character'Succ (Last) /= Current_Char then 2165 Put ("\-" & Character'Pred (Current_Char)); 2166 end if; 2167 2168 else 2169 Current := Current + 1; 2170 end if; 2171 end loop; 2172 2173 Put_Line ("]"); 2174 end if; 2175 2176 Index := Index + Next_Pointer_Bytes + Bitmap'Length; 2177 end; 2178 2179 when EXACT | EXACTF => 2180 Length := String_Length (Program, Index); 2181 if Do_Print then 2182 Put (" (" & Image (Program_Size'Image (Length + 1)) 2183 & " chars) <" 2184 & String (Program (String_Operand (Index) 2185 .. String_Operand (Index) 2186 + Length))); 2187 Put_Line (">"); 2188 end if; 2189 2190 Index := String_Operand (Index) + Length + 1; 2191 2192 -- Node operand 2193 2194 when BRANCH | STAR | PLUS => 2195 if Do_Print then 2196 New_Line; 2197 end if; 2198 2199 Index := Index + Next_Pointer_Bytes; 2200 Dump_Until (Program, Index, Pointer'Min (Next, Till), 2201 Local_Indent + 1, Do_Print); 2202 2203 when CURLY | CURLYX => 2204 if Do_Print then 2205 Put_Line 2206 (" {" 2207 & Image (Natural'Image 2208 (Read_Natural (Program, Index + Next_Pointer_Bytes))) 2209 & "," 2210 & Image (Natural'Image (Read_Natural (Program, Index + 5))) 2211 & "}"); 2212 end if; 2213 2214 Index := Index + 7; 2215 Dump_Until (Program, Index, Pointer'Min (Next, Till), 2216 Local_Indent + 1, Do_Print); 2217 2218 when OPEN => 2219 if Do_Print then 2220 New_Line; 2221 end if; 2222 2223 Index := Index + 4; 2224 Local_Indent := Local_Indent + 1; 2225 2226 when CLOSE | REFF => 2227 if Do_Print then 2228 New_Line; 2229 end if; 2230 2231 Index := Index + 4; 2232 2233 if Op = CLOSE then 2234 Local_Indent := Local_Indent - 1; 2235 end if; 2236 2237 when others => 2238 Index := Index + Next_Pointer_Bytes; 2239 2240 if Do_Print then 2241 New_Line; 2242 end if; 2243 2244 exit when Op = EOP; 2245 end case; 2246 end loop; 2247 end Dump_Until; 2248 2249 ---------- 2250 -- Dump -- 2251 ---------- 2252 2253 procedure Dump (Self : Pattern_Matcher) is 2254 Program : Program_Data renames Self.Program; 2255 Index : Pointer := Program'First; 2256 2257 -- Start of processing for Dump 2258 2259 begin 2260 Put_Line ("Must start with (Self.First) = " 2261 & Character'Image (Self.First)); 2262 2263 if (Self.Flags and Case_Insensitive) /= 0 then 2264 Put_Line (" Case_Insensitive mode"); 2265 end if; 2266 2267 if (Self.Flags and Single_Line) /= 0 then 2268 Put_Line (" Single_Line mode"); 2269 end if; 2270 2271 if (Self.Flags and Multiple_Lines) /= 0 then 2272 Put_Line (" Multiple_Lines mode"); 2273 end if; 2274 2275 Dump_Until (Program, Index, Self.Program'Last + 1, 0); 2276 end Dump; 2277 2278 -------------------- 2279 -- Get_From_Class -- 2280 -------------------- 2281 2282 function Get_From_Class 2283 (Bitmap : Character_Class; 2284 C : Character) return Boolean 2285 is 2286 Value : constant Class_Byte := Character'Pos (C); 2287 begin 2288 return 2289 (Bitmap (Value / 8) and Bit_Conversion (Value mod 8)) /= 0; 2290 end Get_From_Class; 2291 2292 -------------- 2293 -- Get_Next -- 2294 -------------- 2295 2296 function Get_Next (Program : Program_Data; IP : Pointer) return Pointer is 2297 begin 2298 return IP + Pointer (Read_Natural (Program, IP + 1)); 2299 end Get_Next; 2300 2301 -------------- 2302 -- Is_Alnum -- 2303 -------------- 2304 2305 function Is_Alnum (C : Character) return Boolean is 2306 begin 2307 return Is_Alphanumeric (C) or else C = '_'; 2308 end Is_Alnum; 2309 2310 ------------------ 2311 -- Is_Printable -- 2312 ------------------ 2313 2314 function Is_Printable (C : Character) return Boolean is 2315 begin 2316 -- Printable if space or graphic character or other whitespace 2317 -- Other white space includes (HT/LF/VT/FF/CR = codes 9-13) 2318 2319 return C in Character'Val (32) .. Character'Val (126) 2320 or else C in ASCII.HT .. ASCII.CR; 2321 end Is_Printable; 2322 2323 -------------------- 2324 -- Is_White_Space -- 2325 -------------------- 2326 2327 function Is_White_Space (C : Character) return Boolean is 2328 begin 2329 -- Note: HT = 9, LF = 10, VT = 11, FF = 12, CR = 13 2330 2331 return C = ' ' or else C in ASCII.HT .. ASCII.CR; 2332 end Is_White_Space; 2333 2334 ----------- 2335 -- Match -- 2336 ----------- 2337 2338 procedure Match 2339 (Self : Pattern_Matcher; 2340 Data : String; 2341 Matches : out Match_Array; 2342 Data_First : Integer := -1; 2343 Data_Last : Positive := Positive'Last) 2344 is 2345 Program : Program_Data renames Self.Program; -- Shorter notation 2346 2347 First_In_Data : constant Integer := Integer'Max (Data_First, Data'First); 2348 Last_In_Data : constant Integer := Integer'Min (Data_Last, Data'Last); 2349 2350 -- Global work variables 2351 2352 Input_Pos : Natural; -- String-input pointer 2353 BOL_Pos : Natural; -- Beginning of input, for ^ check 2354 Matched : Boolean := False; -- Until proven True 2355 2356 Matches_Full : Match_Array (0 .. Natural'Max (Self.Paren_Count, 2357 Matches'Last)); 2358 -- Stores the value of all the parenthesis pairs. 2359 -- We do not use directly Matches, so that we can also use back 2360 -- references (REFF) even if Matches is too small. 2361 2362 type Natural_Array is array (Match_Count range <>) of Natural; 2363 Matches_Tmp : Natural_Array (Matches_Full'Range); 2364 -- Save the opening position of parenthesis 2365 2366 Last_Paren : Natural := 0; 2367 -- Last parenthesis seen 2368 2369 Greedy : Boolean := True; 2370 -- True if the next operator should be greedy 2371 2372 type Current_Curly_Record; 2373 type Current_Curly_Access is access all Current_Curly_Record; 2374 type Current_Curly_Record is record 2375 Paren_Floor : Natural; -- How far back to strip parenthesis data 2376 Cur : Integer; -- How many instances of scan we've matched 2377 Min : Natural; -- Minimal number of scans to match 2378 Max : Natural; -- Maximal number of scans to match 2379 Greedy : Boolean; -- Whether to work our way up or down 2380 Scan : Pointer; -- The thing to match 2381 Next : Pointer; -- What has to match after it 2382 Lastloc : Natural; -- Where we started matching this scan 2383 Old_Cc : Current_Curly_Access; -- Before we started this one 2384 end record; 2385 -- Data used to handle the curly operator and the plus and star 2386 -- operators for complex expressions. 2387 2388 Current_Curly : Current_Curly_Access := null; 2389 -- The curly currently being processed 2390 2391 ----------------------- 2392 -- Local Subprograms -- 2393 ----------------------- 2394 2395 function Index (Start : Positive; C : Character) return Natural; 2396 -- Find character C in Data starting at Start and return position 2397 2398 function Repeat 2399 (IP : Pointer; 2400 Max : Natural := Natural'Last) return Natural; 2401 -- Repeatedly match something simple, report how many 2402 -- It only matches on things of length 1. 2403 -- Starting from Input_Pos, it matches at most Max CURLY. 2404 2405 function Try (Pos : Positive) return Boolean; 2406 -- Try to match at specific point 2407 2408 function Match (IP : Pointer) return Boolean; 2409 -- This is the main matching routine. Conceptually the strategy 2410 -- is simple: check to see whether the current node matches, 2411 -- call self recursively to see whether the rest matches, 2412 -- and then act accordingly. 2413 -- 2414 -- In practice Match makes some effort to avoid recursion, in 2415 -- particular by going through "ordinary" nodes (that don't 2416 -- need to know whether the rest of the match failed) by 2417 -- using a loop instead of recursion. 2418 -- Why is the above comment part of the spec rather than body ??? 2419 2420 function Match_Whilem return Boolean; 2421 -- Return True if a WHILEM matches the Current_Curly 2422 2423 function Recurse_Match (IP : Pointer; From : Natural) return Boolean; 2424 pragma Inline (Recurse_Match); 2425 -- Calls Match recursively. It saves and restores the parenthesis 2426 -- status and location in the input stream correctly, so that 2427 -- backtracking is possible 2428 2429 function Match_Simple_Operator 2430 (Op : Opcode; 2431 Scan : Pointer; 2432 Next : Pointer; 2433 Greedy : Boolean) return Boolean; 2434 -- Return True it the simple operator (possibly non-greedy) matches 2435 2436 Dump_Indent : Integer := -1; 2437 procedure Dump_Current (Scan : Pointer; Prefix : Boolean := True); 2438 procedure Dump_Error (Msg : String); 2439 -- Debug: print the current context 2440 2441 pragma Inline (Index); 2442 pragma Inline (Repeat); 2443 2444 -- These are two complex functions, but used only once 2445 2446 pragma Inline (Match_Whilem); 2447 pragma Inline (Match_Simple_Operator); 2448 2449 ----------- 2450 -- Index -- 2451 ----------- 2452 2453 function Index (Start : Positive; C : Character) return Natural is 2454 begin 2455 for J in Start .. Last_In_Data loop 2456 if Data (J) = C then 2457 return J; 2458 end if; 2459 end loop; 2460 2461 return 0; 2462 end Index; 2463 2464 ------------------- 2465 -- Recurse_Match -- 2466 ------------------- 2467 2468 function Recurse_Match (IP : Pointer; From : Natural) return Boolean is 2469 L : constant Natural := Last_Paren; 2470 Tmp_F : constant Match_Array := 2471 Matches_Full (From + 1 .. Matches_Full'Last); 2472 Start : constant Natural_Array := 2473 Matches_Tmp (From + 1 .. Matches_Tmp'Last); 2474 Input : constant Natural := Input_Pos; 2475 2476 Dump_Indent_Save : constant Integer := Dump_Indent; 2477 2478 begin 2479 if Match (IP) then 2480 return True; 2481 end if; 2482 2483 Last_Paren := L; 2484 Matches_Full (Tmp_F'Range) := Tmp_F; 2485 Matches_Tmp (Start'Range) := Start; 2486 Input_Pos := Input; 2487 Dump_Indent := Dump_Indent_Save; 2488 return False; 2489 end Recurse_Match; 2490 2491 ------------------ 2492 -- Dump_Current -- 2493 ------------------ 2494 2495 procedure Dump_Current (Scan : Pointer; Prefix : Boolean := True) is 2496 Length : constant := 10; 2497 Pos : constant String := Integer'Image (Input_Pos); 2498 2499 begin 2500 if Prefix then 2501 Put ((1 .. 5 - Pos'Length => ' ')); 2502 Put (Pos & " <" 2503 & Data (Input_Pos 2504 .. Integer'Min (Last_In_Data, Input_Pos + Length - 1))); 2505 Put ((1 .. Length - 1 - Last_In_Data + Input_Pos => ' ')); 2506 Put ("> |"); 2507 2508 else 2509 Put (" "); 2510 end if; 2511 2512 Dump_Operation (Program, Scan, Indent => Dump_Indent); 2513 end Dump_Current; 2514 2515 ---------------- 2516 -- Dump_Error -- 2517 ---------------- 2518 2519 procedure Dump_Error (Msg : String) is 2520 begin 2521 Put (" | "); 2522 Put ((1 .. Dump_Indent * 2 => ' ')); 2523 Put_Line (Msg); 2524 end Dump_Error; 2525 2526 ----------- 2527 -- Match -- 2528 ----------- 2529 2530 function Match (IP : Pointer) return Boolean is 2531 Scan : Pointer := IP; 2532 Next : Pointer; 2533 Op : Opcode; 2534 Result : Boolean; 2535 2536 begin 2537 Dump_Indent := Dump_Indent + 1; 2538 2539 State_Machine : 2540 loop 2541 pragma Assert (Scan /= 0); 2542 2543 -- Determine current opcode and count its usage in debug mode 2544 2545 Op := Opcode'Val (Character'Pos (Program (Scan))); 2546 2547 -- Calculate offset of next instruction. Second character is most 2548 -- significant in Program_Data. 2549 2550 Next := Get_Next (Program, Scan); 2551 2552 if Debug then 2553 Dump_Current (Scan); 2554 end if; 2555 2556 case Op is 2557 when EOP => 2558 Dump_Indent := Dump_Indent - 1; 2559 return True; -- Success 2560 2561 when BRANCH => 2562 if Program (Next) /= BRANCH then 2563 Next := Operand (Scan); -- No choice, avoid recursion 2564 2565 else 2566 loop 2567 if Recurse_Match (Operand (Scan), 0) then 2568 Dump_Indent := Dump_Indent - 1; 2569 return True; 2570 end if; 2571 2572 Scan := Get_Next (Program, Scan); 2573 exit when Scan = 0 or else Program (Scan) /= BRANCH; 2574 end loop; 2575 2576 exit State_Machine; 2577 end if; 2578 2579 when NOTHING => 2580 null; 2581 2582 when BOL => 2583 exit State_Machine when Input_Pos /= BOL_Pos 2584 and then ((Self.Flags and Multiple_Lines) = 0 2585 or else Data (Input_Pos - 1) /= ASCII.LF); 2586 2587 when MBOL => 2588 exit State_Machine when Input_Pos /= BOL_Pos 2589 and then Data (Input_Pos - 1) /= ASCII.LF; 2590 2591 when SBOL => 2592 exit State_Machine when Input_Pos /= BOL_Pos; 2593 2594 when EOL => 2595 exit State_Machine when Input_Pos <= Data'Last 2596 and then ((Self.Flags and Multiple_Lines) = 0 2597 or else Data (Input_Pos) /= ASCII.LF); 2598 2599 when MEOL => 2600 exit State_Machine when Input_Pos <= Data'Last 2601 and then Data (Input_Pos) /= ASCII.LF; 2602 2603 when SEOL => 2604 exit State_Machine when Input_Pos <= Data'Last; 2605 2606 when BOUND | NBOUND => 2607 2608 -- Was last char in word ? 2609 2610 declare 2611 N : Boolean := False; 2612 Ln : Boolean := False; 2613 2614 begin 2615 if Input_Pos /= First_In_Data then 2616 N := Is_Alnum (Data (Input_Pos - 1)); 2617 end if; 2618 2619 Ln := 2620 (if Input_Pos > Last_In_Data 2621 then False 2622 else Is_Alnum (Data (Input_Pos))); 2623 2624 if Op = BOUND then 2625 if N = Ln then 2626 exit State_Machine; 2627 end if; 2628 else 2629 if N /= Ln then 2630 exit State_Machine; 2631 end if; 2632 end if; 2633 end; 2634 2635 when SPACE => 2636 exit State_Machine when Input_Pos > Last_In_Data 2637 or else not Is_White_Space (Data (Input_Pos)); 2638 Input_Pos := Input_Pos + 1; 2639 2640 when NSPACE => 2641 exit State_Machine when Input_Pos > Last_In_Data 2642 or else Is_White_Space (Data (Input_Pos)); 2643 Input_Pos := Input_Pos + 1; 2644 2645 when DIGIT => 2646 exit State_Machine when Input_Pos > Last_In_Data 2647 or else not Is_Digit (Data (Input_Pos)); 2648 Input_Pos := Input_Pos + 1; 2649 2650 when NDIGIT => 2651 exit State_Machine when Input_Pos > Last_In_Data 2652 or else Is_Digit (Data (Input_Pos)); 2653 Input_Pos := Input_Pos + 1; 2654 2655 when ALNUM => 2656 exit State_Machine when Input_Pos > Last_In_Data 2657 or else not Is_Alnum (Data (Input_Pos)); 2658 Input_Pos := Input_Pos + 1; 2659 2660 when NALNUM => 2661 exit State_Machine when Input_Pos > Last_In_Data 2662 or else Is_Alnum (Data (Input_Pos)); 2663 Input_Pos := Input_Pos + 1; 2664 2665 when ANY => 2666 exit State_Machine when Input_Pos > Last_In_Data 2667 or else Data (Input_Pos) = ASCII.LF; 2668 Input_Pos := Input_Pos + 1; 2669 2670 when SANY => 2671 exit State_Machine when Input_Pos > Last_In_Data; 2672 Input_Pos := Input_Pos + 1; 2673 2674 when EXACT => 2675 declare 2676 Opnd : Pointer := String_Operand (Scan); 2677 Current : Positive := Input_Pos; 2678 Last : constant Pointer := 2679 Opnd + String_Length (Program, Scan); 2680 2681 begin 2682 while Opnd <= Last loop 2683 exit State_Machine when Current > Last_In_Data 2684 or else Program (Opnd) /= Data (Current); 2685 Current := Current + 1; 2686 Opnd := Opnd + 1; 2687 end loop; 2688 2689 Input_Pos := Current; 2690 end; 2691 2692 when EXACTF => 2693 declare 2694 Opnd : Pointer := String_Operand (Scan); 2695 Current : Positive := Input_Pos; 2696 2697 Last : constant Pointer := 2698 Opnd + String_Length (Program, Scan); 2699 2700 begin 2701 while Opnd <= Last loop 2702 exit State_Machine when Current > Last_In_Data 2703 or else Program (Opnd) /= To_Lower (Data (Current)); 2704 Current := Current + 1; 2705 Opnd := Opnd + 1; 2706 end loop; 2707 2708 Input_Pos := Current; 2709 end; 2710 2711 when ANYOF => 2712 declare 2713 Bitmap : Character_Class; 2714 begin 2715 Bitmap_Operand (Program, Scan, Bitmap); 2716 exit State_Machine when Input_Pos > Last_In_Data 2717 or else not Get_From_Class (Bitmap, Data (Input_Pos)); 2718 Input_Pos := Input_Pos + 1; 2719 end; 2720 2721 when OPEN => 2722 declare 2723 No : constant Natural := 2724 Character'Pos (Program (Operand (Scan))); 2725 begin 2726 Matches_Tmp (No) := Input_Pos; 2727 end; 2728 2729 when CLOSE => 2730 declare 2731 No : constant Natural := 2732 Character'Pos (Program (Operand (Scan))); 2733 2734 begin 2735 Matches_Full (No) := (Matches_Tmp (No), Input_Pos - 1); 2736 2737 if Last_Paren < No then 2738 Last_Paren := No; 2739 end if; 2740 end; 2741 2742 when REFF => 2743 declare 2744 No : constant Natural := 2745 Character'Pos (Program (Operand (Scan))); 2746 2747 Data_Pos : Natural; 2748 2749 begin 2750 -- If we haven't seen that parenthesis yet 2751 2752 if Last_Paren < No then 2753 Dump_Indent := Dump_Indent - 1; 2754 2755 if Debug then 2756 Dump_Error ("REFF: No match, backtracking"); 2757 end if; 2758 2759 return False; 2760 end if; 2761 2762 Data_Pos := Matches_Full (No).First; 2763 2764 while Data_Pos <= Matches_Full (No).Last loop 2765 if Input_Pos > Last_In_Data 2766 or else Data (Input_Pos) /= Data (Data_Pos) 2767 then 2768 Dump_Indent := Dump_Indent - 1; 2769 2770 if Debug then 2771 Dump_Error ("REFF: No match, backtracking"); 2772 end if; 2773 2774 return False; 2775 end if; 2776 2777 Input_Pos := Input_Pos + 1; 2778 Data_Pos := Data_Pos + 1; 2779 end loop; 2780 end; 2781 2782 when MINMOD => 2783 Greedy := False; 2784 2785 when STAR | PLUS | CURLY => 2786 declare 2787 Greed : constant Boolean := Greedy; 2788 begin 2789 Greedy := True; 2790 Result := Match_Simple_Operator (Op, Scan, Next, Greed); 2791 Dump_Indent := Dump_Indent - 1; 2792 return Result; 2793 end; 2794 2795 when CURLYX => 2796 2797 -- Looking at something like: 2798 2799 -- 1: CURLYX {n,m} (->4) 2800 -- 2: code for complex thing (->3) 2801 -- 3: WHILEM (->0) 2802 -- 4: NOTHING 2803 2804 declare 2805 Min : constant Natural := 2806 Read_Natural (Program, Scan + Next_Pointer_Bytes); 2807 Max : constant Natural := 2808 Read_Natural 2809 (Program, Scan + Next_Pointer_Bytes + 2); 2810 Cc : aliased Current_Curly_Record; 2811 2812 Has_Match : Boolean; 2813 2814 begin 2815 Cc := (Paren_Floor => Last_Paren, 2816 Cur => -1, 2817 Min => Min, 2818 Max => Max, 2819 Greedy => Greedy, 2820 Scan => Scan + 7, 2821 Next => Next, 2822 Lastloc => 0, 2823 Old_Cc => Current_Curly); 2824 Greedy := True; 2825 Current_Curly := Cc'Unchecked_Access; 2826 2827 Has_Match := Match (Next - Next_Pointer_Bytes); 2828 2829 -- Start on the WHILEM 2830 2831 Current_Curly := Cc.Old_Cc; 2832 Dump_Indent := Dump_Indent - 1; 2833 2834 if not Has_Match then 2835 if Debug then 2836 Dump_Error ("CURLYX failed..."); 2837 end if; 2838 end if; 2839 2840 return Has_Match; 2841 end; 2842 2843 when WHILEM => 2844 Result := Match_Whilem; 2845 Dump_Indent := Dump_Indent - 1; 2846 2847 if Debug and then not Result then 2848 Dump_Error ("WHILEM: no match, backtracking"); 2849 end if; 2850 2851 return Result; 2852 end case; 2853 2854 Scan := Next; 2855 end loop State_Machine; 2856 2857 if Debug then 2858 Dump_Error ("failed..."); 2859 Dump_Indent := Dump_Indent - 1; 2860 end if; 2861 2862 -- If we get here, there is no match. For successful matches when EOP 2863 -- is the terminating point. 2864 2865 return False; 2866 end Match; 2867 2868 --------------------------- 2869 -- Match_Simple_Operator -- 2870 --------------------------- 2871 2872 function Match_Simple_Operator 2873 (Op : Opcode; 2874 Scan : Pointer; 2875 Next : Pointer; 2876 Greedy : Boolean) return Boolean 2877 is 2878 Next_Char : Character := ASCII.NUL; 2879 Next_Char_Known : Boolean := False; 2880 No : Integer; -- Can be negative 2881 Min : Natural; 2882 Max : Natural := Natural'Last; 2883 Operand_Code : Pointer; 2884 Old : Natural; 2885 Last_Pos : Natural; 2886 Save : constant Natural := Input_Pos; 2887 2888 begin 2889 -- Lookahead to avoid useless match attempts when we know what 2890 -- character comes next. 2891 2892 if Program (Next) = EXACT then 2893 Next_Char := Program (String_Operand (Next)); 2894 Next_Char_Known := True; 2895 end if; 2896 2897 -- Find the minimal and maximal values for the operator 2898 2899 case Op is 2900 when STAR => 2901 Min := 0; 2902 Operand_Code := Operand (Scan); 2903 2904 when PLUS => 2905 Min := 1; 2906 Operand_Code := Operand (Scan); 2907 2908 when others => 2909 Min := Read_Natural (Program, Scan + Next_Pointer_Bytes); 2910 Max := Read_Natural (Program, Scan + Next_Pointer_Bytes + 2); 2911 Operand_Code := Scan + 7; 2912 end case; 2913 2914 if Debug then 2915 Dump_Current (Operand_Code, Prefix => False); 2916 end if; 2917 2918 -- Non greedy operators 2919 2920 if not Greedy then 2921 2922 -- Test we can repeat at least Min times 2923 2924 if Min /= 0 then 2925 No := Repeat (Operand_Code, Min); 2926 2927 if No < Min then 2928 if Debug then 2929 Dump_Error ("failed... matched" & No'Img & " times"); 2930 end if; 2931 2932 return False; 2933 end if; 2934 end if; 2935 2936 Old := Input_Pos; 2937 2938 -- Find the place where 'next' could work 2939 2940 if Next_Char_Known then 2941 2942 -- Last position to check 2943 2944 if Max = Natural'Last then 2945 Last_Pos := Last_In_Data; 2946 else 2947 Last_Pos := Input_Pos + Max; 2948 2949 if Last_Pos > Last_In_Data then 2950 Last_Pos := Last_In_Data; 2951 end if; 2952 end if; 2953 2954 -- Look for the first possible opportunity 2955 2956 if Debug then 2957 Dump_Error ("Next_Char must be " & Next_Char); 2958 end if; 2959 2960 loop 2961 -- Find the next possible position 2962 2963 while Input_Pos <= Last_Pos 2964 and then Data (Input_Pos) /= Next_Char 2965 loop 2966 Input_Pos := Input_Pos + 1; 2967 end loop; 2968 2969 if Input_Pos > Last_Pos then 2970 return False; 2971 end if; 2972 2973 -- Check that we still match if we stop at the position we 2974 -- just found. 2975 2976 declare 2977 Num : constant Natural := Input_Pos - Old; 2978 2979 begin 2980 Input_Pos := Old; 2981 2982 if Debug then 2983 Dump_Error ("Would we still match at that position?"); 2984 end if; 2985 2986 if Repeat (Operand_Code, Num) < Num then 2987 return False; 2988 end if; 2989 end; 2990 2991 -- Input_Pos now points to the new position 2992 2993 if Match (Get_Next (Program, Scan)) then 2994 return True; 2995 end if; 2996 2997 Old := Input_Pos; 2998 Input_Pos := Input_Pos + 1; 2999 end loop; 3000 3001 -- We do not know what the next character is 3002 3003 else 3004 while Max >= Min loop 3005 if Debug then 3006 Dump_Error ("Non-greedy repeat, N=" & Min'Img); 3007 Dump_Error ("Do we still match Next if we stop here?"); 3008 end if; 3009 3010 -- If the next character matches 3011 3012 if Recurse_Match (Next, 1) then 3013 return True; 3014 end if; 3015 3016 Input_Pos := Save + Min; 3017 3018 -- Could not or did not match -- move forward 3019 3020 if Repeat (Operand_Code, 1) /= 0 then 3021 Min := Min + 1; 3022 else 3023 if Debug then 3024 Dump_Error ("Non-greedy repeat failed..."); 3025 end if; 3026 3027 return False; 3028 end if; 3029 end loop; 3030 end if; 3031 3032 return False; 3033 3034 -- Greedy operators 3035 3036 else 3037 No := Repeat (Operand_Code, Max); 3038 3039 if Debug and then No < Min then 3040 Dump_Error ("failed... matched" & No'Img & " times"); 3041 end if; 3042 3043 -- ??? Perl has some special code here in case the next 3044 -- instruction is of type EOL, since $ and \Z can match before 3045 -- *and* after newline at the end. 3046 3047 -- ??? Perl has some special code here in case (paren) is True 3048 3049 -- Else, if we don't have any parenthesis 3050 3051 while No >= Min loop 3052 if not Next_Char_Known 3053 or else (Input_Pos <= Last_In_Data 3054 and then Data (Input_Pos) = Next_Char) 3055 then 3056 if Match (Next) then 3057 return True; 3058 end if; 3059 end if; 3060 3061 -- Could not or did not work, we back up 3062 3063 No := No - 1; 3064 Input_Pos := Save + No; 3065 end loop; 3066 3067 return False; 3068 end if; 3069 end Match_Simple_Operator; 3070 3071 ------------------ 3072 -- Match_Whilem -- 3073 ------------------ 3074 3075 -- This is really hard to understand, because after we match what we 3076 -- are trying to match, we must make sure the rest of the REx is going 3077 -- to match for sure, and to do that we have to go back UP the parse 3078 -- tree by recursing ever deeper. And if it fails, we have to reset 3079 -- our parent's current state that we can try again after backing off. 3080 3081 function Match_Whilem return Boolean is 3082 Cc : constant Current_Curly_Access := Current_Curly; 3083 3084 N : constant Natural := Cc.Cur + 1; 3085 Ln : Natural := 0; 3086 3087 Lastloc : constant Natural := Cc.Lastloc; 3088 -- Detection of 0-len 3089 3090 begin 3091 -- If degenerate scan matches "", assume scan done 3092 3093 if Input_Pos = Cc.Lastloc 3094 and then N >= Cc.Min 3095 then 3096 -- Temporarily restore the old context, and check that we 3097 -- match was comes after CURLYX. 3098 3099 Current_Curly := Cc.Old_Cc; 3100 3101 if Current_Curly /= null then 3102 Ln := Current_Curly.Cur; 3103 end if; 3104 3105 if Match (Cc.Next) then 3106 return True; 3107 end if; 3108 3109 if Current_Curly /= null then 3110 Current_Curly.Cur := Ln; 3111 end if; 3112 3113 Current_Curly := Cc; 3114 return False; 3115 end if; 3116 3117 -- First, just match a string of min scans 3118 3119 if N < Cc.Min then 3120 Cc.Cur := N; 3121 Cc.Lastloc := Input_Pos; 3122 3123 if Debug then 3124 Dump_Error 3125 ("Tests that we match at least" & Cc.Min'Img & " N=" & N'Img); 3126 end if; 3127 3128 if Match (Cc.Scan) then 3129 return True; 3130 end if; 3131 3132 Cc.Cur := N - 1; 3133 Cc.Lastloc := Lastloc; 3134 3135 if Debug then 3136 Dump_Error ("failed..."); 3137 end if; 3138 3139 return False; 3140 end if; 3141 3142 -- Prefer next over scan for minimal matching 3143 3144 if not Cc.Greedy then 3145 Current_Curly := Cc.Old_Cc; 3146 3147 if Current_Curly /= null then 3148 Ln := Current_Curly.Cur; 3149 end if; 3150 3151 if Recurse_Match (Cc.Next, Cc.Paren_Floor) then 3152 return True; 3153 end if; 3154 3155 if Current_Curly /= null then 3156 Current_Curly.Cur := Ln; 3157 end if; 3158 3159 Current_Curly := Cc; 3160 3161 -- Maximum greed exceeded ? 3162 3163 if N >= Cc.Max then 3164 if Debug then 3165 Dump_Error ("failed..."); 3166 end if; 3167 return False; 3168 end if; 3169 3170 -- Try scanning more and see if it helps 3171 Cc.Cur := N; 3172 Cc.Lastloc := Input_Pos; 3173 3174 if Debug then 3175 Dump_Error ("Next failed, what about Current?"); 3176 end if; 3177 3178 if Recurse_Match (Cc.Scan, Cc.Paren_Floor) then 3179 return True; 3180 end if; 3181 3182 Cc.Cur := N - 1; 3183 Cc.Lastloc := Lastloc; 3184 return False; 3185 end if; 3186 3187 -- Prefer scan over next for maximal matching 3188 3189 if N < Cc.Max then -- more greed allowed ? 3190 Cc.Cur := N; 3191 Cc.Lastloc := Input_Pos; 3192 3193 if Debug then 3194 Dump_Error ("Recurse at current position"); 3195 end if; 3196 3197 if Recurse_Match (Cc.Scan, Cc.Paren_Floor) then 3198 return True; 3199 end if; 3200 end if; 3201 3202 -- Failed deeper matches of scan, so see if this one works 3203 3204 Current_Curly := Cc.Old_Cc; 3205 3206 if Current_Curly /= null then 3207 Ln := Current_Curly.Cur; 3208 end if; 3209 3210 if Debug then 3211 Dump_Error ("Failed matching for later positions"); 3212 end if; 3213 3214 if Match (Cc.Next) then 3215 return True; 3216 end if; 3217 3218 if Current_Curly /= null then 3219 Current_Curly.Cur := Ln; 3220 end if; 3221 3222 Current_Curly := Cc; 3223 Cc.Cur := N - 1; 3224 Cc.Lastloc := Lastloc; 3225 3226 if Debug then 3227 Dump_Error ("failed..."); 3228 end if; 3229 3230 return False; 3231 end Match_Whilem; 3232 3233 ------------ 3234 -- Repeat -- 3235 ------------ 3236 3237 function Repeat 3238 (IP : Pointer; 3239 Max : Natural := Natural'Last) return Natural 3240 is 3241 Scan : Natural := Input_Pos; 3242 Last : Natural; 3243 Op : constant Opcode := Opcode'Val (Character'Pos (Program (IP))); 3244 Count : Natural; 3245 C : Character; 3246 Is_First : Boolean := True; 3247 Bitmap : Character_Class; 3248 3249 begin 3250 if Max = Natural'Last or else Scan + Max - 1 > Last_In_Data then 3251 Last := Last_In_Data; 3252 else 3253 Last := Scan + Max - 1; 3254 end if; 3255 3256 case Op is 3257 when ANY => 3258 while Scan <= Last 3259 and then Data (Scan) /= ASCII.LF 3260 loop 3261 Scan := Scan + 1; 3262 end loop; 3263 3264 when SANY => 3265 Scan := Last + 1; 3266 3267 when EXACT => 3268 3269 -- The string has only one character if Repeat was called 3270 3271 C := Program (String_Operand (IP)); 3272 while Scan <= Last 3273 and then C = Data (Scan) 3274 loop 3275 Scan := Scan + 1; 3276 end loop; 3277 3278 when EXACTF => 3279 3280 -- The string has only one character if Repeat was called 3281 3282 C := Program (String_Operand (IP)); 3283 while Scan <= Last 3284 and then To_Lower (C) = Data (Scan) 3285 loop 3286 Scan := Scan + 1; 3287 end loop; 3288 3289 when ANYOF => 3290 if Is_First then 3291 Bitmap_Operand (Program, IP, Bitmap); 3292 Is_First := False; 3293 end if; 3294 3295 while Scan <= Last 3296 and then Get_From_Class (Bitmap, Data (Scan)) 3297 loop 3298 Scan := Scan + 1; 3299 end loop; 3300 3301 when ALNUM => 3302 while Scan <= Last 3303 and then Is_Alnum (Data (Scan)) 3304 loop 3305 Scan := Scan + 1; 3306 end loop; 3307 3308 when NALNUM => 3309 while Scan <= Last 3310 and then not Is_Alnum (Data (Scan)) 3311 loop 3312 Scan := Scan + 1; 3313 end loop; 3314 3315 when SPACE => 3316 while Scan <= Last 3317 and then Is_White_Space (Data (Scan)) 3318 loop 3319 Scan := Scan + 1; 3320 end loop; 3321 3322 when NSPACE => 3323 while Scan <= Last 3324 and then not Is_White_Space (Data (Scan)) 3325 loop 3326 Scan := Scan + 1; 3327 end loop; 3328 3329 when DIGIT => 3330 while Scan <= Last 3331 and then Is_Digit (Data (Scan)) 3332 loop 3333 Scan := Scan + 1; 3334 end loop; 3335 3336 when NDIGIT => 3337 while Scan <= Last 3338 and then not Is_Digit (Data (Scan)) 3339 loop 3340 Scan := Scan + 1; 3341 end loop; 3342 3343 when others => 3344 raise Program_Error; 3345 end case; 3346 3347 Count := Scan - Input_Pos; 3348 Input_Pos := Scan; 3349 return Count; 3350 end Repeat; 3351 3352 --------- 3353 -- Try -- 3354 --------- 3355 3356 function Try (Pos : Positive) return Boolean is 3357 begin 3358 Input_Pos := Pos; 3359 Last_Paren := 0; 3360 Matches_Full := (others => No_Match); 3361 3362 if Match (Program_First) then 3363 Matches_Full (0) := (Pos, Input_Pos - 1); 3364 return True; 3365 end if; 3366 3367 return False; 3368 end Try; 3369 3370 -- Start of processing for Match 3371 3372 begin 3373 -- Do we have the regexp Never_Match? 3374 3375 if Self.Size = 0 then 3376 Matches := (others => No_Match); 3377 return; 3378 end if; 3379 3380 -- If there is a "must appear" string, look for it 3381 3382 if Self.Must_Have_Length > 0 then 3383 declare 3384 First : constant Character := Program (Self.Must_Have); 3385 Must_First : constant Pointer := Self.Must_Have; 3386 Must_Last : constant Pointer := 3387 Must_First + Pointer (Self.Must_Have_Length - 1); 3388 Next_Try : Natural := Index (First_In_Data, First); 3389 3390 begin 3391 while Next_Try /= 0 3392 and then Data (Next_Try .. Next_Try + Self.Must_Have_Length - 1) 3393 = String (Program (Must_First .. Must_Last)) 3394 loop 3395 Next_Try := Index (Next_Try + 1, First); 3396 end loop; 3397 3398 if Next_Try = 0 then 3399 Matches := (others => No_Match); 3400 return; -- Not present 3401 end if; 3402 end; 3403 end if; 3404 3405 -- Mark beginning of line for ^ 3406 3407 BOL_Pos := Data'First; 3408 3409 -- Simplest case first: an anchored match need be tried only once 3410 3411 if Self.Anchored and then (Self.Flags and Multiple_Lines) = 0 then 3412 Matched := Try (First_In_Data); 3413 3414 elsif Self.Anchored then 3415 declare 3416 Next_Try : Natural := First_In_Data; 3417 begin 3418 -- Test the first position in the buffer 3419 Matched := Try (Next_Try); 3420 3421 -- Else only test after newlines 3422 3423 if not Matched then 3424 while Next_Try <= Last_In_Data loop 3425 while Next_Try <= Last_In_Data 3426 and then Data (Next_Try) /= ASCII.LF 3427 loop 3428 Next_Try := Next_Try + 1; 3429 end loop; 3430 3431 Next_Try := Next_Try + 1; 3432 3433 if Next_Try <= Last_In_Data then 3434 Matched := Try (Next_Try); 3435 exit when Matched; 3436 end if; 3437 end loop; 3438 end if; 3439 end; 3440 3441 elsif Self.First /= ASCII.NUL then 3442 -- We know what char it must start with 3443 3444 declare 3445 Next_Try : Natural := Index (First_In_Data, Self.First); 3446 3447 begin 3448 while Next_Try /= 0 loop 3449 Matched := Try (Next_Try); 3450 exit when Matched; 3451 Next_Try := Index (Next_Try + 1, Self.First); 3452 end loop; 3453 end; 3454 3455 else 3456 -- Messy cases: try all locations (including for the empty string) 3457 3458 Matched := Try (First_In_Data); 3459 3460 if not Matched then 3461 for S in First_In_Data + 1 .. Last_In_Data loop 3462 Matched := Try (S); 3463 exit when Matched; 3464 end loop; 3465 end if; 3466 end if; 3467 3468 -- Matched has its value 3469 3470 for J in Last_Paren + 1 .. Matches'Last loop 3471 Matches_Full (J) := No_Match; 3472 end loop; 3473 3474 Matches := Matches_Full (Matches'Range); 3475 end Match; 3476 3477 ----------- 3478 -- Match -- 3479 ----------- 3480 3481 function Match 3482 (Self : Pattern_Matcher; 3483 Data : String; 3484 Data_First : Integer := -1; 3485 Data_Last : Positive := Positive'Last) return Natural 3486 is 3487 Matches : Match_Array (0 .. 0); 3488 3489 begin 3490 Match (Self, Data, Matches, Data_First, Data_Last); 3491 if Matches (0) = No_Match then 3492 return Data'First - 1; 3493 else 3494 return Matches (0).First; 3495 end if; 3496 end Match; 3497 3498 function Match 3499 (Self : Pattern_Matcher; 3500 Data : String; 3501 Data_First : Integer := -1; 3502 Data_Last : Positive := Positive'Last) return Boolean 3503 is 3504 Matches : Match_Array (0 .. 0); 3505 3506 begin 3507 Match (Self, Data, Matches, Data_First, Data_Last); 3508 return Matches (0).First >= Data'First; 3509 end Match; 3510 3511 procedure Match 3512 (Expression : String; 3513 Data : String; 3514 Matches : out Match_Array; 3515 Size : Program_Size := Auto_Size; 3516 Data_First : Integer := -1; 3517 Data_Last : Positive := Positive'Last) 3518 is 3519 PM : Pattern_Matcher (Size); 3520 Finalize_Size : Program_Size; 3521 pragma Unreferenced (Finalize_Size); 3522 begin 3523 if Size = 0 then 3524 Match (Compile (Expression), Data, Matches, Data_First, Data_Last); 3525 else 3526 Compile (PM, Expression, Finalize_Size); 3527 Match (PM, Data, Matches, Data_First, Data_Last); 3528 end if; 3529 end Match; 3530 3531 ----------- 3532 -- Match -- 3533 ----------- 3534 3535 function Match 3536 (Expression : String; 3537 Data : String; 3538 Size : Program_Size := Auto_Size; 3539 Data_First : Integer := -1; 3540 Data_Last : Positive := Positive'Last) return Natural 3541 is 3542 PM : Pattern_Matcher (Size); 3543 Final_Size : Program_Size; 3544 pragma Unreferenced (Final_Size); 3545 begin 3546 if Size = 0 then 3547 return Match (Compile (Expression), Data, Data_First, Data_Last); 3548 else 3549 Compile (PM, Expression, Final_Size); 3550 return Match (PM, Data, Data_First, Data_Last); 3551 end if; 3552 end Match; 3553 3554 ----------- 3555 -- Match -- 3556 ----------- 3557 3558 function Match 3559 (Expression : String; 3560 Data : String; 3561 Size : Program_Size := Auto_Size; 3562 Data_First : Integer := -1; 3563 Data_Last : Positive := Positive'Last) return Boolean 3564 is 3565 Matches : Match_Array (0 .. 0); 3566 PM : Pattern_Matcher (Size); 3567 Final_Size : Program_Size; 3568 pragma Unreferenced (Final_Size); 3569 begin 3570 if Size = 0 then 3571 Match (Compile (Expression), Data, Matches, Data_First, Data_Last); 3572 else 3573 Compile (PM, Expression, Final_Size); 3574 Match (PM, Data, Matches, Data_First, Data_Last); 3575 end if; 3576 3577 return Matches (0).First >= Data'First; 3578 end Match; 3579 3580 ------------- 3581 -- Operand -- 3582 ------------- 3583 3584 function Operand (P : Pointer) return Pointer is 3585 begin 3586 return P + Next_Pointer_Bytes; 3587 end Operand; 3588 3589 -------------- 3590 -- Optimize -- 3591 -------------- 3592 3593 procedure Optimize (Self : in out Pattern_Matcher) is 3594 Scan : Pointer; 3595 Program : Program_Data renames Self.Program; 3596 3597 begin 3598 -- Start with safe defaults (no optimization): 3599 -- * No known first character of match 3600 -- * Does not necessarily start at beginning of line 3601 -- * No string known that has to appear in data 3602 3603 Self.First := ASCII.NUL; 3604 Self.Anchored := False; 3605 Self.Must_Have := Program'Last + 1; 3606 Self.Must_Have_Length := 0; 3607 3608 Scan := Program_First; -- First instruction (can be anything) 3609 3610 if Program (Scan) = EXACT then 3611 Self.First := Program (String_Operand (Scan)); 3612 3613 elsif Program (Scan) = BOL 3614 or else Program (Scan) = SBOL 3615 or else Program (Scan) = MBOL 3616 then 3617 Self.Anchored := True; 3618 end if; 3619 end Optimize; 3620 3621 ----------------- 3622 -- Paren_Count -- 3623 ----------------- 3624 3625 function Paren_Count (Regexp : Pattern_Matcher) return Match_Count is 3626 begin 3627 return Regexp.Paren_Count; 3628 end Paren_Count; 3629 3630 ----------- 3631 -- Quote -- 3632 ----------- 3633 3634 function Quote (Str : String) return String is 3635 S : String (1 .. Str'Length * 2); 3636 Last : Natural := 0; 3637 3638 begin 3639 for J in Str'Range loop 3640 case Str (J) is 3641 when '^' | '$' | '|' | '*' | '+' | '?' | '{' | 3642 '}' | '[' | ']' | '(' | ')' | '\' | '.' => 3643 3644 S (Last + 1) := '\'; 3645 S (Last + 2) := Str (J); 3646 Last := Last + 2; 3647 3648 when others => 3649 S (Last + 1) := Str (J); 3650 Last := Last + 1; 3651 end case; 3652 end loop; 3653 3654 return S (1 .. Last); 3655 end Quote; 3656 3657 ------------------ 3658 -- Read_Natural -- 3659 ------------------ 3660 3661 function Read_Natural 3662 (Program : Program_Data; 3663 IP : Pointer) return Natural 3664 is 3665 begin 3666 return Character'Pos (Program (IP)) + 3667 256 * Character'Pos (Program (IP + 1)); 3668 end Read_Natural; 3669 3670 ----------------- 3671 -- Reset_Class -- 3672 ----------------- 3673 3674 procedure Reset_Class (Bitmap : out Character_Class) is 3675 begin 3676 Bitmap := (others => 0); 3677 end Reset_Class; 3678 3679 ------------------ 3680 -- Set_In_Class -- 3681 ------------------ 3682 3683 procedure Set_In_Class 3684 (Bitmap : in out Character_Class; 3685 C : Character) 3686 is 3687 Value : constant Class_Byte := Character'Pos (C); 3688 begin 3689 Bitmap (Value / 8) := Bitmap (Value / 8) 3690 or Bit_Conversion (Value mod 8); 3691 end Set_In_Class; 3692 3693 ------------------- 3694 -- String_Length -- 3695 ------------------- 3696 3697 function String_Length 3698 (Program : Program_Data; 3699 P : Pointer) return Program_Size 3700 is 3701 begin 3702 pragma Assert (Program (P) = EXACT or else Program (P) = EXACTF); 3703 return Character'Pos (Program (P + Next_Pointer_Bytes)); 3704 end String_Length; 3705 3706 -------------------- 3707 -- String_Operand -- 3708 -------------------- 3709 3710 function String_Operand (P : Pointer) return Pointer is 3711 begin 3712 return P + 4; 3713 end String_Operand; 3714 3715end System.Regpat; 3716