1------------------------------------------------------------------------------ 2-- -- 3-- GNAT LIBRARY COMPONENTS -- 4-- -- 5-- G N A T . R E G P A T -- 6-- -- 7-- B o d y -- 8-- -- 9-- Copyright (C) 1986 by University of Toronto. -- 10-- Copyright (C) 1999-2018, AdaCore -- 11-- -- 12-- GNAT is free software; you can redistribute it and/or modify it under -- 13-- terms of the GNU General Public License as published by the Free Soft- -- 14-- ware Foundation; either version 3, or (at your option) any later ver- -- 15-- sion. GNAT is distributed in the hope that it will be useful, but WITH- -- 16-- OUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY -- 17-- or FITNESS FOR A PARTICULAR PURPOSE. -- 18-- -- 19-- As a special exception under Section 7 of GPL version 3, you are granted -- 20-- additional permissions described in the GCC Runtime Library Exception, -- 21-- version 3.1, as published by the Free Software Foundation. -- 22-- -- 23-- You should have received a copy of the GNU General Public License and -- 24-- a copy of the GCC Runtime Library Exception along with this program; -- 25-- see the files COPYING3 and COPYING.RUNTIME respectively. If not, see -- 26-- <http://www.gnu.org/licenses/>. -- 27-- -- 28-- GNAT was originally developed by the GNAT team at New York University. -- 29-- Extensive contributions were provided by Ada Core Technologies Inc. -- 30-- -- 31------------------------------------------------------------------------------ 32 33-- This is an altered Ada 95 version of the original V8 style regular 34-- expression library written in C by Henry Spencer. Apart from the 35-- translation to Ada, the interface has been considerably changed to 36-- use the Ada String type instead of C-style nul-terminated strings. 37 38-- Beware that some of this code is subtly aware of the way operator 39-- precedence is structured in regular expressions. Serious changes in 40-- regular-expression syntax might require a total rethink. 41 42with System.IO; use System.IO; 43with Ada.Characters.Handling; use Ada.Characters.Handling; 44with Ada.Unchecked_Conversion; 45 46package body System.Regpat is 47 48 Debug : constant Boolean := False; 49 -- Set to True to activate debug traces. This is normally set to constant 50 -- False to simply delete all the trace code. It is to be edited to True 51 -- for internal debugging of the package. 52 53 ---------------------------- 54 -- Implementation details -- 55 ---------------------------- 56 57 -- This is essentially a linear encoding of a nondeterministic 58 -- finite-state machine, also known as syntax charts or 59 -- "railroad normal form" in parsing technology. 60 61 -- Each node is an opcode plus a "next" pointer, possibly plus an 62 -- operand. "Next" pointers of all nodes except BRANCH implement 63 -- concatenation; a "next" pointer with a BRANCH on both ends of it 64 -- is connecting two alternatives. 65 66 -- The operand of some types of node is a literal string; for others, 67 -- it is a node leading into a sub-FSM. In particular, the operand of 68 -- a BRANCH node is the first node of the branch. 69 -- (NB this is *not* a tree structure: the tail of the branch connects 70 -- to the thing following the set of BRANCHes). 71 72 -- You can see the exact byte-compiled version by using the Dump 73 -- subprogram. However, here are a few examples: 74 75 -- (a|b): 1 : BRANCH (next at 9) 76 -- 4 : EXACT (next at 17) operand=a 77 -- 9 : BRANCH (next at 17) 78 -- 12 : EXACT (next at 17) operand=b 79 -- 17 : EOP (next at 0) 80 -- 81 -- (ab)*: 1 : CURLYX (next at 25) { 0, 32767} 82 -- 8 : OPEN 1 (next at 12) 83 -- 12 : EXACT (next at 18) operand=ab 84 -- 18 : CLOSE 1 (next at 22) 85 -- 22 : WHILEM (next at 0) 86 -- 25 : NOTHING (next at 28) 87 -- 28 : EOP (next at 0) 88 89 -- The opcodes are: 90 91 type Opcode is 92 93 -- Name Operand? Meaning 94 95 (EOP, -- no End of program 96 MINMOD, -- no Next operator is not greedy 97 98 -- Classes of characters 99 100 ANY, -- no Match any one character except newline 101 SANY, -- no Match any character, including new line 102 ANYOF, -- class Match any character in this class 103 EXACT, -- str Match this string exactly 104 EXACTF, -- str Match this string (case-folding is one) 105 NOTHING, -- no Match empty string 106 SPACE, -- no Match any whitespace character 107 NSPACE, -- no Match any non-whitespace character 108 DIGIT, -- no Match any numeric character 109 NDIGIT, -- no Match any non-numeric character 110 ALNUM, -- no Match any alphanumeric character 111 NALNUM, -- no Match any non-alphanumeric character 112 113 -- Branches 114 115 BRANCH, -- node Match this alternative, or the next 116 117 -- Simple loops (when the following node is one character in length) 118 119 STAR, -- node Match this simple thing 0 or more times 120 PLUS, -- node Match this simple thing 1 or more times 121 CURLY, -- 2num node Match this simple thing between n and m times. 122 123 -- Complex loops 124 125 CURLYX, -- 2num node Match this complex thing {n,m} times 126 -- The nums are coded on two characters each 127 128 WHILEM, -- no Do curly processing and see if rest matches 129 130 -- Matches after or before a word 131 132 BOL, -- no Match "" at beginning of line 133 MBOL, -- no Same, assuming multiline (match after \n) 134 SBOL, -- no Same, assuming single line (don't match at \n) 135 EOL, -- no Match "" at end of line 136 MEOL, -- no Same, assuming multiline (match before \n) 137 SEOL, -- no Same, assuming single line (don't match at \n) 138 139 BOUND, -- no Match "" at any word boundary 140 NBOUND, -- no Match "" at any word non-boundary 141 142 -- Parenthesis groups handling 143 144 REFF, -- num Match some already matched string, folded 145 OPEN, -- num Mark this point in input as start of #n 146 CLOSE); -- num Analogous to OPEN 147 148 for Opcode'Size use 8; 149 150 -- Opcode notes: 151 152 -- BRANCH 153 -- The set of branches constituting a single choice are hooked 154 -- together with their "next" pointers, since precedence prevents 155 -- anything being concatenated to any individual branch. The 156 -- "next" pointer of the last BRANCH in a choice points to the 157 -- thing following the whole choice. This is also where the 158 -- final "next" pointer of each individual branch points; each 159 -- branch starts with the operand node of a BRANCH node. 160 161 -- STAR,PLUS 162 -- '?', and complex '*' and '+', are implemented with CURLYX. 163 -- branches. Simple cases (one character per match) are implemented with 164 -- STAR and PLUS for speed and to minimize recursive plunges. 165 166 -- OPEN,CLOSE 167 -- ...are numbered at compile time. 168 169 -- EXACT, EXACTF 170 -- There are in fact two arguments, the first one is the length (minus 171 -- one of the string argument), coded on one character, the second 172 -- argument is the string itself, coded on length + 1 characters. 173 174 -- A node is one char of opcode followed by two chars of "next" pointer. 175 -- "Next" pointers are stored as two 8-bit pieces, high order first. The 176 -- value is a positive offset from the opcode of the node containing it. 177 -- An operand, if any, simply follows the node. (Note that much of the 178 -- code generation knows about this implicit relationship.) 179 180 -- Using two bytes for the "next" pointer is vast overkill for most 181 -- things, but allows patterns to get big without disasters. 182 183 Next_Pointer_Bytes : constant := 3; 184 -- Points after the "next pointer" data. An instruction is therefore: 185 -- 1 byte: instruction opcode 186 -- 2 bytes: pointer to next instruction 187 -- * bytes: optional data for the instruction 188 189 ----------------------- 190 -- Character classes -- 191 ----------------------- 192 -- This is the implementation for character classes ([...]) in the 193 -- syntax for regular expressions. Each character (0..256) has an 194 -- entry into the table. This makes for a very fast matching 195 -- algorithm. 196 197 type Class_Byte is mod 256; 198 type Character_Class is array (Class_Byte range 0 .. 31) of Class_Byte; 199 200 type Bit_Conversion_Array is array (Class_Byte range 0 .. 7) of Class_Byte; 201 Bit_Conversion : constant Bit_Conversion_Array := 202 (1, 2, 4, 8, 16, 32, 64, 128); 203 204 type Std_Class is (ANYOF_NONE, 205 ANYOF_ALNUM, -- Alphanumeric class [a-zA-Z0-9] 206 ANYOF_NALNUM, 207 ANYOF_SPACE, -- Space class [ \t\n\r\f] 208 ANYOF_NSPACE, 209 ANYOF_DIGIT, -- Digit class [0-9] 210 ANYOF_NDIGIT, 211 ANYOF_ALNUMC, -- Alphanumeric class [a-zA-Z0-9] 212 ANYOF_NALNUMC, 213 ANYOF_ALPHA, -- Alpha class [a-zA-Z] 214 ANYOF_NALPHA, 215 ANYOF_ASCII, -- Ascii class (7 bits) 0..127 216 ANYOF_NASCII, 217 ANYOF_CNTRL, -- Control class 218 ANYOF_NCNTRL, 219 ANYOF_GRAPH, -- Graphic class 220 ANYOF_NGRAPH, 221 ANYOF_LOWER, -- Lower case class [a-z] 222 ANYOF_NLOWER, 223 ANYOF_PRINT, -- printable class 224 ANYOF_NPRINT, 225 ANYOF_PUNCT, -- 226 ANYOF_NPUNCT, 227 ANYOF_UPPER, -- Upper case class [A-Z] 228 ANYOF_NUPPER, 229 ANYOF_XDIGIT, -- Hexadecimal digit 230 ANYOF_NXDIGIT 231 ); 232 233 procedure Set_In_Class 234 (Bitmap : in out Character_Class; 235 C : Character); 236 -- Set the entry to True for C in the class Bitmap 237 238 function Get_From_Class 239 (Bitmap : Character_Class; 240 C : Character) return Boolean; 241 -- Return True if the entry is set for C in the class Bitmap 242 243 procedure Reset_Class (Bitmap : out Character_Class); 244 -- Clear all the entries in the class Bitmap 245 246 pragma Inline (Set_In_Class); 247 pragma Inline (Get_From_Class); 248 pragma Inline (Reset_Class); 249 250 ----------------------- 251 -- Local Subprograms -- 252 ----------------------- 253 254 function "=" (Left : Character; Right : Opcode) return Boolean; 255 256 function Is_Alnum (C : Character) return Boolean; 257 -- Return True if C is an alphanum character or an underscore ('_') 258 259 function Is_White_Space (C : Character) return Boolean; 260 -- Return True if C is a whitespace character 261 262 function Is_Printable (C : Character) return Boolean; 263 -- Return True if C is a printable character 264 265 function Operand (P : Pointer) return Pointer; 266 -- Return a pointer to the first operand of the node at P 267 268 function String_Length 269 (Program : Program_Data; 270 P : Pointer) return Program_Size; 271 -- Return the length of the string argument of the node at P 272 273 function String_Operand (P : Pointer) return Pointer; 274 -- Return a pointer to the string argument of the node at P 275 276 procedure Bitmap_Operand 277 (Program : Program_Data; 278 P : Pointer; 279 Op : out Character_Class); 280 -- Return a pointer to the string argument of the node at P 281 282 function Get_Next 283 (Program : Program_Data; 284 IP : Pointer) return Pointer; 285 -- Dig the next instruction pointer out of a node 286 287 procedure Optimize (Self : in out Pattern_Matcher); 288 -- Optimize a Pattern_Matcher by noting certain special cases 289 290 function Read_Natural 291 (Program : Program_Data; 292 IP : Pointer) return Natural; 293 -- Return the 2-byte natural coded at position IP 294 295 -- All of the subprograms above are tiny and should be inlined 296 297 pragma Inline ("="); 298 pragma Inline (Is_Alnum); 299 pragma Inline (Is_White_Space); 300 pragma Inline (Get_Next); 301 pragma Inline (Operand); 302 pragma Inline (Read_Natural); 303 pragma Inline (String_Length); 304 pragma Inline (String_Operand); 305 306 type Expression_Flags is record 307 Has_Width, -- Known never to match null string 308 Simple, -- Simple enough to be STAR/PLUS operand 309 SP_Start : Boolean; -- Starts with * or + 310 end record; 311 312 Worst_Expression : constant Expression_Flags := (others => False); 313 -- Worst case 314 315 procedure Dump_Until 316 (Program : Program_Data; 317 Index : in out Pointer; 318 Till : Pointer; 319 Indent : Natural; 320 Do_Print : Boolean := True); 321 -- Dump the program until the node Till (not included) is met. Every line 322 -- is indented with Index spaces at the beginning Dumps till the end if 323 -- Till is 0. 324 325 procedure Dump_Operation 326 (Program : Program_Data; 327 Index : Pointer; 328 Indent : Natural); 329 -- Same as above, but only dumps a single operation, and compute its 330 -- indentation from the program. 331 332 --------- 333 -- "=" -- 334 --------- 335 336 function "=" (Left : Character; Right : Opcode) return Boolean is 337 begin 338 return Character'Pos (Left) = Opcode'Pos (Right); 339 end "="; 340 341 -------------------- 342 -- Bitmap_Operand -- 343 -------------------- 344 345 procedure Bitmap_Operand 346 (Program : Program_Data; 347 P : Pointer; 348 Op : out Character_Class) 349 is 350 function Convert is new Ada.Unchecked_Conversion 351 (Program_Data, Character_Class); 352 353 begin 354 Op (0 .. 31) := Convert (Program (P + Next_Pointer_Bytes .. P + 34)); 355 end Bitmap_Operand; 356 357 ------------- 358 -- Compile -- 359 ------------- 360 361 procedure Compile 362 (Matcher : out Pattern_Matcher; 363 Expression : String; 364 Final_Code_Size : out Program_Size; 365 Flags : Regexp_Flags := No_Flags) 366 is 367 -- We can't allocate space until we know how big the compiled form 368 -- will be, but we can't compile it (and thus know how big it is) 369 -- until we've got a place to put the code. So we cheat: we compile 370 -- it twice, once with code generation turned off and size counting 371 -- turned on, and once "for real". 372 373 -- This also means that we don't allocate space until we are sure 374 -- that the thing really will compile successfully, and we never 375 -- have to move the code and thus invalidate pointers into it. 376 377 -- Beware that the optimization-preparation code in here knows 378 -- about some of the structure of the compiled regexp. 379 380 PM : Pattern_Matcher renames Matcher; 381 Program : Program_Data renames PM.Program; 382 383 Emit_Ptr : Pointer := Program_First; 384 385 Parse_Pos : Natural := Expression'First; -- Input-scan pointer 386 Parse_End : constant Natural := Expression'Last; 387 388 ---------------------------- 389 -- Subprograms for Create -- 390 ---------------------------- 391 392 procedure Emit (B : Character); 393 -- Output the Character B to the Program. If code-generation is 394 -- disabled, simply increments the program counter. 395 396 function Emit_Node (Op : Opcode) return Pointer; 397 -- If code-generation is enabled, Emit_Node outputs the 398 -- opcode Op and reserves space for a pointer to the next node. 399 -- Return value is the location of new opcode, i.e. old Emit_Ptr. 400 401 procedure Emit_Natural (IP : Pointer; N : Natural); 402 -- Split N on two characters at position IP 403 404 procedure Emit_Class (Bitmap : Character_Class); 405 -- Emits a character class 406 407 procedure Case_Emit (C : Character); 408 -- Emit C, after converting is to lower-case if the regular 409 -- expression is case insensitive. 410 411 procedure Parse 412 (Parenthesized : Boolean; 413 Capturing : Boolean; 414 Flags : out Expression_Flags; 415 IP : out Pointer); 416 -- Parse regular expression, i.e. main body or parenthesized thing. 417 -- Caller must absorb opening parenthesis. Capturing should be set to 418 -- True when we have an open parenthesis from which we want the user 419 -- to extra text. 420 421 procedure Parse_Branch 422 (Flags : out Expression_Flags; 423 First : Boolean; 424 IP : out Pointer); 425 -- Implements the concatenation operator and handles '|'. 426 -- First should be true if this is the first item of the alternative. 427 428 procedure Parse_Piece 429 (Expr_Flags : out Expression_Flags; 430 IP : out Pointer); 431 -- Parse something followed by possible [*+?] 432 433 procedure Parse_Atom 434 (Expr_Flags : out Expression_Flags; 435 IP : out Pointer); 436 -- Parse_Atom is the lowest level parse procedure. 437 -- 438 -- Optimization: Gobbles an entire sequence of ordinary characters so 439 -- that it can turn them into a single node, which is smaller to store 440 -- and faster to run. Backslashed characters are exceptions, each 441 -- becoming a separate node; the code is simpler that way and it's 442 -- not worth fixing. 443 444 procedure Insert_Operator 445 (Op : Opcode; 446 Operand : Pointer; 447 Greedy : Boolean := True); 448 -- Insert_Operator inserts an operator in front of an already-emitted 449 -- operand and relocates the operand. This applies to PLUS and STAR. 450 -- If Minmod is True, then the operator is non-greedy. 451 452 function Insert_Operator_Before 453 (Op : Opcode; 454 Operand : Pointer; 455 Greedy : Boolean; 456 Opsize : Pointer) return Pointer; 457 -- Insert an operator before Operand (and move the latter forward in the 458 -- program). Opsize is the size needed to represent the operator. This 459 -- returns the position at which the operator was inserted, and moves 460 -- Emit_Ptr after the new position of the operand. 461 462 procedure Insert_Curly_Operator 463 (Op : Opcode; 464 Min : Natural; 465 Max : Natural; 466 Operand : Pointer; 467 Greedy : Boolean := True); 468 -- Insert an operator for CURLY ({Min}, {Min,} or {Min,Max}). 469 -- If Minmod is True, then the operator is non-greedy. 470 471 procedure Link_Tail (P, Val : Pointer); 472 -- Link_Tail sets the next-pointer at the end of a node chain 473 474 procedure Link_Operand_Tail (P, Val : Pointer); 475 -- Link_Tail on operand of first argument; noop if operand-less 476 477 procedure Fail (M : String); 478 pragma No_Return (Fail); 479 -- Fail with a diagnostic message, if possible 480 481 function Is_Curly_Operator (IP : Natural) return Boolean; 482 -- Return True if IP is looking at a '{' that is the beginning 483 -- of a curly operator, i.e. it matches {\d+,?\d*} 484 485 function Is_Mult (IP : Natural) return Boolean; 486 -- Return True if C is a regexp multiplier: '+', '*' or '?' 487 488 procedure Get_Curly_Arguments 489 (IP : Natural; 490 Min : out Natural; 491 Max : out Natural; 492 Greedy : out Boolean); 493 -- Parse the argument list for a curly operator. 494 -- It is assumed that IP is indeed pointing at a valid operator. 495 -- So what is IP and how come IP is not referenced in the body ??? 496 497 procedure Parse_Character_Class (IP : out Pointer); 498 -- Parse a character class. 499 -- The calling subprogram should consume the opening '[' before. 500 501 procedure Parse_Literal 502 (Expr_Flags : out Expression_Flags; 503 IP : out Pointer); 504 -- Parse_Literal encodes a string of characters to be matched exactly 505 506 function Parse_Posix_Character_Class return Std_Class; 507 -- Parse a posix character class, like [:alpha:] or [:^alpha:]. 508 -- The caller is supposed to absorb the opening [. 509 510 pragma Inline (Is_Mult); 511 pragma Inline (Emit_Natural); 512 pragma Inline (Parse_Character_Class); -- since used only once 513 514 --------------- 515 -- Case_Emit -- 516 --------------- 517 518 procedure Case_Emit (C : Character) is 519 begin 520 if (Flags and Case_Insensitive) /= 0 then 521 Emit (To_Lower (C)); 522 523 else 524 -- Dump current character 525 526 Emit (C); 527 end if; 528 end Case_Emit; 529 530 ---------- 531 -- Emit -- 532 ---------- 533 534 procedure Emit (B : Character) is 535 begin 536 if Emit_Ptr <= PM.Size then 537 Program (Emit_Ptr) := B; 538 end if; 539 540 Emit_Ptr := Emit_Ptr + 1; 541 end Emit; 542 543 ---------------- 544 -- Emit_Class -- 545 ---------------- 546 547 procedure Emit_Class (Bitmap : Character_Class) is 548 subtype Program31 is Program_Data (0 .. 31); 549 550 function Convert is new Ada.Unchecked_Conversion 551 (Character_Class, Program31); 552 553 begin 554 -- What is the mysterious constant 31 here??? Can't it be expressed 555 -- symbolically (size of integer - 1 or some such???). In any case 556 -- it should be declared as a constant (and referenced presumably 557 -- as this constant + 1 below. 558 559 if Emit_Ptr + 31 <= PM.Size then 560 Program (Emit_Ptr .. Emit_Ptr + 31) := Convert (Bitmap); 561 end if; 562 563 Emit_Ptr := Emit_Ptr + 32; 564 end Emit_Class; 565 566 ------------------ 567 -- Emit_Natural -- 568 ------------------ 569 570 procedure Emit_Natural (IP : Pointer; N : Natural) is 571 begin 572 if IP + 1 <= PM.Size then 573 Program (IP + 1) := Character'Val (N / 256); 574 Program (IP) := Character'Val (N mod 256); 575 end if; 576 end Emit_Natural; 577 578 --------------- 579 -- Emit_Node -- 580 --------------- 581 582 function Emit_Node (Op : Opcode) return Pointer is 583 Result : constant Pointer := Emit_Ptr; 584 585 begin 586 if Emit_Ptr + 2 <= PM.Size then 587 Program (Emit_Ptr) := Character'Val (Opcode'Pos (Op)); 588 Program (Emit_Ptr + 1) := ASCII.NUL; 589 Program (Emit_Ptr + 2) := ASCII.NUL; 590 end if; 591 592 Emit_Ptr := Emit_Ptr + Next_Pointer_Bytes; 593 return Result; 594 end Emit_Node; 595 596 ---------- 597 -- Fail -- 598 ---------- 599 600 procedure Fail (M : String) is 601 begin 602 raise Expression_Error with M; 603 end Fail; 604 605 ------------------------- 606 -- Get_Curly_Arguments -- 607 ------------------------- 608 609 procedure Get_Curly_Arguments 610 (IP : Natural; 611 Min : out Natural; 612 Max : out Natural; 613 Greedy : out Boolean) 614 is 615 pragma Unreferenced (IP); 616 617 Save_Pos : Natural := Parse_Pos + 1; 618 619 begin 620 Min := 0; 621 Max := Max_Curly_Repeat; 622 623 while Expression (Parse_Pos) /= '}' 624 and then Expression (Parse_Pos) /= ',' 625 loop 626 Parse_Pos := Parse_Pos + 1; 627 end loop; 628 629 Min := Natural'Value (Expression (Save_Pos .. Parse_Pos - 1)); 630 631 if Expression (Parse_Pos) = ',' then 632 Save_Pos := Parse_Pos + 1; 633 while Expression (Parse_Pos) /= '}' loop 634 Parse_Pos := Parse_Pos + 1; 635 end loop; 636 637 if Save_Pos /= Parse_Pos then 638 Max := Natural'Value (Expression (Save_Pos .. Parse_Pos - 1)); 639 end if; 640 641 else 642 Max := Min; 643 end if; 644 645 if Parse_Pos < Expression'Last 646 and then Expression (Parse_Pos + 1) = '?' 647 then 648 Greedy := False; 649 Parse_Pos := Parse_Pos + 1; 650 651 else 652 Greedy := True; 653 end if; 654 end Get_Curly_Arguments; 655 656 --------------------------- 657 -- Insert_Curly_Operator -- 658 --------------------------- 659 660 procedure Insert_Curly_Operator 661 (Op : Opcode; 662 Min : Natural; 663 Max : Natural; 664 Operand : Pointer; 665 Greedy : Boolean := True) 666 is 667 Old : Pointer; 668 begin 669 Old := Insert_Operator_Before (Op, Operand, Greedy, Opsize => 7); 670 Emit_Natural (Old + Next_Pointer_Bytes, Min); 671 Emit_Natural (Old + Next_Pointer_Bytes + 2, Max); 672 end Insert_Curly_Operator; 673 674 ---------------------------- 675 -- Insert_Operator_Before -- 676 ---------------------------- 677 678 function Insert_Operator_Before 679 (Op : Opcode; 680 Operand : Pointer; 681 Greedy : Boolean; 682 Opsize : Pointer) return Pointer 683 is 684 Dest : constant Pointer := Emit_Ptr; 685 Old : Pointer; 686 Size : Pointer := Opsize; 687 688 begin 689 -- If not greedy, we have to emit another opcode first 690 691 if not Greedy then 692 Size := Size + Next_Pointer_Bytes; 693 end if; 694 695 -- Move the operand in the byte-compilation, so that we can insert 696 -- the operator before it. 697 698 if Emit_Ptr + Size <= PM.Size then 699 Program (Operand + Size .. Emit_Ptr + Size) := 700 Program (Operand .. Emit_Ptr); 701 end if; 702 703 -- Insert the operator at the position previously occupied by the 704 -- operand. 705 706 Emit_Ptr := Operand; 707 708 if not Greedy then 709 Old := Emit_Node (MINMOD); 710 Link_Tail (Old, Old + Next_Pointer_Bytes); 711 end if; 712 713 Old := Emit_Node (Op); 714 Emit_Ptr := Dest + Size; 715 return Old; 716 end Insert_Operator_Before; 717 718 --------------------- 719 -- Insert_Operator -- 720 --------------------- 721 722 procedure Insert_Operator 723 (Op : Opcode; 724 Operand : Pointer; 725 Greedy : Boolean := True) 726 is 727 Discard : Pointer; 728 pragma Warnings (Off, Discard); 729 begin 730 Discard := Insert_Operator_Before 731 (Op, Operand, Greedy, Opsize => Next_Pointer_Bytes); 732 end Insert_Operator; 733 734 ----------------------- 735 -- Is_Curly_Operator -- 736 ----------------------- 737 738 function Is_Curly_Operator (IP : Natural) return Boolean is 739 Scan : Natural := IP; 740 741 begin 742 if Expression (Scan) /= '{' 743 or else Scan + 2 > Expression'Last 744 or else not Is_Digit (Expression (Scan + 1)) 745 then 746 return False; 747 end if; 748 749 Scan := Scan + 1; 750 751 -- The first digit 752 753 loop 754 Scan := Scan + 1; 755 756 if Scan > Expression'Last then 757 return False; 758 end if; 759 760 exit when not Is_Digit (Expression (Scan)); 761 end loop; 762 763 if Expression (Scan) = ',' then 764 loop 765 Scan := Scan + 1; 766 767 if Scan > Expression'Last then 768 return False; 769 end if; 770 771 exit when not Is_Digit (Expression (Scan)); 772 end loop; 773 end if; 774 775 return Expression (Scan) = '}'; 776 end Is_Curly_Operator; 777 778 ------------- 779 -- Is_Mult -- 780 ------------- 781 782 function Is_Mult (IP : Natural) return Boolean is 783 C : constant Character := Expression (IP); 784 785 begin 786 return C = '*' 787 or else C = '+' 788 or else C = '?' 789 or else (C = '{' and then Is_Curly_Operator (IP)); 790 end Is_Mult; 791 792 ----------------------- 793 -- Link_Operand_Tail -- 794 ----------------------- 795 796 procedure Link_Operand_Tail (P, Val : Pointer) is 797 begin 798 if P <= PM.Size and then Program (P) = BRANCH then 799 Link_Tail (Operand (P), Val); 800 end if; 801 end Link_Operand_Tail; 802 803 --------------- 804 -- Link_Tail -- 805 --------------- 806 807 procedure Link_Tail (P, Val : Pointer) is 808 Scan : Pointer; 809 Temp : Pointer; 810 Offset : Pointer; 811 812 begin 813 -- Find last node (the size of the pattern matcher might be too 814 -- small, so don't try to read past its end). 815 816 Scan := P; 817 while Scan + Next_Pointer_Bytes <= PM.Size loop 818 Temp := Get_Next (Program, Scan); 819 exit when Temp = Scan; 820 Scan := Temp; 821 end loop; 822 823 Offset := Val - Scan; 824 825 Emit_Natural (Scan + 1, Natural (Offset)); 826 end Link_Tail; 827 828 ----------- 829 -- Parse -- 830 ----------- 831 832 -- Combining parenthesis handling with the base level of regular 833 -- expression is a trifle forced, but the need to tie the tails of the 834 -- the branches to what follows makes it hard to avoid. 835 836 procedure Parse 837 (Parenthesized : Boolean; 838 Capturing : Boolean; 839 Flags : out Expression_Flags; 840 IP : out Pointer) 841 is 842 E : String renames Expression; 843 Br, Br2 : Pointer; 844 Ender : Pointer; 845 Par_No : Natural; 846 New_Flags : Expression_Flags; 847 Have_Branch : Boolean := False; 848 849 begin 850 Flags := (Has_Width => True, others => False); -- Tentatively 851 852 -- Make an OPEN node, if parenthesized 853 854 if Parenthesized and then Capturing then 855 if Matcher.Paren_Count > Max_Paren_Count then 856 Fail ("too many ()"); 857 end if; 858 859 Par_No := Matcher.Paren_Count + 1; 860 Matcher.Paren_Count := Matcher.Paren_Count + 1; 861 IP := Emit_Node (OPEN); 862 Emit (Character'Val (Par_No)); 863 else 864 IP := 0; 865 Par_No := 0; 866 end if; 867 868 -- Pick up the branches, linking them together 869 870 Parse_Branch (New_Flags, True, Br); 871 872 if Br = 0 then 873 IP := 0; 874 return; 875 end if; 876 877 if Parse_Pos <= Parse_End 878 and then E (Parse_Pos) = '|' 879 then 880 Insert_Operator (BRANCH, Br); 881 Have_Branch := True; 882 end if; 883 884 if IP /= 0 then 885 Link_Tail (IP, Br); -- OPEN -> first 886 else 887 IP := Br; 888 end if; 889 890 if not New_Flags.Has_Width then 891 Flags.Has_Width := False; 892 end if; 893 894 Flags.SP_Start := Flags.SP_Start or else New_Flags.SP_Start; 895 896 while Parse_Pos <= Parse_End 897 and then (E (Parse_Pos) = '|') 898 loop 899 Parse_Pos := Parse_Pos + 1; 900 Parse_Branch (New_Flags, False, Br); 901 902 if Br = 0 then 903 IP := 0; 904 return; 905 end if; 906 907 Link_Tail (IP, Br); -- BRANCH -> BRANCH 908 909 if not New_Flags.Has_Width then 910 Flags.Has_Width := False; 911 end if; 912 913 Flags.SP_Start := Flags.SP_Start or else New_Flags.SP_Start; 914 end loop; 915 916 -- Make a closing node, and hook it on the end 917 918 if Parenthesized then 919 if Capturing then 920 Ender := Emit_Node (CLOSE); 921 Emit (Character'Val (Par_No)); 922 Link_Tail (IP, Ender); 923 924 else 925 -- Need to keep looking after the closing parenthesis 926 Ender := Emit_Ptr; 927 end if; 928 929 else 930 Ender := Emit_Node (EOP); 931 Link_Tail (IP, Ender); 932 end if; 933 934 if Have_Branch and then Emit_Ptr <= PM.Size + 1 then 935 936 -- Hook the tails of the branches to the closing node 937 938 Br := IP; 939 loop 940 Link_Operand_Tail (Br, Ender); 941 Br2 := Get_Next (Program, Br); 942 exit when Br2 = Br; 943 Br := Br2; 944 end loop; 945 end if; 946 947 -- Check for proper termination 948 949 if Parenthesized then 950 if Parse_Pos > Parse_End or else E (Parse_Pos) /= ')' then 951 Fail ("unmatched ()"); 952 end if; 953 954 Parse_Pos := Parse_Pos + 1; 955 956 elsif Parse_Pos <= Parse_End then 957 if E (Parse_Pos) = ')' then 958 Fail ("unmatched ')'"); 959 else 960 Fail ("junk on end"); -- "Can't happen" 961 end if; 962 end if; 963 end Parse; 964 965 ---------------- 966 -- Parse_Atom -- 967 ---------------- 968 969 procedure Parse_Atom 970 (Expr_Flags : out Expression_Flags; 971 IP : out Pointer) 972 is 973 C : Character; 974 975 begin 976 -- Tentatively set worst expression case 977 978 Expr_Flags := Worst_Expression; 979 980 C := Expression (Parse_Pos); 981 Parse_Pos := Parse_Pos + 1; 982 983 case (C) is 984 when '^' => 985 IP := 986 Emit_Node 987 (if (Flags and Multiple_Lines) /= 0 then MBOL 988 elsif (Flags and Single_Line) /= 0 then SBOL 989 else BOL); 990 991 when '$' => 992 IP := 993 Emit_Node 994 (if (Flags and Multiple_Lines) /= 0 then MEOL 995 elsif (Flags and Single_Line) /= 0 then SEOL 996 else EOL); 997 998 when '.' => 999 IP := 1000 Emit_Node 1001 (if (Flags and Single_Line) /= 0 then SANY else ANY); 1002 1003 Expr_Flags.Has_Width := True; 1004 Expr_Flags.Simple := True; 1005 1006 when '[' => 1007 Parse_Character_Class (IP); 1008 Expr_Flags.Has_Width := True; 1009 Expr_Flags.Simple := True; 1010 1011 when '(' => 1012 declare 1013 New_Flags : Expression_Flags; 1014 1015 begin 1016 if Parse_Pos <= Parse_End - 1 1017 and then Expression (Parse_Pos) = '?' 1018 and then Expression (Parse_Pos + 1) = ':' 1019 then 1020 Parse_Pos := Parse_Pos + 2; 1021 1022 -- Non-capturing parenthesis 1023 1024 Parse (True, False, New_Flags, IP); 1025 1026 else 1027 -- Capturing parenthesis 1028 1029 Parse (True, True, New_Flags, IP); 1030 Expr_Flags.Has_Width := 1031 Expr_Flags.Has_Width or else New_Flags.Has_Width; 1032 Expr_Flags.SP_Start := 1033 Expr_Flags.SP_Start or else New_Flags.SP_Start; 1034 if IP = 0 then 1035 return; 1036 end if; 1037 end if; 1038 end; 1039 1040 when '|' | ASCII.LF | ')' => 1041 Fail ("internal urp"); -- Supposed to be caught earlier 1042 1043 when '?' | '+' | '*' => 1044 Fail (C & " follows nothing"); 1045 1046 when '{' => 1047 if Is_Curly_Operator (Parse_Pos - 1) then 1048 Fail (C & " follows nothing"); 1049 else 1050 Parse_Literal (Expr_Flags, IP); 1051 end if; 1052 1053 when '\' => 1054 if Parse_Pos > Parse_End then 1055 Fail ("trailing \"); 1056 end if; 1057 1058 Parse_Pos := Parse_Pos + 1; 1059 1060 case Expression (Parse_Pos - 1) is 1061 when 'b' => 1062 IP := Emit_Node (BOUND); 1063 1064 when 'B' => 1065 IP := Emit_Node (NBOUND); 1066 1067 when 's' => 1068 IP := Emit_Node (SPACE); 1069 Expr_Flags.Simple := True; 1070 Expr_Flags.Has_Width := True; 1071 1072 when 'S' => 1073 IP := Emit_Node (NSPACE); 1074 Expr_Flags.Simple := True; 1075 Expr_Flags.Has_Width := True; 1076 1077 when 'd' => 1078 IP := Emit_Node (DIGIT); 1079 Expr_Flags.Simple := True; 1080 Expr_Flags.Has_Width := True; 1081 1082 when 'D' => 1083 IP := Emit_Node (NDIGIT); 1084 Expr_Flags.Simple := True; 1085 Expr_Flags.Has_Width := True; 1086 1087 when 'w' => 1088 IP := Emit_Node (ALNUM); 1089 Expr_Flags.Simple := True; 1090 Expr_Flags.Has_Width := True; 1091 1092 when 'W' => 1093 IP := Emit_Node (NALNUM); 1094 Expr_Flags.Simple := True; 1095 Expr_Flags.Has_Width := True; 1096 1097 when 'A' => 1098 IP := Emit_Node (SBOL); 1099 1100 when 'G' => 1101 IP := Emit_Node (SEOL); 1102 1103 when '0' .. '9' => 1104 IP := Emit_Node (REFF); 1105 1106 declare 1107 Save : constant Natural := Parse_Pos - 1; 1108 1109 begin 1110 while Parse_Pos <= Expression'Last 1111 and then Is_Digit (Expression (Parse_Pos)) 1112 loop 1113 Parse_Pos := Parse_Pos + 1; 1114 end loop; 1115 1116 Emit (Character'Val (Natural'Value 1117 (Expression (Save .. Parse_Pos - 1)))); 1118 end; 1119 1120 when others => 1121 Parse_Pos := Parse_Pos - 1; 1122 Parse_Literal (Expr_Flags, IP); 1123 end case; 1124 1125 when others => 1126 Parse_Literal (Expr_Flags, IP); 1127 end case; 1128 end Parse_Atom; 1129 1130 ------------------ 1131 -- Parse_Branch -- 1132 ------------------ 1133 1134 procedure Parse_Branch 1135 (Flags : out Expression_Flags; 1136 First : Boolean; 1137 IP : out Pointer) 1138 is 1139 E : String renames Expression; 1140 Chain : Pointer; 1141 Last : Pointer; 1142 New_Flags : Expression_Flags; 1143 1144 Discard : Pointer; 1145 pragma Warnings (Off, Discard); 1146 1147 begin 1148 Flags := Worst_Expression; -- Tentatively 1149 IP := (if First then Emit_Ptr else Emit_Node (BRANCH)); 1150 1151 Chain := 0; 1152 while Parse_Pos <= Parse_End 1153 and then E (Parse_Pos) /= ')' 1154 and then E (Parse_Pos) /= ASCII.LF 1155 and then E (Parse_Pos) /= '|' 1156 loop 1157 Parse_Piece (New_Flags, Last); 1158 1159 if Last = 0 then 1160 IP := 0; 1161 return; 1162 end if; 1163 1164 Flags.Has_Width := Flags.Has_Width or else New_Flags.Has_Width; 1165 1166 if Chain = 0 then -- First piece 1167 Flags.SP_Start := Flags.SP_Start or else New_Flags.SP_Start; 1168 else 1169 Link_Tail (Chain, Last); 1170 end if; 1171 1172 Chain := Last; 1173 end loop; 1174 1175 -- Case where loop ran zero CURLY 1176 1177 if Chain = 0 then 1178 Discard := Emit_Node (NOTHING); 1179 end if; 1180 end Parse_Branch; 1181 1182 --------------------------- 1183 -- Parse_Character_Class -- 1184 --------------------------- 1185 1186 procedure Parse_Character_Class (IP : out Pointer) is 1187 Bitmap : Character_Class; 1188 Invert : Boolean := False; 1189 In_Range : Boolean := False; 1190 Named_Class : Std_Class := ANYOF_NONE; 1191 Value : Character; 1192 Last_Value : Character := ASCII.NUL; 1193 1194 begin 1195 Reset_Class (Bitmap); 1196 1197 -- Do we have an invert character class ? 1198 1199 if Parse_Pos <= Parse_End 1200 and then Expression (Parse_Pos) = '^' 1201 then 1202 Invert := True; 1203 Parse_Pos := Parse_Pos + 1; 1204 end if; 1205 1206 -- First character can be ] or - without closing the class 1207 1208 if Parse_Pos <= Parse_End 1209 and then (Expression (Parse_Pos) = ']' 1210 or else Expression (Parse_Pos) = '-') 1211 then 1212 Set_In_Class (Bitmap, Expression (Parse_Pos)); 1213 Parse_Pos := Parse_Pos + 1; 1214 end if; 1215 1216 -- While we don't have the end of the class 1217 1218 while Parse_Pos <= Parse_End 1219 and then Expression (Parse_Pos) /= ']' 1220 loop 1221 Named_Class := ANYOF_NONE; 1222 Value := Expression (Parse_Pos); 1223 Parse_Pos := Parse_Pos + 1; 1224 1225 -- Do we have a Posix character class 1226 if Value = '[' then 1227 Named_Class := Parse_Posix_Character_Class; 1228 1229 elsif Value = '\' then 1230 if Parse_Pos = Parse_End then 1231 Fail ("Trailing \"); 1232 end if; 1233 Value := Expression (Parse_Pos); 1234 Parse_Pos := Parse_Pos + 1; 1235 1236 case Value is 1237 when 'w' => Named_Class := ANYOF_ALNUM; 1238 when 'W' => Named_Class := ANYOF_NALNUM; 1239 when 's' => Named_Class := ANYOF_SPACE; 1240 when 'S' => Named_Class := ANYOF_NSPACE; 1241 when 'd' => Named_Class := ANYOF_DIGIT; 1242 when 'D' => Named_Class := ANYOF_NDIGIT; 1243 when 'n' => Value := ASCII.LF; 1244 when 'r' => Value := ASCII.CR; 1245 when 't' => Value := ASCII.HT; 1246 when 'f' => Value := ASCII.FF; 1247 when 'e' => Value := ASCII.ESC; 1248 when 'a' => Value := ASCII.BEL; 1249 1250 -- when 'x' => ??? hexadecimal value 1251 -- when 'c' => ??? control character 1252 -- when '0'..'9' => ??? octal character 1253 1254 when others => null; 1255 end case; 1256 end if; 1257 1258 -- Do we have a character class? 1259 1260 if Named_Class /= ANYOF_NONE then 1261 1262 -- A range like 'a-\d' or 'a-[:digit:] is not a range 1263 1264 if In_Range then 1265 Set_In_Class (Bitmap, Last_Value); 1266 Set_In_Class (Bitmap, '-'); 1267 In_Range := False; 1268 end if; 1269 1270 -- Expand the range 1271 1272 case Named_Class is 1273 when ANYOF_NONE => null; 1274 1275 when ANYOF_ALNUM | ANYOF_ALNUMC => 1276 for Value in Class_Byte'Range loop 1277 if Is_Alnum (Character'Val (Value)) then 1278 Set_In_Class (Bitmap, Character'Val (Value)); 1279 end if; 1280 end loop; 1281 1282 when ANYOF_NALNUM | ANYOF_NALNUMC => 1283 for Value in Class_Byte'Range loop 1284 if not Is_Alnum (Character'Val (Value)) then 1285 Set_In_Class (Bitmap, Character'Val (Value)); 1286 end if; 1287 end loop; 1288 1289 when ANYOF_SPACE => 1290 for Value in Class_Byte'Range loop 1291 if Is_White_Space (Character'Val (Value)) then 1292 Set_In_Class (Bitmap, Character'Val (Value)); 1293 end if; 1294 end loop; 1295 1296 when ANYOF_NSPACE => 1297 for Value in Class_Byte'Range loop 1298 if not Is_White_Space (Character'Val (Value)) then 1299 Set_In_Class (Bitmap, Character'Val (Value)); 1300 end if; 1301 end loop; 1302 1303 when ANYOF_DIGIT => 1304 for Value in Class_Byte'Range loop 1305 if Is_Digit (Character'Val (Value)) then 1306 Set_In_Class (Bitmap, Character'Val (Value)); 1307 end if; 1308 end loop; 1309 1310 when ANYOF_NDIGIT => 1311 for Value in Class_Byte'Range loop 1312 if not Is_Digit (Character'Val (Value)) then 1313 Set_In_Class (Bitmap, Character'Val (Value)); 1314 end if; 1315 end loop; 1316 1317 when ANYOF_ALPHA => 1318 for Value in Class_Byte'Range loop 1319 if Is_Letter (Character'Val (Value)) then 1320 Set_In_Class (Bitmap, Character'Val (Value)); 1321 end if; 1322 end loop; 1323 1324 when ANYOF_NALPHA => 1325 for Value in Class_Byte'Range loop 1326 if not Is_Letter (Character'Val (Value)) then 1327 Set_In_Class (Bitmap, Character'Val (Value)); 1328 end if; 1329 end loop; 1330 1331 when ANYOF_ASCII => 1332 for Value in 0 .. 127 loop 1333 Set_In_Class (Bitmap, Character'Val (Value)); 1334 end loop; 1335 1336 when ANYOF_NASCII => 1337 for Value in 128 .. 255 loop 1338 Set_In_Class (Bitmap, Character'Val (Value)); 1339 end loop; 1340 1341 when ANYOF_CNTRL => 1342 for Value in Class_Byte'Range loop 1343 if Is_Control (Character'Val (Value)) then 1344 Set_In_Class (Bitmap, Character'Val (Value)); 1345 end if; 1346 end loop; 1347 1348 when ANYOF_NCNTRL => 1349 for Value in Class_Byte'Range loop 1350 if not Is_Control (Character'Val (Value)) then 1351 Set_In_Class (Bitmap, Character'Val (Value)); 1352 end if; 1353 end loop; 1354 1355 when ANYOF_GRAPH => 1356 for Value in Class_Byte'Range loop 1357 if Is_Graphic (Character'Val (Value)) then 1358 Set_In_Class (Bitmap, Character'Val (Value)); 1359 end if; 1360 end loop; 1361 1362 when ANYOF_NGRAPH => 1363 for Value in Class_Byte'Range loop 1364 if not Is_Graphic (Character'Val (Value)) then 1365 Set_In_Class (Bitmap, Character'Val (Value)); 1366 end if; 1367 end loop; 1368 1369 when ANYOF_LOWER => 1370 for Value in Class_Byte'Range loop 1371 if Is_Lower (Character'Val (Value)) then 1372 Set_In_Class (Bitmap, Character'Val (Value)); 1373 end if; 1374 end loop; 1375 1376 when ANYOF_NLOWER => 1377 for Value in Class_Byte'Range loop 1378 if not Is_Lower (Character'Val (Value)) then 1379 Set_In_Class (Bitmap, Character'Val (Value)); 1380 end if; 1381 end loop; 1382 1383 when ANYOF_PRINT => 1384 for Value in Class_Byte'Range loop 1385 if Is_Printable (Character'Val (Value)) then 1386 Set_In_Class (Bitmap, Character'Val (Value)); 1387 end if; 1388 end loop; 1389 1390 when ANYOF_NPRINT => 1391 for Value in Class_Byte'Range loop 1392 if not Is_Printable (Character'Val (Value)) then 1393 Set_In_Class (Bitmap, Character'Val (Value)); 1394 end if; 1395 end loop; 1396 1397 when ANYOF_PUNCT => 1398 for Value in Class_Byte'Range loop 1399 if Is_Printable (Character'Val (Value)) 1400 and then not Is_White_Space (Character'Val (Value)) 1401 and then not Is_Alnum (Character'Val (Value)) 1402 then 1403 Set_In_Class (Bitmap, Character'Val (Value)); 1404 end if; 1405 end loop; 1406 1407 when ANYOF_NPUNCT => 1408 for Value in Class_Byte'Range loop 1409 if not Is_Printable (Character'Val (Value)) 1410 or else Is_White_Space (Character'Val (Value)) 1411 or else Is_Alnum (Character'Val (Value)) 1412 then 1413 Set_In_Class (Bitmap, Character'Val (Value)); 1414 end if; 1415 end loop; 1416 1417 when ANYOF_UPPER => 1418 for Value in Class_Byte'Range loop 1419 if Is_Upper (Character'Val (Value)) then 1420 Set_In_Class (Bitmap, Character'Val (Value)); 1421 end if; 1422 end loop; 1423 1424 when ANYOF_NUPPER => 1425 for Value in Class_Byte'Range loop 1426 if not Is_Upper (Character'Val (Value)) then 1427 Set_In_Class (Bitmap, Character'Val (Value)); 1428 end if; 1429 end loop; 1430 1431 when ANYOF_XDIGIT => 1432 for Value in Class_Byte'Range loop 1433 if Is_Hexadecimal_Digit (Character'Val (Value)) then 1434 Set_In_Class (Bitmap, Character'Val (Value)); 1435 end if; 1436 end loop; 1437 1438 when ANYOF_NXDIGIT => 1439 for Value in Class_Byte'Range loop 1440 if not Is_Hexadecimal_Digit 1441 (Character'Val (Value)) 1442 then 1443 Set_In_Class (Bitmap, Character'Val (Value)); 1444 end if; 1445 end loop; 1446 1447 end case; 1448 1449 -- Not a character range 1450 1451 elsif not In_Range then 1452 Last_Value := Value; 1453 1454 if Parse_Pos > Expression'Last then 1455 Fail ("Empty character class []"); 1456 end if; 1457 1458 if Expression (Parse_Pos) = '-' 1459 and then Parse_Pos < Parse_End 1460 and then Expression (Parse_Pos + 1) /= ']' 1461 then 1462 Parse_Pos := Parse_Pos + 1; 1463 1464 -- Do we have a range like '\d-a' and '[:space:]-a' 1465 -- which is not a real range 1466 1467 if Named_Class /= ANYOF_NONE then 1468 Set_In_Class (Bitmap, '-'); 1469 else 1470 In_Range := True; 1471 end if; 1472 1473 else 1474 Set_In_Class (Bitmap, Value); 1475 1476 end if; 1477 1478 -- Else in a character range 1479 1480 else 1481 if Last_Value > Value then 1482 Fail ("Invalid Range [" & Last_Value'Img 1483 & "-" & Value'Img & "]"); 1484 end if; 1485 1486 while Last_Value <= Value loop 1487 Set_In_Class (Bitmap, Last_Value); 1488 Last_Value := Character'Succ (Last_Value); 1489 end loop; 1490 1491 In_Range := False; 1492 1493 end if; 1494 1495 end loop; 1496 1497 -- Optimize case-insensitive ranges (put the upper case or lower 1498 -- case character into the bitmap) 1499 1500 if (Flags and Case_Insensitive) /= 0 then 1501 for C in Character'Range loop 1502 if Get_From_Class (Bitmap, C) then 1503 Set_In_Class (Bitmap, To_Lower (C)); 1504 Set_In_Class (Bitmap, To_Upper (C)); 1505 end if; 1506 end loop; 1507 end if; 1508 1509 -- Optimize inverted classes 1510 1511 if Invert then 1512 for J in Bitmap'Range loop 1513 Bitmap (J) := not Bitmap (J); 1514 end loop; 1515 end if; 1516 1517 Parse_Pos := Parse_Pos + 1; 1518 1519 -- Emit the class 1520 1521 IP := Emit_Node (ANYOF); 1522 Emit_Class (Bitmap); 1523 end Parse_Character_Class; 1524 1525 ------------------- 1526 -- Parse_Literal -- 1527 ------------------- 1528 1529 -- This is a bit tricky due to quoted chars and due to 1530 -- the multiplier characters '*', '+', and '?' that 1531 -- take the SINGLE char previous as their operand. 1532 1533 -- On entry, the character at Parse_Pos - 1 is going to go 1534 -- into the string, no matter what it is. It could be 1535 -- following a \ if Parse_Atom was entered from the '\' case. 1536 1537 -- Basic idea is to pick up a good char in C and examine 1538 -- the next char. If Is_Mult (C) then twiddle, if it's a \ 1539 -- then frozzle and if it's another magic char then push C and 1540 -- terminate the string. If none of the above, push C on the 1541 -- string and go around again. 1542 1543 -- Start_Pos is used to remember where "the current character" 1544 -- starts in the string, if due to an Is_Mult we need to back 1545 -- up and put the current char in a separate 1-character string. 1546 -- When Start_Pos is 0, C is the only char in the string; 1547 -- this is used in Is_Mult handling, and in setting the SIMPLE 1548 -- flag at the end. 1549 1550 procedure Parse_Literal 1551 (Expr_Flags : out Expression_Flags; 1552 IP : out Pointer) 1553 is 1554 Start_Pos : Natural := 0; 1555 C : Character; 1556 Length_Ptr : Pointer; 1557 1558 Has_Special_Operator : Boolean := False; 1559 1560 begin 1561 Parse_Pos := Parse_Pos - 1; -- Look at current character 1562 1563 IP := 1564 Emit_Node 1565 (if (Flags and Case_Insensitive) /= 0 then EXACTF else EXACT); 1566 1567 Length_Ptr := Emit_Ptr; 1568 Emit_Ptr := String_Operand (IP); 1569 1570 Parse_Loop : 1571 loop 1572 C := Expression (Parse_Pos); -- Get current character 1573 1574 case C is 1575 when '.' | '[' | '(' | ')' | '|' | ASCII.LF | '$' | '^' => 1576 1577 if Start_Pos = 0 then 1578 Start_Pos := Parse_Pos; 1579 Emit (C); -- First character is always emitted 1580 else 1581 exit Parse_Loop; -- Else we are done 1582 end if; 1583 1584 when '?' | '+' | '*' | '{' => 1585 1586 if Start_Pos = 0 then 1587 Start_Pos := Parse_Pos; 1588 Emit (C); -- First character is always emitted 1589 1590 -- Are we looking at an operator, or is this 1591 -- simply a normal character ? 1592 1593 elsif not Is_Mult (Parse_Pos) then 1594 Start_Pos := Parse_Pos; 1595 Case_Emit (C); 1596 1597 else 1598 -- We've got something like "abc?d". Mark this as a 1599 -- special case. What we want to emit is a first 1600 -- constant string for "ab", then one for "c" that will 1601 -- ultimately be transformed with a CURLY operator, A 1602 -- special case has to be handled for "a?", since there 1603 -- is no initial string to emit. 1604 1605 Has_Special_Operator := True; 1606 exit Parse_Loop; 1607 end if; 1608 1609 when '\' => 1610 Start_Pos := Parse_Pos; 1611 1612 if Parse_Pos = Parse_End then 1613 Fail ("Trailing \"); 1614 1615 else 1616 case Expression (Parse_Pos + 1) is 1617 when 'b' | 'B' | 's' | 'S' | 'd' | 'D' 1618 | 'w' | 'W' | '0' .. '9' | 'G' | 'A' 1619 => exit Parse_Loop; 1620 when 'n' => Emit (ASCII.LF); 1621 when 't' => Emit (ASCII.HT); 1622 when 'r' => Emit (ASCII.CR); 1623 when 'f' => Emit (ASCII.FF); 1624 when 'e' => Emit (ASCII.ESC); 1625 when 'a' => Emit (ASCII.BEL); 1626 when others => Emit (Expression (Parse_Pos + 1)); 1627 end case; 1628 1629 Parse_Pos := Parse_Pos + 1; 1630 end if; 1631 1632 when others => 1633 Start_Pos := Parse_Pos; 1634 Case_Emit (C); 1635 end case; 1636 1637 Parse_Pos := Parse_Pos + 1; 1638 exit Parse_Loop when Parse_Pos > Parse_End 1639 or else Emit_Ptr - Length_Ptr = 254; 1640 end loop Parse_Loop; 1641 1642 -- Is the string followed by a '*+?{' operator ? If yes, and if there 1643 -- is an initial string to emit, do it now. 1644 1645 if Has_Special_Operator 1646 and then Emit_Ptr >= Length_Ptr + Next_Pointer_Bytes 1647 then 1648 Emit_Ptr := Emit_Ptr - 1; 1649 Parse_Pos := Start_Pos; 1650 end if; 1651 1652 if Length_Ptr <= PM.Size then 1653 Program (Length_Ptr) := Character'Val (Emit_Ptr - Length_Ptr - 2); 1654 end if; 1655 1656 Expr_Flags.Has_Width := True; 1657 1658 -- Slight optimization when there is a single character 1659 1660 if Emit_Ptr = Length_Ptr + 2 then 1661 Expr_Flags.Simple := True; 1662 end if; 1663 end Parse_Literal; 1664 1665 ----------------- 1666 -- Parse_Piece -- 1667 ----------------- 1668 1669 -- Note that the branching code sequences used for '?' and the 1670 -- general cases of '*' and + are somewhat optimized: they use 1671 -- the same NOTHING node as both the endmarker for their branch 1672 -- list and the body of the last branch. It might seem that 1673 -- this node could be dispensed with entirely, but the endmarker 1674 -- role is not redundant. 1675 1676 procedure Parse_Piece 1677 (Expr_Flags : out Expression_Flags; 1678 IP : out Pointer) 1679 is 1680 Op : Character; 1681 New_Flags : Expression_Flags; 1682 Greedy : Boolean := True; 1683 1684 begin 1685 Parse_Atom (New_Flags, IP); 1686 1687 if IP = 0 then 1688 return; 1689 end if; 1690 1691 if Parse_Pos > Parse_End 1692 or else not Is_Mult (Parse_Pos) 1693 then 1694 Expr_Flags := New_Flags; 1695 return; 1696 end if; 1697 1698 Op := Expression (Parse_Pos); 1699 1700 Expr_Flags := 1701 (if Op /= '+' 1702 then (SP_Start => True, others => False) 1703 else (Has_Width => True, others => False)); 1704 1705 -- Detect non greedy operators in the easy cases 1706 1707 if Op /= '{' 1708 and then Parse_Pos + 1 <= Parse_End 1709 and then Expression (Parse_Pos + 1) = '?' 1710 then 1711 Greedy := False; 1712 Parse_Pos := Parse_Pos + 1; 1713 end if; 1714 1715 -- Generate the byte code 1716 1717 case Op is 1718 when '*' => 1719 1720 if New_Flags.Simple then 1721 Insert_Operator (STAR, IP, Greedy); 1722 else 1723 Link_Tail (IP, Emit_Node (WHILEM)); 1724 Insert_Curly_Operator 1725 (CURLYX, 0, Max_Curly_Repeat, IP, Greedy); 1726 Link_Tail (IP, Emit_Node (NOTHING)); 1727 end if; 1728 1729 when '+' => 1730 1731 if New_Flags.Simple then 1732 Insert_Operator (PLUS, IP, Greedy); 1733 else 1734 Link_Tail (IP, Emit_Node (WHILEM)); 1735 Insert_Curly_Operator 1736 (CURLYX, 1, Max_Curly_Repeat, IP, Greedy); 1737 Link_Tail (IP, Emit_Node (NOTHING)); 1738 end if; 1739 1740 when '?' => 1741 if New_Flags.Simple then 1742 Insert_Curly_Operator (CURLY, 0, 1, IP, Greedy); 1743 else 1744 Link_Tail (IP, Emit_Node (WHILEM)); 1745 Insert_Curly_Operator (CURLYX, 0, 1, IP, Greedy); 1746 Link_Tail (IP, Emit_Node (NOTHING)); 1747 end if; 1748 1749 when '{' => 1750 declare 1751 Min, Max : Natural; 1752 1753 begin 1754 Get_Curly_Arguments (Parse_Pos, Min, Max, Greedy); 1755 1756 if New_Flags.Simple then 1757 Insert_Curly_Operator (CURLY, Min, Max, IP, Greedy); 1758 else 1759 Link_Tail (IP, Emit_Node (WHILEM)); 1760 Insert_Curly_Operator (CURLYX, Min, Max, IP, Greedy); 1761 Link_Tail (IP, Emit_Node (NOTHING)); 1762 end if; 1763 end; 1764 1765 when others => 1766 null; 1767 end case; 1768 1769 Parse_Pos := Parse_Pos + 1; 1770 1771 if Parse_Pos <= Parse_End 1772 and then Is_Mult (Parse_Pos) 1773 then 1774 Fail ("nested *+{"); 1775 end if; 1776 end Parse_Piece; 1777 1778 --------------------------------- 1779 -- Parse_Posix_Character_Class -- 1780 --------------------------------- 1781 1782 function Parse_Posix_Character_Class return Std_Class is 1783 Invert : Boolean := False; 1784 Class : Std_Class := ANYOF_NONE; 1785 E : String renames Expression; 1786 1787 -- Class names. Note that code assumes that the length of all 1788 -- classes starting with the same letter have the same length. 1789 1790 Alnum : constant String := "alnum:]"; 1791 Alpha : constant String := "alpha:]"; 1792 Ascii_C : constant String := "ascii:]"; 1793 Cntrl : constant String := "cntrl:]"; 1794 Digit : constant String := "digit:]"; 1795 Graph : constant String := "graph:]"; 1796 Lower : constant String := "lower:]"; 1797 Print : constant String := "print:]"; 1798 Punct : constant String := "punct:]"; 1799 Space : constant String := "space:]"; 1800 Upper : constant String := "upper:]"; 1801 Word : constant String := "word:]"; 1802 Xdigit : constant String := "xdigit:]"; 1803 1804 begin 1805 -- Case of character class specified 1806 1807 if Parse_Pos <= Parse_End 1808 and then Expression (Parse_Pos) = ':' 1809 then 1810 Parse_Pos := Parse_Pos + 1; 1811 1812 -- Do we have something like: [[:^alpha:]] 1813 1814 if Parse_Pos <= Parse_End 1815 and then Expression (Parse_Pos) = '^' 1816 then 1817 Invert := True; 1818 Parse_Pos := Parse_Pos + 1; 1819 end if; 1820 1821 -- Check for class names based on first letter 1822 1823 case Expression (Parse_Pos) is 1824 when 'a' => 1825 1826 -- All 'a' classes have the same length (Alnum'Length) 1827 1828 if Parse_Pos + Alnum'Length - 1 <= Parse_End then 1829 if 1830 E (Parse_Pos .. Parse_Pos + Alnum'Length - 1) = Alnum 1831 then 1832 Class := 1833 (if Invert then ANYOF_NALNUMC else ANYOF_ALNUMC); 1834 Parse_Pos := Parse_Pos + Alnum'Length; 1835 1836 elsif 1837 E (Parse_Pos .. Parse_Pos + Alpha'Length - 1) = Alpha 1838 then 1839 Class := 1840 (if Invert then ANYOF_NALPHA else ANYOF_ALPHA); 1841 Parse_Pos := Parse_Pos + Alpha'Length; 1842 1843 elsif E (Parse_Pos .. Parse_Pos + Ascii_C'Length - 1) = 1844 Ascii_C 1845 then 1846 Class := 1847 (if Invert then ANYOF_NASCII else ANYOF_ASCII); 1848 Parse_Pos := Parse_Pos + Ascii_C'Length; 1849 else 1850 Fail ("Invalid character class: " & E); 1851 end if; 1852 1853 else 1854 Fail ("Invalid character class: " & E); 1855 end if; 1856 1857 when 'c' => 1858 if Parse_Pos + Cntrl'Length - 1 <= Parse_End 1859 and then 1860 E (Parse_Pos .. Parse_Pos + Cntrl'Length - 1) = Cntrl 1861 then 1862 Class := (if Invert then ANYOF_NCNTRL else ANYOF_CNTRL); 1863 Parse_Pos := Parse_Pos + Cntrl'Length; 1864 else 1865 Fail ("Invalid character class: " & E); 1866 end if; 1867 1868 when 'd' => 1869 if Parse_Pos + Digit'Length - 1 <= Parse_End 1870 and then 1871 E (Parse_Pos .. Parse_Pos + Digit'Length - 1) = Digit 1872 then 1873 Class := (if Invert then ANYOF_NDIGIT else ANYOF_DIGIT); 1874 Parse_Pos := Parse_Pos + Digit'Length; 1875 end if; 1876 1877 when 'g' => 1878 if Parse_Pos + Graph'Length - 1 <= Parse_End 1879 and then 1880 E (Parse_Pos .. Parse_Pos + Graph'Length - 1) = Graph 1881 then 1882 Class := (if Invert then ANYOF_NGRAPH else ANYOF_GRAPH); 1883 Parse_Pos := Parse_Pos + Graph'Length; 1884 else 1885 Fail ("Invalid character class: " & E); 1886 end if; 1887 1888 when 'l' => 1889 if Parse_Pos + Lower'Length - 1 <= Parse_End 1890 and then 1891 E (Parse_Pos .. Parse_Pos + Lower'Length - 1) = Lower 1892 then 1893 Class := (if Invert then ANYOF_NLOWER else ANYOF_LOWER); 1894 Parse_Pos := Parse_Pos + Lower'Length; 1895 else 1896 Fail ("Invalid character class: " & E); 1897 end if; 1898 1899 when 'p' => 1900 1901 -- All 'p' classes have the same length 1902 1903 if Parse_Pos + Print'Length - 1 <= Parse_End then 1904 if 1905 E (Parse_Pos .. Parse_Pos + Print'Length - 1) = Print 1906 then 1907 Class := 1908 (if Invert then ANYOF_NPRINT else ANYOF_PRINT); 1909 Parse_Pos := Parse_Pos + Print'Length; 1910 1911 elsif 1912 E (Parse_Pos .. Parse_Pos + Punct'Length - 1) = Punct 1913 then 1914 Class := 1915 (if Invert then ANYOF_NPUNCT else ANYOF_PUNCT); 1916 Parse_Pos := Parse_Pos + Punct'Length; 1917 1918 else 1919 Fail ("Invalid character class: " & E); 1920 end if; 1921 1922 else 1923 Fail ("Invalid character class: " & E); 1924 end if; 1925 1926 when 's' => 1927 if Parse_Pos + Space'Length - 1 <= Parse_End 1928 and then 1929 E (Parse_Pos .. Parse_Pos + Space'Length - 1) = Space 1930 then 1931 Class := (if Invert then ANYOF_NSPACE else ANYOF_SPACE); 1932 Parse_Pos := Parse_Pos + Space'Length; 1933 else 1934 Fail ("Invalid character class: " & E); 1935 end if; 1936 1937 when 'u' => 1938 if Parse_Pos + Upper'Length - 1 <= Parse_End 1939 and then 1940 E (Parse_Pos .. Parse_Pos + Upper'Length - 1) = Upper 1941 then 1942 Class := (if Invert then ANYOF_NUPPER else ANYOF_UPPER); 1943 Parse_Pos := Parse_Pos + Upper'Length; 1944 else 1945 Fail ("Invalid character class: " & E); 1946 end if; 1947 1948 when 'w' => 1949 if Parse_Pos + Word'Length - 1 <= Parse_End 1950 and then 1951 E (Parse_Pos .. Parse_Pos + Word'Length - 1) = Word 1952 then 1953 Class := (if Invert then ANYOF_NALNUM else ANYOF_ALNUM); 1954 Parse_Pos := Parse_Pos + Word'Length; 1955 else 1956 Fail ("Invalid character class: " & E); 1957 end if; 1958 1959 when 'x' => 1960 if Parse_Pos + Xdigit'Length - 1 <= Parse_End 1961 and then 1962 E (Parse_Pos .. Parse_Pos + Xdigit'Length - 1) = Xdigit 1963 then 1964 Class := (if Invert then ANYOF_NXDIGIT else ANYOF_XDIGIT); 1965 Parse_Pos := Parse_Pos + Xdigit'Length; 1966 1967 else 1968 Fail ("Invalid character class: " & E); 1969 end if; 1970 1971 when others => 1972 Fail ("Invalid character class: " & E); 1973 end case; 1974 1975 -- Character class not specified 1976 1977 else 1978 return ANYOF_NONE; 1979 end if; 1980 1981 return Class; 1982 end Parse_Posix_Character_Class; 1983 1984 -- Local Declarations 1985 1986 Result : Pointer; 1987 1988 Expr_Flags : Expression_Flags; 1989 pragma Unreferenced (Expr_Flags); 1990 1991 -- Start of processing for Compile 1992 1993 begin 1994 Parse (False, False, Expr_Flags, Result); 1995 1996 if Result = 0 then 1997 Fail ("Couldn't compile expression"); 1998 end if; 1999 2000 Final_Code_Size := Emit_Ptr - 1; 2001 2002 -- Do we want to actually compile the expression, or simply get the 2003 -- code size ??? 2004 2005 if Emit_Ptr <= PM.Size then 2006 Optimize (PM); 2007 end if; 2008 2009 PM.Flags := Flags; 2010 end Compile; 2011 2012 function Compile 2013 (Expression : String; 2014 Flags : Regexp_Flags := No_Flags) return Pattern_Matcher 2015 is 2016 -- Assume the compiled regexp will fit in 1000 chars. If it does not we 2017 -- will have to compile a second time once the correct size is known. If 2018 -- it fits, we save a significant amount of time by avoiding the second 2019 -- compilation. 2020 2021 Dummy : Pattern_Matcher (1000); 2022 Size : Program_Size; 2023 2024 begin 2025 Compile (Dummy, Expression, Size, Flags); 2026 2027 if Size <= Dummy.Size then 2028 return Pattern_Matcher' 2029 (Size => Size, 2030 First => Dummy.First, 2031 Anchored => Dummy.Anchored, 2032 Must_Have => Dummy.Must_Have, 2033 Must_Have_Length => Dummy.Must_Have_Length, 2034 Paren_Count => Dummy.Paren_Count, 2035 Flags => Dummy.Flags, 2036 Program => 2037 Dummy.Program 2038 (Dummy.Program'First .. Dummy.Program'First + Size - 1)); 2039 else 2040 -- We have to recompile now that we know the size 2041 -- ??? Can we use Ada 2005's return construct ? 2042 2043 declare 2044 Result : Pattern_Matcher (Size); 2045 begin 2046 Compile (Result, Expression, Size, Flags); 2047 return Result; 2048 end; 2049 end if; 2050 end Compile; 2051 2052 procedure Compile 2053 (Matcher : out Pattern_Matcher; 2054 Expression : String; 2055 Flags : Regexp_Flags := No_Flags) 2056 is 2057 Size : Program_Size; 2058 2059 begin 2060 Compile (Matcher, Expression, Size, Flags); 2061 2062 if Size > Matcher.Size then 2063 raise Expression_Error with "Pattern_Matcher is too small"; 2064 end if; 2065 end Compile; 2066 2067 -------------------- 2068 -- Dump_Operation -- 2069 -------------------- 2070 2071 procedure Dump_Operation 2072 (Program : Program_Data; 2073 Index : Pointer; 2074 Indent : Natural) 2075 is 2076 Current : Pointer := Index; 2077 begin 2078 Dump_Until (Program, Current, Current + 1, Indent); 2079 end Dump_Operation; 2080 2081 ---------------- 2082 -- Dump_Until -- 2083 ---------------- 2084 2085 procedure Dump_Until 2086 (Program : Program_Data; 2087 Index : in out Pointer; 2088 Till : Pointer; 2089 Indent : Natural; 2090 Do_Print : Boolean := True) 2091 is 2092 function Image (S : String) return String; 2093 -- Remove leading space 2094 2095 ----------- 2096 -- Image -- 2097 ----------- 2098 2099 function Image (S : String) return String is 2100 begin 2101 if S (S'First) = ' ' then 2102 return S (S'First + 1 .. S'Last); 2103 else 2104 return S; 2105 end if; 2106 end Image; 2107 2108 -- Local variables 2109 2110 Op : Opcode; 2111 Next : Pointer; 2112 Length : Pointer; 2113 Local_Indent : Natural := Indent; 2114 2115 -- Start of processing for Dump_Until 2116 2117 begin 2118 while Index < Till loop 2119 Op := Opcode'Val (Character'Pos ((Program (Index)))); 2120 Next := Get_Next (Program, Index); 2121 2122 if Do_Print then 2123 declare 2124 Point : constant String := Pointer'Image (Index); 2125 begin 2126 Put ((1 .. 4 - Point'Length => ' ') 2127 & Point & ":" 2128 & (1 .. Local_Indent * 2 => ' ') & Opcode'Image (Op)); 2129 end; 2130 2131 -- Print the parenthesis number 2132 2133 if Op = OPEN or else Op = CLOSE or else Op = REFF then 2134 Put (Image (Natural'Image 2135 (Character'Pos 2136 (Program (Index + Next_Pointer_Bytes))))); 2137 end if; 2138 2139 if Next = Index then 2140 Put (" (-)"); 2141 else 2142 Put (" (" & Image (Pointer'Image (Next)) & ")"); 2143 end if; 2144 end if; 2145 2146 case Op is 2147 when ANYOF => 2148 declare 2149 Bitmap : Character_Class; 2150 Last : Character := ASCII.NUL; 2151 Current : Natural := 0; 2152 Current_Char : Character; 2153 2154 begin 2155 Bitmap_Operand (Program, Index, Bitmap); 2156 2157 if Do_Print then 2158 Put ("["); 2159 2160 while Current <= 255 loop 2161 Current_Char := Character'Val (Current); 2162 2163 -- First item in a range 2164 2165 if Get_From_Class (Bitmap, Current_Char) then 2166 Last := Current_Char; 2167 2168 -- Search for the last item in the range 2169 2170 loop 2171 Current := Current + 1; 2172 exit when Current > 255; 2173 Current_Char := Character'Val (Current); 2174 exit when 2175 not Get_From_Class (Bitmap, Current_Char); 2176 end loop; 2177 2178 if not Is_Graphic (Last) then 2179 Put (Last'Img); 2180 else 2181 Put (Last); 2182 end if; 2183 2184 if Character'Succ (Last) /= Current_Char then 2185 Put ("\-" & Character'Pred (Current_Char)); 2186 end if; 2187 2188 else 2189 Current := Current + 1; 2190 end if; 2191 end loop; 2192 2193 Put_Line ("]"); 2194 end if; 2195 2196 Index := Index + Next_Pointer_Bytes + Bitmap'Length; 2197 end; 2198 2199 when EXACT | EXACTF => 2200 Length := String_Length (Program, Index); 2201 if Do_Print then 2202 Put (" (" & Image (Program_Size'Image (Length + 1)) 2203 & " chars) <" 2204 & String (Program (String_Operand (Index) 2205 .. String_Operand (Index) 2206 + Length))); 2207 Put_Line (">"); 2208 end if; 2209 2210 Index := String_Operand (Index) + Length + 1; 2211 2212 -- Node operand 2213 2214 when BRANCH | STAR | PLUS => 2215 if Do_Print then 2216 New_Line; 2217 end if; 2218 2219 Index := Index + Next_Pointer_Bytes; 2220 Dump_Until (Program, Index, Pointer'Min (Next, Till), 2221 Local_Indent + 1, Do_Print); 2222 2223 when CURLY | CURLYX => 2224 if Do_Print then 2225 Put_Line 2226 (" {" 2227 & Image (Natural'Image 2228 (Read_Natural (Program, Index + Next_Pointer_Bytes))) 2229 & "," 2230 & Image (Natural'Image (Read_Natural (Program, Index + 5))) 2231 & "}"); 2232 end if; 2233 2234 Index := Index + 7; 2235 Dump_Until (Program, Index, Pointer'Min (Next, Till), 2236 Local_Indent + 1, Do_Print); 2237 2238 when OPEN => 2239 if Do_Print then 2240 New_Line; 2241 end if; 2242 2243 Index := Index + 4; 2244 Local_Indent := Local_Indent + 1; 2245 2246 when CLOSE | REFF => 2247 if Do_Print then 2248 New_Line; 2249 end if; 2250 2251 Index := Index + 4; 2252 2253 if Op = CLOSE then 2254 Local_Indent := Local_Indent - 1; 2255 end if; 2256 2257 when others => 2258 Index := Index + Next_Pointer_Bytes; 2259 2260 if Do_Print then 2261 New_Line; 2262 end if; 2263 2264 exit when Op = EOP; 2265 end case; 2266 end loop; 2267 end Dump_Until; 2268 2269 ---------- 2270 -- Dump -- 2271 ---------- 2272 2273 procedure Dump (Self : Pattern_Matcher) is 2274 Program : Program_Data renames Self.Program; 2275 Index : Pointer := Program'First; 2276 2277 -- Start of processing for Dump 2278 2279 begin 2280 Put_Line ("Must start with (Self.First) = " 2281 & Character'Image (Self.First)); 2282 2283 if (Self.Flags and Case_Insensitive) /= 0 then 2284 Put_Line (" Case_Insensitive mode"); 2285 end if; 2286 2287 if (Self.Flags and Single_Line) /= 0 then 2288 Put_Line (" Single_Line mode"); 2289 end if; 2290 2291 if (Self.Flags and Multiple_Lines) /= 0 then 2292 Put_Line (" Multiple_Lines mode"); 2293 end if; 2294 2295 Dump_Until (Program, Index, Self.Program'Last + 1, 0); 2296 end Dump; 2297 2298 -------------------- 2299 -- Get_From_Class -- 2300 -------------------- 2301 2302 function Get_From_Class 2303 (Bitmap : Character_Class; 2304 C : Character) return Boolean 2305 is 2306 Value : constant Class_Byte := Character'Pos (C); 2307 begin 2308 return 2309 (Bitmap (Value / 8) and Bit_Conversion (Value mod 8)) /= 0; 2310 end Get_From_Class; 2311 2312 -------------- 2313 -- Get_Next -- 2314 -------------- 2315 2316 function Get_Next (Program : Program_Data; IP : Pointer) return Pointer is 2317 begin 2318 return IP + Pointer (Read_Natural (Program, IP + 1)); 2319 end Get_Next; 2320 2321 -------------- 2322 -- Is_Alnum -- 2323 -------------- 2324 2325 function Is_Alnum (C : Character) return Boolean is 2326 begin 2327 return Is_Alphanumeric (C) or else C = '_'; 2328 end Is_Alnum; 2329 2330 ------------------ 2331 -- Is_Printable -- 2332 ------------------ 2333 2334 function Is_Printable (C : Character) return Boolean is 2335 begin 2336 -- Printable if space or graphic character or other whitespace 2337 -- Other white space includes (HT/LF/VT/FF/CR = codes 9-13) 2338 2339 return C in Character'Val (32) .. Character'Val (126) 2340 or else C in ASCII.HT .. ASCII.CR; 2341 end Is_Printable; 2342 2343 -------------------- 2344 -- Is_White_Space -- 2345 -------------------- 2346 2347 function Is_White_Space (C : Character) return Boolean is 2348 begin 2349 -- Note: HT = 9, LF = 10, VT = 11, FF = 12, CR = 13 2350 2351 return C = ' ' or else C in ASCII.HT .. ASCII.CR; 2352 end Is_White_Space; 2353 2354 ----------- 2355 -- Match -- 2356 ----------- 2357 2358 procedure Match 2359 (Self : Pattern_Matcher; 2360 Data : String; 2361 Matches : out Match_Array; 2362 Data_First : Integer := -1; 2363 Data_Last : Positive := Positive'Last) 2364 is 2365 Program : Program_Data renames Self.Program; -- Shorter notation 2366 2367 First_In_Data : constant Integer := Integer'Max (Data_First, Data'First); 2368 Last_In_Data : constant Integer := Integer'Min (Data_Last, Data'Last); 2369 2370 -- Global work variables 2371 2372 Input_Pos : Natural; -- String-input pointer 2373 BOL_Pos : Natural; -- Beginning of input, for ^ check 2374 Matched : Boolean := False; -- Until proven True 2375 2376 Matches_Full : Match_Array (0 .. Natural'Max (Self.Paren_Count, 2377 Matches'Last)); 2378 -- Stores the value of all the parenthesis pairs. 2379 -- We do not use directly Matches, so that we can also use back 2380 -- references (REFF) even if Matches is too small. 2381 2382 type Natural_Array is array (Match_Count range <>) of Natural; 2383 Matches_Tmp : Natural_Array (Matches_Full'Range); 2384 -- Save the opening position of parenthesis 2385 2386 Last_Paren : Natural := 0; 2387 -- Last parenthesis seen 2388 2389 Greedy : Boolean := True; 2390 -- True if the next operator should be greedy 2391 2392 type Current_Curly_Record; 2393 type Current_Curly_Access is access all Current_Curly_Record; 2394 type Current_Curly_Record is record 2395 Paren_Floor : Natural; -- How far back to strip parenthesis data 2396 Cur : Integer; -- How many instances of scan we've matched 2397 Min : Natural; -- Minimal number of scans to match 2398 Max : Natural; -- Maximal number of scans to match 2399 Greedy : Boolean; -- Whether to work our way up or down 2400 Scan : Pointer; -- The thing to match 2401 Next : Pointer; -- What has to match after it 2402 Lastloc : Natural; -- Where we started matching this scan 2403 Old_Cc : Current_Curly_Access; -- Before we started this one 2404 end record; 2405 -- Data used to handle the curly operator and the plus and star 2406 -- operators for complex expressions. 2407 2408 Current_Curly : Current_Curly_Access := null; 2409 -- The curly currently being processed 2410 2411 ----------------------- 2412 -- Local Subprograms -- 2413 ----------------------- 2414 2415 function Index (Start : Positive; C : Character) return Natural; 2416 -- Find character C in Data starting at Start and return position 2417 2418 function Repeat 2419 (IP : Pointer; 2420 Max : Natural := Natural'Last) return Natural; 2421 -- Repeatedly match something simple, report how many 2422 -- It only matches on things of length 1. 2423 -- Starting from Input_Pos, it matches at most Max CURLY. 2424 2425 function Try (Pos : Positive) return Boolean; 2426 -- Try to match at specific point 2427 2428 function Match (IP : Pointer) return Boolean; 2429 -- This is the main matching routine. Conceptually the strategy 2430 -- is simple: check to see whether the current node matches, 2431 -- call self recursively to see whether the rest matches, 2432 -- and then act accordingly. 2433 -- 2434 -- In practice Match makes some effort to avoid recursion, in 2435 -- particular by going through "ordinary" nodes (that don't 2436 -- need to know whether the rest of the match failed) by 2437 -- using a loop instead of recursion. 2438 -- Why is the above comment part of the spec rather than body ??? 2439 2440 function Match_Whilem return Boolean; 2441 -- Return True if a WHILEM matches the Current_Curly 2442 2443 function Recurse_Match (IP : Pointer; From : Natural) return Boolean; 2444 pragma Inline (Recurse_Match); 2445 -- Calls Match recursively. It saves and restores the parenthesis 2446 -- status and location in the input stream correctly, so that 2447 -- backtracking is possible 2448 2449 function Match_Simple_Operator 2450 (Op : Opcode; 2451 Scan : Pointer; 2452 Next : Pointer; 2453 Greedy : Boolean) return Boolean; 2454 -- Return True it the simple operator (possibly non-greedy) matches 2455 2456 Dump_Indent : Integer := -1; 2457 procedure Dump_Current (Scan : Pointer; Prefix : Boolean := True); 2458 procedure Dump_Error (Msg : String); 2459 -- Debug: print the current context 2460 2461 pragma Inline (Index); 2462 pragma Inline (Repeat); 2463 2464 -- These are two complex functions, but used only once 2465 2466 pragma Inline (Match_Whilem); 2467 pragma Inline (Match_Simple_Operator); 2468 2469 ----------- 2470 -- Index -- 2471 ----------- 2472 2473 function Index (Start : Positive; C : Character) return Natural is 2474 begin 2475 for J in Start .. Last_In_Data loop 2476 if Data (J) = C then 2477 return J; 2478 end if; 2479 end loop; 2480 2481 return 0; 2482 end Index; 2483 2484 ------------------- 2485 -- Recurse_Match -- 2486 ------------------- 2487 2488 function Recurse_Match (IP : Pointer; From : Natural) return Boolean is 2489 L : constant Natural := Last_Paren; 2490 Tmp_F : constant Match_Array := 2491 Matches_Full (From + 1 .. Matches_Full'Last); 2492 Start : constant Natural_Array := 2493 Matches_Tmp (From + 1 .. Matches_Tmp'Last); 2494 Input : constant Natural := Input_Pos; 2495 2496 Dump_Indent_Save : constant Integer := Dump_Indent; 2497 2498 begin 2499 if Match (IP) then 2500 return True; 2501 end if; 2502 2503 Last_Paren := L; 2504 Matches_Full (Tmp_F'Range) := Tmp_F; 2505 Matches_Tmp (Start'Range) := Start; 2506 Input_Pos := Input; 2507 Dump_Indent := Dump_Indent_Save; 2508 return False; 2509 end Recurse_Match; 2510 2511 ------------------ 2512 -- Dump_Current -- 2513 ------------------ 2514 2515 procedure Dump_Current (Scan : Pointer; Prefix : Boolean := True) is 2516 Length : constant := 10; 2517 Pos : constant String := Integer'Image (Input_Pos); 2518 2519 begin 2520 if Prefix then 2521 Put ((1 .. 5 - Pos'Length => ' ')); 2522 Put (Pos & " <" 2523 & Data (Input_Pos 2524 .. Integer'Min (Last_In_Data, Input_Pos + Length - 1))); 2525 Put ((1 .. Length - 1 - Last_In_Data + Input_Pos => ' ')); 2526 Put ("> |"); 2527 2528 else 2529 Put (" "); 2530 end if; 2531 2532 Dump_Operation (Program, Scan, Indent => Dump_Indent); 2533 end Dump_Current; 2534 2535 ---------------- 2536 -- Dump_Error -- 2537 ---------------- 2538 2539 procedure Dump_Error (Msg : String) is 2540 begin 2541 Put (" | "); 2542 Put ((1 .. Dump_Indent * 2 => ' ')); 2543 Put_Line (Msg); 2544 end Dump_Error; 2545 2546 ----------- 2547 -- Match -- 2548 ----------- 2549 2550 function Match (IP : Pointer) return Boolean is 2551 Scan : Pointer := IP; 2552 Next : Pointer; 2553 Op : Opcode; 2554 Result : Boolean; 2555 2556 begin 2557 Dump_Indent := Dump_Indent + 1; 2558 2559 State_Machine : 2560 loop 2561 pragma Assert (Scan /= 0); 2562 2563 -- Determine current opcode and count its usage in debug mode 2564 2565 Op := Opcode'Val (Character'Pos (Program (Scan))); 2566 2567 -- Calculate offset of next instruction. Second character is most 2568 -- significant in Program_Data. 2569 2570 Next := Get_Next (Program, Scan); 2571 2572 if Debug then 2573 Dump_Current (Scan); 2574 end if; 2575 2576 case Op is 2577 when EOP => 2578 Dump_Indent := Dump_Indent - 1; 2579 return True; -- Success 2580 2581 when BRANCH => 2582 if Program (Next) /= BRANCH then 2583 Next := Operand (Scan); -- No choice, avoid recursion 2584 2585 else 2586 loop 2587 if Recurse_Match (Operand (Scan), 0) then 2588 Dump_Indent := Dump_Indent - 1; 2589 return True; 2590 end if; 2591 2592 Scan := Get_Next (Program, Scan); 2593 exit when Scan = 0 or else Program (Scan) /= BRANCH; 2594 end loop; 2595 2596 exit State_Machine; 2597 end if; 2598 2599 when NOTHING => 2600 null; 2601 2602 when BOL => 2603 exit State_Machine when Input_Pos /= BOL_Pos 2604 and then ((Self.Flags and Multiple_Lines) = 0 2605 or else Data (Input_Pos - 1) /= ASCII.LF); 2606 2607 when MBOL => 2608 exit State_Machine when Input_Pos /= BOL_Pos 2609 and then Data (Input_Pos - 1) /= ASCII.LF; 2610 2611 when SBOL => 2612 exit State_Machine when Input_Pos /= BOL_Pos; 2613 2614 when EOL => 2615 2616 -- A combination of MEOL and SEOL 2617 2618 if (Self.Flags and Multiple_Lines) = 0 then 2619 2620 -- Single line mode 2621 2622 exit State_Machine when Input_Pos <= Data'Last; 2623 2624 elsif Input_Pos <= Last_In_Data then 2625 exit State_Machine when Data (Input_Pos) /= ASCII.LF; 2626 else 2627 exit State_Machine when Last_In_Data /= Data'Last; 2628 end if; 2629 2630 when MEOL => 2631 if Input_Pos <= Last_In_Data then 2632 exit State_Machine when Data (Input_Pos) /= ASCII.LF; 2633 else 2634 exit State_Machine when Last_In_Data /= Data'Last; 2635 end if; 2636 2637 when SEOL => 2638 2639 -- If there is a character before Data'Last (even if 2640 -- Last_In_Data stops before then), we can't have the 2641 -- end of the line. 2642 2643 exit State_Machine when Input_Pos <= Data'Last; 2644 2645 when BOUND | NBOUND => 2646 2647 -- Was last char in word ? 2648 2649 declare 2650 N : Boolean := False; 2651 Ln : Boolean := False; 2652 2653 begin 2654 if Input_Pos /= First_In_Data then 2655 N := Is_Alnum (Data (Input_Pos - 1)); 2656 end if; 2657 2658 Ln := 2659 (if Input_Pos > Last_In_Data 2660 then False 2661 else Is_Alnum (Data (Input_Pos))); 2662 2663 if Op = BOUND then 2664 if N = Ln then 2665 exit State_Machine; 2666 end if; 2667 else 2668 if N /= Ln then 2669 exit State_Machine; 2670 end if; 2671 end if; 2672 end; 2673 2674 when SPACE => 2675 exit State_Machine when Input_Pos > Last_In_Data 2676 or else not Is_White_Space (Data (Input_Pos)); 2677 Input_Pos := Input_Pos + 1; 2678 2679 when NSPACE => 2680 exit State_Machine when Input_Pos > Last_In_Data 2681 or else Is_White_Space (Data (Input_Pos)); 2682 Input_Pos := Input_Pos + 1; 2683 2684 when DIGIT => 2685 exit State_Machine when Input_Pos > Last_In_Data 2686 or else not Is_Digit (Data (Input_Pos)); 2687 Input_Pos := Input_Pos + 1; 2688 2689 when NDIGIT => 2690 exit State_Machine when Input_Pos > Last_In_Data 2691 or else Is_Digit (Data (Input_Pos)); 2692 Input_Pos := Input_Pos + 1; 2693 2694 when ALNUM => 2695 exit State_Machine when Input_Pos > Last_In_Data 2696 or else not Is_Alnum (Data (Input_Pos)); 2697 Input_Pos := Input_Pos + 1; 2698 2699 when NALNUM => 2700 exit State_Machine when Input_Pos > Last_In_Data 2701 or else Is_Alnum (Data (Input_Pos)); 2702 Input_Pos := Input_Pos + 1; 2703 2704 when ANY => 2705 exit State_Machine when Input_Pos > Last_In_Data 2706 or else Data (Input_Pos) = ASCII.LF; 2707 Input_Pos := Input_Pos + 1; 2708 2709 when SANY => 2710 exit State_Machine when Input_Pos > Last_In_Data; 2711 Input_Pos := Input_Pos + 1; 2712 2713 when EXACT => 2714 declare 2715 Opnd : Pointer := String_Operand (Scan); 2716 Current : Positive := Input_Pos; 2717 Last : constant Pointer := 2718 Opnd + String_Length (Program, Scan); 2719 2720 begin 2721 while Opnd <= Last loop 2722 exit State_Machine when Current > Last_In_Data 2723 or else Program (Opnd) /= Data (Current); 2724 Current := Current + 1; 2725 Opnd := Opnd + 1; 2726 end loop; 2727 2728 Input_Pos := Current; 2729 end; 2730 2731 when EXACTF => 2732 declare 2733 Opnd : Pointer := String_Operand (Scan); 2734 Current : Positive := Input_Pos; 2735 2736 Last : constant Pointer := 2737 Opnd + String_Length (Program, Scan); 2738 2739 begin 2740 while Opnd <= Last loop 2741 exit State_Machine when Current > Last_In_Data 2742 or else Program (Opnd) /= To_Lower (Data (Current)); 2743 Current := Current + 1; 2744 Opnd := Opnd + 1; 2745 end loop; 2746 2747 Input_Pos := Current; 2748 end; 2749 2750 when ANYOF => 2751 declare 2752 Bitmap : Character_Class; 2753 begin 2754 Bitmap_Operand (Program, Scan, Bitmap); 2755 exit State_Machine when Input_Pos > Last_In_Data 2756 or else not Get_From_Class (Bitmap, Data (Input_Pos)); 2757 Input_Pos := Input_Pos + 1; 2758 end; 2759 2760 when OPEN => 2761 declare 2762 No : constant Natural := 2763 Character'Pos (Program (Operand (Scan))); 2764 begin 2765 Matches_Tmp (No) := Input_Pos; 2766 end; 2767 2768 when CLOSE => 2769 declare 2770 No : constant Natural := 2771 Character'Pos (Program (Operand (Scan))); 2772 2773 begin 2774 Matches_Full (No) := (Matches_Tmp (No), Input_Pos - 1); 2775 2776 if Last_Paren < No then 2777 Last_Paren := No; 2778 end if; 2779 end; 2780 2781 when REFF => 2782 declare 2783 No : constant Natural := 2784 Character'Pos (Program (Operand (Scan))); 2785 2786 Data_Pos : Natural; 2787 2788 begin 2789 -- If we haven't seen that parenthesis yet 2790 2791 if Last_Paren < No then 2792 Dump_Indent := Dump_Indent - 1; 2793 2794 if Debug then 2795 Dump_Error ("REFF: No match, backtracking"); 2796 end if; 2797 2798 return False; 2799 end if; 2800 2801 Data_Pos := Matches_Full (No).First; 2802 2803 while Data_Pos <= Matches_Full (No).Last loop 2804 if Input_Pos > Last_In_Data 2805 or else Data (Input_Pos) /= Data (Data_Pos) 2806 then 2807 Dump_Indent := Dump_Indent - 1; 2808 2809 if Debug then 2810 Dump_Error ("REFF: No match, backtracking"); 2811 end if; 2812 2813 return False; 2814 end if; 2815 2816 Input_Pos := Input_Pos + 1; 2817 Data_Pos := Data_Pos + 1; 2818 end loop; 2819 end; 2820 2821 when MINMOD => 2822 Greedy := False; 2823 2824 when STAR | PLUS | CURLY => 2825 declare 2826 Greed : constant Boolean := Greedy; 2827 begin 2828 Greedy := True; 2829 Result := Match_Simple_Operator (Op, Scan, Next, Greed); 2830 Dump_Indent := Dump_Indent - 1; 2831 return Result; 2832 end; 2833 2834 when CURLYX => 2835 2836 -- Looking at something like: 2837 2838 -- 1: CURLYX {n,m} (->4) 2839 -- 2: code for complex thing (->3) 2840 -- 3: WHILEM (->0) 2841 -- 4: NOTHING 2842 2843 declare 2844 Min : constant Natural := 2845 Read_Natural (Program, Scan + Next_Pointer_Bytes); 2846 Max : constant Natural := 2847 Read_Natural 2848 (Program, Scan + Next_Pointer_Bytes + 2); 2849 Cc : aliased Current_Curly_Record; 2850 2851 Has_Match : Boolean; 2852 2853 begin 2854 Cc := (Paren_Floor => Last_Paren, 2855 Cur => -1, 2856 Min => Min, 2857 Max => Max, 2858 Greedy => Greedy, 2859 Scan => Scan + 7, 2860 Next => Next, 2861 Lastloc => 0, 2862 Old_Cc => Current_Curly); 2863 Greedy := True; 2864 Current_Curly := Cc'Unchecked_Access; 2865 2866 Has_Match := Match (Next - Next_Pointer_Bytes); 2867 2868 -- Start on the WHILEM 2869 2870 Current_Curly := Cc.Old_Cc; 2871 Dump_Indent := Dump_Indent - 1; 2872 2873 if not Has_Match then 2874 if Debug then 2875 Dump_Error ("CURLYX failed..."); 2876 end if; 2877 end if; 2878 2879 return Has_Match; 2880 end; 2881 2882 when WHILEM => 2883 Result := Match_Whilem; 2884 Dump_Indent := Dump_Indent - 1; 2885 2886 if Debug and then not Result then 2887 Dump_Error ("WHILEM: no match, backtracking"); 2888 end if; 2889 2890 return Result; 2891 end case; 2892 2893 Scan := Next; 2894 end loop State_Machine; 2895 2896 if Debug then 2897 Dump_Error ("failed..."); 2898 Dump_Indent := Dump_Indent - 1; 2899 end if; 2900 2901 -- If we get here, there is no match. For successful matches when EOP 2902 -- is the terminating point. 2903 2904 return False; 2905 end Match; 2906 2907 --------------------------- 2908 -- Match_Simple_Operator -- 2909 --------------------------- 2910 2911 function Match_Simple_Operator 2912 (Op : Opcode; 2913 Scan : Pointer; 2914 Next : Pointer; 2915 Greedy : Boolean) return Boolean 2916 is 2917 Next_Char : Character := ASCII.NUL; 2918 Next_Char_Known : Boolean := False; 2919 No : Integer; -- Can be negative 2920 Min : Natural; 2921 Max : Natural := Natural'Last; 2922 Operand_Code : Pointer; 2923 Old : Natural; 2924 Last_Pos : Natural; 2925 Save : constant Natural := Input_Pos; 2926 2927 begin 2928 -- Lookahead to avoid useless match attempts when we know what 2929 -- character comes next. 2930 2931 if Program (Next) = EXACT then 2932 Next_Char := Program (String_Operand (Next)); 2933 Next_Char_Known := True; 2934 end if; 2935 2936 -- Find the minimal and maximal values for the operator 2937 2938 case Op is 2939 when STAR => 2940 Min := 0; 2941 Operand_Code := Operand (Scan); 2942 2943 when PLUS => 2944 Min := 1; 2945 Operand_Code := Operand (Scan); 2946 2947 when others => 2948 Min := Read_Natural (Program, Scan + Next_Pointer_Bytes); 2949 Max := Read_Natural (Program, Scan + Next_Pointer_Bytes + 2); 2950 Operand_Code := Scan + 7; 2951 end case; 2952 2953 if Debug then 2954 Dump_Current (Operand_Code, Prefix => False); 2955 end if; 2956 2957 -- Non greedy operators 2958 2959 if not Greedy then 2960 2961 -- Test we can repeat at least Min times 2962 2963 if Min /= 0 then 2964 No := Repeat (Operand_Code, Min); 2965 2966 if No < Min then 2967 if Debug then 2968 Dump_Error ("failed... matched" & No'Img & " times"); 2969 end if; 2970 2971 return False; 2972 end if; 2973 end if; 2974 2975 Old := Input_Pos; 2976 2977 -- Find the place where 'next' could work 2978 2979 if Next_Char_Known then 2980 2981 -- Last position to check 2982 2983 if Max = Natural'Last then 2984 Last_Pos := Last_In_Data; 2985 else 2986 Last_Pos := Input_Pos + Max; 2987 2988 if Last_Pos > Last_In_Data then 2989 Last_Pos := Last_In_Data; 2990 end if; 2991 end if; 2992 2993 -- Look for the first possible opportunity 2994 2995 if Debug then 2996 Dump_Error ("Next_Char must be " & Next_Char); 2997 end if; 2998 2999 loop 3000 -- Find the next possible position 3001 3002 while Input_Pos <= Last_Pos 3003 and then Data (Input_Pos) /= Next_Char 3004 loop 3005 Input_Pos := Input_Pos + 1; 3006 end loop; 3007 3008 if Input_Pos > Last_Pos then 3009 return False; 3010 end if; 3011 3012 -- Check that we still match if we stop at the position we 3013 -- just found. 3014 3015 declare 3016 Num : constant Natural := Input_Pos - Old; 3017 3018 begin 3019 Input_Pos := Old; 3020 3021 if Debug then 3022 Dump_Error ("Would we still match at that position?"); 3023 end if; 3024 3025 if Repeat (Operand_Code, Num) < Num then 3026 return False; 3027 end if; 3028 end; 3029 3030 -- Input_Pos now points to the new position 3031 3032 if Match (Get_Next (Program, Scan)) then 3033 return True; 3034 end if; 3035 3036 Old := Input_Pos; 3037 Input_Pos := Input_Pos + 1; 3038 end loop; 3039 3040 -- We do not know what the next character is 3041 3042 else 3043 while Max >= Min loop 3044 if Debug then 3045 Dump_Error ("Non-greedy repeat, N=" & Min'Img); 3046 Dump_Error ("Do we still match Next if we stop here?"); 3047 end if; 3048 3049 -- If the next character matches 3050 3051 if Recurse_Match (Next, 1) then 3052 return True; 3053 end if; 3054 3055 Input_Pos := Save + Min; 3056 3057 -- Could not or did not match -- move forward 3058 3059 if Repeat (Operand_Code, 1) /= 0 then 3060 Min := Min + 1; 3061 else 3062 if Debug then 3063 Dump_Error ("Non-greedy repeat failed..."); 3064 end if; 3065 3066 return False; 3067 end if; 3068 end loop; 3069 end if; 3070 3071 return False; 3072 3073 -- Greedy operators 3074 3075 else 3076 No := Repeat (Operand_Code, Max); 3077 3078 if Debug and then No < Min then 3079 Dump_Error ("failed... matched" & No'Img & " times"); 3080 end if; 3081 3082 -- ??? Perl has some special code here in case the next 3083 -- instruction is of type EOL, since $ and \Z can match before 3084 -- *and* after newline at the end. 3085 3086 -- ??? Perl has some special code here in case (paren) is True 3087 3088 -- Else, if we don't have any parenthesis 3089 3090 while No >= Min loop 3091 if not Next_Char_Known 3092 or else (Input_Pos <= Last_In_Data 3093 and then Data (Input_Pos) = Next_Char) 3094 then 3095 if Match (Next) then 3096 return True; 3097 end if; 3098 end if; 3099 3100 -- Could not or did not work, we back up 3101 3102 No := No - 1; 3103 Input_Pos := Save + No; 3104 end loop; 3105 3106 return False; 3107 end if; 3108 end Match_Simple_Operator; 3109 3110 ------------------ 3111 -- Match_Whilem -- 3112 ------------------ 3113 3114 -- This is really hard to understand, because after we match what we 3115 -- are trying to match, we must make sure the rest of the REx is going 3116 -- to match for sure, and to do that we have to go back UP the parse 3117 -- tree by recursing ever deeper. And if it fails, we have to reset 3118 -- our parent's current state that we can try again after backing off. 3119 3120 function Match_Whilem return Boolean is 3121 Cc : constant Current_Curly_Access := Current_Curly; 3122 3123 N : constant Natural := Cc.Cur + 1; 3124 Ln : Natural := 0; 3125 3126 Lastloc : constant Natural := Cc.Lastloc; 3127 -- Detection of 0-len 3128 3129 begin 3130 -- If degenerate scan matches "", assume scan done 3131 3132 if Input_Pos = Cc.Lastloc 3133 and then N >= Cc.Min 3134 then 3135 -- Temporarily restore the old context, and check that we 3136 -- match was comes after CURLYX. 3137 3138 Current_Curly := Cc.Old_Cc; 3139 3140 if Current_Curly /= null then 3141 Ln := Current_Curly.Cur; 3142 end if; 3143 3144 if Match (Cc.Next) then 3145 return True; 3146 end if; 3147 3148 if Current_Curly /= null then 3149 Current_Curly.Cur := Ln; 3150 end if; 3151 3152 Current_Curly := Cc; 3153 return False; 3154 end if; 3155 3156 -- First, just match a string of min scans 3157 3158 if N < Cc.Min then 3159 Cc.Cur := N; 3160 Cc.Lastloc := Input_Pos; 3161 3162 if Debug then 3163 Dump_Error 3164 ("Tests that we match at least" & Cc.Min'Img & " N=" & N'Img); 3165 end if; 3166 3167 if Match (Cc.Scan) then 3168 return True; 3169 end if; 3170 3171 Cc.Cur := N - 1; 3172 Cc.Lastloc := Lastloc; 3173 3174 if Debug then 3175 Dump_Error ("failed..."); 3176 end if; 3177 3178 return False; 3179 end if; 3180 3181 -- Prefer next over scan for minimal matching 3182 3183 if not Cc.Greedy then 3184 Current_Curly := Cc.Old_Cc; 3185 3186 if Current_Curly /= null then 3187 Ln := Current_Curly.Cur; 3188 end if; 3189 3190 if Recurse_Match (Cc.Next, Cc.Paren_Floor) then 3191 return True; 3192 end if; 3193 3194 if Current_Curly /= null then 3195 Current_Curly.Cur := Ln; 3196 end if; 3197 3198 Current_Curly := Cc; 3199 3200 -- Maximum greed exceeded ? 3201 3202 if N >= Cc.Max then 3203 if Debug then 3204 Dump_Error ("failed..."); 3205 end if; 3206 return False; 3207 end if; 3208 3209 -- Try scanning more and see if it helps 3210 Cc.Cur := N; 3211 Cc.Lastloc := Input_Pos; 3212 3213 if Debug then 3214 Dump_Error ("Next failed, what about Current?"); 3215 end if; 3216 3217 if Recurse_Match (Cc.Scan, Cc.Paren_Floor) then 3218 return True; 3219 end if; 3220 3221 Cc.Cur := N - 1; 3222 Cc.Lastloc := Lastloc; 3223 return False; 3224 end if; 3225 3226 -- Prefer scan over next for maximal matching 3227 3228 if N < Cc.Max then -- more greed allowed ? 3229 Cc.Cur := N; 3230 Cc.Lastloc := Input_Pos; 3231 3232 if Debug then 3233 Dump_Error ("Recurse at current position"); 3234 end if; 3235 3236 if Recurse_Match (Cc.Scan, Cc.Paren_Floor) then 3237 return True; 3238 end if; 3239 end if; 3240 3241 -- Failed deeper matches of scan, so see if this one works 3242 3243 Current_Curly := Cc.Old_Cc; 3244 3245 if Current_Curly /= null then 3246 Ln := Current_Curly.Cur; 3247 end if; 3248 3249 if Debug then 3250 Dump_Error ("Failed matching for later positions"); 3251 end if; 3252 3253 if Match (Cc.Next) then 3254 return True; 3255 end if; 3256 3257 if Current_Curly /= null then 3258 Current_Curly.Cur := Ln; 3259 end if; 3260 3261 Current_Curly := Cc; 3262 Cc.Cur := N - 1; 3263 Cc.Lastloc := Lastloc; 3264 3265 if Debug then 3266 Dump_Error ("failed..."); 3267 end if; 3268 3269 return False; 3270 end Match_Whilem; 3271 3272 ------------ 3273 -- Repeat -- 3274 ------------ 3275 3276 function Repeat 3277 (IP : Pointer; 3278 Max : Natural := Natural'Last) return Natural 3279 is 3280 Scan : Natural := Input_Pos; 3281 Last : Natural; 3282 Op : constant Opcode := Opcode'Val (Character'Pos (Program (IP))); 3283 Count : Natural; 3284 C : Character; 3285 Is_First : Boolean := True; 3286 Bitmap : Character_Class; 3287 3288 begin 3289 if Max = Natural'Last or else Scan + Max - 1 > Last_In_Data then 3290 Last := Last_In_Data; 3291 else 3292 Last := Scan + Max - 1; 3293 end if; 3294 3295 case Op is 3296 when ANY => 3297 while Scan <= Last 3298 and then Data (Scan) /= ASCII.LF 3299 loop 3300 Scan := Scan + 1; 3301 end loop; 3302 3303 when SANY => 3304 Scan := Last + 1; 3305 3306 when EXACT => 3307 3308 -- The string has only one character if Repeat was called 3309 3310 C := Program (String_Operand (IP)); 3311 while Scan <= Last 3312 and then C = Data (Scan) 3313 loop 3314 Scan := Scan + 1; 3315 end loop; 3316 3317 when EXACTF => 3318 3319 -- The string has only one character if Repeat was called 3320 3321 C := Program (String_Operand (IP)); 3322 while Scan <= Last 3323 and then To_Lower (C) = Data (Scan) 3324 loop 3325 Scan := Scan + 1; 3326 end loop; 3327 3328 when ANYOF => 3329 if Is_First then 3330 Bitmap_Operand (Program, IP, Bitmap); 3331 Is_First := False; 3332 end if; 3333 3334 while Scan <= Last 3335 and then Get_From_Class (Bitmap, Data (Scan)) 3336 loop 3337 Scan := Scan + 1; 3338 end loop; 3339 3340 when ALNUM => 3341 while Scan <= Last 3342 and then Is_Alnum (Data (Scan)) 3343 loop 3344 Scan := Scan + 1; 3345 end loop; 3346 3347 when NALNUM => 3348 while Scan <= Last 3349 and then not Is_Alnum (Data (Scan)) 3350 loop 3351 Scan := Scan + 1; 3352 end loop; 3353 3354 when SPACE => 3355 while Scan <= Last 3356 and then Is_White_Space (Data (Scan)) 3357 loop 3358 Scan := Scan + 1; 3359 end loop; 3360 3361 when NSPACE => 3362 while Scan <= Last 3363 and then not Is_White_Space (Data (Scan)) 3364 loop 3365 Scan := Scan + 1; 3366 end loop; 3367 3368 when DIGIT => 3369 while Scan <= Last 3370 and then Is_Digit (Data (Scan)) 3371 loop 3372 Scan := Scan + 1; 3373 end loop; 3374 3375 when NDIGIT => 3376 while Scan <= Last 3377 and then not Is_Digit (Data (Scan)) 3378 loop 3379 Scan := Scan + 1; 3380 end loop; 3381 3382 when others => 3383 raise Program_Error; 3384 end case; 3385 3386 Count := Scan - Input_Pos; 3387 Input_Pos := Scan; 3388 return Count; 3389 end Repeat; 3390 3391 --------- 3392 -- Try -- 3393 --------- 3394 3395 function Try (Pos : Positive) return Boolean is 3396 begin 3397 Input_Pos := Pos; 3398 Last_Paren := 0; 3399 Matches_Full := (others => No_Match); 3400 3401 if Match (Program_First) then 3402 Matches_Full (0) := (Pos, Input_Pos - 1); 3403 return True; 3404 end if; 3405 3406 return False; 3407 end Try; 3408 3409 -- Start of processing for Match 3410 3411 begin 3412 -- Do we have the regexp Never_Match? 3413 3414 if Self.Size = 0 then 3415 Matches := (others => No_Match); 3416 return; 3417 end if; 3418 3419 -- If there is a "must appear" string, look for it 3420 3421 if Self.Must_Have_Length > 0 then 3422 declare 3423 First : constant Character := Program (Self.Must_Have); 3424 Must_First : constant Pointer := Self.Must_Have; 3425 Must_Last : constant Pointer := 3426 Must_First + Pointer (Self.Must_Have_Length - 1); 3427 Next_Try : Natural := Index (First_In_Data, First); 3428 3429 begin 3430 while Next_Try /= 0 3431 and then Data (Next_Try .. Next_Try + Self.Must_Have_Length - 1) 3432 = String (Program (Must_First .. Must_Last)) 3433 loop 3434 Next_Try := Index (Next_Try + 1, First); 3435 end loop; 3436 3437 if Next_Try = 0 then 3438 Matches := (others => No_Match); 3439 return; -- Not present 3440 end if; 3441 end; 3442 end if; 3443 3444 -- Mark beginning of line for ^ 3445 3446 BOL_Pos := Data'First; 3447 3448 -- Simplest case first: an anchored match need be tried only once 3449 3450 if Self.Anchored and then (Self.Flags and Multiple_Lines) = 0 then 3451 Matched := Try (First_In_Data); 3452 3453 elsif Self.Anchored then 3454 declare 3455 Next_Try : Natural := First_In_Data; 3456 begin 3457 -- Test the first position in the buffer 3458 Matched := Try (Next_Try); 3459 3460 -- Else only test after newlines 3461 3462 if not Matched then 3463 while Next_Try <= Last_In_Data loop 3464 while Next_Try <= Last_In_Data 3465 and then Data (Next_Try) /= ASCII.LF 3466 loop 3467 Next_Try := Next_Try + 1; 3468 end loop; 3469 3470 Next_Try := Next_Try + 1; 3471 3472 if Next_Try <= Last_In_Data then 3473 Matched := Try (Next_Try); 3474 exit when Matched; 3475 end if; 3476 end loop; 3477 end if; 3478 end; 3479 3480 elsif Self.First /= ASCII.NUL then 3481 -- We know what char it must start with 3482 3483 declare 3484 Next_Try : Natural := Index (First_In_Data, Self.First); 3485 3486 begin 3487 while Next_Try /= 0 loop 3488 Matched := Try (Next_Try); 3489 exit when Matched; 3490 Next_Try := Index (Next_Try + 1, Self.First); 3491 end loop; 3492 end; 3493 3494 else 3495 -- Messy cases: try all locations (including for the empty string) 3496 3497 Matched := Try (First_In_Data); 3498 3499 if not Matched then 3500 for S in First_In_Data + 1 .. Last_In_Data loop 3501 Matched := Try (S); 3502 exit when Matched; 3503 end loop; 3504 end if; 3505 end if; 3506 3507 -- Matched has its value 3508 3509 for J in Last_Paren + 1 .. Matches'Last loop 3510 Matches_Full (J) := No_Match; 3511 end loop; 3512 3513 Matches := Matches_Full (Matches'Range); 3514 end Match; 3515 3516 ----------- 3517 -- Match -- 3518 ----------- 3519 3520 function Match 3521 (Self : Pattern_Matcher; 3522 Data : String; 3523 Data_First : Integer := -1; 3524 Data_Last : Positive := Positive'Last) return Natural 3525 is 3526 Matches : Match_Array (0 .. 0); 3527 3528 begin 3529 Match (Self, Data, Matches, Data_First, Data_Last); 3530 if Matches (0) = No_Match then 3531 return Data'First - 1; 3532 else 3533 return Matches (0).First; 3534 end if; 3535 end Match; 3536 3537 function Match 3538 (Self : Pattern_Matcher; 3539 Data : String; 3540 Data_First : Integer := -1; 3541 Data_Last : Positive := Positive'Last) return Boolean 3542 is 3543 Matches : Match_Array (0 .. 0); 3544 3545 begin 3546 Match (Self, Data, Matches, Data_First, Data_Last); 3547 return Matches (0).First >= Data'First; 3548 end Match; 3549 3550 procedure Match 3551 (Expression : String; 3552 Data : String; 3553 Matches : out Match_Array; 3554 Size : Program_Size := Auto_Size; 3555 Data_First : Integer := -1; 3556 Data_Last : Positive := Positive'Last) 3557 is 3558 PM : Pattern_Matcher (Size); 3559 Finalize_Size : Program_Size; 3560 pragma Unreferenced (Finalize_Size); 3561 begin 3562 if Size = 0 then 3563 Match (Compile (Expression), Data, Matches, Data_First, Data_Last); 3564 else 3565 Compile (PM, Expression, Finalize_Size); 3566 Match (PM, Data, Matches, Data_First, Data_Last); 3567 end if; 3568 end Match; 3569 3570 ----------- 3571 -- Match -- 3572 ----------- 3573 3574 function Match 3575 (Expression : String; 3576 Data : String; 3577 Size : Program_Size := Auto_Size; 3578 Data_First : Integer := -1; 3579 Data_Last : Positive := Positive'Last) return Natural 3580 is 3581 PM : Pattern_Matcher (Size); 3582 Final_Size : Program_Size; 3583 pragma Unreferenced (Final_Size); 3584 begin 3585 if Size = 0 then 3586 return Match (Compile (Expression), Data, Data_First, Data_Last); 3587 else 3588 Compile (PM, Expression, Final_Size); 3589 return Match (PM, Data, Data_First, Data_Last); 3590 end if; 3591 end Match; 3592 3593 ----------- 3594 -- Match -- 3595 ----------- 3596 3597 function Match 3598 (Expression : String; 3599 Data : String; 3600 Size : Program_Size := Auto_Size; 3601 Data_First : Integer := -1; 3602 Data_Last : Positive := Positive'Last) return Boolean 3603 is 3604 Matches : Match_Array (0 .. 0); 3605 PM : Pattern_Matcher (Size); 3606 Final_Size : Program_Size; 3607 pragma Unreferenced (Final_Size); 3608 begin 3609 if Size = 0 then 3610 Match (Compile (Expression), Data, Matches, Data_First, Data_Last); 3611 else 3612 Compile (PM, Expression, Final_Size); 3613 Match (PM, Data, Matches, Data_First, Data_Last); 3614 end if; 3615 3616 return Matches (0).First >= Data'First; 3617 end Match; 3618 3619 ------------- 3620 -- Operand -- 3621 ------------- 3622 3623 function Operand (P : Pointer) return Pointer is 3624 begin 3625 return P + Next_Pointer_Bytes; 3626 end Operand; 3627 3628 -------------- 3629 -- Optimize -- 3630 -------------- 3631 3632 procedure Optimize (Self : in out Pattern_Matcher) is 3633 Scan : Pointer; 3634 Program : Program_Data renames Self.Program; 3635 3636 begin 3637 -- Start with safe defaults (no optimization): 3638 -- * No known first character of match 3639 -- * Does not necessarily start at beginning of line 3640 -- * No string known that has to appear in data 3641 3642 Self.First := ASCII.NUL; 3643 Self.Anchored := False; 3644 Self.Must_Have := Program'Last + 1; 3645 Self.Must_Have_Length := 0; 3646 3647 Scan := Program_First; -- First instruction (can be anything) 3648 3649 if Program (Scan) = EXACT then 3650 Self.First := Program (String_Operand (Scan)); 3651 3652 elsif Program (Scan) = BOL 3653 or else Program (Scan) = SBOL 3654 or else Program (Scan) = MBOL 3655 then 3656 Self.Anchored := True; 3657 end if; 3658 end Optimize; 3659 3660 ----------------- 3661 -- Paren_Count -- 3662 ----------------- 3663 3664 function Paren_Count (Regexp : Pattern_Matcher) return Match_Count is 3665 begin 3666 return Regexp.Paren_Count; 3667 end Paren_Count; 3668 3669 ----------- 3670 -- Quote -- 3671 ----------- 3672 3673 function Quote (Str : String) return String is 3674 S : String (1 .. Str'Length * 2); 3675 Last : Natural := 0; 3676 3677 begin 3678 for J in Str'Range loop 3679 case Str (J) is 3680 when '^' | '$' | '|' | '*' | '+' | '?' | '{' | 3681 '}' | '[' | ']' | '(' | ')' | '\' | '.' => 3682 3683 S (Last + 1) := '\'; 3684 S (Last + 2) := Str (J); 3685 Last := Last + 2; 3686 3687 when others => 3688 S (Last + 1) := Str (J); 3689 Last := Last + 1; 3690 end case; 3691 end loop; 3692 3693 return S (1 .. Last); 3694 end Quote; 3695 3696 ------------------ 3697 -- Read_Natural -- 3698 ------------------ 3699 3700 function Read_Natural 3701 (Program : Program_Data; 3702 IP : Pointer) return Natural 3703 is 3704 begin 3705 return Character'Pos (Program (IP)) + 3706 256 * Character'Pos (Program (IP + 1)); 3707 end Read_Natural; 3708 3709 ----------------- 3710 -- Reset_Class -- 3711 ----------------- 3712 3713 procedure Reset_Class (Bitmap : out Character_Class) is 3714 begin 3715 Bitmap := (others => 0); 3716 end Reset_Class; 3717 3718 ------------------ 3719 -- Set_In_Class -- 3720 ------------------ 3721 3722 procedure Set_In_Class 3723 (Bitmap : in out Character_Class; 3724 C : Character) 3725 is 3726 Value : constant Class_Byte := Character'Pos (C); 3727 begin 3728 Bitmap (Value / 8) := Bitmap (Value / 8) 3729 or Bit_Conversion (Value mod 8); 3730 end Set_In_Class; 3731 3732 ------------------- 3733 -- String_Length -- 3734 ------------------- 3735 3736 function String_Length 3737 (Program : Program_Data; 3738 P : Pointer) return Program_Size 3739 is 3740 begin 3741 pragma Assert (Program (P) = EXACT or else Program (P) = EXACTF); 3742 return Character'Pos (Program (P + Next_Pointer_Bytes)); 3743 end String_Length; 3744 3745 -------------------- 3746 -- String_Operand -- 3747 -------------------- 3748 3749 function String_Operand (P : Pointer) return Pointer is 3750 begin 3751 return P + 4; 3752 end String_Operand; 3753 3754end System.Regpat; 3755