1------------------------------------------------------------------------------ 2-- -- 3-- GNAT LIBRARY COMPONENTS -- 4-- -- 5-- G N A T . R E G P A T -- 6-- -- 7-- B o d y -- 8-- -- 9-- Copyright (C) 1986 by University of Toronto. -- 10-- Copyright (C) 1999-2020, AdaCore -- 11-- -- 12-- GNAT is free software; you can redistribute it and/or modify it under -- 13-- terms of the GNU General Public License as published by the Free Soft- -- 14-- ware Foundation; either version 3, or (at your option) any later ver- -- 15-- sion. GNAT is distributed in the hope that it will be useful, but WITH- -- 16-- OUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY -- 17-- or FITNESS FOR A PARTICULAR PURPOSE. -- 18-- -- 19-- As a special exception under Section 7 of GPL version 3, you are granted -- 20-- additional permissions described in the GCC Runtime Library Exception, -- 21-- version 3.1, as published by the Free Software Foundation. -- 22-- -- 23-- You should have received a copy of the GNU General Public License and -- 24-- a copy of the GCC Runtime Library Exception along with this program; -- 25-- see the files COPYING3 and COPYING.RUNTIME respectively. If not, see -- 26-- <http://www.gnu.org/licenses/>. -- 27-- -- 28-- GNAT was originally developed by the GNAT team at New York University. -- 29-- Extensive contributions were provided by Ada Core Technologies Inc. -- 30-- -- 31------------------------------------------------------------------------------ 32 33-- This is an altered Ada 95 version of the original V8 style regular 34-- expression library written in C by Henry Spencer. Apart from the 35-- translation to Ada, the interface has been considerably changed to 36-- use the Ada String type instead of C-style nul-terminated strings. 37 38-- Beware that some of this code is subtly aware of the way operator 39-- precedence is structured in regular expressions. Serious changes in 40-- regular-expression syntax might require a total rethink. 41 42with System.IO; use System.IO; 43with Ada.Characters.Handling; use Ada.Characters.Handling; 44with Ada.Unchecked_Conversion; 45 46package body System.Regpat is 47 48 Debug : constant Boolean := False; 49 -- Set to True to activate debug traces. This is normally set to constant 50 -- False to simply delete all the trace code. It is to be edited to True 51 -- for internal debugging of the package. 52 53 ---------------------------- 54 -- Implementation details -- 55 ---------------------------- 56 57 -- This is essentially a linear encoding of a nondeterministic 58 -- finite-state machine, also known as syntax charts or 59 -- "railroad normal form" in parsing technology. 60 61 -- Each node is an opcode plus a "next" pointer, possibly plus an 62 -- operand. "Next" pointers of all nodes except BRANCH implement 63 -- concatenation; a "next" pointer with a BRANCH on both ends of it 64 -- is connecting two alternatives. 65 66 -- The operand of some types of node is a literal string; for others, 67 -- it is a node leading into a sub-FSM. In particular, the operand of 68 -- a BRANCH node is the first node of the branch. 69 -- (NB this is *not* a tree structure: the tail of the branch connects 70 -- to the thing following the set of BRANCHes). 71 72 -- You can see the exact byte-compiled version by using the Dump 73 -- subprogram. However, here are a few examples: 74 75 -- (a|b): 1 : BRANCH (next at 9) 76 -- 4 : EXACT (next at 17) operand=a 77 -- 9 : BRANCH (next at 17) 78 -- 12 : EXACT (next at 17) operand=b 79 -- 17 : EOP (next at 0) 80 -- 81 -- (ab)*: 1 : CURLYX (next at 25) { 0, 32767} 82 -- 8 : OPEN 1 (next at 12) 83 -- 12 : EXACT (next at 18) operand=ab 84 -- 18 : CLOSE 1 (next at 22) 85 -- 22 : WHILEM (next at 0) 86 -- 25 : NOTHING (next at 28) 87 -- 28 : EOP (next at 0) 88 89 -- The opcodes are: 90 91 type Opcode is 92 93 -- Name Operand? Meaning 94 95 (EOP, -- no End of program 96 MINMOD, -- no Next operator is not greedy 97 98 -- Classes of characters 99 100 ANY, -- no Match any one character except newline 101 SANY, -- no Match any character, including new line 102 ANYOF, -- class Match any character in this class 103 EXACT, -- str Match this string exactly 104 EXACTF, -- str Match this string (case-folding is one) 105 NOTHING, -- no Match empty string 106 SPACE, -- no Match any whitespace character 107 NSPACE, -- no Match any non-whitespace character 108 DIGIT, -- no Match any numeric character 109 NDIGIT, -- no Match any non-numeric character 110 ALNUM, -- no Match any alphanumeric character 111 NALNUM, -- no Match any non-alphanumeric character 112 113 -- Branches 114 115 BRANCH, -- node Match this alternative, or the next 116 117 -- Simple loops (when the following node is one character in length) 118 119 STAR, -- node Match this simple thing 0 or more times 120 PLUS, -- node Match this simple thing 1 or more times 121 CURLY, -- 2num node Match this simple thing between n and m times. 122 123 -- Complex loops 124 125 CURLYX, -- 2num node Match this complex thing {n,m} times 126 -- The nums are coded on two characters each 127 128 WHILEM, -- no Do curly processing and see if rest matches 129 130 -- Matches after or before a word 131 132 BOL, -- no Match "" at beginning of line 133 MBOL, -- no Same, assuming multiline (match after \n) 134 SBOL, -- no Same, assuming single line (don't match at \n) 135 EOL, -- no Match "" at end of line 136 MEOL, -- no Same, assuming multiline (match before \n) 137 SEOL, -- no Same, assuming single line (don't match at \n) 138 139 BOUND, -- no Match "" at any word boundary 140 NBOUND, -- no Match "" at any word non-boundary 141 142 -- Parenthesis groups handling 143 144 REFF, -- num Match some already matched string, folded 145 OPEN, -- num Mark this point in input as start of #n 146 CLOSE); -- num Analogous to OPEN 147 148 for Opcode'Size use 8; 149 150 -- Opcode notes: 151 152 -- BRANCH 153 -- The set of branches constituting a single choice are hooked 154 -- together with their "next" pointers, since precedence prevents 155 -- anything being concatenated to any individual branch. The 156 -- "next" pointer of the last BRANCH in a choice points to the 157 -- thing following the whole choice. This is also where the 158 -- final "next" pointer of each individual branch points; each 159 -- branch starts with the operand node of a BRANCH node. 160 161 -- STAR,PLUS 162 -- '?', and complex '*' and '+', are implemented with CURLYX. 163 -- branches. Simple cases (one character per match) are implemented with 164 -- STAR and PLUS for speed and to minimize recursive plunges. 165 166 -- OPEN,CLOSE 167 -- ...are numbered at compile time. 168 169 -- EXACT, EXACTF 170 -- There are in fact two arguments, the first one is the length (minus 171 -- one of the string argument), coded on one character, the second 172 -- argument is the string itself, coded on length + 1 characters. 173 174 -- A node is one char of opcode followed by two chars of "next" pointer. 175 -- "Next" pointers are stored as two 8-bit pieces, high order first. The 176 -- value is a positive offset from the opcode of the node containing it. 177 -- An operand, if any, simply follows the node. (Note that much of the 178 -- code generation knows about this implicit relationship.) 179 180 -- Using two bytes for the "next" pointer is vast overkill for most 181 -- things, but allows patterns to get big without disasters. 182 183 Next_Pointer_Bytes : constant := 3; 184 -- Points after the "next pointer" data. An instruction is therefore: 185 -- 1 byte: instruction opcode 186 -- 2 bytes: pointer to next instruction 187 -- * bytes: optional data for the instruction 188 189 ----------------------- 190 -- Character classes -- 191 ----------------------- 192 -- This is the implementation for character classes ([...]) in the 193 -- syntax for regular expressions. Each character (0..256) has an 194 -- entry into the table. This makes for a very fast matching 195 -- algorithm. 196 197 type Class_Byte is mod 256; 198 type Character_Class is array (Class_Byte range 0 .. 31) of Class_Byte; 199 200 type Bit_Conversion_Array is array (Class_Byte range 0 .. 7) of Class_Byte; 201 Bit_Conversion : constant Bit_Conversion_Array := 202 (1, 2, 4, 8, 16, 32, 64, 128); 203 204 type Std_Class is (ANYOF_NONE, 205 ANYOF_ALNUM, -- Alphanumeric class [a-zA-Z0-9] 206 ANYOF_NALNUM, 207 ANYOF_SPACE, -- Space class [ \t\n\r\f] 208 ANYOF_NSPACE, 209 ANYOF_DIGIT, -- Digit class [0-9] 210 ANYOF_NDIGIT, 211 ANYOF_ALNUMC, -- Alphanumeric class [a-zA-Z0-9] 212 ANYOF_NALNUMC, 213 ANYOF_ALPHA, -- Alpha class [a-zA-Z] 214 ANYOF_NALPHA, 215 ANYOF_ASCII, -- Ascii class (7 bits) 0..127 216 ANYOF_NASCII, 217 ANYOF_CNTRL, -- Control class 218 ANYOF_NCNTRL, 219 ANYOF_GRAPH, -- Graphic class 220 ANYOF_NGRAPH, 221 ANYOF_LOWER, -- Lower case class [a-z] 222 ANYOF_NLOWER, 223 ANYOF_PRINT, -- printable class 224 ANYOF_NPRINT, 225 ANYOF_PUNCT, -- 226 ANYOF_NPUNCT, 227 ANYOF_UPPER, -- Upper case class [A-Z] 228 ANYOF_NUPPER, 229 ANYOF_XDIGIT, -- Hexadecimal digit 230 ANYOF_NXDIGIT 231 ); 232 233 procedure Set_In_Class 234 (Bitmap : in out Character_Class; 235 C : Character); 236 -- Set the entry to True for C in the class Bitmap 237 238 function Get_From_Class 239 (Bitmap : Character_Class; 240 C : Character) return Boolean; 241 -- Return True if the entry is set for C in the class Bitmap 242 243 procedure Reset_Class (Bitmap : out Character_Class); 244 -- Clear all the entries in the class Bitmap 245 246 pragma Inline (Set_In_Class); 247 pragma Inline (Get_From_Class); 248 pragma Inline (Reset_Class); 249 250 ----------------------- 251 -- Local Subprograms -- 252 ----------------------- 253 254 function "=" (Left : Character; Right : Opcode) return Boolean; 255 256 function Is_Alnum (C : Character) return Boolean; 257 -- Return True if C is an alphanum character or an underscore ('_') 258 259 function Is_White_Space (C : Character) return Boolean; 260 -- Return True if C is a whitespace character 261 262 function Is_Printable (C : Character) return Boolean; 263 -- Return True if C is a printable character 264 265 function Operand (P : Pointer) return Pointer; 266 -- Return a pointer to the first operand of the node at P 267 268 function String_Length 269 (Program : Program_Data; 270 P : Pointer) return Program_Size; 271 -- Return the length of the string argument of the node at P 272 273 function String_Operand (P : Pointer) return Pointer; 274 -- Return a pointer to the string argument of the node at P 275 276 procedure Bitmap_Operand 277 (Program : Program_Data; 278 P : Pointer; 279 Op : out Character_Class); 280 -- Return a pointer to the string argument of the node at P 281 282 function Get_Next 283 (Program : Program_Data; 284 IP : Pointer) return Pointer; 285 -- Dig the next instruction pointer out of a node 286 287 procedure Optimize (Self : in out Pattern_Matcher); 288 -- Optimize a Pattern_Matcher by noting certain special cases 289 290 function Read_Natural 291 (Program : Program_Data; 292 IP : Pointer) return Natural; 293 -- Return the 2-byte natural coded at position IP 294 295 -- All of the subprograms above are tiny and should be inlined 296 297 pragma Inline ("="); 298 pragma Inline (Is_Alnum); 299 pragma Inline (Is_White_Space); 300 pragma Inline (Get_Next); 301 pragma Inline (Operand); 302 pragma Inline (Read_Natural); 303 pragma Inline (String_Length); 304 pragma Inline (String_Operand); 305 306 type Expression_Flags is record 307 Has_Width, -- Known never to match null string 308 Simple, -- Simple enough to be STAR/PLUS operand 309 SP_Start : Boolean; -- Starts with * or + 310 end record; 311 312 Worst_Expression : constant Expression_Flags := (others => False); 313 -- Worst case 314 315 procedure Dump_Until 316 (Program : Program_Data; 317 Index : in out Pointer; 318 Till : Pointer; 319 Indent : Natural; 320 Do_Print : Boolean := True); 321 -- Dump the program until the node Till (not included) is met. Every line 322 -- is indented with Index spaces at the beginning Dumps till the end if 323 -- Till is 0. 324 325 procedure Dump_Operation 326 (Program : Program_Data; 327 Index : Pointer; 328 Indent : Natural); 329 -- Same as above, but only dumps a single operation, and compute its 330 -- indentation from the program. 331 332 --------- 333 -- "=" -- 334 --------- 335 336 function "=" (Left : Character; Right : Opcode) return Boolean is 337 begin 338 return Character'Pos (Left) = Opcode'Pos (Right); 339 end "="; 340 341 -------------------- 342 -- Bitmap_Operand -- 343 -------------------- 344 345 procedure Bitmap_Operand 346 (Program : Program_Data; 347 P : Pointer; 348 Op : out Character_Class) 349 is 350 function Convert is new Ada.Unchecked_Conversion 351 (Program_Data, Character_Class); 352 353 begin 354 Op (0 .. 31) := Convert (Program (P + Next_Pointer_Bytes .. P + 34)); 355 end Bitmap_Operand; 356 357 ------------- 358 -- Compile -- 359 ------------- 360 361 procedure Compile 362 (Matcher : out Pattern_Matcher; 363 Expression : String; 364 Final_Code_Size : out Program_Size; 365 Flags : Regexp_Flags := No_Flags) 366 is 367 -- We can't allocate space until we know how big the compiled form 368 -- will be, but we can't compile it (and thus know how big it is) 369 -- until we've got a place to put the code. So we cheat: we compile 370 -- it twice, once with code generation turned off and size counting 371 -- turned on, and once "for real". 372 373 -- This also means that we don't allocate space until we are sure 374 -- that the thing really will compile successfully, and we never 375 -- have to move the code and thus invalidate pointers into it. 376 377 -- Beware that the optimization-preparation code in here knows 378 -- about some of the structure of the compiled regexp. 379 380 PM : Pattern_Matcher renames Matcher; 381 Program : Program_Data renames PM.Program; 382 383 Emit_Ptr : Pointer := Program_First; 384 385 Parse_Pos : Natural := Expression'First; -- Input-scan pointer 386 Parse_End : constant Natural := Expression'Last; 387 388 ---------------------------- 389 -- Subprograms for Create -- 390 ---------------------------- 391 392 procedure Emit (B : Character); 393 -- Output the Character B to the Program. If code-generation is 394 -- disabled, simply increments the program counter. 395 396 function Emit_Node (Op : Opcode) return Pointer; 397 -- If code-generation is enabled, Emit_Node outputs the 398 -- opcode Op and reserves space for a pointer to the next node. 399 -- Return value is the location of new opcode, i.e. old Emit_Ptr. 400 401 procedure Emit_Natural (IP : Pointer; N : Natural); 402 -- Split N on two characters at position IP 403 404 procedure Emit_Class (Bitmap : Character_Class); 405 -- Emits a character class 406 407 procedure Case_Emit (C : Character); 408 -- Emit C, after converting is to lower-case if the regular 409 -- expression is case insensitive. 410 411 procedure Parse 412 (Parenthesized : Boolean; 413 Capturing : Boolean; 414 Flags : out Expression_Flags; 415 IP : out Pointer); 416 -- Parse regular expression, i.e. main body or parenthesized thing. 417 -- Caller must absorb opening parenthesis. Capturing should be set to 418 -- True when we have an open parenthesis from which we want the user 419 -- to extra text. 420 421 procedure Parse_Branch 422 (Flags : out Expression_Flags; 423 First : Boolean; 424 IP : out Pointer); 425 -- Implements the concatenation operator and handles '|'. 426 -- First should be true if this is the first item of the alternative. 427 428 procedure Parse_Piece 429 (Expr_Flags : out Expression_Flags; 430 IP : out Pointer); 431 -- Parse something followed by possible [*+?] 432 433 procedure Parse_Atom 434 (Expr_Flags : out Expression_Flags; 435 IP : out Pointer); 436 -- Parse_Atom is the lowest level parse procedure. 437 -- 438 -- Optimization: Gobbles an entire sequence of ordinary characters so 439 -- that it can turn them into a single node, which is smaller to store 440 -- and faster to run. Backslashed characters are exceptions, each 441 -- becoming a separate node; the code is simpler that way and it's 442 -- not worth fixing. 443 444 procedure Insert_Operator 445 (Op : Opcode; 446 Operand : Pointer; 447 Greedy : Boolean := True); 448 -- Insert_Operator inserts an operator in front of an already-emitted 449 -- operand and relocates the operand. This applies to PLUS and STAR. 450 -- If Minmod is True, then the operator is non-greedy. 451 452 function Insert_Operator_Before 453 (Op : Opcode; 454 Operand : Pointer; 455 Greedy : Boolean; 456 Opsize : Pointer) return Pointer; 457 -- Insert an operator before Operand (and move the latter forward in the 458 -- program). Opsize is the size needed to represent the operator. This 459 -- returns the position at which the operator was inserted, and moves 460 -- Emit_Ptr after the new position of the operand. 461 462 procedure Insert_Curly_Operator 463 (Op : Opcode; 464 Min : Natural; 465 Max : Natural; 466 Operand : Pointer; 467 Greedy : Boolean := True); 468 -- Insert an operator for CURLY ({Min}, {Min,} or {Min,Max}). 469 -- If Minmod is True, then the operator is non-greedy. 470 471 procedure Link_Tail (P, Val : Pointer); 472 -- Link_Tail sets the next-pointer at the end of a node chain 473 474 procedure Link_Operand_Tail (P, Val : Pointer); 475 -- Link_Tail on operand of first argument; noop if operand-less 476 477 procedure Fail (M : String); 478 pragma No_Return (Fail); 479 -- Fail with a diagnostic message, if possible 480 481 function Is_Curly_Operator (IP : Natural) return Boolean; 482 -- Return True if IP is looking at a '{' that is the beginning 483 -- of a curly operator, i.e. it matches {\d+,?\d*} 484 485 function Is_Mult (IP : Natural) return Boolean; 486 -- Return True if C is a regexp multiplier: '+', '*' or '?' 487 488 procedure Get_Curly_Arguments 489 (IP : Natural; 490 Min : out Natural; 491 Max : out Natural; 492 Greedy : out Boolean); 493 -- Parse the argument list for a curly operator. 494 -- It is assumed that IP is indeed pointing at a valid operator. 495 -- So what is IP and how come IP is not referenced in the body ??? 496 497 procedure Parse_Character_Class (IP : out Pointer); 498 -- Parse a character class. 499 -- The calling subprogram should consume the opening '[' before. 500 501 procedure Parse_Literal 502 (Expr_Flags : out Expression_Flags; 503 IP : out Pointer); 504 -- Parse_Literal encodes a string of characters to be matched exactly 505 506 function Parse_Posix_Character_Class return Std_Class; 507 -- Parse a posix character class, like [:alpha:] or [:^alpha:]. 508 -- The caller is supposed to absorb the opening [. 509 510 pragma Inline (Is_Mult); 511 pragma Inline (Emit_Natural); 512 pragma Inline (Parse_Character_Class); -- since used only once 513 514 --------------- 515 -- Case_Emit -- 516 --------------- 517 518 procedure Case_Emit (C : Character) is 519 begin 520 if (Flags and Case_Insensitive) /= 0 then 521 Emit (To_Lower (C)); 522 523 else 524 -- Dump current character 525 526 Emit (C); 527 end if; 528 end Case_Emit; 529 530 ---------- 531 -- Emit -- 532 ---------- 533 534 procedure Emit (B : Character) is 535 begin 536 if Emit_Ptr <= PM.Size then 537 Program (Emit_Ptr) := B; 538 end if; 539 540 Emit_Ptr := Emit_Ptr + 1; 541 end Emit; 542 543 ---------------- 544 -- Emit_Class -- 545 ---------------- 546 547 procedure Emit_Class (Bitmap : Character_Class) is 548 subtype Program31 is Program_Data (0 .. 31); 549 550 function Convert is new Ada.Unchecked_Conversion 551 (Character_Class, Program31); 552 553 begin 554 -- What is the mysterious constant 31 here??? Can't it be expressed 555 -- symbolically (size of integer - 1 or some such???). In any case 556 -- it should be declared as a constant (and referenced presumably 557 -- as this constant + 1 below. 558 559 if Emit_Ptr + 31 <= PM.Size then 560 Program (Emit_Ptr .. Emit_Ptr + 31) := Convert (Bitmap); 561 end if; 562 563 Emit_Ptr := Emit_Ptr + 32; 564 end Emit_Class; 565 566 ------------------ 567 -- Emit_Natural -- 568 ------------------ 569 570 procedure Emit_Natural (IP : Pointer; N : Natural) is 571 begin 572 if IP + 1 <= PM.Size then 573 Program (IP + 1) := Character'Val (N / 256); 574 Program (IP) := Character'Val (N mod 256); 575 end if; 576 end Emit_Natural; 577 578 --------------- 579 -- Emit_Node -- 580 --------------- 581 582 function Emit_Node (Op : Opcode) return Pointer is 583 Result : constant Pointer := Emit_Ptr; 584 585 begin 586 if Emit_Ptr + 2 <= PM.Size then 587 Program (Emit_Ptr) := Character'Val (Opcode'Pos (Op)); 588 Program (Emit_Ptr + 1) := ASCII.NUL; 589 Program (Emit_Ptr + 2) := ASCII.NUL; 590 end if; 591 592 Emit_Ptr := Emit_Ptr + Next_Pointer_Bytes; 593 return Result; 594 end Emit_Node; 595 596 ---------- 597 -- Fail -- 598 ---------- 599 600 procedure Fail (M : String) is 601 begin 602 raise Expression_Error with M; 603 end Fail; 604 605 ------------------------- 606 -- Get_Curly_Arguments -- 607 ------------------------- 608 609 procedure Get_Curly_Arguments 610 (IP : Natural; 611 Min : out Natural; 612 Max : out Natural; 613 Greedy : out Boolean) 614 is 615 pragma Unreferenced (IP); 616 617 Save_Pos : Natural := Parse_Pos + 1; 618 619 begin 620 Min := 0; 621 Max := Max_Curly_Repeat; 622 623 while Expression (Parse_Pos) /= '}' 624 and then Expression (Parse_Pos) /= ',' 625 loop 626 Parse_Pos := Parse_Pos + 1; 627 end loop; 628 629 Min := Natural'Value (Expression (Save_Pos .. Parse_Pos - 1)); 630 631 if Expression (Parse_Pos) = ',' then 632 Save_Pos := Parse_Pos + 1; 633 while Expression (Parse_Pos) /= '}' loop 634 Parse_Pos := Parse_Pos + 1; 635 end loop; 636 637 if Save_Pos /= Parse_Pos then 638 Max := Natural'Value (Expression (Save_Pos .. Parse_Pos - 1)); 639 end if; 640 641 else 642 Max := Min; 643 end if; 644 645 if Parse_Pos < Expression'Last 646 and then Expression (Parse_Pos + 1) = '?' 647 then 648 Greedy := False; 649 Parse_Pos := Parse_Pos + 1; 650 651 else 652 Greedy := True; 653 end if; 654 end Get_Curly_Arguments; 655 656 --------------------------- 657 -- Insert_Curly_Operator -- 658 --------------------------- 659 660 procedure Insert_Curly_Operator 661 (Op : Opcode; 662 Min : Natural; 663 Max : Natural; 664 Operand : Pointer; 665 Greedy : Boolean := True) 666 is 667 Old : Pointer; 668 begin 669 Old := Insert_Operator_Before (Op, Operand, Greedy, Opsize => 7); 670 Emit_Natural (Old + Next_Pointer_Bytes, Min); 671 Emit_Natural (Old + Next_Pointer_Bytes + 2, Max); 672 end Insert_Curly_Operator; 673 674 ---------------------------- 675 -- Insert_Operator_Before -- 676 ---------------------------- 677 678 function Insert_Operator_Before 679 (Op : Opcode; 680 Operand : Pointer; 681 Greedy : Boolean; 682 Opsize : Pointer) return Pointer 683 is 684 Dest : constant Pointer := Emit_Ptr; 685 Old : Pointer; 686 Size : Pointer := Opsize; 687 688 begin 689 -- If not greedy, we have to emit another opcode first 690 691 if not Greedy then 692 Size := Size + Next_Pointer_Bytes; 693 end if; 694 695 -- Move the operand in the byte-compilation, so that we can insert 696 -- the operator before it. 697 698 if Emit_Ptr + Size <= PM.Size then 699 Program (Operand + Size .. Emit_Ptr + Size) := 700 Program (Operand .. Emit_Ptr); 701 end if; 702 703 -- Insert the operator at the position previously occupied by the 704 -- operand. 705 706 Emit_Ptr := Operand; 707 708 if not Greedy then 709 Old := Emit_Node (MINMOD); 710 Link_Tail (Old, Old + Next_Pointer_Bytes); 711 end if; 712 713 Old := Emit_Node (Op); 714 Emit_Ptr := Dest + Size; 715 return Old; 716 end Insert_Operator_Before; 717 718 --------------------- 719 -- Insert_Operator -- 720 --------------------- 721 722 procedure Insert_Operator 723 (Op : Opcode; 724 Operand : Pointer; 725 Greedy : Boolean := True) 726 is 727 Discard : Pointer; 728 pragma Warnings (Off, Discard); 729 begin 730 Discard := Insert_Operator_Before 731 (Op, Operand, Greedy, Opsize => Next_Pointer_Bytes); 732 end Insert_Operator; 733 734 ----------------------- 735 -- Is_Curly_Operator -- 736 ----------------------- 737 738 function Is_Curly_Operator (IP : Natural) return Boolean is 739 Scan : Natural := IP; 740 741 begin 742 if Expression (Scan) /= '{' 743 or else Scan + 2 > Expression'Last 744 or else not Is_Digit (Expression (Scan + 1)) 745 then 746 return False; 747 end if; 748 749 Scan := Scan + 1; 750 751 -- The first digit 752 753 loop 754 Scan := Scan + 1; 755 756 if Scan > Expression'Last then 757 return False; 758 end if; 759 760 exit when not Is_Digit (Expression (Scan)); 761 end loop; 762 763 if Expression (Scan) = ',' then 764 loop 765 Scan := Scan + 1; 766 767 if Scan > Expression'Last then 768 return False; 769 end if; 770 771 exit when not Is_Digit (Expression (Scan)); 772 end loop; 773 end if; 774 775 return Expression (Scan) = '}'; 776 end Is_Curly_Operator; 777 778 ------------- 779 -- Is_Mult -- 780 ------------- 781 782 function Is_Mult (IP : Natural) return Boolean is 783 C : constant Character := Expression (IP); 784 785 begin 786 return C = '*' 787 or else C = '+' 788 or else C = '?' 789 or else (C = '{' and then Is_Curly_Operator (IP)); 790 end Is_Mult; 791 792 ----------------------- 793 -- Link_Operand_Tail -- 794 ----------------------- 795 796 procedure Link_Operand_Tail (P, Val : Pointer) is 797 begin 798 if P <= PM.Size and then Program (P) = BRANCH then 799 Link_Tail (Operand (P), Val); 800 end if; 801 end Link_Operand_Tail; 802 803 --------------- 804 -- Link_Tail -- 805 --------------- 806 807 procedure Link_Tail (P, Val : Pointer) is 808 Scan : Pointer; 809 Temp : Pointer; 810 Offset : Pointer; 811 812 begin 813 -- Find last node (the size of the pattern matcher might be too 814 -- small, so don't try to read past its end). 815 816 Scan := P; 817 while Scan + Next_Pointer_Bytes <= PM.Size loop 818 Temp := Get_Next (Program, Scan); 819 exit when Temp = Scan; 820 Scan := Temp; 821 end loop; 822 823 Offset := Val - Scan; 824 825 Emit_Natural (Scan + 1, Natural (Offset)); 826 end Link_Tail; 827 828 ----------- 829 -- Parse -- 830 ----------- 831 832 -- Combining parenthesis handling with the base level of regular 833 -- expression is a trifle forced, but the need to tie the tails of the 834 -- the branches to what follows makes it hard to avoid. 835 836 procedure Parse 837 (Parenthesized : Boolean; 838 Capturing : Boolean; 839 Flags : out Expression_Flags; 840 IP : out Pointer) 841 is 842 E : String renames Expression; 843 Br, Br2 : Pointer; 844 Ender : Pointer; 845 Par_No : Natural; 846 New_Flags : Expression_Flags; 847 Have_Branch : Boolean := False; 848 849 begin 850 Flags := (Has_Width => True, others => False); -- Tentatively 851 852 -- Make an OPEN node, if parenthesized 853 854 if Parenthesized and then Capturing then 855 if Matcher.Paren_Count > Max_Paren_Count then 856 Fail ("too many ()"); 857 end if; 858 859 Par_No := Matcher.Paren_Count + 1; 860 Matcher.Paren_Count := Matcher.Paren_Count + 1; 861 IP := Emit_Node (OPEN); 862 Emit (Character'Val (Par_No)); 863 else 864 IP := 0; 865 Par_No := 0; 866 end if; 867 868 -- Pick up the branches, linking them together 869 870 Parse_Branch (New_Flags, True, Br); 871 872 if Br = 0 then 873 IP := 0; 874 return; 875 end if; 876 877 if Parse_Pos <= Parse_End 878 and then E (Parse_Pos) = '|' 879 then 880 Insert_Operator (BRANCH, Br); 881 Have_Branch := True; 882 end if; 883 884 if IP /= 0 then 885 Link_Tail (IP, Br); -- OPEN -> first 886 else 887 IP := Br; 888 end if; 889 890 if not New_Flags.Has_Width then 891 Flags.Has_Width := False; 892 end if; 893 894 Flags.SP_Start := Flags.SP_Start or else New_Flags.SP_Start; 895 896 while Parse_Pos <= Parse_End 897 and then (E (Parse_Pos) = '|') 898 loop 899 Parse_Pos := Parse_Pos + 1; 900 Parse_Branch (New_Flags, False, Br); 901 902 if Br = 0 then 903 IP := 0; 904 return; 905 end if; 906 907 Link_Tail (IP, Br); -- BRANCH -> BRANCH 908 909 if not New_Flags.Has_Width then 910 Flags.Has_Width := False; 911 end if; 912 913 Flags.SP_Start := Flags.SP_Start or else New_Flags.SP_Start; 914 end loop; 915 916 -- Make a closing node, and hook it on the end 917 918 if Parenthesized then 919 if Capturing then 920 Ender := Emit_Node (CLOSE); 921 Emit (Character'Val (Par_No)); 922 Link_Tail (IP, Ender); 923 924 else 925 -- Need to keep looking after the closing parenthesis 926 Ender := Emit_Ptr; 927 end if; 928 929 else 930 Ender := Emit_Node (EOP); 931 Link_Tail (IP, Ender); 932 end if; 933 934 if Have_Branch and then Emit_Ptr <= PM.Size + 1 then 935 936 -- Hook the tails of the branches to the closing node 937 938 Br := IP; 939 loop 940 Link_Operand_Tail (Br, Ender); 941 Br2 := Get_Next (Program, Br); 942 exit when Br2 = Br; 943 Br := Br2; 944 end loop; 945 end if; 946 947 -- Check for proper termination 948 949 if Parenthesized then 950 if Parse_Pos > Parse_End or else E (Parse_Pos) /= ')' then 951 Fail ("unmatched ()"); 952 end if; 953 954 Parse_Pos := Parse_Pos + 1; 955 956 elsif Parse_Pos <= Parse_End then 957 if E (Parse_Pos) = ')' then 958 Fail ("unmatched ')'"); 959 else 960 Fail ("junk on end"); -- "Can't happen" 961 end if; 962 end if; 963 end Parse; 964 965 ---------------- 966 -- Parse_Atom -- 967 ---------------- 968 969 procedure Parse_Atom 970 (Expr_Flags : out Expression_Flags; 971 IP : out Pointer) 972 is 973 C : Character; 974 975 begin 976 -- Tentatively set worst expression case 977 978 Expr_Flags := Worst_Expression; 979 980 C := Expression (Parse_Pos); 981 Parse_Pos := Parse_Pos + 1; 982 983 case (C) is 984 when '^' => 985 IP := 986 Emit_Node 987 (if (Flags and Multiple_Lines) /= 0 then MBOL 988 elsif (Flags and Single_Line) /= 0 then SBOL 989 else BOL); 990 991 when '$' => 992 IP := 993 Emit_Node 994 (if (Flags and Multiple_Lines) /= 0 then MEOL 995 elsif (Flags and Single_Line) /= 0 then SEOL 996 else EOL); 997 998 when '.' => 999 IP := 1000 Emit_Node 1001 (if (Flags and Single_Line) /= 0 then SANY else ANY); 1002 1003 Expr_Flags.Has_Width := True; 1004 Expr_Flags.Simple := True; 1005 1006 when '[' => 1007 Parse_Character_Class (IP); 1008 Expr_Flags.Has_Width := True; 1009 Expr_Flags.Simple := True; 1010 1011 when '(' => 1012 declare 1013 New_Flags : Expression_Flags; 1014 1015 begin 1016 if Parse_Pos <= Parse_End - 1 1017 and then Expression (Parse_Pos) = '?' 1018 and then Expression (Parse_Pos + 1) = ':' 1019 then 1020 Parse_Pos := Parse_Pos + 2; 1021 1022 -- Non-capturing parenthesis 1023 1024 Parse (True, False, New_Flags, IP); 1025 1026 else 1027 -- Capturing parenthesis 1028 1029 Parse (True, True, New_Flags, IP); 1030 Expr_Flags.Has_Width := 1031 Expr_Flags.Has_Width or else New_Flags.Has_Width; 1032 Expr_Flags.SP_Start := 1033 Expr_Flags.SP_Start or else New_Flags.SP_Start; 1034 if IP = 0 then 1035 return; 1036 end if; 1037 end if; 1038 end; 1039 1040 when '|' | ASCII.LF | ')' => 1041 Fail ("internal urp"); -- Supposed to be caught earlier 1042 1043 when '?' | '+' | '*' => 1044 Fail (C & " follows nothing"); 1045 1046 when '{' => 1047 if Is_Curly_Operator (Parse_Pos - 1) then 1048 Fail (C & " follows nothing"); 1049 else 1050 Parse_Literal (Expr_Flags, IP); 1051 end if; 1052 1053 when '\' => 1054 if Parse_Pos > Parse_End then 1055 Fail ("trailing \"); 1056 end if; 1057 1058 Parse_Pos := Parse_Pos + 1; 1059 1060 case Expression (Parse_Pos - 1) is 1061 when 'b' => 1062 IP := Emit_Node (BOUND); 1063 1064 when 'B' => 1065 IP := Emit_Node (NBOUND); 1066 1067 when 's' => 1068 IP := Emit_Node (SPACE); 1069 Expr_Flags.Simple := True; 1070 Expr_Flags.Has_Width := True; 1071 1072 when 'S' => 1073 IP := Emit_Node (NSPACE); 1074 Expr_Flags.Simple := True; 1075 Expr_Flags.Has_Width := True; 1076 1077 when 'd' => 1078 IP := Emit_Node (DIGIT); 1079 Expr_Flags.Simple := True; 1080 Expr_Flags.Has_Width := True; 1081 1082 when 'D' => 1083 IP := Emit_Node (NDIGIT); 1084 Expr_Flags.Simple := True; 1085 Expr_Flags.Has_Width := True; 1086 1087 when 'w' => 1088 IP := Emit_Node (ALNUM); 1089 Expr_Flags.Simple := True; 1090 Expr_Flags.Has_Width := True; 1091 1092 when 'W' => 1093 IP := Emit_Node (NALNUM); 1094 Expr_Flags.Simple := True; 1095 Expr_Flags.Has_Width := True; 1096 1097 when 'A' => 1098 IP := Emit_Node (SBOL); 1099 1100 when 'G' => 1101 IP := Emit_Node (SEOL); 1102 1103 when '0' .. '9' => 1104 IP := Emit_Node (REFF); 1105 1106 declare 1107 Save : constant Natural := Parse_Pos - 1; 1108 1109 begin 1110 while Parse_Pos <= Expression'Last 1111 and then Is_Digit (Expression (Parse_Pos)) 1112 loop 1113 Parse_Pos := Parse_Pos + 1; 1114 end loop; 1115 1116 Emit (Character'Val (Natural'Value 1117 (Expression (Save .. Parse_Pos - 1)))); 1118 end; 1119 1120 when others => 1121 Parse_Pos := Parse_Pos - 1; 1122 Parse_Literal (Expr_Flags, IP); 1123 end case; 1124 1125 when others => 1126 Parse_Literal (Expr_Flags, IP); 1127 end case; 1128 end Parse_Atom; 1129 1130 ------------------ 1131 -- Parse_Branch -- 1132 ------------------ 1133 1134 procedure Parse_Branch 1135 (Flags : out Expression_Flags; 1136 First : Boolean; 1137 IP : out Pointer) 1138 is 1139 E : String renames Expression; 1140 Chain : Pointer; 1141 Last : Pointer; 1142 New_Flags : Expression_Flags; 1143 1144 Discard : Pointer; 1145 pragma Warnings (Off, Discard); 1146 1147 begin 1148 Flags := Worst_Expression; -- Tentatively 1149 IP := (if First then Emit_Ptr else Emit_Node (BRANCH)); 1150 1151 Chain := 0; 1152 while Parse_Pos <= Parse_End 1153 and then E (Parse_Pos) /= ')' 1154 and then E (Parse_Pos) /= ASCII.LF 1155 and then E (Parse_Pos) /= '|' 1156 loop 1157 Parse_Piece (New_Flags, Last); 1158 1159 if Last = 0 then 1160 IP := 0; 1161 return; 1162 end if; 1163 1164 Flags.Has_Width := Flags.Has_Width or else New_Flags.Has_Width; 1165 1166 if Chain = 0 then -- First piece 1167 Flags.SP_Start := Flags.SP_Start or else New_Flags.SP_Start; 1168 else 1169 Link_Tail (Chain, Last); 1170 end if; 1171 1172 Chain := Last; 1173 end loop; 1174 1175 -- Case where loop ran zero CURLY 1176 1177 if Chain = 0 then 1178 Discard := Emit_Node (NOTHING); 1179 end if; 1180 end Parse_Branch; 1181 1182 --------------------------- 1183 -- Parse_Character_Class -- 1184 --------------------------- 1185 1186 procedure Parse_Character_Class (IP : out Pointer) is 1187 Bitmap : Character_Class; 1188 Invert : Boolean := False; 1189 In_Range : Boolean := False; 1190 Named_Class : Std_Class := ANYOF_NONE; 1191 Value : Character; 1192 Last_Value : Character := ASCII.NUL; 1193 1194 begin 1195 Reset_Class (Bitmap); 1196 1197 -- Do we have an invert character class ? 1198 1199 if Parse_Pos <= Parse_End 1200 and then Expression (Parse_Pos) = '^' 1201 then 1202 Invert := True; 1203 Parse_Pos := Parse_Pos + 1; 1204 end if; 1205 1206 -- First character can be ] or - without closing the class 1207 1208 if Parse_Pos <= Parse_End 1209 and then (Expression (Parse_Pos) = ']' 1210 or else Expression (Parse_Pos) = '-') 1211 then 1212 Set_In_Class (Bitmap, Expression (Parse_Pos)); 1213 Parse_Pos := Parse_Pos + 1; 1214 end if; 1215 1216 -- While we don't have the end of the class 1217 1218 while Parse_Pos <= Parse_End 1219 and then Expression (Parse_Pos) /= ']' 1220 loop 1221 Named_Class := ANYOF_NONE; 1222 Value := Expression (Parse_Pos); 1223 Parse_Pos := Parse_Pos + 1; 1224 1225 -- Do we have a Posix character class 1226 if Value = '[' then 1227 Named_Class := Parse_Posix_Character_Class; 1228 1229 elsif Value = '\' then 1230 if Parse_Pos = Parse_End then 1231 Fail ("Trailing \"); 1232 end if; 1233 Value := Expression (Parse_Pos); 1234 Parse_Pos := Parse_Pos + 1; 1235 1236 case Value is 1237 when 'w' => Named_Class := ANYOF_ALNUM; 1238 when 'W' => Named_Class := ANYOF_NALNUM; 1239 when 's' => Named_Class := ANYOF_SPACE; 1240 when 'S' => Named_Class := ANYOF_NSPACE; 1241 when 'd' => Named_Class := ANYOF_DIGIT; 1242 when 'D' => Named_Class := ANYOF_NDIGIT; 1243 when 'n' => Value := ASCII.LF; 1244 when 'r' => Value := ASCII.CR; 1245 when 't' => Value := ASCII.HT; 1246 when 'f' => Value := ASCII.FF; 1247 when 'e' => Value := ASCII.ESC; 1248 when 'a' => Value := ASCII.BEL; 1249 1250 -- when 'x' => ??? hexadecimal value 1251 -- when 'c' => ??? control character 1252 -- when '0'..'9' => ??? octal character 1253 1254 when others => null; 1255 end case; 1256 end if; 1257 1258 -- Do we have a character class? 1259 1260 if Named_Class /= ANYOF_NONE then 1261 1262 -- A range like 'a-\d' or 'a-[:digit:] is not a range 1263 1264 if In_Range then 1265 Set_In_Class (Bitmap, Last_Value); 1266 Set_In_Class (Bitmap, '-'); 1267 In_Range := False; 1268 end if; 1269 1270 -- Expand the range 1271 1272 case Named_Class is 1273 when ANYOF_NONE => null; 1274 1275 when ANYOF_ALNUM | ANYOF_ALNUMC => 1276 for Value in Class_Byte'Range loop 1277 if Is_Alnum (Character'Val (Value)) then 1278 Set_In_Class (Bitmap, Character'Val (Value)); 1279 end if; 1280 end loop; 1281 1282 when ANYOF_NALNUM | ANYOF_NALNUMC => 1283 for Value in Class_Byte'Range loop 1284 if not Is_Alnum (Character'Val (Value)) then 1285 Set_In_Class (Bitmap, Character'Val (Value)); 1286 end if; 1287 end loop; 1288 1289 when ANYOF_SPACE => 1290 for Value in Class_Byte'Range loop 1291 if Is_White_Space (Character'Val (Value)) then 1292 Set_In_Class (Bitmap, Character'Val (Value)); 1293 end if; 1294 end loop; 1295 1296 when ANYOF_NSPACE => 1297 for Value in Class_Byte'Range loop 1298 if not Is_White_Space (Character'Val (Value)) then 1299 Set_In_Class (Bitmap, Character'Val (Value)); 1300 end if; 1301 end loop; 1302 1303 when ANYOF_DIGIT => 1304 for Value in Class_Byte'Range loop 1305 if Is_Digit (Character'Val (Value)) then 1306 Set_In_Class (Bitmap, Character'Val (Value)); 1307 end if; 1308 end loop; 1309 1310 when ANYOF_NDIGIT => 1311 for Value in Class_Byte'Range loop 1312 if not Is_Digit (Character'Val (Value)) then 1313 Set_In_Class (Bitmap, Character'Val (Value)); 1314 end if; 1315 end loop; 1316 1317 when ANYOF_ALPHA => 1318 for Value in Class_Byte'Range loop 1319 if Is_Letter (Character'Val (Value)) then 1320 Set_In_Class (Bitmap, Character'Val (Value)); 1321 end if; 1322 end loop; 1323 1324 when ANYOF_NALPHA => 1325 for Value in Class_Byte'Range loop 1326 if not Is_Letter (Character'Val (Value)) then 1327 Set_In_Class (Bitmap, Character'Val (Value)); 1328 end if; 1329 end loop; 1330 1331 when ANYOF_ASCII => 1332 for Value in 0 .. 127 loop 1333 Set_In_Class (Bitmap, Character'Val (Value)); 1334 end loop; 1335 1336 when ANYOF_NASCII => 1337 for Value in 128 .. 255 loop 1338 Set_In_Class (Bitmap, Character'Val (Value)); 1339 end loop; 1340 1341 when ANYOF_CNTRL => 1342 for Value in Class_Byte'Range loop 1343 if Is_Control (Character'Val (Value)) then 1344 Set_In_Class (Bitmap, Character'Val (Value)); 1345 end if; 1346 end loop; 1347 1348 when ANYOF_NCNTRL => 1349 for Value in Class_Byte'Range loop 1350 if not Is_Control (Character'Val (Value)) then 1351 Set_In_Class (Bitmap, Character'Val (Value)); 1352 end if; 1353 end loop; 1354 1355 when ANYOF_GRAPH => 1356 for Value in Class_Byte'Range loop 1357 if Is_Graphic (Character'Val (Value)) then 1358 Set_In_Class (Bitmap, Character'Val (Value)); 1359 end if; 1360 end loop; 1361 1362 when ANYOF_NGRAPH => 1363 for Value in Class_Byte'Range loop 1364 if not Is_Graphic (Character'Val (Value)) then 1365 Set_In_Class (Bitmap, Character'Val (Value)); 1366 end if; 1367 end loop; 1368 1369 when ANYOF_LOWER => 1370 for Value in Class_Byte'Range loop 1371 if Is_Lower (Character'Val (Value)) then 1372 Set_In_Class (Bitmap, Character'Val (Value)); 1373 end if; 1374 end loop; 1375 1376 when ANYOF_NLOWER => 1377 for Value in Class_Byte'Range loop 1378 if not Is_Lower (Character'Val (Value)) then 1379 Set_In_Class (Bitmap, Character'Val (Value)); 1380 end if; 1381 end loop; 1382 1383 when ANYOF_PRINT => 1384 for Value in Class_Byte'Range loop 1385 if Is_Printable (Character'Val (Value)) then 1386 Set_In_Class (Bitmap, Character'Val (Value)); 1387 end if; 1388 end loop; 1389 1390 when ANYOF_NPRINT => 1391 for Value in Class_Byte'Range loop 1392 if not Is_Printable (Character'Val (Value)) then 1393 Set_In_Class (Bitmap, Character'Val (Value)); 1394 end if; 1395 end loop; 1396 1397 when ANYOF_PUNCT => 1398 for Value in Class_Byte'Range loop 1399 if Is_Printable (Character'Val (Value)) 1400 and then not Is_White_Space (Character'Val (Value)) 1401 and then not Is_Alnum (Character'Val (Value)) 1402 then 1403 Set_In_Class (Bitmap, Character'Val (Value)); 1404 end if; 1405 end loop; 1406 1407 when ANYOF_NPUNCT => 1408 for Value in Class_Byte'Range loop 1409 if not Is_Printable (Character'Val (Value)) 1410 or else Is_White_Space (Character'Val (Value)) 1411 or else Is_Alnum (Character'Val (Value)) 1412 then 1413 Set_In_Class (Bitmap, Character'Val (Value)); 1414 end if; 1415 end loop; 1416 1417 when ANYOF_UPPER => 1418 for Value in Class_Byte'Range loop 1419 if Is_Upper (Character'Val (Value)) then 1420 Set_In_Class (Bitmap, Character'Val (Value)); 1421 end if; 1422 end loop; 1423 1424 when ANYOF_NUPPER => 1425 for Value in Class_Byte'Range loop 1426 if not Is_Upper (Character'Val (Value)) then 1427 Set_In_Class (Bitmap, Character'Val (Value)); 1428 end if; 1429 end loop; 1430 1431 when ANYOF_XDIGIT => 1432 for Value in Class_Byte'Range loop 1433 if Is_Hexadecimal_Digit (Character'Val (Value)) then 1434 Set_In_Class (Bitmap, Character'Val (Value)); 1435 end if; 1436 end loop; 1437 1438 when ANYOF_NXDIGIT => 1439 for Value in Class_Byte'Range loop 1440 if not Is_Hexadecimal_Digit 1441 (Character'Val (Value)) 1442 then 1443 Set_In_Class (Bitmap, Character'Val (Value)); 1444 end if; 1445 end loop; 1446 1447 end case; 1448 1449 -- Not a character range 1450 1451 elsif not In_Range then 1452 Last_Value := Value; 1453 1454 if Parse_Pos > Expression'Last then 1455 Fail ("Empty character class []"); 1456 end if; 1457 1458 if Expression (Parse_Pos) = '-' 1459 and then Parse_Pos < Parse_End 1460 and then Expression (Parse_Pos + 1) /= ']' 1461 then 1462 Parse_Pos := Parse_Pos + 1; 1463 In_Range := True; 1464 else 1465 Set_In_Class (Bitmap, Value); 1466 end if; 1467 1468 -- Else in a character range 1469 1470 else 1471 if Last_Value > Value then 1472 Fail ("Invalid Range [" & Last_Value'Img 1473 & "-" & Value'Img & "]"); 1474 end if; 1475 1476 while Last_Value <= Value loop 1477 Set_In_Class (Bitmap, Last_Value); 1478 Last_Value := Character'Succ (Last_Value); 1479 end loop; 1480 1481 In_Range := False; 1482 1483 end if; 1484 1485 end loop; 1486 1487 -- Optimize case-insensitive ranges (put the upper case or lower 1488 -- case character into the bitmap) 1489 1490 if (Flags and Case_Insensitive) /= 0 then 1491 for C in Character'Range loop 1492 if Get_From_Class (Bitmap, C) then 1493 Set_In_Class (Bitmap, To_Lower (C)); 1494 Set_In_Class (Bitmap, To_Upper (C)); 1495 end if; 1496 end loop; 1497 end if; 1498 1499 -- Optimize inverted classes 1500 1501 if Invert then 1502 for J in Bitmap'Range loop 1503 Bitmap (J) := not Bitmap (J); 1504 end loop; 1505 end if; 1506 1507 Parse_Pos := Parse_Pos + 1; 1508 1509 -- Emit the class 1510 1511 IP := Emit_Node (ANYOF); 1512 Emit_Class (Bitmap); 1513 end Parse_Character_Class; 1514 1515 ------------------- 1516 -- Parse_Literal -- 1517 ------------------- 1518 1519 -- This is a bit tricky due to quoted chars and due to 1520 -- the multiplier characters '*', '+', and '?' that 1521 -- take the SINGLE char previous as their operand. 1522 1523 -- On entry, the character at Parse_Pos - 1 is going to go 1524 -- into the string, no matter what it is. It could be 1525 -- following a \ if Parse_Atom was entered from the '\' case. 1526 1527 -- Basic idea is to pick up a good char in C and examine 1528 -- the next char. If Is_Mult (C) then twiddle, if it's a \ 1529 -- then frozzle and if it's another magic char then push C and 1530 -- terminate the string. If none of the above, push C on the 1531 -- string and go around again. 1532 1533 -- Start_Pos is used to remember where "the current character" 1534 -- starts in the string, if due to an Is_Mult we need to back 1535 -- up and put the current char in a separate 1-character string. 1536 -- When Start_Pos is 0, C is the only char in the string; 1537 -- this is used in Is_Mult handling, and in setting the SIMPLE 1538 -- flag at the end. 1539 1540 procedure Parse_Literal 1541 (Expr_Flags : out Expression_Flags; 1542 IP : out Pointer) 1543 is 1544 Start_Pos : Natural := 0; 1545 C : Character; 1546 Length_Ptr : Pointer; 1547 1548 Has_Special_Operator : Boolean := False; 1549 1550 begin 1551 Expr_Flags := Worst_Expression; -- Ensure Expr_Flags is initialized 1552 Parse_Pos := Parse_Pos - 1; -- Look at current character 1553 1554 IP := 1555 Emit_Node 1556 (if (Flags and Case_Insensitive) /= 0 then EXACTF else EXACT); 1557 1558 Length_Ptr := Emit_Ptr; 1559 Emit_Ptr := String_Operand (IP); 1560 1561 Parse_Loop : 1562 loop 1563 C := Expression (Parse_Pos); -- Get current character 1564 1565 case C is 1566 when '.' | '[' | '(' | ')' | '|' | ASCII.LF | '$' | '^' => 1567 1568 if Start_Pos = 0 then 1569 Start_Pos := Parse_Pos; 1570 Emit (C); -- First character is always emitted 1571 else 1572 exit Parse_Loop; -- Else we are done 1573 end if; 1574 1575 when '?' | '+' | '*' | '{' => 1576 1577 if Start_Pos = 0 then 1578 Start_Pos := Parse_Pos; 1579 Emit (C); -- First character is always emitted 1580 1581 -- Are we looking at an operator, or is this 1582 -- simply a normal character ? 1583 1584 elsif not Is_Mult (Parse_Pos) then 1585 Start_Pos := Parse_Pos; 1586 Case_Emit (C); 1587 1588 else 1589 -- We've got something like "abc?d". Mark this as a 1590 -- special case. What we want to emit is a first 1591 -- constant string for "ab", then one for "c" that will 1592 -- ultimately be transformed with a CURLY operator, A 1593 -- special case has to be handled for "a?", since there 1594 -- is no initial string to emit. 1595 1596 Has_Special_Operator := True; 1597 exit Parse_Loop; 1598 end if; 1599 1600 when '\' => 1601 Start_Pos := Parse_Pos; 1602 1603 if Parse_Pos = Parse_End then 1604 Fail ("Trailing \"); 1605 1606 else 1607 case Expression (Parse_Pos + 1) is 1608 when 'b' | 'B' | 's' | 'S' | 'd' | 'D' 1609 | 'w' | 'W' | '0' .. '9' | 'G' | 'A' 1610 => exit Parse_Loop; 1611 when 'n' => Emit (ASCII.LF); 1612 when 't' => Emit (ASCII.HT); 1613 when 'r' => Emit (ASCII.CR); 1614 when 'f' => Emit (ASCII.FF); 1615 when 'e' => Emit (ASCII.ESC); 1616 when 'a' => Emit (ASCII.BEL); 1617 when others => Emit (Expression (Parse_Pos + 1)); 1618 end case; 1619 1620 Parse_Pos := Parse_Pos + 1; 1621 end if; 1622 1623 when others => 1624 Start_Pos := Parse_Pos; 1625 Case_Emit (C); 1626 end case; 1627 1628 Parse_Pos := Parse_Pos + 1; 1629 exit Parse_Loop when Parse_Pos > Parse_End 1630 or else Emit_Ptr - Length_Ptr = 254; 1631 end loop Parse_Loop; 1632 1633 -- Is the string followed by a '*+?{' operator ? If yes, and if there 1634 -- is an initial string to emit, do it now. 1635 1636 if Has_Special_Operator 1637 and then Emit_Ptr >= Length_Ptr + Next_Pointer_Bytes 1638 then 1639 Emit_Ptr := Emit_Ptr - 1; 1640 Parse_Pos := Start_Pos; 1641 end if; 1642 1643 if Length_Ptr <= PM.Size then 1644 Program (Length_Ptr) := Character'Val (Emit_Ptr - Length_Ptr - 2); 1645 end if; 1646 1647 Expr_Flags.Has_Width := True; 1648 1649 -- Slight optimization when there is a single character 1650 1651 if Emit_Ptr = Length_Ptr + 2 then 1652 Expr_Flags.Simple := True; 1653 end if; 1654 end Parse_Literal; 1655 1656 ----------------- 1657 -- Parse_Piece -- 1658 ----------------- 1659 1660 -- Note that the branching code sequences used for '?' and the 1661 -- general cases of '*' and + are somewhat optimized: they use 1662 -- the same NOTHING node as both the endmarker for their branch 1663 -- list and the body of the last branch. It might seem that 1664 -- this node could be dispensed with entirely, but the endmarker 1665 -- role is not redundant. 1666 1667 procedure Parse_Piece 1668 (Expr_Flags : out Expression_Flags; 1669 IP : out Pointer) 1670 is 1671 Op : Character; 1672 New_Flags : Expression_Flags; 1673 Greedy : Boolean := True; 1674 1675 begin 1676 Parse_Atom (New_Flags, IP); 1677 1678 if IP = 0 1679 or else Parse_Pos > Parse_End 1680 or else not Is_Mult (Parse_Pos) 1681 then 1682 Expr_Flags := New_Flags; 1683 return; 1684 end if; 1685 1686 Op := Expression (Parse_Pos); 1687 1688 Expr_Flags := 1689 (if Op /= '+' 1690 then (SP_Start => True, others => False) 1691 else (Has_Width => True, others => False)); 1692 1693 -- Detect non greedy operators in the easy cases 1694 1695 if Op /= '{' 1696 and then Parse_Pos + 1 <= Parse_End 1697 and then Expression (Parse_Pos + 1) = '?' 1698 then 1699 Greedy := False; 1700 Parse_Pos := Parse_Pos + 1; 1701 end if; 1702 1703 -- Generate the byte code 1704 1705 case Op is 1706 when '*' => 1707 1708 if New_Flags.Simple then 1709 Insert_Operator (STAR, IP, Greedy); 1710 else 1711 Link_Tail (IP, Emit_Node (WHILEM)); 1712 Insert_Curly_Operator 1713 (CURLYX, 0, Max_Curly_Repeat, IP, Greedy); 1714 Link_Tail (IP, Emit_Node (NOTHING)); 1715 end if; 1716 1717 when '+' => 1718 1719 if New_Flags.Simple then 1720 Insert_Operator (PLUS, IP, Greedy); 1721 else 1722 Link_Tail (IP, Emit_Node (WHILEM)); 1723 Insert_Curly_Operator 1724 (CURLYX, 1, Max_Curly_Repeat, IP, Greedy); 1725 Link_Tail (IP, Emit_Node (NOTHING)); 1726 end if; 1727 1728 when '?' => 1729 if New_Flags.Simple then 1730 Insert_Curly_Operator (CURLY, 0, 1, IP, Greedy); 1731 else 1732 Link_Tail (IP, Emit_Node (WHILEM)); 1733 Insert_Curly_Operator (CURLYX, 0, 1, IP, Greedy); 1734 Link_Tail (IP, Emit_Node (NOTHING)); 1735 end if; 1736 1737 when '{' => 1738 declare 1739 Min, Max : Natural; 1740 1741 begin 1742 Get_Curly_Arguments (Parse_Pos, Min, Max, Greedy); 1743 1744 if New_Flags.Simple then 1745 Insert_Curly_Operator (CURLY, Min, Max, IP, Greedy); 1746 else 1747 Link_Tail (IP, Emit_Node (WHILEM)); 1748 Insert_Curly_Operator (CURLYX, Min, Max, IP, Greedy); 1749 Link_Tail (IP, Emit_Node (NOTHING)); 1750 end if; 1751 end; 1752 1753 when others => 1754 null; 1755 end case; 1756 1757 Parse_Pos := Parse_Pos + 1; 1758 1759 if Parse_Pos <= Parse_End 1760 and then Is_Mult (Parse_Pos) 1761 then 1762 Fail ("nested *+{"); 1763 end if; 1764 end Parse_Piece; 1765 1766 --------------------------------- 1767 -- Parse_Posix_Character_Class -- 1768 --------------------------------- 1769 1770 function Parse_Posix_Character_Class return Std_Class is 1771 Invert : Boolean := False; 1772 Class : Std_Class := ANYOF_NONE; 1773 E : String renames Expression; 1774 1775 -- Class names. Note that code assumes that the length of all 1776 -- classes starting with the same letter have the same length. 1777 1778 Alnum : constant String := "alnum:]"; 1779 Alpha : constant String := "alpha:]"; 1780 Ascii_C : constant String := "ascii:]"; 1781 Cntrl : constant String := "cntrl:]"; 1782 Digit : constant String := "digit:]"; 1783 Graph : constant String := "graph:]"; 1784 Lower : constant String := "lower:]"; 1785 Print : constant String := "print:]"; 1786 Punct : constant String := "punct:]"; 1787 Space : constant String := "space:]"; 1788 Upper : constant String := "upper:]"; 1789 Word : constant String := "word:]"; 1790 Xdigit : constant String := "xdigit:]"; 1791 1792 begin 1793 -- Case of character class specified 1794 1795 if Parse_Pos <= Parse_End 1796 and then Expression (Parse_Pos) = ':' 1797 then 1798 Parse_Pos := Parse_Pos + 1; 1799 1800 -- Do we have something like: [[:^alpha:]] 1801 1802 if Parse_Pos <= Parse_End 1803 and then Expression (Parse_Pos) = '^' 1804 then 1805 Invert := True; 1806 Parse_Pos := Parse_Pos + 1; 1807 end if; 1808 1809 -- Check for class names based on first letter 1810 1811 case Expression (Parse_Pos) is 1812 when 'a' => 1813 1814 -- All 'a' classes have the same length (Alnum'Length) 1815 1816 if Parse_Pos + Alnum'Length - 1 <= Parse_End then 1817 if 1818 E (Parse_Pos .. Parse_Pos + Alnum'Length - 1) = Alnum 1819 then 1820 Class := 1821 (if Invert then ANYOF_NALNUMC else ANYOF_ALNUMC); 1822 Parse_Pos := Parse_Pos + Alnum'Length; 1823 1824 elsif 1825 E (Parse_Pos .. Parse_Pos + Alpha'Length - 1) = Alpha 1826 then 1827 Class := 1828 (if Invert then ANYOF_NALPHA else ANYOF_ALPHA); 1829 Parse_Pos := Parse_Pos + Alpha'Length; 1830 1831 elsif E (Parse_Pos .. Parse_Pos + Ascii_C'Length - 1) = 1832 Ascii_C 1833 then 1834 Class := 1835 (if Invert then ANYOF_NASCII else ANYOF_ASCII); 1836 Parse_Pos := Parse_Pos + Ascii_C'Length; 1837 else 1838 Fail ("Invalid character class: " & E); 1839 end if; 1840 1841 else 1842 Fail ("Invalid character class: " & E); 1843 end if; 1844 1845 when 'c' => 1846 if Parse_Pos + Cntrl'Length - 1 <= Parse_End 1847 and then 1848 E (Parse_Pos .. Parse_Pos + Cntrl'Length - 1) = Cntrl 1849 then 1850 Class := (if Invert then ANYOF_NCNTRL else ANYOF_CNTRL); 1851 Parse_Pos := Parse_Pos + Cntrl'Length; 1852 else 1853 Fail ("Invalid character class: " & E); 1854 end if; 1855 1856 when 'd' => 1857 if Parse_Pos + Digit'Length - 1 <= Parse_End 1858 and then 1859 E (Parse_Pos .. Parse_Pos + Digit'Length - 1) = Digit 1860 then 1861 Class := (if Invert then ANYOF_NDIGIT else ANYOF_DIGIT); 1862 Parse_Pos := Parse_Pos + Digit'Length; 1863 end if; 1864 1865 when 'g' => 1866 if Parse_Pos + Graph'Length - 1 <= Parse_End 1867 and then 1868 E (Parse_Pos .. Parse_Pos + Graph'Length - 1) = Graph 1869 then 1870 Class := (if Invert then ANYOF_NGRAPH else ANYOF_GRAPH); 1871 Parse_Pos := Parse_Pos + Graph'Length; 1872 else 1873 Fail ("Invalid character class: " & E); 1874 end if; 1875 1876 when 'l' => 1877 if Parse_Pos + Lower'Length - 1 <= Parse_End 1878 and then 1879 E (Parse_Pos .. Parse_Pos + Lower'Length - 1) = Lower 1880 then 1881 Class := (if Invert then ANYOF_NLOWER else ANYOF_LOWER); 1882 Parse_Pos := Parse_Pos + Lower'Length; 1883 else 1884 Fail ("Invalid character class: " & E); 1885 end if; 1886 1887 when 'p' => 1888 1889 -- All 'p' classes have the same length 1890 1891 if Parse_Pos + Print'Length - 1 <= Parse_End then 1892 if 1893 E (Parse_Pos .. Parse_Pos + Print'Length - 1) = Print 1894 then 1895 Class := 1896 (if Invert then ANYOF_NPRINT else ANYOF_PRINT); 1897 Parse_Pos := Parse_Pos + Print'Length; 1898 1899 elsif 1900 E (Parse_Pos .. Parse_Pos + Punct'Length - 1) = Punct 1901 then 1902 Class := 1903 (if Invert then ANYOF_NPUNCT else ANYOF_PUNCT); 1904 Parse_Pos := Parse_Pos + Punct'Length; 1905 1906 else 1907 Fail ("Invalid character class: " & E); 1908 end if; 1909 1910 else 1911 Fail ("Invalid character class: " & E); 1912 end if; 1913 1914 when 's' => 1915 if Parse_Pos + Space'Length - 1 <= Parse_End 1916 and then 1917 E (Parse_Pos .. Parse_Pos + Space'Length - 1) = Space 1918 then 1919 Class := (if Invert then ANYOF_NSPACE else ANYOF_SPACE); 1920 Parse_Pos := Parse_Pos + Space'Length; 1921 else 1922 Fail ("Invalid character class: " & E); 1923 end if; 1924 1925 when 'u' => 1926 if Parse_Pos + Upper'Length - 1 <= Parse_End 1927 and then 1928 E (Parse_Pos .. Parse_Pos + Upper'Length - 1) = Upper 1929 then 1930 Class := (if Invert then ANYOF_NUPPER else ANYOF_UPPER); 1931 Parse_Pos := Parse_Pos + Upper'Length; 1932 else 1933 Fail ("Invalid character class: " & E); 1934 end if; 1935 1936 when 'w' => 1937 if Parse_Pos + Word'Length - 1 <= Parse_End 1938 and then 1939 E (Parse_Pos .. Parse_Pos + Word'Length - 1) = Word 1940 then 1941 Class := (if Invert then ANYOF_NALNUM else ANYOF_ALNUM); 1942 Parse_Pos := Parse_Pos + Word'Length; 1943 else 1944 Fail ("Invalid character class: " & E); 1945 end if; 1946 1947 when 'x' => 1948 if Parse_Pos + Xdigit'Length - 1 <= Parse_End 1949 and then 1950 E (Parse_Pos .. Parse_Pos + Xdigit'Length - 1) = Xdigit 1951 then 1952 Class := (if Invert then ANYOF_NXDIGIT else ANYOF_XDIGIT); 1953 Parse_Pos := Parse_Pos + Xdigit'Length; 1954 1955 else 1956 Fail ("Invalid character class: " & E); 1957 end if; 1958 1959 when others => 1960 Fail ("Invalid character class: " & E); 1961 end case; 1962 1963 -- Character class not specified 1964 1965 else 1966 return ANYOF_NONE; 1967 end if; 1968 1969 return Class; 1970 end Parse_Posix_Character_Class; 1971 1972 -- Local Declarations 1973 1974 Result : Pointer; 1975 1976 Expr_Flags : Expression_Flags; 1977 pragma Unreferenced (Expr_Flags); 1978 1979 -- Start of processing for Compile 1980 1981 begin 1982 Parse (False, False, Expr_Flags, Result); 1983 1984 if Result = 0 then 1985 Fail ("Couldn't compile expression"); 1986 end if; 1987 1988 Final_Code_Size := Emit_Ptr - 1; 1989 1990 -- Do we want to actually compile the expression, or simply get the 1991 -- code size ??? 1992 1993 if Emit_Ptr <= PM.Size then 1994 Optimize (PM); 1995 end if; 1996 1997 PM.Flags := Flags; 1998 end Compile; 1999 2000 function Compile 2001 (Expression : String; 2002 Flags : Regexp_Flags := No_Flags) return Pattern_Matcher 2003 is 2004 -- Assume the compiled regexp will fit in 1000 chars. If it does not we 2005 -- will have to compile a second time once the correct size is known. If 2006 -- it fits, we save a significant amount of time by avoiding the second 2007 -- compilation. 2008 2009 Dummy : Pattern_Matcher (1000); 2010 Size : Program_Size; 2011 2012 begin 2013 Compile (Dummy, Expression, Size, Flags); 2014 2015 if Size <= Dummy.Size then 2016 return Pattern_Matcher' 2017 (Size => Size, 2018 First => Dummy.First, 2019 Anchored => Dummy.Anchored, 2020 Must_Have => Dummy.Must_Have, 2021 Must_Have_Length => Dummy.Must_Have_Length, 2022 Paren_Count => Dummy.Paren_Count, 2023 Flags => Dummy.Flags, 2024 Program => 2025 Dummy.Program 2026 (Dummy.Program'First .. Dummy.Program'First + Size - 1)); 2027 else 2028 -- We have to recompile now that we know the size 2029 -- ??? Can we use Ada 2005's return construct ? 2030 2031 declare 2032 Result : Pattern_Matcher (Size); 2033 begin 2034 Compile (Result, Expression, Size, Flags); 2035 return Result; 2036 end; 2037 end if; 2038 end Compile; 2039 2040 procedure Compile 2041 (Matcher : out Pattern_Matcher; 2042 Expression : String; 2043 Flags : Regexp_Flags := No_Flags) 2044 is 2045 Size : Program_Size; 2046 2047 begin 2048 Compile (Matcher, Expression, Size, Flags); 2049 2050 if Size > Matcher.Size then 2051 raise Expression_Error with "Pattern_Matcher is too small"; 2052 end if; 2053 end Compile; 2054 2055 -------------------- 2056 -- Dump_Operation -- 2057 -------------------- 2058 2059 procedure Dump_Operation 2060 (Program : Program_Data; 2061 Index : Pointer; 2062 Indent : Natural) 2063 is 2064 Current : Pointer := Index; 2065 begin 2066 Dump_Until (Program, Current, Current + 1, Indent); 2067 end Dump_Operation; 2068 2069 ---------------- 2070 -- Dump_Until -- 2071 ---------------- 2072 2073 procedure Dump_Until 2074 (Program : Program_Data; 2075 Index : in out Pointer; 2076 Till : Pointer; 2077 Indent : Natural; 2078 Do_Print : Boolean := True) 2079 is 2080 function Image (S : String) return String; 2081 -- Remove leading space 2082 2083 ----------- 2084 -- Image -- 2085 ----------- 2086 2087 function Image (S : String) return String is 2088 begin 2089 if S (S'First) = ' ' then 2090 return S (S'First + 1 .. S'Last); 2091 else 2092 return S; 2093 end if; 2094 end Image; 2095 2096 -- Local variables 2097 2098 Op : Opcode; 2099 Next : Pointer; 2100 Length : Pointer; 2101 Local_Indent : Natural := Indent; 2102 2103 -- Start of processing for Dump_Until 2104 2105 begin 2106 while Index < Till loop 2107 Op := Opcode'Val (Character'Pos ((Program (Index)))); 2108 Next := Get_Next (Program, Index); 2109 2110 if Do_Print then 2111 declare 2112 Point : constant String := Pointer'Image (Index); 2113 begin 2114 Put ((1 .. 4 - Point'Length => ' ') 2115 & Point & ":" 2116 & (1 .. Local_Indent * 2 => ' ') & Opcode'Image (Op)); 2117 end; 2118 2119 -- Print the parenthesis number 2120 2121 if Op = OPEN or else Op = CLOSE or else Op = REFF then 2122 Put (Image (Natural'Image 2123 (Character'Pos 2124 (Program (Index + Next_Pointer_Bytes))))); 2125 end if; 2126 2127 if Next = Index then 2128 Put (" (-)"); 2129 else 2130 Put (" (" & Image (Pointer'Image (Next)) & ")"); 2131 end if; 2132 end if; 2133 2134 case Op is 2135 when ANYOF => 2136 declare 2137 Bitmap : Character_Class; 2138 Last : Character := ASCII.NUL; 2139 Current : Natural := 0; 2140 Current_Char : Character; 2141 2142 begin 2143 Bitmap_Operand (Program, Index, Bitmap); 2144 2145 if Do_Print then 2146 Put ("["); 2147 2148 while Current <= 255 loop 2149 Current_Char := Character'Val (Current); 2150 2151 -- First item in a range 2152 2153 if Get_From_Class (Bitmap, Current_Char) then 2154 Last := Current_Char; 2155 2156 -- Search for the last item in the range 2157 2158 loop 2159 Current := Current + 1; 2160 exit when Current > 255; 2161 Current_Char := Character'Val (Current); 2162 exit when 2163 not Get_From_Class (Bitmap, Current_Char); 2164 end loop; 2165 2166 if not Is_Graphic (Last) then 2167 Put (Last'Img); 2168 else 2169 Put (Last); 2170 end if; 2171 2172 if Character'Succ (Last) /= Current_Char then 2173 Put ("\-" & Character'Pred (Current_Char)); 2174 end if; 2175 2176 else 2177 Current := Current + 1; 2178 end if; 2179 end loop; 2180 2181 Put_Line ("]"); 2182 end if; 2183 2184 Index := Index + Next_Pointer_Bytes + Bitmap'Length; 2185 end; 2186 2187 when EXACT | EXACTF => 2188 Length := String_Length (Program, Index); 2189 if Do_Print then 2190 Put (" (" & Image (Program_Size'Image (Length + 1)) 2191 & " chars) <" 2192 & String (Program (String_Operand (Index) 2193 .. String_Operand (Index) 2194 + Length))); 2195 Put_Line (">"); 2196 end if; 2197 2198 Index := String_Operand (Index) + Length + 1; 2199 2200 -- Node operand 2201 2202 when BRANCH | STAR | PLUS => 2203 if Do_Print then 2204 New_Line; 2205 end if; 2206 2207 Index := Index + Next_Pointer_Bytes; 2208 Dump_Until (Program, Index, Pointer'Min (Next, Till), 2209 Local_Indent + 1, Do_Print); 2210 2211 when CURLY | CURLYX => 2212 if Do_Print then 2213 Put_Line 2214 (" {" 2215 & Image (Natural'Image 2216 (Read_Natural (Program, Index + Next_Pointer_Bytes))) 2217 & "," 2218 & Image (Natural'Image (Read_Natural (Program, Index + 5))) 2219 & "}"); 2220 end if; 2221 2222 Index := Index + 7; 2223 Dump_Until (Program, Index, Pointer'Min (Next, Till), 2224 Local_Indent + 1, Do_Print); 2225 2226 when OPEN => 2227 if Do_Print then 2228 New_Line; 2229 end if; 2230 2231 Index := Index + 4; 2232 Local_Indent := Local_Indent + 1; 2233 2234 when CLOSE | REFF => 2235 if Do_Print then 2236 New_Line; 2237 end if; 2238 2239 Index := Index + 4; 2240 2241 if Op = CLOSE then 2242 Local_Indent := Local_Indent - 1; 2243 end if; 2244 2245 when others => 2246 Index := Index + Next_Pointer_Bytes; 2247 2248 if Do_Print then 2249 New_Line; 2250 end if; 2251 2252 exit when Op = EOP; 2253 end case; 2254 end loop; 2255 end Dump_Until; 2256 2257 ---------- 2258 -- Dump -- 2259 ---------- 2260 2261 procedure Dump (Self : Pattern_Matcher) is 2262 Program : Program_Data renames Self.Program; 2263 Index : Pointer := Program'First; 2264 2265 -- Start of processing for Dump 2266 2267 begin 2268 Put_Line ("Must start with (Self.First) = " 2269 & Character'Image (Self.First)); 2270 2271 if (Self.Flags and Case_Insensitive) /= 0 then 2272 Put_Line (" Case_Insensitive mode"); 2273 end if; 2274 2275 if (Self.Flags and Single_Line) /= 0 then 2276 Put_Line (" Single_Line mode"); 2277 end if; 2278 2279 if (Self.Flags and Multiple_Lines) /= 0 then 2280 Put_Line (" Multiple_Lines mode"); 2281 end if; 2282 2283 Dump_Until (Program, Index, Self.Program'Last + 1, 0); 2284 end Dump; 2285 2286 -------------------- 2287 -- Get_From_Class -- 2288 -------------------- 2289 2290 function Get_From_Class 2291 (Bitmap : Character_Class; 2292 C : Character) return Boolean 2293 is 2294 Value : constant Class_Byte := Character'Pos (C); 2295 begin 2296 return 2297 (Bitmap (Value / 8) and Bit_Conversion (Value mod 8)) /= 0; 2298 end Get_From_Class; 2299 2300 -------------- 2301 -- Get_Next -- 2302 -------------- 2303 2304 function Get_Next (Program : Program_Data; IP : Pointer) return Pointer is 2305 begin 2306 return IP + Pointer (Read_Natural (Program, IP + 1)); 2307 end Get_Next; 2308 2309 -------------- 2310 -- Is_Alnum -- 2311 -------------- 2312 2313 function Is_Alnum (C : Character) return Boolean is 2314 begin 2315 return Is_Alphanumeric (C) or else C = '_'; 2316 end Is_Alnum; 2317 2318 ------------------ 2319 -- Is_Printable -- 2320 ------------------ 2321 2322 function Is_Printable (C : Character) return Boolean is 2323 begin 2324 -- Printable if space or graphic character or other whitespace 2325 -- Other white space includes (HT/LF/VT/FF/CR = codes 9-13) 2326 2327 return C in Character'Val (32) .. Character'Val (126) 2328 or else C in ASCII.HT .. ASCII.CR; 2329 end Is_Printable; 2330 2331 -------------------- 2332 -- Is_White_Space -- 2333 -------------------- 2334 2335 function Is_White_Space (C : Character) return Boolean is 2336 begin 2337 -- Note: HT = 9, LF = 10, VT = 11, FF = 12, CR = 13 2338 2339 return C = ' ' or else C in ASCII.HT .. ASCII.CR; 2340 end Is_White_Space; 2341 2342 ----------- 2343 -- Match -- 2344 ----------- 2345 2346 procedure Match 2347 (Self : Pattern_Matcher; 2348 Data : String; 2349 Matches : out Match_Array; 2350 Data_First : Integer := -1; 2351 Data_Last : Positive := Positive'Last) 2352 is 2353 Program : Program_Data renames Self.Program; -- Shorter notation 2354 2355 First_In_Data : constant Integer := Integer'Max (Data_First, Data'First); 2356 Last_In_Data : constant Integer := Integer'Min (Data_Last, Data'Last); 2357 2358 -- Global work variables 2359 2360 Input_Pos : Natural; -- String-input pointer 2361 BOL_Pos : Natural; -- Beginning of input, for ^ check 2362 Matched : Boolean := False; -- Until proven True 2363 2364 Matches_Full : Match_Array (0 .. Natural'Max (Self.Paren_Count, 2365 Matches'Last)); 2366 -- Stores the value of all the parenthesis pairs. 2367 -- We do not use directly Matches, so that we can also use back 2368 -- references (REFF) even if Matches is too small. 2369 2370 type Natural_Array is array (Match_Count range <>) of Natural; 2371 Matches_Tmp : Natural_Array (Matches_Full'Range); 2372 -- Save the opening position of parenthesis 2373 2374 Last_Paren : Natural := 0; 2375 -- Last parenthesis seen 2376 2377 Greedy : Boolean := True; 2378 -- True if the next operator should be greedy 2379 2380 type Current_Curly_Record; 2381 type Current_Curly_Access is access all Current_Curly_Record; 2382 type Current_Curly_Record is record 2383 Paren_Floor : Natural; -- How far back to strip parenthesis data 2384 Cur : Integer; -- How many instances of scan we've matched 2385 Min : Natural; -- Minimal number of scans to match 2386 Max : Natural; -- Maximal number of scans to match 2387 Greedy : Boolean; -- Whether to work our way up or down 2388 Scan : Pointer; -- The thing to match 2389 Next : Pointer; -- What has to match after it 2390 Lastloc : Natural; -- Where we started matching this scan 2391 Old_Cc : Current_Curly_Access; -- Before we started this one 2392 end record; 2393 -- Data used to handle the curly operator and the plus and star 2394 -- operators for complex expressions. 2395 2396 Current_Curly : Current_Curly_Access := null; 2397 -- The curly currently being processed 2398 2399 ----------------------- 2400 -- Local Subprograms -- 2401 ----------------------- 2402 2403 function Index (Start : Positive; C : Character) return Natural; 2404 -- Find character C in Data starting at Start and return position 2405 2406 function Repeat 2407 (IP : Pointer; 2408 Max : Natural := Natural'Last) return Natural; 2409 -- Repeatedly match something simple, report how many 2410 -- It only matches on things of length 1. 2411 -- Starting from Input_Pos, it matches at most Max CURLY. 2412 2413 function Try (Pos : Positive) return Boolean; 2414 -- Try to match at specific point 2415 2416 function Match (IP : Pointer) return Boolean; 2417 -- This is the main matching routine. Conceptually the strategy 2418 -- is simple: check to see whether the current node matches, 2419 -- call self recursively to see whether the rest matches, 2420 -- and then act accordingly. 2421 -- 2422 -- In practice Match makes some effort to avoid recursion, in 2423 -- particular by going through "ordinary" nodes (that don't 2424 -- need to know whether the rest of the match failed) by 2425 -- using a loop instead of recursion. 2426 -- Why is the above comment part of the spec rather than body ??? 2427 2428 function Match_Whilem return Boolean; 2429 -- Return True if a WHILEM matches the Current_Curly 2430 2431 function Recurse_Match (IP : Pointer; From : Natural) return Boolean; 2432 pragma Inline (Recurse_Match); 2433 -- Calls Match recursively. It saves and restores the parenthesis 2434 -- status and location in the input stream correctly, so that 2435 -- backtracking is possible 2436 2437 function Match_Simple_Operator 2438 (Op : Opcode; 2439 Scan : Pointer; 2440 Next : Pointer; 2441 Greedy : Boolean) return Boolean; 2442 -- Return True it the simple operator (possibly non-greedy) matches 2443 2444 Dump_Indent : Integer := -1; 2445 procedure Dump_Current (Scan : Pointer; Prefix : Boolean := True); 2446 procedure Dump_Error (Msg : String); 2447 -- Debug: print the current context 2448 2449 pragma Inline (Index); 2450 pragma Inline (Repeat); 2451 2452 -- These are two complex functions, but used only once 2453 2454 pragma Inline (Match_Whilem); 2455 pragma Inline (Match_Simple_Operator); 2456 2457 ----------- 2458 -- Index -- 2459 ----------- 2460 2461 function Index (Start : Positive; C : Character) return Natural is 2462 begin 2463 for J in Start .. Last_In_Data loop 2464 if Data (J) = C then 2465 return J; 2466 end if; 2467 end loop; 2468 2469 return 0; 2470 end Index; 2471 2472 ------------------- 2473 -- Recurse_Match -- 2474 ------------------- 2475 2476 function Recurse_Match (IP : Pointer; From : Natural) return Boolean is 2477 L : constant Natural := Last_Paren; 2478 Tmp_F : constant Match_Array := 2479 Matches_Full (From + 1 .. Matches_Full'Last); 2480 Start : constant Natural_Array := 2481 Matches_Tmp (From + 1 .. Matches_Tmp'Last); 2482 Input : constant Natural := Input_Pos; 2483 2484 Dump_Indent_Save : constant Integer := Dump_Indent; 2485 2486 begin 2487 if Match (IP) then 2488 return True; 2489 end if; 2490 2491 Last_Paren := L; 2492 Matches_Full (Tmp_F'Range) := Tmp_F; 2493 Matches_Tmp (Start'Range) := Start; 2494 Input_Pos := Input; 2495 Dump_Indent := Dump_Indent_Save; 2496 return False; 2497 end Recurse_Match; 2498 2499 ------------------ 2500 -- Dump_Current -- 2501 ------------------ 2502 2503 procedure Dump_Current (Scan : Pointer; Prefix : Boolean := True) is 2504 Length : constant := 10; 2505 Pos : constant String := Integer'Image (Input_Pos); 2506 2507 begin 2508 if Prefix then 2509 Put ((1 .. 5 - Pos'Length => ' ')); 2510 Put (Pos & " <" 2511 & Data (Input_Pos 2512 .. Integer'Min (Last_In_Data, Input_Pos + Length - 1))); 2513 Put ((1 .. Length - 1 - Last_In_Data + Input_Pos => ' ')); 2514 Put ("> |"); 2515 2516 else 2517 Put (" "); 2518 end if; 2519 2520 Dump_Operation (Program, Scan, Indent => Dump_Indent); 2521 end Dump_Current; 2522 2523 ---------------- 2524 -- Dump_Error -- 2525 ---------------- 2526 2527 procedure Dump_Error (Msg : String) is 2528 begin 2529 Put (" | "); 2530 Put ((1 .. Dump_Indent * 2 => ' ')); 2531 Put_Line (Msg); 2532 end Dump_Error; 2533 2534 ----------- 2535 -- Match -- 2536 ----------- 2537 2538 function Match (IP : Pointer) return Boolean is 2539 Scan : Pointer := IP; 2540 Next : Pointer; 2541 Op : Opcode; 2542 Result : Boolean; 2543 2544 begin 2545 Dump_Indent := Dump_Indent + 1; 2546 2547 State_Machine : 2548 loop 2549 pragma Assert (Scan /= 0); 2550 2551 -- Determine current opcode and count its usage in debug mode 2552 2553 Op := Opcode'Val (Character'Pos (Program (Scan))); 2554 2555 -- Calculate offset of next instruction. Second character is most 2556 -- significant in Program_Data. 2557 2558 Next := Get_Next (Program, Scan); 2559 2560 if Debug then 2561 Dump_Current (Scan); 2562 end if; 2563 2564 case Op is 2565 when EOP => 2566 Dump_Indent := Dump_Indent - 1; 2567 return True; -- Success 2568 2569 when BRANCH => 2570 if Program (Next) /= BRANCH then 2571 Next := Operand (Scan); -- No choice, avoid recursion 2572 2573 else 2574 loop 2575 if Recurse_Match (Operand (Scan), 0) then 2576 Dump_Indent := Dump_Indent - 1; 2577 return True; 2578 end if; 2579 2580 Scan := Get_Next (Program, Scan); 2581 exit when Scan = 0 or else Program (Scan) /= BRANCH; 2582 end loop; 2583 2584 exit State_Machine; 2585 end if; 2586 2587 when NOTHING => 2588 null; 2589 2590 when BOL => 2591 exit State_Machine when Input_Pos /= BOL_Pos 2592 and then ((Self.Flags and Multiple_Lines) = 0 2593 or else Data (Input_Pos - 1) /= ASCII.LF); 2594 2595 when MBOL => 2596 exit State_Machine when Input_Pos /= BOL_Pos 2597 and then Data (Input_Pos - 1) /= ASCII.LF; 2598 2599 when SBOL => 2600 exit State_Machine when Input_Pos /= BOL_Pos; 2601 2602 when EOL => 2603 2604 -- A combination of MEOL and SEOL 2605 2606 if (Self.Flags and Multiple_Lines) = 0 then 2607 2608 -- Single line mode 2609 2610 exit State_Machine when Input_Pos <= Data'Last; 2611 2612 elsif Input_Pos <= Last_In_Data then 2613 exit State_Machine when Data (Input_Pos) /= ASCII.LF; 2614 else 2615 exit State_Machine when Last_In_Data /= Data'Last; 2616 end if; 2617 2618 when MEOL => 2619 if Input_Pos <= Last_In_Data then 2620 exit State_Machine when Data (Input_Pos) /= ASCII.LF; 2621 else 2622 exit State_Machine when Last_In_Data /= Data'Last; 2623 end if; 2624 2625 when SEOL => 2626 2627 -- If there is a character before Data'Last (even if 2628 -- Last_In_Data stops before then), we can't have the 2629 -- end of the line. 2630 2631 exit State_Machine when Input_Pos <= Data'Last; 2632 2633 when BOUND | NBOUND => 2634 2635 -- Was last char in word ? 2636 2637 declare 2638 N : Boolean := False; 2639 Ln : Boolean := False; 2640 2641 begin 2642 if Input_Pos /= First_In_Data then 2643 N := Is_Alnum (Data (Input_Pos - 1)); 2644 end if; 2645 2646 Ln := 2647 (if Input_Pos > Last_In_Data 2648 then False 2649 else Is_Alnum (Data (Input_Pos))); 2650 2651 if Op = BOUND then 2652 if N = Ln then 2653 exit State_Machine; 2654 end if; 2655 else 2656 if N /= Ln then 2657 exit State_Machine; 2658 end if; 2659 end if; 2660 end; 2661 2662 when SPACE => 2663 exit State_Machine when Input_Pos > Last_In_Data 2664 or else not Is_White_Space (Data (Input_Pos)); 2665 Input_Pos := Input_Pos + 1; 2666 2667 when NSPACE => 2668 exit State_Machine when Input_Pos > Last_In_Data 2669 or else Is_White_Space (Data (Input_Pos)); 2670 Input_Pos := Input_Pos + 1; 2671 2672 when DIGIT => 2673 exit State_Machine when Input_Pos > Last_In_Data 2674 or else not Is_Digit (Data (Input_Pos)); 2675 Input_Pos := Input_Pos + 1; 2676 2677 when NDIGIT => 2678 exit State_Machine when Input_Pos > Last_In_Data 2679 or else Is_Digit (Data (Input_Pos)); 2680 Input_Pos := Input_Pos + 1; 2681 2682 when ALNUM => 2683 exit State_Machine when Input_Pos > Last_In_Data 2684 or else not Is_Alnum (Data (Input_Pos)); 2685 Input_Pos := Input_Pos + 1; 2686 2687 when NALNUM => 2688 exit State_Machine when Input_Pos > Last_In_Data 2689 or else Is_Alnum (Data (Input_Pos)); 2690 Input_Pos := Input_Pos + 1; 2691 2692 when ANY => 2693 exit State_Machine when Input_Pos > Last_In_Data 2694 or else Data (Input_Pos) = ASCII.LF; 2695 Input_Pos := Input_Pos + 1; 2696 2697 when SANY => 2698 exit State_Machine when Input_Pos > Last_In_Data; 2699 Input_Pos := Input_Pos + 1; 2700 2701 when EXACT => 2702 declare 2703 Opnd : Pointer := String_Operand (Scan); 2704 Current : Positive := Input_Pos; 2705 Last : constant Pointer := 2706 Opnd + String_Length (Program, Scan); 2707 2708 begin 2709 while Opnd <= Last loop 2710 exit State_Machine when Current > Last_In_Data 2711 or else Program (Opnd) /= Data (Current); 2712 Current := Current + 1; 2713 Opnd := Opnd + 1; 2714 end loop; 2715 2716 Input_Pos := Current; 2717 end; 2718 2719 when EXACTF => 2720 declare 2721 Opnd : Pointer := String_Operand (Scan); 2722 Current : Positive := Input_Pos; 2723 2724 Last : constant Pointer := 2725 Opnd + String_Length (Program, Scan); 2726 2727 begin 2728 while Opnd <= Last loop 2729 exit State_Machine when Current > Last_In_Data 2730 or else Program (Opnd) /= To_Lower (Data (Current)); 2731 Current := Current + 1; 2732 Opnd := Opnd + 1; 2733 end loop; 2734 2735 Input_Pos := Current; 2736 end; 2737 2738 when ANYOF => 2739 declare 2740 Bitmap : Character_Class; 2741 begin 2742 Bitmap_Operand (Program, Scan, Bitmap); 2743 exit State_Machine when Input_Pos > Last_In_Data 2744 or else not Get_From_Class (Bitmap, Data (Input_Pos)); 2745 Input_Pos := Input_Pos + 1; 2746 end; 2747 2748 when OPEN => 2749 declare 2750 No : constant Natural := 2751 Character'Pos (Program (Operand (Scan))); 2752 begin 2753 Matches_Tmp (No) := Input_Pos; 2754 end; 2755 2756 when CLOSE => 2757 declare 2758 No : constant Natural := 2759 Character'Pos (Program (Operand (Scan))); 2760 2761 begin 2762 Matches_Full (No) := (Matches_Tmp (No), Input_Pos - 1); 2763 2764 if Last_Paren < No then 2765 Last_Paren := No; 2766 end if; 2767 end; 2768 2769 when REFF => 2770 declare 2771 No : constant Natural := 2772 Character'Pos (Program (Operand (Scan))); 2773 2774 Data_Pos : Natural; 2775 2776 begin 2777 -- If we haven't seen that parenthesis yet 2778 2779 if Last_Paren < No then 2780 Dump_Indent := Dump_Indent - 1; 2781 2782 if Debug then 2783 Dump_Error ("REFF: No match, backtracking"); 2784 end if; 2785 2786 return False; 2787 end if; 2788 2789 Data_Pos := Matches_Full (No).First; 2790 2791 while Data_Pos <= Matches_Full (No).Last loop 2792 if Input_Pos > Last_In_Data 2793 or else Data (Input_Pos) /= Data (Data_Pos) 2794 then 2795 Dump_Indent := Dump_Indent - 1; 2796 2797 if Debug then 2798 Dump_Error ("REFF: No match, backtracking"); 2799 end if; 2800 2801 return False; 2802 end if; 2803 2804 Input_Pos := Input_Pos + 1; 2805 Data_Pos := Data_Pos + 1; 2806 end loop; 2807 end; 2808 2809 when MINMOD => 2810 Greedy := False; 2811 2812 when STAR | PLUS | CURLY => 2813 declare 2814 Greed : constant Boolean := Greedy; 2815 begin 2816 Greedy := True; 2817 Result := Match_Simple_Operator (Op, Scan, Next, Greed); 2818 Dump_Indent := Dump_Indent - 1; 2819 return Result; 2820 end; 2821 2822 when CURLYX => 2823 2824 -- Looking at something like: 2825 2826 -- 1: CURLYX {n,m} (->4) 2827 -- 2: code for complex thing (->3) 2828 -- 3: WHILEM (->0) 2829 -- 4: NOTHING 2830 2831 declare 2832 Min : constant Natural := 2833 Read_Natural (Program, Scan + Next_Pointer_Bytes); 2834 Max : constant Natural := 2835 Read_Natural 2836 (Program, Scan + Next_Pointer_Bytes + 2); 2837 Cc : aliased Current_Curly_Record; 2838 2839 Has_Match : Boolean; 2840 2841 begin 2842 Cc := (Paren_Floor => Last_Paren, 2843 Cur => -1, 2844 Min => Min, 2845 Max => Max, 2846 Greedy => Greedy, 2847 Scan => Scan + 7, 2848 Next => Next, 2849 Lastloc => 0, 2850 Old_Cc => Current_Curly); 2851 Greedy := True; 2852 Current_Curly := Cc'Unchecked_Access; 2853 2854 Has_Match := Match (Next - Next_Pointer_Bytes); 2855 2856 -- Start on the WHILEM 2857 2858 Current_Curly := Cc.Old_Cc; 2859 Dump_Indent := Dump_Indent - 1; 2860 2861 if not Has_Match then 2862 if Debug then 2863 Dump_Error ("CURLYX failed..."); 2864 end if; 2865 end if; 2866 2867 return Has_Match; 2868 end; 2869 2870 when WHILEM => 2871 Result := Match_Whilem; 2872 Dump_Indent := Dump_Indent - 1; 2873 2874 if Debug and then not Result then 2875 Dump_Error ("WHILEM: no match, backtracking"); 2876 end if; 2877 2878 return Result; 2879 end case; 2880 2881 Scan := Next; 2882 end loop State_Machine; 2883 2884 if Debug then 2885 Dump_Error ("failed..."); 2886 Dump_Indent := Dump_Indent - 1; 2887 end if; 2888 2889 -- If we get here, there is no match. For successful matches when EOP 2890 -- is the terminating point. 2891 2892 return False; 2893 end Match; 2894 2895 --------------------------- 2896 -- Match_Simple_Operator -- 2897 --------------------------- 2898 2899 function Match_Simple_Operator 2900 (Op : Opcode; 2901 Scan : Pointer; 2902 Next : Pointer; 2903 Greedy : Boolean) return Boolean 2904 is 2905 Next_Char : Character := ASCII.NUL; 2906 Next_Char_Known : Boolean := False; 2907 No : Integer; -- Can be negative 2908 Min : Natural; 2909 Max : Natural := Natural'Last; 2910 Operand_Code : Pointer; 2911 Old : Natural; 2912 Last_Pos : Natural; 2913 Save : constant Natural := Input_Pos; 2914 2915 begin 2916 -- Lookahead to avoid useless match attempts when we know what 2917 -- character comes next. 2918 2919 if Program (Next) = EXACT then 2920 Next_Char := Program (String_Operand (Next)); 2921 Next_Char_Known := True; 2922 end if; 2923 2924 -- Find the minimal and maximal values for the operator 2925 2926 case Op is 2927 when STAR => 2928 Min := 0; 2929 Operand_Code := Operand (Scan); 2930 2931 when PLUS => 2932 Min := 1; 2933 Operand_Code := Operand (Scan); 2934 2935 when others => 2936 Min := Read_Natural (Program, Scan + Next_Pointer_Bytes); 2937 Max := Read_Natural (Program, Scan + Next_Pointer_Bytes + 2); 2938 Operand_Code := Scan + 7; 2939 end case; 2940 2941 if Debug then 2942 Dump_Current (Operand_Code, Prefix => False); 2943 end if; 2944 2945 -- Non greedy operators 2946 2947 if not Greedy then 2948 2949 -- Test we can repeat at least Min times 2950 2951 if Min /= 0 then 2952 No := Repeat (Operand_Code, Min); 2953 2954 if No < Min then 2955 if Debug then 2956 Dump_Error ("failed... matched" & No'Img & " times"); 2957 end if; 2958 2959 return False; 2960 end if; 2961 end if; 2962 2963 Old := Input_Pos; 2964 2965 -- Find the place where 'next' could work 2966 2967 if Next_Char_Known then 2968 2969 -- Last position to check 2970 2971 if Max = Natural'Last then 2972 Last_Pos := Last_In_Data; 2973 else 2974 Last_Pos := Input_Pos + Max; 2975 2976 if Last_Pos > Last_In_Data then 2977 Last_Pos := Last_In_Data; 2978 end if; 2979 end if; 2980 2981 -- Look for the first possible opportunity 2982 2983 if Debug then 2984 Dump_Error ("Next_Char must be " & Next_Char); 2985 end if; 2986 2987 loop 2988 -- Find the next possible position 2989 2990 while Input_Pos <= Last_Pos 2991 and then Data (Input_Pos) /= Next_Char 2992 loop 2993 Input_Pos := Input_Pos + 1; 2994 end loop; 2995 2996 if Input_Pos > Last_Pos then 2997 return False; 2998 end if; 2999 3000 -- Check that we still match if we stop at the position we 3001 -- just found. 3002 3003 declare 3004 Num : constant Natural := Input_Pos - Old; 3005 3006 begin 3007 Input_Pos := Old; 3008 3009 if Debug then 3010 Dump_Error ("Would we still match at that position?"); 3011 end if; 3012 3013 if Repeat (Operand_Code, Num) < Num then 3014 return False; 3015 end if; 3016 end; 3017 3018 -- Input_Pos now points to the new position 3019 3020 if Match (Get_Next (Program, Scan)) then 3021 return True; 3022 end if; 3023 3024 Old := Input_Pos; 3025 Input_Pos := Input_Pos + 1; 3026 end loop; 3027 3028 -- We do not know what the next character is 3029 3030 else 3031 while Max >= Min loop 3032 if Debug then 3033 Dump_Error ("Non-greedy repeat, N=" & Min'Img); 3034 Dump_Error ("Do we still match Next if we stop here?"); 3035 end if; 3036 3037 -- If the next character matches 3038 3039 if Recurse_Match (Next, 1) then 3040 return True; 3041 end if; 3042 3043 Input_Pos := Save + Min; 3044 3045 -- Could not or did not match -- move forward 3046 3047 if Repeat (Operand_Code, 1) /= 0 then 3048 Min := Min + 1; 3049 else 3050 if Debug then 3051 Dump_Error ("Non-greedy repeat failed..."); 3052 end if; 3053 3054 return False; 3055 end if; 3056 end loop; 3057 end if; 3058 3059 return False; 3060 3061 -- Greedy operators 3062 3063 else 3064 No := Repeat (Operand_Code, Max); 3065 3066 if Debug and then No < Min then 3067 Dump_Error ("failed... matched" & No'Img & " times"); 3068 end if; 3069 3070 -- ??? Perl has some special code here in case the next 3071 -- instruction is of type EOL, since $ and \Z can match before 3072 -- *and* after newline at the end. 3073 3074 -- ??? Perl has some special code here in case (paren) is True 3075 3076 -- Else, if we don't have any parenthesis 3077 3078 while No >= Min loop 3079 if not Next_Char_Known 3080 or else (Input_Pos <= Last_In_Data 3081 and then Data (Input_Pos) = Next_Char) 3082 then 3083 if Match (Next) then 3084 return True; 3085 end if; 3086 end if; 3087 3088 -- Could not or did not work, we back up 3089 3090 No := No - 1; 3091 Input_Pos := Save + No; 3092 end loop; 3093 3094 return False; 3095 end if; 3096 end Match_Simple_Operator; 3097 3098 ------------------ 3099 -- Match_Whilem -- 3100 ------------------ 3101 3102 -- This is really hard to understand, because after we match what we 3103 -- are trying to match, we must make sure the rest of the REx is going 3104 -- to match for sure, and to do that we have to go back UP the parse 3105 -- tree by recursing ever deeper. And if it fails, we have to reset 3106 -- our parent's current state that we can try again after backing off. 3107 3108 function Match_Whilem return Boolean is 3109 Cc : constant Current_Curly_Access := Current_Curly; 3110 3111 N : constant Natural := Cc.Cur + 1; 3112 Ln : Natural := 0; 3113 3114 Lastloc : constant Natural := Cc.Lastloc; 3115 -- Detection of 0-len 3116 3117 begin 3118 -- If degenerate scan matches "", assume scan done 3119 3120 if Input_Pos = Cc.Lastloc 3121 and then N >= Cc.Min 3122 then 3123 -- Temporarily restore the old context, and check that we 3124 -- match was comes after CURLYX. 3125 3126 Current_Curly := Cc.Old_Cc; 3127 3128 if Current_Curly /= null then 3129 Ln := Current_Curly.Cur; 3130 end if; 3131 3132 if Match (Cc.Next) then 3133 return True; 3134 end if; 3135 3136 if Current_Curly /= null then 3137 Current_Curly.Cur := Ln; 3138 end if; 3139 3140 Current_Curly := Cc; 3141 return False; 3142 end if; 3143 3144 -- First, just match a string of min scans 3145 3146 if N < Cc.Min then 3147 Cc.Cur := N; 3148 Cc.Lastloc := Input_Pos; 3149 3150 if Debug then 3151 Dump_Error 3152 ("Tests that we match at least" & Cc.Min'Img & " N=" & N'Img); 3153 end if; 3154 3155 if Match (Cc.Scan) then 3156 return True; 3157 end if; 3158 3159 Cc.Cur := N - 1; 3160 Cc.Lastloc := Lastloc; 3161 3162 if Debug then 3163 Dump_Error ("failed..."); 3164 end if; 3165 3166 return False; 3167 end if; 3168 3169 -- Prefer next over scan for minimal matching 3170 3171 if not Cc.Greedy then 3172 Current_Curly := Cc.Old_Cc; 3173 3174 if Current_Curly /= null then 3175 Ln := Current_Curly.Cur; 3176 end if; 3177 3178 if Recurse_Match (Cc.Next, Cc.Paren_Floor) then 3179 return True; 3180 end if; 3181 3182 if Current_Curly /= null then 3183 Current_Curly.Cur := Ln; 3184 end if; 3185 3186 Current_Curly := Cc; 3187 3188 -- Maximum greed exceeded ? 3189 3190 if N >= Cc.Max then 3191 if Debug then 3192 Dump_Error ("failed..."); 3193 end if; 3194 return False; 3195 end if; 3196 3197 -- Try scanning more and see if it helps 3198 Cc.Cur := N; 3199 Cc.Lastloc := Input_Pos; 3200 3201 if Debug then 3202 Dump_Error ("Next failed, what about Current?"); 3203 end if; 3204 3205 if Recurse_Match (Cc.Scan, Cc.Paren_Floor) then 3206 return True; 3207 end if; 3208 3209 Cc.Cur := N - 1; 3210 Cc.Lastloc := Lastloc; 3211 return False; 3212 end if; 3213 3214 -- Prefer scan over next for maximal matching 3215 3216 if N < Cc.Max then -- more greed allowed ? 3217 Cc.Cur := N; 3218 Cc.Lastloc := Input_Pos; 3219 3220 if Debug then 3221 Dump_Error ("Recurse at current position"); 3222 end if; 3223 3224 if Recurse_Match (Cc.Scan, Cc.Paren_Floor) then 3225 return True; 3226 end if; 3227 end if; 3228 3229 -- Failed deeper matches of scan, so see if this one works 3230 3231 Current_Curly := Cc.Old_Cc; 3232 3233 if Current_Curly /= null then 3234 Ln := Current_Curly.Cur; 3235 end if; 3236 3237 if Debug then 3238 Dump_Error ("Failed matching for later positions"); 3239 end if; 3240 3241 if Match (Cc.Next) then 3242 return True; 3243 end if; 3244 3245 if Current_Curly /= null then 3246 Current_Curly.Cur := Ln; 3247 end if; 3248 3249 Current_Curly := Cc; 3250 Cc.Cur := N - 1; 3251 Cc.Lastloc := Lastloc; 3252 3253 if Debug then 3254 Dump_Error ("failed..."); 3255 end if; 3256 3257 return False; 3258 end Match_Whilem; 3259 3260 ------------ 3261 -- Repeat -- 3262 ------------ 3263 3264 function Repeat 3265 (IP : Pointer; 3266 Max : Natural := Natural'Last) return Natural 3267 is 3268 Scan : Natural := Input_Pos; 3269 Last : Natural; 3270 Op : constant Opcode := 3271 Opcode'Val (Character'Pos (Program (IP))); 3272 Count : Natural; 3273 C : Character; 3274 Bitmap : Character_Class; 3275 3276 begin 3277 if Max = Natural'Last or else Scan + Max - 1 > Last_In_Data then 3278 Last := Last_In_Data; 3279 else 3280 Last := Scan + Max - 1; 3281 end if; 3282 3283 case Op is 3284 when ANY => 3285 while Scan <= Last 3286 and then Data (Scan) /= ASCII.LF 3287 loop 3288 Scan := Scan + 1; 3289 end loop; 3290 3291 when SANY => 3292 Scan := Last + 1; 3293 3294 when EXACT => 3295 3296 -- The string has only one character if Repeat was called 3297 3298 C := Program (String_Operand (IP)); 3299 while Scan <= Last 3300 and then C = Data (Scan) 3301 loop 3302 Scan := Scan + 1; 3303 end loop; 3304 3305 when EXACTF => 3306 3307 -- The string has only one character if Repeat was called 3308 3309 C := Program (String_Operand (IP)); 3310 while Scan <= Last 3311 and then To_Lower (C) = Data (Scan) 3312 loop 3313 Scan := Scan + 1; 3314 end loop; 3315 3316 when ANYOF => 3317 Bitmap_Operand (Program, IP, Bitmap); 3318 3319 while Scan <= Last 3320 and then Get_From_Class (Bitmap, Data (Scan)) 3321 loop 3322 Scan := Scan + 1; 3323 end loop; 3324 3325 when ALNUM => 3326 while Scan <= Last 3327 and then Is_Alnum (Data (Scan)) 3328 loop 3329 Scan := Scan + 1; 3330 end loop; 3331 3332 when NALNUM => 3333 while Scan <= Last 3334 and then not Is_Alnum (Data (Scan)) 3335 loop 3336 Scan := Scan + 1; 3337 end loop; 3338 3339 when SPACE => 3340 while Scan <= Last 3341 and then Is_White_Space (Data (Scan)) 3342 loop 3343 Scan := Scan + 1; 3344 end loop; 3345 3346 when NSPACE => 3347 while Scan <= Last 3348 and then not Is_White_Space (Data (Scan)) 3349 loop 3350 Scan := Scan + 1; 3351 end loop; 3352 3353 when DIGIT => 3354 while Scan <= Last 3355 and then Is_Digit (Data (Scan)) 3356 loop 3357 Scan := Scan + 1; 3358 end loop; 3359 3360 when NDIGIT => 3361 while Scan <= Last 3362 and then not Is_Digit (Data (Scan)) 3363 loop 3364 Scan := Scan + 1; 3365 end loop; 3366 3367 when others => 3368 raise Program_Error; 3369 end case; 3370 3371 Count := Scan - Input_Pos; 3372 Input_Pos := Scan; 3373 return Count; 3374 end Repeat; 3375 3376 --------- 3377 -- Try -- 3378 --------- 3379 3380 function Try (Pos : Positive) return Boolean is 3381 begin 3382 Input_Pos := Pos; 3383 Last_Paren := 0; 3384 Matches_Full := (others => No_Match); 3385 3386 if Match (Program_First) then 3387 Matches_Full (0) := (Pos, Input_Pos - 1); 3388 return True; 3389 end if; 3390 3391 return False; 3392 end Try; 3393 3394 -- Start of processing for Match 3395 3396 begin 3397 -- Do we have the regexp Never_Match? 3398 3399 if Self.Size = 0 then 3400 Matches := (others => No_Match); 3401 return; 3402 end if; 3403 3404 -- If there is a "must appear" string, look for it 3405 3406 if Self.Must_Have_Length > 0 then 3407 declare 3408 First : constant Character := Program (Self.Must_Have); 3409 Must_First : constant Pointer := Self.Must_Have; 3410 Must_Last : constant Pointer := 3411 Must_First + Pointer (Self.Must_Have_Length - 1); 3412 Next_Try : Natural := Index (First_In_Data, First); 3413 3414 begin 3415 while Next_Try /= 0 3416 and then Data (Next_Try .. Next_Try + Self.Must_Have_Length - 1) 3417 = String (Program (Must_First .. Must_Last)) 3418 loop 3419 Next_Try := Index (Next_Try + 1, First); 3420 end loop; 3421 3422 if Next_Try = 0 then 3423 Matches := (others => No_Match); 3424 return; -- Not present 3425 end if; 3426 end; 3427 end if; 3428 3429 -- Mark beginning of line for ^ 3430 3431 BOL_Pos := Data'First; 3432 3433 -- Simplest case first: an anchored match need be tried only once 3434 3435 if Self.Anchored and then (Self.Flags and Multiple_Lines) = 0 then 3436 Matched := Try (First_In_Data); 3437 3438 elsif Self.Anchored then 3439 declare 3440 Next_Try : Natural := First_In_Data; 3441 begin 3442 -- Test the first position in the buffer 3443 Matched := Try (Next_Try); 3444 3445 -- Else only test after newlines 3446 3447 if not Matched then 3448 while Next_Try <= Last_In_Data loop 3449 while Next_Try <= Last_In_Data 3450 and then Data (Next_Try) /= ASCII.LF 3451 loop 3452 Next_Try := Next_Try + 1; 3453 end loop; 3454 3455 Next_Try := Next_Try + 1; 3456 3457 if Next_Try <= Last_In_Data then 3458 Matched := Try (Next_Try); 3459 exit when Matched; 3460 end if; 3461 end loop; 3462 end if; 3463 end; 3464 3465 elsif Self.First /= ASCII.NUL then 3466 -- We know what char it must start with 3467 3468 declare 3469 Next_Try : Natural := Index (First_In_Data, Self.First); 3470 3471 begin 3472 while Next_Try /= 0 loop 3473 Matched := Try (Next_Try); 3474 exit when Matched; 3475 Next_Try := Index (Next_Try + 1, Self.First); 3476 end loop; 3477 end; 3478 3479 else 3480 -- Messy cases: try all locations (including for the empty string) 3481 3482 Matched := Try (First_In_Data); 3483 3484 if not Matched then 3485 for S in First_In_Data + 1 .. Last_In_Data loop 3486 Matched := Try (S); 3487 exit when Matched; 3488 end loop; 3489 end if; 3490 end if; 3491 3492 -- Matched has its value 3493 3494 for J in Last_Paren + 1 .. Matches'Last loop 3495 Matches_Full (J) := No_Match; 3496 end loop; 3497 3498 Matches := Matches_Full (Matches'Range); 3499 end Match; 3500 3501 ----------- 3502 -- Match -- 3503 ----------- 3504 3505 function Match 3506 (Self : Pattern_Matcher; 3507 Data : String; 3508 Data_First : Integer := -1; 3509 Data_Last : Positive := Positive'Last) return Natural 3510 is 3511 Matches : Match_Array (0 .. 0); 3512 3513 begin 3514 Match (Self, Data, Matches, Data_First, Data_Last); 3515 if Matches (0) = No_Match then 3516 return Data'First - 1; 3517 else 3518 return Matches (0).First; 3519 end if; 3520 end Match; 3521 3522 function Match 3523 (Self : Pattern_Matcher; 3524 Data : String; 3525 Data_First : Integer := -1; 3526 Data_Last : Positive := Positive'Last) return Boolean 3527 is 3528 Matches : Match_Array (0 .. 0); 3529 3530 begin 3531 Match (Self, Data, Matches, Data_First, Data_Last); 3532 return Matches (0).First >= Data'First; 3533 end Match; 3534 3535 procedure Match 3536 (Expression : String; 3537 Data : String; 3538 Matches : out Match_Array; 3539 Size : Program_Size := Auto_Size; 3540 Data_First : Integer := -1; 3541 Data_Last : Positive := Positive'Last) 3542 is 3543 PM : Pattern_Matcher (Size); 3544 Finalize_Size : Program_Size; 3545 pragma Unreferenced (Finalize_Size); 3546 begin 3547 if Size = 0 then 3548 Match (Compile (Expression), Data, Matches, Data_First, Data_Last); 3549 else 3550 Compile (PM, Expression, Finalize_Size); 3551 Match (PM, Data, Matches, Data_First, Data_Last); 3552 end if; 3553 end Match; 3554 3555 ----------- 3556 -- Match -- 3557 ----------- 3558 3559 function Match 3560 (Expression : String; 3561 Data : String; 3562 Size : Program_Size := Auto_Size; 3563 Data_First : Integer := -1; 3564 Data_Last : Positive := Positive'Last) return Natural 3565 is 3566 PM : Pattern_Matcher (Size); 3567 Final_Size : Program_Size; 3568 pragma Unreferenced (Final_Size); 3569 begin 3570 if Size = 0 then 3571 return Match (Compile (Expression), Data, Data_First, Data_Last); 3572 else 3573 Compile (PM, Expression, Final_Size); 3574 return Match (PM, Data, Data_First, Data_Last); 3575 end if; 3576 end Match; 3577 3578 ----------- 3579 -- Match -- 3580 ----------- 3581 3582 function Match 3583 (Expression : String; 3584 Data : String; 3585 Size : Program_Size := Auto_Size; 3586 Data_First : Integer := -1; 3587 Data_Last : Positive := Positive'Last) return Boolean 3588 is 3589 Matches : Match_Array (0 .. 0); 3590 PM : Pattern_Matcher (Size); 3591 Final_Size : Program_Size; 3592 pragma Unreferenced (Final_Size); 3593 begin 3594 if Size = 0 then 3595 Match (Compile (Expression), Data, Matches, Data_First, Data_Last); 3596 else 3597 Compile (PM, Expression, Final_Size); 3598 Match (PM, Data, Matches, Data_First, Data_Last); 3599 end if; 3600 3601 return Matches (0).First >= Data'First; 3602 end Match; 3603 3604 ------------- 3605 -- Operand -- 3606 ------------- 3607 3608 function Operand (P : Pointer) return Pointer is 3609 begin 3610 return P + Next_Pointer_Bytes; 3611 end Operand; 3612 3613 -------------- 3614 -- Optimize -- 3615 -------------- 3616 3617 procedure Optimize (Self : in out Pattern_Matcher) is 3618 Scan : Pointer; 3619 Program : Program_Data renames Self.Program; 3620 3621 begin 3622 -- Start with safe defaults (no optimization): 3623 -- * No known first character of match 3624 -- * Does not necessarily start at beginning of line 3625 -- * No string known that has to appear in data 3626 3627 Self.First := ASCII.NUL; 3628 Self.Anchored := False; 3629 Self.Must_Have := Program'Last + 1; 3630 Self.Must_Have_Length := 0; 3631 3632 Scan := Program_First; -- First instruction (can be anything) 3633 3634 if Program (Scan) = EXACT then 3635 Self.First := Program (String_Operand (Scan)); 3636 3637 elsif Program (Scan) = BOL 3638 or else Program (Scan) = SBOL 3639 or else Program (Scan) = MBOL 3640 then 3641 Self.Anchored := True; 3642 end if; 3643 end Optimize; 3644 3645 ----------------- 3646 -- Paren_Count -- 3647 ----------------- 3648 3649 function Paren_Count (Regexp : Pattern_Matcher) return Match_Count is 3650 begin 3651 return Regexp.Paren_Count; 3652 end Paren_Count; 3653 3654 ----------- 3655 -- Quote -- 3656 ----------- 3657 3658 function Quote (Str : String) return String is 3659 S : String (1 .. Str'Length * 2); 3660 Last : Natural := 0; 3661 3662 begin 3663 for J in Str'Range loop 3664 case Str (J) is 3665 when '^' | '$' | '|' | '*' | '+' | '?' | '{' | 3666 '}' | '[' | ']' | '(' | ')' | '\' | '.' => 3667 3668 S (Last + 1) := '\'; 3669 S (Last + 2) := Str (J); 3670 Last := Last + 2; 3671 3672 when others => 3673 S (Last + 1) := Str (J); 3674 Last := Last + 1; 3675 end case; 3676 end loop; 3677 3678 return S (1 .. Last); 3679 end Quote; 3680 3681 ------------------ 3682 -- Read_Natural -- 3683 ------------------ 3684 3685 function Read_Natural 3686 (Program : Program_Data; 3687 IP : Pointer) return Natural 3688 is 3689 begin 3690 return Character'Pos (Program (IP)) + 3691 256 * Character'Pos (Program (IP + 1)); 3692 end Read_Natural; 3693 3694 ----------------- 3695 -- Reset_Class -- 3696 ----------------- 3697 3698 procedure Reset_Class (Bitmap : out Character_Class) is 3699 begin 3700 Bitmap := (others => 0); 3701 end Reset_Class; 3702 3703 ------------------ 3704 -- Set_In_Class -- 3705 ------------------ 3706 3707 procedure Set_In_Class 3708 (Bitmap : in out Character_Class; 3709 C : Character) 3710 is 3711 Value : constant Class_Byte := Character'Pos (C); 3712 begin 3713 Bitmap (Value / 8) := Bitmap (Value / 8) 3714 or Bit_Conversion (Value mod 8); 3715 end Set_In_Class; 3716 3717 ------------------- 3718 -- String_Length -- 3719 ------------------- 3720 3721 function String_Length 3722 (Program : Program_Data; 3723 P : Pointer) return Program_Size 3724 is 3725 begin 3726 pragma Assert (Program (P) = EXACT or else Program (P) = EXACTF); 3727 return Character'Pos (Program (P + Next_Pointer_Bytes)); 3728 end String_Length; 3729 3730 -------------------- 3731 -- String_Operand -- 3732 -------------------- 3733 3734 function String_Operand (P : Pointer) return Pointer is 3735 begin 3736 return P + 4; 3737 end String_Operand; 3738 3739end System.Regpat; 3740