1------------------------------------------------------------------------------ 2-- -- 3-- GNAT LIBRARY COMPONENTS -- 4-- -- 5-- G N A T . R E G P A T -- 6-- -- 7-- B o d y -- 8-- -- 9-- Copyright (C) 1986 by University of Toronto. -- 10-- Copyright (C) 1999-2014, AdaCore -- 11-- -- 12-- GNAT is free software; you can redistribute it and/or modify it under -- 13-- terms of the GNU General Public License as published by the Free Soft- -- 14-- ware Foundation; either version 3, or (at your option) any later ver- -- 15-- sion. GNAT is distributed in the hope that it will be useful, but WITH- -- 16-- OUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY -- 17-- or FITNESS FOR A PARTICULAR PURPOSE. -- 18-- -- 19-- As a special exception under Section 7 of GPL version 3, you are granted -- 20-- additional permissions described in the GCC Runtime Library Exception, -- 21-- version 3.1, as published by the Free Software Foundation. -- 22-- -- 23-- You should have received a copy of the GNU General Public License and -- 24-- a copy of the GCC Runtime Library Exception along with this program; -- 25-- see the files COPYING3 and COPYING.RUNTIME respectively. If not, see -- 26-- <http://www.gnu.org/licenses/>. -- 27-- -- 28-- GNAT was originally developed by the GNAT team at New York University. -- 29-- Extensive contributions were provided by Ada Core Technologies Inc. -- 30-- -- 31------------------------------------------------------------------------------ 32 33-- This is an altered Ada 95 version of the original V8 style regular 34-- expression library written in C by Henry Spencer. Apart from the 35-- translation to Ada, the interface has been considerably changed to 36-- use the Ada String type instead of C-style nul-terminated strings. 37 38-- Beware that some of this code is subtly aware of the way operator 39-- precedence is structured in regular expressions. Serious changes in 40-- regular-expression syntax might require a total rethink. 41 42with System.IO; use System.IO; 43with Ada.Characters.Handling; use Ada.Characters.Handling; 44with Ada.Unchecked_Conversion; 45 46package body System.Regpat is 47 48 Debug : constant Boolean := False; 49 -- Set to True to activate debug traces. This is normally set to constant 50 -- False to simply delete all the trace code. It is to be edited to True 51 -- for internal debugging of the package. 52 53 ---------------------------- 54 -- Implementation details -- 55 ---------------------------- 56 57 -- This is essentially a linear encoding of a nondeterministic 58 -- finite-state machine, also known as syntax charts or 59 -- "railroad normal form" in parsing technology. 60 61 -- Each node is an opcode plus a "next" pointer, possibly plus an 62 -- operand. "Next" pointers of all nodes except BRANCH implement 63 -- concatenation; a "next" pointer with a BRANCH on both ends of it 64 -- is connecting two alternatives. 65 66 -- The operand of some types of node is a literal string; for others, 67 -- it is a node leading into a sub-FSM. In particular, the operand of 68 -- a BRANCH node is the first node of the branch. 69 -- (NB this is *not* a tree structure: the tail of the branch connects 70 -- to the thing following the set of BRANCHes). 71 72 -- You can see the exact byte-compiled version by using the Dump 73 -- subprogram. However, here are a few examples: 74 75 -- (a|b): 1 : BRANCH (next at 9) 76 -- 4 : EXACT (next at 17) operand=a 77 -- 9 : BRANCH (next at 17) 78 -- 12 : EXACT (next at 17) operand=b 79 -- 17 : EOP (next at 0) 80 -- 81 -- (ab)*: 1 : CURLYX (next at 25) { 0, 32767} 82 -- 8 : OPEN 1 (next at 12) 83 -- 12 : EXACT (next at 18) operand=ab 84 -- 18 : CLOSE 1 (next at 22) 85 -- 22 : WHILEM (next at 0) 86 -- 25 : NOTHING (next at 28) 87 -- 28 : EOP (next at 0) 88 89 -- The opcodes are: 90 91 type Opcode is 92 93 -- Name Operand? Meaning 94 95 (EOP, -- no End of program 96 MINMOD, -- no Next operator is not greedy 97 98 -- Classes of characters 99 100 ANY, -- no Match any one character except newline 101 SANY, -- no Match any character, including new line 102 ANYOF, -- class Match any character in this class 103 EXACT, -- str Match this string exactly 104 EXACTF, -- str Match this string (case-folding is one) 105 NOTHING, -- no Match empty string 106 SPACE, -- no Match any whitespace character 107 NSPACE, -- no Match any non-whitespace character 108 DIGIT, -- no Match any numeric character 109 NDIGIT, -- no Match any non-numeric character 110 ALNUM, -- no Match any alphanumeric character 111 NALNUM, -- no Match any non-alphanumeric character 112 113 -- Branches 114 115 BRANCH, -- node Match this alternative, or the next 116 117 -- Simple loops (when the following node is one character in length) 118 119 STAR, -- node Match this simple thing 0 or more times 120 PLUS, -- node Match this simple thing 1 or more times 121 CURLY, -- 2num node Match this simple thing between n and m times. 122 123 -- Complex loops 124 125 CURLYX, -- 2num node Match this complex thing {n,m} times 126 -- The nums are coded on two characters each 127 128 WHILEM, -- no Do curly processing and see if rest matches 129 130 -- Matches after or before a word 131 132 BOL, -- no Match "" at beginning of line 133 MBOL, -- no Same, assuming multiline (match after \n) 134 SBOL, -- no Same, assuming single line (don't match at \n) 135 EOL, -- no Match "" at end of line 136 MEOL, -- no Same, assuming multiline (match before \n) 137 SEOL, -- no Same, assuming single line (don't match at \n) 138 139 BOUND, -- no Match "" at any word boundary 140 NBOUND, -- no Match "" at any word non-boundary 141 142 -- Parenthesis groups handling 143 144 REFF, -- num Match some already matched string, folded 145 OPEN, -- num Mark this point in input as start of #n 146 CLOSE); -- num Analogous to OPEN 147 148 for Opcode'Size use 8; 149 150 -- Opcode notes: 151 152 -- BRANCH 153 -- The set of branches constituting a single choice are hooked 154 -- together with their "next" pointers, since precedence prevents 155 -- anything being concatenated to any individual branch. The 156 -- "next" pointer of the last BRANCH in a choice points to the 157 -- thing following the whole choice. This is also where the 158 -- final "next" pointer of each individual branch points; each 159 -- branch starts with the operand node of a BRANCH node. 160 161 -- STAR,PLUS 162 -- '?', and complex '*' and '+', are implemented with CURLYX. 163 -- branches. Simple cases (one character per match) are implemented with 164 -- STAR and PLUS for speed and to minimize recursive plunges. 165 166 -- OPEN,CLOSE 167 -- ...are numbered at compile time. 168 169 -- EXACT, EXACTF 170 -- There are in fact two arguments, the first one is the length (minus 171 -- one of the string argument), coded on one character, the second 172 -- argument is the string itself, coded on length + 1 characters. 173 174 -- A node is one char of opcode followed by two chars of "next" pointer. 175 -- "Next" pointers are stored as two 8-bit pieces, high order first. The 176 -- value is a positive offset from the opcode of the node containing it. 177 -- An operand, if any, simply follows the node. (Note that much of the 178 -- code generation knows about this implicit relationship.) 179 180 -- Using two bytes for the "next" pointer is vast overkill for most 181 -- things, but allows patterns to get big without disasters. 182 183 Next_Pointer_Bytes : constant := 3; 184 -- Points after the "next pointer" data. An instruction is therefore: 185 -- 1 byte: instruction opcode 186 -- 2 bytes: pointer to next instruction 187 -- * bytes: optional data for the instruction 188 189 ----------------------- 190 -- Character classes -- 191 ----------------------- 192 -- This is the implementation for character classes ([...]) in the 193 -- syntax for regular expressions. Each character (0..256) has an 194 -- entry into the table. This makes for a very fast matching 195 -- algorithm. 196 197 type Class_Byte is mod 256; 198 type Character_Class is array (Class_Byte range 0 .. 31) of Class_Byte; 199 200 type Bit_Conversion_Array is array (Class_Byte range 0 .. 7) of Class_Byte; 201 Bit_Conversion : constant Bit_Conversion_Array := 202 (1, 2, 4, 8, 16, 32, 64, 128); 203 204 type Std_Class is (ANYOF_NONE, 205 ANYOF_ALNUM, -- Alphanumeric class [a-zA-Z0-9] 206 ANYOF_NALNUM, 207 ANYOF_SPACE, -- Space class [ \t\n\r\f] 208 ANYOF_NSPACE, 209 ANYOF_DIGIT, -- Digit class [0-9] 210 ANYOF_NDIGIT, 211 ANYOF_ALNUMC, -- Alphanumeric class [a-zA-Z0-9] 212 ANYOF_NALNUMC, 213 ANYOF_ALPHA, -- Alpha class [a-zA-Z] 214 ANYOF_NALPHA, 215 ANYOF_ASCII, -- Ascii class (7 bits) 0..127 216 ANYOF_NASCII, 217 ANYOF_CNTRL, -- Control class 218 ANYOF_NCNTRL, 219 ANYOF_GRAPH, -- Graphic class 220 ANYOF_NGRAPH, 221 ANYOF_LOWER, -- Lower case class [a-z] 222 ANYOF_NLOWER, 223 ANYOF_PRINT, -- printable class 224 ANYOF_NPRINT, 225 ANYOF_PUNCT, -- 226 ANYOF_NPUNCT, 227 ANYOF_UPPER, -- Upper case class [A-Z] 228 ANYOF_NUPPER, 229 ANYOF_XDIGIT, -- Hexadecimal digit 230 ANYOF_NXDIGIT 231 ); 232 233 procedure Set_In_Class 234 (Bitmap : in out Character_Class; 235 C : Character); 236 -- Set the entry to True for C in the class Bitmap 237 238 function Get_From_Class 239 (Bitmap : Character_Class; 240 C : Character) return Boolean; 241 -- Return True if the entry is set for C in the class Bitmap 242 243 procedure Reset_Class (Bitmap : out Character_Class); 244 -- Clear all the entries in the class Bitmap 245 246 pragma Inline (Set_In_Class); 247 pragma Inline (Get_From_Class); 248 pragma Inline (Reset_Class); 249 250 ----------------------- 251 -- Local Subprograms -- 252 ----------------------- 253 254 function "=" (Left : Character; Right : Opcode) return Boolean; 255 256 function Is_Alnum (C : Character) return Boolean; 257 -- Return True if C is an alphanum character or an underscore ('_') 258 259 function Is_White_Space (C : Character) return Boolean; 260 -- Return True if C is a whitespace character 261 262 function Is_Printable (C : Character) return Boolean; 263 -- Return True if C is a printable character 264 265 function Operand (P : Pointer) return Pointer; 266 -- Return a pointer to the first operand of the node at P 267 268 function String_Length 269 (Program : Program_Data; 270 P : Pointer) return Program_Size; 271 -- Return the length of the string argument of the node at P 272 273 function String_Operand (P : Pointer) return Pointer; 274 -- Return a pointer to the string argument of the node at P 275 276 procedure Bitmap_Operand 277 (Program : Program_Data; 278 P : Pointer; 279 Op : out Character_Class); 280 -- Return a pointer to the string argument of the node at P 281 282 function Get_Next 283 (Program : Program_Data; 284 IP : Pointer) return Pointer; 285 -- Dig the next instruction pointer out of a node 286 287 procedure Optimize (Self : in out Pattern_Matcher); 288 -- Optimize a Pattern_Matcher by noting certain special cases 289 290 function Read_Natural 291 (Program : Program_Data; 292 IP : Pointer) return Natural; 293 -- Return the 2-byte natural coded at position IP 294 295 -- All of the subprograms above are tiny and should be inlined 296 297 pragma Inline ("="); 298 pragma Inline (Is_Alnum); 299 pragma Inline (Is_White_Space); 300 pragma Inline (Get_Next); 301 pragma Inline (Operand); 302 pragma Inline (Read_Natural); 303 pragma Inline (String_Length); 304 pragma Inline (String_Operand); 305 306 type Expression_Flags is record 307 Has_Width, -- Known never to match null string 308 Simple, -- Simple enough to be STAR/PLUS operand 309 SP_Start : Boolean; -- Starts with * or + 310 end record; 311 312 Worst_Expression : constant Expression_Flags := (others => False); 313 -- Worst case 314 315 procedure Dump_Until 316 (Program : Program_Data; 317 Index : in out Pointer; 318 Till : Pointer; 319 Indent : Natural; 320 Do_Print : Boolean := True); 321 -- Dump the program until the node Till (not included) is met. Every line 322 -- is indented with Index spaces at the beginning Dumps till the end if 323 -- Till is 0. 324 325 procedure Dump_Operation 326 (Program : Program_Data; 327 Index : Pointer; 328 Indent : Natural); 329 -- Same as above, but only dumps a single operation, and compute its 330 -- indentation from the program. 331 332 --------- 333 -- "=" -- 334 --------- 335 336 function "=" (Left : Character; Right : Opcode) return Boolean is 337 begin 338 return Character'Pos (Left) = Opcode'Pos (Right); 339 end "="; 340 341 -------------------- 342 -- Bitmap_Operand -- 343 -------------------- 344 345 procedure Bitmap_Operand 346 (Program : Program_Data; 347 P : Pointer; 348 Op : out Character_Class) 349 is 350 function Convert is new Ada.Unchecked_Conversion 351 (Program_Data, Character_Class); 352 353 begin 354 Op (0 .. 31) := Convert (Program (P + Next_Pointer_Bytes .. P + 34)); 355 end Bitmap_Operand; 356 357 ------------- 358 -- Compile -- 359 ------------- 360 361 procedure Compile 362 (Matcher : out Pattern_Matcher; 363 Expression : String; 364 Final_Code_Size : out Program_Size; 365 Flags : Regexp_Flags := No_Flags) 366 is 367 -- We can't allocate space until we know how big the compiled form 368 -- will be, but we can't compile it (and thus know how big it is) 369 -- until we've got a place to put the code. So we cheat: we compile 370 -- it twice, once with code generation turned off and size counting 371 -- turned on, and once "for real". 372 373 -- This also means that we don't allocate space until we are sure 374 -- that the thing really will compile successfully, and we never 375 -- have to move the code and thus invalidate pointers into it. 376 377 -- Beware that the optimization-preparation code in here knows 378 -- about some of the structure of the compiled regexp. 379 380 PM : Pattern_Matcher renames Matcher; 381 Program : Program_Data renames PM.Program; 382 383 Emit_Ptr : Pointer := Program_First; 384 385 Parse_Pos : Natural := Expression'First; -- Input-scan pointer 386 Parse_End : constant Natural := Expression'Last; 387 388 ---------------------------- 389 -- Subprograms for Create -- 390 ---------------------------- 391 392 procedure Emit (B : Character); 393 -- Output the Character B to the Program. If code-generation is 394 -- disabled, simply increments the program counter. 395 396 function Emit_Node (Op : Opcode) return Pointer; 397 -- If code-generation is enabled, Emit_Node outputs the 398 -- opcode Op and reserves space for a pointer to the next node. 399 -- Return value is the location of new opcode, i.e. old Emit_Ptr. 400 401 procedure Emit_Natural (IP : Pointer; N : Natural); 402 -- Split N on two characters at position IP 403 404 procedure Emit_Class (Bitmap : Character_Class); 405 -- Emits a character class 406 407 procedure Case_Emit (C : Character); 408 -- Emit C, after converting is to lower-case if the regular 409 -- expression is case insensitive. 410 411 procedure Parse 412 (Parenthesized : Boolean; 413 Capturing : Boolean; 414 Flags : out Expression_Flags; 415 IP : out Pointer); 416 -- Parse regular expression, i.e. main body or parenthesized thing 417 -- Caller must absorb opening parenthesis. Capturing should be set to 418 -- True when we have an open parenthesis from which we want the user 419 -- to extra text. 420 421 procedure Parse_Branch 422 (Flags : out Expression_Flags; 423 First : Boolean; 424 IP : out Pointer); 425 -- Implements the concatenation operator and handles '|' 426 -- First should be true if this is the first item of the alternative. 427 428 procedure Parse_Piece 429 (Expr_Flags : out Expression_Flags; 430 IP : out Pointer); 431 -- Parse something followed by possible [*+?] 432 433 procedure Parse_Atom 434 (Expr_Flags : out Expression_Flags; 435 IP : out Pointer); 436 -- Parse_Atom is the lowest level parse procedure. 437 -- 438 -- Optimization: Gobbles an entire sequence of ordinary characters so 439 -- that it can turn them into a single node, which is smaller to store 440 -- and faster to run. Backslashed characters are exceptions, each 441 -- becoming a separate node; the code is simpler that way and it's 442 -- not worth fixing. 443 444 procedure Insert_Operator 445 (Op : Opcode; 446 Operand : Pointer; 447 Greedy : Boolean := True); 448 -- Insert_Operator inserts an operator in front of an already-emitted 449 -- operand and relocates the operand. This applies to PLUS and STAR. 450 -- If Minmod is True, then the operator is non-greedy. 451 452 function Insert_Operator_Before 453 (Op : Opcode; 454 Operand : Pointer; 455 Greedy : Boolean; 456 Opsize : Pointer) return Pointer; 457 -- Insert an operator before Operand (and move the latter forward in the 458 -- program). Opsize is the size needed to represent the operator. This 459 -- returns the position at which the operator was inserted, and moves 460 -- Emit_Ptr after the new position of the operand. 461 462 procedure Insert_Curly_Operator 463 (Op : Opcode; 464 Min : Natural; 465 Max : Natural; 466 Operand : Pointer; 467 Greedy : Boolean := True); 468 -- Insert an operator for CURLY ({Min}, {Min,} or {Min,Max}). 469 -- If Minmod is True, then the operator is non-greedy. 470 471 procedure Link_Tail (P, Val : Pointer); 472 -- Link_Tail sets the next-pointer at the end of a node chain 473 474 procedure Link_Operand_Tail (P, Val : Pointer); 475 -- Link_Tail on operand of first argument; noop if operand-less 476 477 procedure Fail (M : String); 478 pragma No_Return (Fail); 479 -- Fail with a diagnostic message, if possible 480 481 function Is_Curly_Operator (IP : Natural) return Boolean; 482 -- Return True if IP is looking at a '{' that is the beginning 483 -- of a curly operator, i.e. it matches {\d+,?\d*} 484 485 function Is_Mult (IP : Natural) return Boolean; 486 -- Return True if C is a regexp multiplier: '+', '*' or '?' 487 488 procedure Get_Curly_Arguments 489 (IP : Natural; 490 Min : out Natural; 491 Max : out Natural; 492 Greedy : out Boolean); 493 -- Parse the argument list for a curly operator. 494 -- It is assumed that IP is indeed pointing at a valid operator. 495 -- So what is IP and how come IP is not referenced in the body ??? 496 497 procedure Parse_Character_Class (IP : out Pointer); 498 -- Parse a character class. 499 -- The calling subprogram should consume the opening '[' before. 500 501 procedure Parse_Literal 502 (Expr_Flags : out Expression_Flags; 503 IP : out Pointer); 504 -- Parse_Literal encodes a string of characters to be matched exactly 505 506 function Parse_Posix_Character_Class return Std_Class; 507 -- Parse a posix character class, like [:alpha:] or [:^alpha:]. 508 -- The caller is supposed to absorb the opening [. 509 510 pragma Inline (Is_Mult); 511 pragma Inline (Emit_Natural); 512 pragma Inline (Parse_Character_Class); -- since used only once 513 514 --------------- 515 -- Case_Emit -- 516 --------------- 517 518 procedure Case_Emit (C : Character) is 519 begin 520 if (Flags and Case_Insensitive) /= 0 then 521 Emit (To_Lower (C)); 522 523 else 524 -- Dump current character 525 526 Emit (C); 527 end if; 528 end Case_Emit; 529 530 ---------- 531 -- Emit -- 532 ---------- 533 534 procedure Emit (B : Character) is 535 begin 536 if Emit_Ptr <= PM.Size then 537 Program (Emit_Ptr) := B; 538 end if; 539 540 Emit_Ptr := Emit_Ptr + 1; 541 end Emit; 542 543 ---------------- 544 -- Emit_Class -- 545 ---------------- 546 547 procedure Emit_Class (Bitmap : Character_Class) is 548 subtype Program31 is Program_Data (0 .. 31); 549 550 function Convert is new Ada.Unchecked_Conversion 551 (Character_Class, Program31); 552 553 begin 554 -- What is the mysterious constant 31 here??? Can't it be expressed 555 -- symbolically (size of integer - 1 or some such???). In any case 556 -- it should be declared as a constant (and referenced presumably 557 -- as this constant + 1 below. 558 559 if Emit_Ptr + 31 <= PM.Size then 560 Program (Emit_Ptr .. Emit_Ptr + 31) := Convert (Bitmap); 561 end if; 562 563 Emit_Ptr := Emit_Ptr + 32; 564 end Emit_Class; 565 566 ------------------ 567 -- Emit_Natural -- 568 ------------------ 569 570 procedure Emit_Natural (IP : Pointer; N : Natural) is 571 begin 572 if IP + 1 <= PM.Size then 573 Program (IP + 1) := Character'Val (N / 256); 574 Program (IP) := Character'Val (N mod 256); 575 end if; 576 end Emit_Natural; 577 578 --------------- 579 -- Emit_Node -- 580 --------------- 581 582 function Emit_Node (Op : Opcode) return Pointer is 583 Result : constant Pointer := Emit_Ptr; 584 585 begin 586 if Emit_Ptr + 2 <= PM.Size then 587 Program (Emit_Ptr) := Character'Val (Opcode'Pos (Op)); 588 Program (Emit_Ptr + 1) := ASCII.NUL; 589 Program (Emit_Ptr + 2) := ASCII.NUL; 590 end if; 591 592 Emit_Ptr := Emit_Ptr + Next_Pointer_Bytes; 593 return Result; 594 end Emit_Node; 595 596 ---------- 597 -- Fail -- 598 ---------- 599 600 procedure Fail (M : String) is 601 begin 602 raise Expression_Error with M; 603 end Fail; 604 605 ------------------------- 606 -- Get_Curly_Arguments -- 607 ------------------------- 608 609 procedure Get_Curly_Arguments 610 (IP : Natural; 611 Min : out Natural; 612 Max : out Natural; 613 Greedy : out Boolean) 614 is 615 pragma Unreferenced (IP); 616 617 Save_Pos : Natural := Parse_Pos + 1; 618 619 begin 620 Min := 0; 621 Max := Max_Curly_Repeat; 622 623 while Expression (Parse_Pos) /= '}' 624 and then Expression (Parse_Pos) /= ',' 625 loop 626 Parse_Pos := Parse_Pos + 1; 627 end loop; 628 629 Min := Natural'Value (Expression (Save_Pos .. Parse_Pos - 1)); 630 631 if Expression (Parse_Pos) = ',' then 632 Save_Pos := Parse_Pos + 1; 633 while Expression (Parse_Pos) /= '}' loop 634 Parse_Pos := Parse_Pos + 1; 635 end loop; 636 637 if Save_Pos /= Parse_Pos then 638 Max := Natural'Value (Expression (Save_Pos .. Parse_Pos - 1)); 639 end if; 640 641 else 642 Max := Min; 643 end if; 644 645 if Parse_Pos < Expression'Last 646 and then Expression (Parse_Pos + 1) = '?' 647 then 648 Greedy := False; 649 Parse_Pos := Parse_Pos + 1; 650 651 else 652 Greedy := True; 653 end if; 654 end Get_Curly_Arguments; 655 656 --------------------------- 657 -- Insert_Curly_Operator -- 658 --------------------------- 659 660 procedure Insert_Curly_Operator 661 (Op : Opcode; 662 Min : Natural; 663 Max : Natural; 664 Operand : Pointer; 665 Greedy : Boolean := True) 666 is 667 Old : Pointer; 668 begin 669 Old := Insert_Operator_Before (Op, Operand, Greedy, Opsize => 7); 670 Emit_Natural (Old + Next_Pointer_Bytes, Min); 671 Emit_Natural (Old + Next_Pointer_Bytes + 2, Max); 672 end Insert_Curly_Operator; 673 674 ---------------------------- 675 -- Insert_Operator_Before -- 676 ---------------------------- 677 678 function Insert_Operator_Before 679 (Op : Opcode; 680 Operand : Pointer; 681 Greedy : Boolean; 682 Opsize : Pointer) return Pointer 683 is 684 Dest : constant Pointer := Emit_Ptr; 685 Old : Pointer; 686 Size : Pointer := Opsize; 687 688 begin 689 -- If not greedy, we have to emit another opcode first 690 691 if not Greedy then 692 Size := Size + Next_Pointer_Bytes; 693 end if; 694 695 -- Move the operand in the byte-compilation, so that we can insert 696 -- the operator before it. 697 698 if Emit_Ptr + Size <= PM.Size then 699 Program (Operand + Size .. Emit_Ptr + Size) := 700 Program (Operand .. Emit_Ptr); 701 end if; 702 703 -- Insert the operator at the position previously occupied by the 704 -- operand. 705 706 Emit_Ptr := Operand; 707 708 if not Greedy then 709 Old := Emit_Node (MINMOD); 710 Link_Tail (Old, Old + Next_Pointer_Bytes); 711 end if; 712 713 Old := Emit_Node (Op); 714 Emit_Ptr := Dest + Size; 715 return Old; 716 end Insert_Operator_Before; 717 718 --------------------- 719 -- Insert_Operator -- 720 --------------------- 721 722 procedure Insert_Operator 723 (Op : Opcode; 724 Operand : Pointer; 725 Greedy : Boolean := True) 726 is 727 Discard : Pointer; 728 pragma Warnings (Off, Discard); 729 begin 730 Discard := Insert_Operator_Before 731 (Op, Operand, Greedy, Opsize => Next_Pointer_Bytes); 732 end Insert_Operator; 733 734 ----------------------- 735 -- Is_Curly_Operator -- 736 ----------------------- 737 738 function Is_Curly_Operator (IP : Natural) return Boolean is 739 Scan : Natural := IP; 740 741 begin 742 if Expression (Scan) /= '{' 743 or else Scan + 2 > Expression'Last 744 or else not Is_Digit (Expression (Scan + 1)) 745 then 746 return False; 747 end if; 748 749 Scan := Scan + 1; 750 751 -- The first digit 752 753 loop 754 Scan := Scan + 1; 755 756 if Scan > Expression'Last then 757 return False; 758 end if; 759 760 exit when not Is_Digit (Expression (Scan)); 761 end loop; 762 763 if Expression (Scan) = ',' then 764 loop 765 Scan := Scan + 1; 766 767 if Scan > Expression'Last then 768 return False; 769 end if; 770 771 exit when not Is_Digit (Expression (Scan)); 772 end loop; 773 end if; 774 775 return Expression (Scan) = '}'; 776 end Is_Curly_Operator; 777 778 ------------- 779 -- Is_Mult -- 780 ------------- 781 782 function Is_Mult (IP : Natural) return Boolean is 783 C : constant Character := Expression (IP); 784 785 begin 786 return C = '*' 787 or else C = '+' 788 or else C = '?' 789 or else (C = '{' and then Is_Curly_Operator (IP)); 790 end Is_Mult; 791 792 ----------------------- 793 -- Link_Operand_Tail -- 794 ----------------------- 795 796 procedure Link_Operand_Tail (P, Val : Pointer) is 797 begin 798 if P <= PM.Size and then Program (P) = BRANCH then 799 Link_Tail (Operand (P), Val); 800 end if; 801 end Link_Operand_Tail; 802 803 --------------- 804 -- Link_Tail -- 805 --------------- 806 807 procedure Link_Tail (P, Val : Pointer) is 808 Scan : Pointer; 809 Temp : Pointer; 810 Offset : Pointer; 811 812 begin 813 -- Find last node (the size of the pattern matcher might be too 814 -- small, so don't try to read past its end). 815 816 Scan := P; 817 while Scan + Next_Pointer_Bytes <= PM.Size loop 818 Temp := Get_Next (Program, Scan); 819 exit when Temp = Scan; 820 Scan := Temp; 821 end loop; 822 823 Offset := Val - Scan; 824 825 Emit_Natural (Scan + 1, Natural (Offset)); 826 end Link_Tail; 827 828 ----------- 829 -- Parse -- 830 ----------- 831 832 -- Combining parenthesis handling with the base level of regular 833 -- expression is a trifle forced, but the need to tie the tails of the 834 -- the branches to what follows makes it hard to avoid. 835 836 procedure Parse 837 (Parenthesized : Boolean; 838 Capturing : Boolean; 839 Flags : out Expression_Flags; 840 IP : out Pointer) 841 is 842 E : String renames Expression; 843 Br, Br2 : Pointer; 844 Ender : Pointer; 845 Par_No : Natural; 846 New_Flags : Expression_Flags; 847 Have_Branch : Boolean := False; 848 849 begin 850 Flags := (Has_Width => True, others => False); -- Tentatively 851 852 -- Make an OPEN node, if parenthesized 853 854 if Parenthesized and then Capturing then 855 if Matcher.Paren_Count > Max_Paren_Count then 856 Fail ("too many ()"); 857 end if; 858 859 Par_No := Matcher.Paren_Count + 1; 860 Matcher.Paren_Count := Matcher.Paren_Count + 1; 861 IP := Emit_Node (OPEN); 862 Emit (Character'Val (Par_No)); 863 else 864 IP := 0; 865 Par_No := 0; 866 end if; 867 868 -- Pick up the branches, linking them together 869 870 Parse_Branch (New_Flags, True, Br); 871 872 if Br = 0 then 873 IP := 0; 874 return; 875 end if; 876 877 if Parse_Pos <= Parse_End 878 and then E (Parse_Pos) = '|' 879 then 880 Insert_Operator (BRANCH, Br); 881 Have_Branch := True; 882 end if; 883 884 if IP /= 0 then 885 Link_Tail (IP, Br); -- OPEN -> first 886 else 887 IP := Br; 888 end if; 889 890 if not New_Flags.Has_Width then 891 Flags.Has_Width := False; 892 end if; 893 894 Flags.SP_Start := Flags.SP_Start or else New_Flags.SP_Start; 895 896 while Parse_Pos <= Parse_End 897 and then (E (Parse_Pos) = '|') 898 loop 899 Parse_Pos := Parse_Pos + 1; 900 Parse_Branch (New_Flags, False, Br); 901 902 if Br = 0 then 903 IP := 0; 904 return; 905 end if; 906 907 Link_Tail (IP, Br); -- BRANCH -> BRANCH 908 909 if not New_Flags.Has_Width then 910 Flags.Has_Width := False; 911 end if; 912 913 Flags.SP_Start := Flags.SP_Start or else New_Flags.SP_Start; 914 end loop; 915 916 -- Make a closing node, and hook it on the end 917 918 if Parenthesized then 919 if Capturing then 920 Ender := Emit_Node (CLOSE); 921 Emit (Character'Val (Par_No)); 922 Link_Tail (IP, Ender); 923 924 else 925 -- Need to keep looking after the closing parenthesis 926 Ender := Emit_Ptr; 927 end if; 928 929 else 930 Ender := Emit_Node (EOP); 931 Link_Tail (IP, Ender); 932 end if; 933 934 if Have_Branch and then Emit_Ptr <= PM.Size + 1 then 935 936 -- Hook the tails of the branches to the closing node 937 938 Br := IP; 939 loop 940 Link_Operand_Tail (Br, Ender); 941 Br2 := Get_Next (Program, Br); 942 exit when Br2 = Br; 943 Br := Br2; 944 end loop; 945 end if; 946 947 -- Check for proper termination 948 949 if Parenthesized then 950 if Parse_Pos > Parse_End or else E (Parse_Pos) /= ')' then 951 Fail ("unmatched ()"); 952 end if; 953 954 Parse_Pos := Parse_Pos + 1; 955 956 elsif Parse_Pos <= Parse_End then 957 if E (Parse_Pos) = ')' then 958 Fail ("unmatched ')'"); 959 else 960 Fail ("junk on end"); -- "Can't happen" 961 end if; 962 end if; 963 end Parse; 964 965 ---------------- 966 -- Parse_Atom -- 967 ---------------- 968 969 procedure Parse_Atom 970 (Expr_Flags : out Expression_Flags; 971 IP : out Pointer) 972 is 973 C : Character; 974 975 begin 976 -- Tentatively set worst expression case 977 978 Expr_Flags := Worst_Expression; 979 980 C := Expression (Parse_Pos); 981 Parse_Pos := Parse_Pos + 1; 982 983 case (C) is 984 when '^' => 985 IP := 986 Emit_Node 987 (if (Flags and Multiple_Lines) /= 0 then MBOL 988 elsif (Flags and Single_Line) /= 0 then SBOL 989 else BOL); 990 991 when '$' => 992 IP := 993 Emit_Node 994 (if (Flags and Multiple_Lines) /= 0 then MEOL 995 elsif (Flags and Single_Line) /= 0 then SEOL 996 else EOL); 997 998 when '.' => 999 IP := 1000 Emit_Node 1001 (if (Flags and Single_Line) /= 0 then SANY else ANY); 1002 1003 Expr_Flags.Has_Width := True; 1004 Expr_Flags.Simple := True; 1005 1006 when '[' => 1007 Parse_Character_Class (IP); 1008 Expr_Flags.Has_Width := True; 1009 Expr_Flags.Simple := True; 1010 1011 when '(' => 1012 declare 1013 New_Flags : Expression_Flags; 1014 1015 begin 1016 if Parse_Pos <= Parse_End - 1 1017 and then Expression (Parse_Pos) = '?' 1018 and then Expression (Parse_Pos + 1) = ':' 1019 then 1020 Parse_Pos := Parse_Pos + 2; 1021 1022 -- Non-capturing parenthesis 1023 1024 Parse (True, False, New_Flags, IP); 1025 1026 else 1027 -- Capturing parenthesis 1028 1029 Parse (True, True, New_Flags, IP); 1030 Expr_Flags.Has_Width := 1031 Expr_Flags.Has_Width or else New_Flags.Has_Width; 1032 Expr_Flags.SP_Start := 1033 Expr_Flags.SP_Start or else New_Flags.SP_Start; 1034 if IP = 0 then 1035 return; 1036 end if; 1037 end if; 1038 end; 1039 1040 when '|' | ASCII.LF | ')' => 1041 Fail ("internal urp"); -- Supposed to be caught earlier 1042 1043 when '?' | '+' | '*' => 1044 Fail (C & " follows nothing"); 1045 1046 when '{' => 1047 if Is_Curly_Operator (Parse_Pos - 1) then 1048 Fail (C & " follows nothing"); 1049 else 1050 Parse_Literal (Expr_Flags, IP); 1051 end if; 1052 1053 when '\' => 1054 if Parse_Pos > Parse_End then 1055 Fail ("trailing \"); 1056 end if; 1057 1058 Parse_Pos := Parse_Pos + 1; 1059 1060 case Expression (Parse_Pos - 1) is 1061 when 'b' => 1062 IP := Emit_Node (BOUND); 1063 1064 when 'B' => 1065 IP := Emit_Node (NBOUND); 1066 1067 when 's' => 1068 IP := Emit_Node (SPACE); 1069 Expr_Flags.Simple := True; 1070 Expr_Flags.Has_Width := True; 1071 1072 when 'S' => 1073 IP := Emit_Node (NSPACE); 1074 Expr_Flags.Simple := True; 1075 Expr_Flags.Has_Width := True; 1076 1077 when 'd' => 1078 IP := Emit_Node (DIGIT); 1079 Expr_Flags.Simple := True; 1080 Expr_Flags.Has_Width := True; 1081 1082 when 'D' => 1083 IP := Emit_Node (NDIGIT); 1084 Expr_Flags.Simple := True; 1085 Expr_Flags.Has_Width := True; 1086 1087 when 'w' => 1088 IP := Emit_Node (ALNUM); 1089 Expr_Flags.Simple := True; 1090 Expr_Flags.Has_Width := True; 1091 1092 when 'W' => 1093 IP := Emit_Node (NALNUM); 1094 Expr_Flags.Simple := True; 1095 Expr_Flags.Has_Width := True; 1096 1097 when 'A' => 1098 IP := Emit_Node (SBOL); 1099 1100 when 'G' => 1101 IP := Emit_Node (SEOL); 1102 1103 when '0' .. '9' => 1104 IP := Emit_Node (REFF); 1105 1106 declare 1107 Save : constant Natural := Parse_Pos - 1; 1108 1109 begin 1110 while Parse_Pos <= Expression'Last 1111 and then Is_Digit (Expression (Parse_Pos)) 1112 loop 1113 Parse_Pos := Parse_Pos + 1; 1114 end loop; 1115 1116 Emit (Character'Val (Natural'Value 1117 (Expression (Save .. Parse_Pos - 1)))); 1118 end; 1119 1120 when others => 1121 Parse_Pos := Parse_Pos - 1; 1122 Parse_Literal (Expr_Flags, IP); 1123 end case; 1124 1125 when others => 1126 Parse_Literal (Expr_Flags, IP); 1127 end case; 1128 end Parse_Atom; 1129 1130 ------------------ 1131 -- Parse_Branch -- 1132 ------------------ 1133 1134 procedure Parse_Branch 1135 (Flags : out Expression_Flags; 1136 First : Boolean; 1137 IP : out Pointer) 1138 is 1139 E : String renames Expression; 1140 Chain : Pointer; 1141 Last : Pointer; 1142 New_Flags : Expression_Flags; 1143 1144 Discard : Pointer; 1145 pragma Warnings (Off, Discard); 1146 1147 begin 1148 Flags := Worst_Expression; -- Tentatively 1149 IP := (if First then Emit_Ptr else Emit_Node (BRANCH)); 1150 1151 Chain := 0; 1152 while Parse_Pos <= Parse_End 1153 and then E (Parse_Pos) /= ')' 1154 and then E (Parse_Pos) /= ASCII.LF 1155 and then E (Parse_Pos) /= '|' 1156 loop 1157 Parse_Piece (New_Flags, Last); 1158 1159 if Last = 0 then 1160 IP := 0; 1161 return; 1162 end if; 1163 1164 Flags.Has_Width := Flags.Has_Width or else New_Flags.Has_Width; 1165 1166 if Chain = 0 then -- First piece 1167 Flags.SP_Start := Flags.SP_Start or else New_Flags.SP_Start; 1168 else 1169 Link_Tail (Chain, Last); 1170 end if; 1171 1172 Chain := Last; 1173 end loop; 1174 1175 -- Case where loop ran zero CURLY 1176 1177 if Chain = 0 then 1178 Discard := Emit_Node (NOTHING); 1179 end if; 1180 end Parse_Branch; 1181 1182 --------------------------- 1183 -- Parse_Character_Class -- 1184 --------------------------- 1185 1186 procedure Parse_Character_Class (IP : out Pointer) is 1187 Bitmap : Character_Class; 1188 Invert : Boolean := False; 1189 In_Range : Boolean := False; 1190 Named_Class : Std_Class := ANYOF_NONE; 1191 Value : Character; 1192 Last_Value : Character := ASCII.NUL; 1193 1194 begin 1195 Reset_Class (Bitmap); 1196 1197 -- Do we have an invert character class ? 1198 1199 if Parse_Pos <= Parse_End 1200 and then Expression (Parse_Pos) = '^' 1201 then 1202 Invert := True; 1203 Parse_Pos := Parse_Pos + 1; 1204 end if; 1205 1206 -- First character can be ] or - without closing the class 1207 1208 if Parse_Pos <= Parse_End 1209 and then (Expression (Parse_Pos) = ']' 1210 or else Expression (Parse_Pos) = '-') 1211 then 1212 Set_In_Class (Bitmap, Expression (Parse_Pos)); 1213 Parse_Pos := Parse_Pos + 1; 1214 end if; 1215 1216 -- While we don't have the end of the class 1217 1218 while Parse_Pos <= Parse_End 1219 and then Expression (Parse_Pos) /= ']' 1220 loop 1221 Named_Class := ANYOF_NONE; 1222 Value := Expression (Parse_Pos); 1223 Parse_Pos := Parse_Pos + 1; 1224 1225 -- Do we have a Posix character class 1226 if Value = '[' then 1227 Named_Class := Parse_Posix_Character_Class; 1228 1229 elsif Value = '\' then 1230 if Parse_Pos = Parse_End then 1231 Fail ("Trailing \"); 1232 end if; 1233 Value := Expression (Parse_Pos); 1234 Parse_Pos := Parse_Pos + 1; 1235 1236 case Value is 1237 when 'w' => Named_Class := ANYOF_ALNUM; 1238 when 'W' => Named_Class := ANYOF_NALNUM; 1239 when 's' => Named_Class := ANYOF_SPACE; 1240 when 'S' => Named_Class := ANYOF_NSPACE; 1241 when 'd' => Named_Class := ANYOF_DIGIT; 1242 when 'D' => Named_Class := ANYOF_NDIGIT; 1243 when 'n' => Value := ASCII.LF; 1244 when 'r' => Value := ASCII.CR; 1245 when 't' => Value := ASCII.HT; 1246 when 'f' => Value := ASCII.FF; 1247 when 'e' => Value := ASCII.ESC; 1248 when 'a' => Value := ASCII.BEL; 1249 1250 -- when 'x' => ??? hexadecimal value 1251 -- when 'c' => ??? control character 1252 -- when '0'..'9' => ??? octal character 1253 1254 when others => null; 1255 end case; 1256 end if; 1257 1258 -- Do we have a character class? 1259 1260 if Named_Class /= ANYOF_NONE then 1261 1262 -- A range like 'a-\d' or 'a-[:digit:] is not a range 1263 1264 if In_Range then 1265 Set_In_Class (Bitmap, Last_Value); 1266 Set_In_Class (Bitmap, '-'); 1267 In_Range := False; 1268 end if; 1269 1270 -- Expand the range 1271 1272 case Named_Class is 1273 when ANYOF_NONE => null; 1274 1275 when ANYOF_ALNUM | ANYOF_ALNUMC => 1276 for Value in Class_Byte'Range loop 1277 if Is_Alnum (Character'Val (Value)) then 1278 Set_In_Class (Bitmap, Character'Val (Value)); 1279 end if; 1280 end loop; 1281 1282 when ANYOF_NALNUM | ANYOF_NALNUMC => 1283 for Value in Class_Byte'Range loop 1284 if not Is_Alnum (Character'Val (Value)) then 1285 Set_In_Class (Bitmap, Character'Val (Value)); 1286 end if; 1287 end loop; 1288 1289 when ANYOF_SPACE => 1290 for Value in Class_Byte'Range loop 1291 if Is_White_Space (Character'Val (Value)) then 1292 Set_In_Class (Bitmap, Character'Val (Value)); 1293 end if; 1294 end loop; 1295 1296 when ANYOF_NSPACE => 1297 for Value in Class_Byte'Range loop 1298 if not Is_White_Space (Character'Val (Value)) then 1299 Set_In_Class (Bitmap, Character'Val (Value)); 1300 end if; 1301 end loop; 1302 1303 when ANYOF_DIGIT => 1304 for Value in Class_Byte'Range loop 1305 if Is_Digit (Character'Val (Value)) then 1306 Set_In_Class (Bitmap, Character'Val (Value)); 1307 end if; 1308 end loop; 1309 1310 when ANYOF_NDIGIT => 1311 for Value in Class_Byte'Range loop 1312 if not Is_Digit (Character'Val (Value)) then 1313 Set_In_Class (Bitmap, Character'Val (Value)); 1314 end if; 1315 end loop; 1316 1317 when ANYOF_ALPHA => 1318 for Value in Class_Byte'Range loop 1319 if Is_Letter (Character'Val (Value)) then 1320 Set_In_Class (Bitmap, Character'Val (Value)); 1321 end if; 1322 end loop; 1323 1324 when ANYOF_NALPHA => 1325 for Value in Class_Byte'Range loop 1326 if not Is_Letter (Character'Val (Value)) then 1327 Set_In_Class (Bitmap, Character'Val (Value)); 1328 end if; 1329 end loop; 1330 1331 when ANYOF_ASCII => 1332 for Value in 0 .. 127 loop 1333 Set_In_Class (Bitmap, Character'Val (Value)); 1334 end loop; 1335 1336 when ANYOF_NASCII => 1337 for Value in 128 .. 255 loop 1338 Set_In_Class (Bitmap, Character'Val (Value)); 1339 end loop; 1340 1341 when ANYOF_CNTRL => 1342 for Value in Class_Byte'Range loop 1343 if Is_Control (Character'Val (Value)) then 1344 Set_In_Class (Bitmap, Character'Val (Value)); 1345 end if; 1346 end loop; 1347 1348 when ANYOF_NCNTRL => 1349 for Value in Class_Byte'Range loop 1350 if not Is_Control (Character'Val (Value)) then 1351 Set_In_Class (Bitmap, Character'Val (Value)); 1352 end if; 1353 end loop; 1354 1355 when ANYOF_GRAPH => 1356 for Value in Class_Byte'Range loop 1357 if Is_Graphic (Character'Val (Value)) then 1358 Set_In_Class (Bitmap, Character'Val (Value)); 1359 end if; 1360 end loop; 1361 1362 when ANYOF_NGRAPH => 1363 for Value in Class_Byte'Range loop 1364 if not Is_Graphic (Character'Val (Value)) then 1365 Set_In_Class (Bitmap, Character'Val (Value)); 1366 end if; 1367 end loop; 1368 1369 when ANYOF_LOWER => 1370 for Value in Class_Byte'Range loop 1371 if Is_Lower (Character'Val (Value)) then 1372 Set_In_Class (Bitmap, Character'Val (Value)); 1373 end if; 1374 end loop; 1375 1376 when ANYOF_NLOWER => 1377 for Value in Class_Byte'Range loop 1378 if not Is_Lower (Character'Val (Value)) then 1379 Set_In_Class (Bitmap, Character'Val (Value)); 1380 end if; 1381 end loop; 1382 1383 when ANYOF_PRINT => 1384 for Value in Class_Byte'Range loop 1385 if Is_Printable (Character'Val (Value)) then 1386 Set_In_Class (Bitmap, Character'Val (Value)); 1387 end if; 1388 end loop; 1389 1390 when ANYOF_NPRINT => 1391 for Value in Class_Byte'Range loop 1392 if not Is_Printable (Character'Val (Value)) then 1393 Set_In_Class (Bitmap, Character'Val (Value)); 1394 end if; 1395 end loop; 1396 1397 when ANYOF_PUNCT => 1398 for Value in Class_Byte'Range loop 1399 if Is_Printable (Character'Val (Value)) 1400 and then not Is_White_Space (Character'Val (Value)) 1401 and then not Is_Alnum (Character'Val (Value)) 1402 then 1403 Set_In_Class (Bitmap, Character'Val (Value)); 1404 end if; 1405 end loop; 1406 1407 when ANYOF_NPUNCT => 1408 for Value in Class_Byte'Range loop 1409 if not Is_Printable (Character'Val (Value)) 1410 or else Is_White_Space (Character'Val (Value)) 1411 or else Is_Alnum (Character'Val (Value)) 1412 then 1413 Set_In_Class (Bitmap, Character'Val (Value)); 1414 end if; 1415 end loop; 1416 1417 when ANYOF_UPPER => 1418 for Value in Class_Byte'Range loop 1419 if Is_Upper (Character'Val (Value)) then 1420 Set_In_Class (Bitmap, Character'Val (Value)); 1421 end if; 1422 end loop; 1423 1424 when ANYOF_NUPPER => 1425 for Value in Class_Byte'Range loop 1426 if not Is_Upper (Character'Val (Value)) then 1427 Set_In_Class (Bitmap, Character'Val (Value)); 1428 end if; 1429 end loop; 1430 1431 when ANYOF_XDIGIT => 1432 for Value in Class_Byte'Range loop 1433 if Is_Hexadecimal_Digit (Character'Val (Value)) then 1434 Set_In_Class (Bitmap, Character'Val (Value)); 1435 end if; 1436 end loop; 1437 1438 when ANYOF_NXDIGIT => 1439 for Value in Class_Byte'Range loop 1440 if not Is_Hexadecimal_Digit 1441 (Character'Val (Value)) 1442 then 1443 Set_In_Class (Bitmap, Character'Val (Value)); 1444 end if; 1445 end loop; 1446 1447 end case; 1448 1449 -- Not a character range 1450 1451 elsif not In_Range then 1452 Last_Value := Value; 1453 1454 if Parse_Pos > Expression'Last then 1455 Fail ("Empty character class []"); 1456 end if; 1457 1458 if Expression (Parse_Pos) = '-' 1459 and then Parse_Pos < Parse_End 1460 and then Expression (Parse_Pos + 1) /= ']' 1461 then 1462 Parse_Pos := Parse_Pos + 1; 1463 1464 -- Do we have a range like '\d-a' and '[:space:]-a' 1465 -- which is not a real range 1466 1467 if Named_Class /= ANYOF_NONE then 1468 Set_In_Class (Bitmap, '-'); 1469 else 1470 In_Range := True; 1471 end if; 1472 1473 else 1474 Set_In_Class (Bitmap, Value); 1475 1476 end if; 1477 1478 -- Else in a character range 1479 1480 else 1481 if Last_Value > Value then 1482 Fail ("Invalid Range [" & Last_Value'Img 1483 & "-" & Value'Img & "]"); 1484 end if; 1485 1486 while Last_Value <= Value loop 1487 Set_In_Class (Bitmap, Last_Value); 1488 Last_Value := Character'Succ (Last_Value); 1489 end loop; 1490 1491 In_Range := False; 1492 1493 end if; 1494 1495 end loop; 1496 1497 -- Optimize case-insensitive ranges (put the upper case or lower 1498 -- case character into the bitmap) 1499 1500 if (Flags and Case_Insensitive) /= 0 then 1501 for C in Character'Range loop 1502 if Get_From_Class (Bitmap, C) then 1503 Set_In_Class (Bitmap, To_Lower (C)); 1504 Set_In_Class (Bitmap, To_Upper (C)); 1505 end if; 1506 end loop; 1507 end if; 1508 1509 -- Optimize inverted classes 1510 1511 if Invert then 1512 for J in Bitmap'Range loop 1513 Bitmap (J) := not Bitmap (J); 1514 end loop; 1515 end if; 1516 1517 Parse_Pos := Parse_Pos + 1; 1518 1519 -- Emit the class 1520 1521 IP := Emit_Node (ANYOF); 1522 Emit_Class (Bitmap); 1523 end Parse_Character_Class; 1524 1525 ------------------- 1526 -- Parse_Literal -- 1527 ------------------- 1528 1529 -- This is a bit tricky due to quoted chars and due to 1530 -- the multiplier characters '*', '+', and '?' that 1531 -- take the SINGLE char previous as their operand. 1532 1533 -- On entry, the character at Parse_Pos - 1 is going to go 1534 -- into the string, no matter what it is. It could be 1535 -- following a \ if Parse_Atom was entered from the '\' case. 1536 1537 -- Basic idea is to pick up a good char in C and examine 1538 -- the next char. If Is_Mult (C) then twiddle, if it's a \ 1539 -- then frozzle and if it's another magic char then push C and 1540 -- terminate the string. If none of the above, push C on the 1541 -- string and go around again. 1542 1543 -- Start_Pos is used to remember where "the current character" 1544 -- starts in the string, if due to an Is_Mult we need to back 1545 -- up and put the current char in a separate 1-character string. 1546 -- When Start_Pos is 0, C is the only char in the string; 1547 -- this is used in Is_Mult handling, and in setting the SIMPLE 1548 -- flag at the end. 1549 1550 procedure Parse_Literal 1551 (Expr_Flags : out Expression_Flags; 1552 IP : out Pointer) 1553 is 1554 Start_Pos : Natural := 0; 1555 C : Character; 1556 Length_Ptr : Pointer; 1557 1558 Has_Special_Operator : Boolean := False; 1559 1560 begin 1561 Parse_Pos := Parse_Pos - 1; -- Look at current character 1562 1563 IP := 1564 Emit_Node 1565 (if (Flags and Case_Insensitive) /= 0 then EXACTF else EXACT); 1566 1567 Length_Ptr := Emit_Ptr; 1568 Emit_Ptr := String_Operand (IP); 1569 1570 Parse_Loop : 1571 loop 1572 C := Expression (Parse_Pos); -- Get current character 1573 1574 case C is 1575 when '.' | '[' | '(' | ')' | '|' | ASCII.LF | '$' | '^' => 1576 1577 if Start_Pos = 0 then 1578 Start_Pos := Parse_Pos; 1579 Emit (C); -- First character is always emitted 1580 else 1581 exit Parse_Loop; -- Else we are done 1582 end if; 1583 1584 when '?' | '+' | '*' | '{' => 1585 1586 if Start_Pos = 0 then 1587 Start_Pos := Parse_Pos; 1588 Emit (C); -- First character is always emitted 1589 1590 -- Are we looking at an operator, or is this 1591 -- simply a normal character ? 1592 1593 elsif not Is_Mult (Parse_Pos) then 1594 Start_Pos := Parse_Pos; 1595 Case_Emit (C); 1596 1597 else 1598 -- We've got something like "abc?d". Mark this as a 1599 -- special case. What we want to emit is a first 1600 -- constant string for "ab", then one for "c" that will 1601 -- ultimately be transformed with a CURLY operator, A 1602 -- special case has to be handled for "a?", since there 1603 -- is no initial string to emit. 1604 1605 Has_Special_Operator := True; 1606 exit Parse_Loop; 1607 end if; 1608 1609 when '\' => 1610 Start_Pos := Parse_Pos; 1611 1612 if Parse_Pos = Parse_End then 1613 Fail ("Trailing \"); 1614 1615 else 1616 case Expression (Parse_Pos + 1) is 1617 when 'b' | 'B' | 's' | 'S' | 'd' | 'D' 1618 | 'w' | 'W' | '0' .. '9' | 'G' | 'A' 1619 => exit Parse_Loop; 1620 when 'n' => Emit (ASCII.LF); 1621 when 't' => Emit (ASCII.HT); 1622 when 'r' => Emit (ASCII.CR); 1623 when 'f' => Emit (ASCII.FF); 1624 when 'e' => Emit (ASCII.ESC); 1625 when 'a' => Emit (ASCII.BEL); 1626 when others => Emit (Expression (Parse_Pos + 1)); 1627 end case; 1628 1629 Parse_Pos := Parse_Pos + 1; 1630 end if; 1631 1632 when others => 1633 Start_Pos := Parse_Pos; 1634 Case_Emit (C); 1635 end case; 1636 1637 exit Parse_Loop when Emit_Ptr - Length_Ptr = 254; 1638 1639 Parse_Pos := Parse_Pos + 1; 1640 1641 exit Parse_Loop when Parse_Pos > Parse_End; 1642 end loop Parse_Loop; 1643 1644 -- Is the string followed by a '*+?{' operator ? If yes, and if there 1645 -- is an initial string to emit, do it now. 1646 1647 if Has_Special_Operator 1648 and then Emit_Ptr >= Length_Ptr + Next_Pointer_Bytes 1649 then 1650 Emit_Ptr := Emit_Ptr - 1; 1651 Parse_Pos := Start_Pos; 1652 end if; 1653 1654 if Length_Ptr <= PM.Size then 1655 Program (Length_Ptr) := Character'Val (Emit_Ptr - Length_Ptr - 2); 1656 end if; 1657 1658 Expr_Flags.Has_Width := True; 1659 1660 -- Slight optimization when there is a single character 1661 1662 if Emit_Ptr = Length_Ptr + 2 then 1663 Expr_Flags.Simple := True; 1664 end if; 1665 end Parse_Literal; 1666 1667 ----------------- 1668 -- Parse_Piece -- 1669 ----------------- 1670 1671 -- Note that the branching code sequences used for '?' and the 1672 -- general cases of '*' and + are somewhat optimized: they use 1673 -- the same NOTHING node as both the endmarker for their branch 1674 -- list and the body of the last branch. It might seem that 1675 -- this node could be dispensed with entirely, but the endmarker 1676 -- role is not redundant. 1677 1678 procedure Parse_Piece 1679 (Expr_Flags : out Expression_Flags; 1680 IP : out Pointer) 1681 is 1682 Op : Character; 1683 New_Flags : Expression_Flags; 1684 Greedy : Boolean := True; 1685 1686 begin 1687 Parse_Atom (New_Flags, IP); 1688 1689 if IP = 0 then 1690 return; 1691 end if; 1692 1693 if Parse_Pos > Parse_End 1694 or else not Is_Mult (Parse_Pos) 1695 then 1696 Expr_Flags := New_Flags; 1697 return; 1698 end if; 1699 1700 Op := Expression (Parse_Pos); 1701 1702 Expr_Flags := 1703 (if Op /= '+' 1704 then (SP_Start => True, others => False) 1705 else (Has_Width => True, others => False)); 1706 1707 -- Detect non greedy operators in the easy cases 1708 1709 if Op /= '{' 1710 and then Parse_Pos + 1 <= Parse_End 1711 and then Expression (Parse_Pos + 1) = '?' 1712 then 1713 Greedy := False; 1714 Parse_Pos := Parse_Pos + 1; 1715 end if; 1716 1717 -- Generate the byte code 1718 1719 case Op is 1720 when '*' => 1721 1722 if New_Flags.Simple then 1723 Insert_Operator (STAR, IP, Greedy); 1724 else 1725 Link_Tail (IP, Emit_Node (WHILEM)); 1726 Insert_Curly_Operator 1727 (CURLYX, 0, Max_Curly_Repeat, IP, Greedy); 1728 Link_Tail (IP, Emit_Node (NOTHING)); 1729 end if; 1730 1731 when '+' => 1732 1733 if New_Flags.Simple then 1734 Insert_Operator (PLUS, IP, Greedy); 1735 else 1736 Link_Tail (IP, Emit_Node (WHILEM)); 1737 Insert_Curly_Operator 1738 (CURLYX, 1, Max_Curly_Repeat, IP, Greedy); 1739 Link_Tail (IP, Emit_Node (NOTHING)); 1740 end if; 1741 1742 when '?' => 1743 if New_Flags.Simple then 1744 Insert_Curly_Operator (CURLY, 0, 1, IP, Greedy); 1745 else 1746 Link_Tail (IP, Emit_Node (WHILEM)); 1747 Insert_Curly_Operator (CURLYX, 0, 1, IP, Greedy); 1748 Link_Tail (IP, Emit_Node (NOTHING)); 1749 end if; 1750 1751 when '{' => 1752 declare 1753 Min, Max : Natural; 1754 1755 begin 1756 Get_Curly_Arguments (Parse_Pos, Min, Max, Greedy); 1757 1758 if New_Flags.Simple then 1759 Insert_Curly_Operator (CURLY, Min, Max, IP, Greedy); 1760 else 1761 Link_Tail (IP, Emit_Node (WHILEM)); 1762 Insert_Curly_Operator (CURLYX, Min, Max, IP, Greedy); 1763 Link_Tail (IP, Emit_Node (NOTHING)); 1764 end if; 1765 end; 1766 1767 when others => 1768 null; 1769 end case; 1770 1771 Parse_Pos := Parse_Pos + 1; 1772 1773 if Parse_Pos <= Parse_End 1774 and then Is_Mult (Parse_Pos) 1775 then 1776 Fail ("nested *+{"); 1777 end if; 1778 end Parse_Piece; 1779 1780 --------------------------------- 1781 -- Parse_Posix_Character_Class -- 1782 --------------------------------- 1783 1784 function Parse_Posix_Character_Class return Std_Class is 1785 Invert : Boolean := False; 1786 Class : Std_Class := ANYOF_NONE; 1787 E : String renames Expression; 1788 1789 -- Class names. Note that code assumes that the length of all 1790 -- classes starting with the same letter have the same length. 1791 1792 Alnum : constant String := "alnum:]"; 1793 Alpha : constant String := "alpha:]"; 1794 Ascii_C : constant String := "ascii:]"; 1795 Cntrl : constant String := "cntrl:]"; 1796 Digit : constant String := "digit:]"; 1797 Graph : constant String := "graph:]"; 1798 Lower : constant String := "lower:]"; 1799 Print : constant String := "print:]"; 1800 Punct : constant String := "punct:]"; 1801 Space : constant String := "space:]"; 1802 Upper : constant String := "upper:]"; 1803 Word : constant String := "word:]"; 1804 Xdigit : constant String := "xdigit:]"; 1805 1806 begin 1807 -- Case of character class specified 1808 1809 if Parse_Pos <= Parse_End 1810 and then Expression (Parse_Pos) = ':' 1811 then 1812 Parse_Pos := Parse_Pos + 1; 1813 1814 -- Do we have something like: [[:^alpha:]] 1815 1816 if Parse_Pos <= Parse_End 1817 and then Expression (Parse_Pos) = '^' 1818 then 1819 Invert := True; 1820 Parse_Pos := Parse_Pos + 1; 1821 end if; 1822 1823 -- Check for class names based on first letter 1824 1825 case Expression (Parse_Pos) is 1826 when 'a' => 1827 1828 -- All 'a' classes have the same length (Alnum'Length) 1829 1830 if Parse_Pos + Alnum'Length - 1 <= Parse_End then 1831 if 1832 E (Parse_Pos .. Parse_Pos + Alnum'Length - 1) = Alnum 1833 then 1834 Class := 1835 (if Invert then ANYOF_NALNUMC else ANYOF_ALNUMC); 1836 Parse_Pos := Parse_Pos + Alnum'Length; 1837 1838 elsif 1839 E (Parse_Pos .. Parse_Pos + Alpha'Length - 1) = Alpha 1840 then 1841 Class := 1842 (if Invert then ANYOF_NALPHA else ANYOF_ALPHA); 1843 Parse_Pos := Parse_Pos + Alpha'Length; 1844 1845 elsif E (Parse_Pos .. Parse_Pos + Ascii_C'Length - 1) = 1846 Ascii_C 1847 then 1848 Class := 1849 (if Invert then ANYOF_NASCII else ANYOF_ASCII); 1850 Parse_Pos := Parse_Pos + Ascii_C'Length; 1851 else 1852 Fail ("Invalid character class: " & E); 1853 end if; 1854 1855 else 1856 Fail ("Invalid character class: " & E); 1857 end if; 1858 1859 when 'c' => 1860 if Parse_Pos + Cntrl'Length - 1 <= Parse_End 1861 and then 1862 E (Parse_Pos .. Parse_Pos + Cntrl'Length - 1) = Cntrl 1863 then 1864 Class := (if Invert then ANYOF_NCNTRL else ANYOF_CNTRL); 1865 Parse_Pos := Parse_Pos + Cntrl'Length; 1866 else 1867 Fail ("Invalid character class: " & E); 1868 end if; 1869 1870 when 'd' => 1871 if Parse_Pos + Digit'Length - 1 <= Parse_End 1872 and then 1873 E (Parse_Pos .. Parse_Pos + Digit'Length - 1) = Digit 1874 then 1875 Class := (if Invert then ANYOF_NDIGIT else ANYOF_DIGIT); 1876 Parse_Pos := Parse_Pos + Digit'Length; 1877 end if; 1878 1879 when 'g' => 1880 if Parse_Pos + Graph'Length - 1 <= Parse_End 1881 and then 1882 E (Parse_Pos .. Parse_Pos + Graph'Length - 1) = Graph 1883 then 1884 Class := (if Invert then ANYOF_NGRAPH else ANYOF_GRAPH); 1885 Parse_Pos := Parse_Pos + Graph'Length; 1886 else 1887 Fail ("Invalid character class: " & E); 1888 end if; 1889 1890 when 'l' => 1891 if Parse_Pos + Lower'Length - 1 <= Parse_End 1892 and then 1893 E (Parse_Pos .. Parse_Pos + Lower'Length - 1) = Lower 1894 then 1895 Class := (if Invert then ANYOF_NLOWER else ANYOF_LOWER); 1896 Parse_Pos := Parse_Pos + Lower'Length; 1897 else 1898 Fail ("Invalid character class: " & E); 1899 end if; 1900 1901 when 'p' => 1902 1903 -- All 'p' classes have the same length 1904 1905 if Parse_Pos + Print'Length - 1 <= Parse_End then 1906 if 1907 E (Parse_Pos .. Parse_Pos + Print'Length - 1) = Print 1908 then 1909 Class := 1910 (if Invert then ANYOF_NPRINT else ANYOF_PRINT); 1911 Parse_Pos := Parse_Pos + Print'Length; 1912 1913 elsif 1914 E (Parse_Pos .. Parse_Pos + Punct'Length - 1) = Punct 1915 then 1916 Class := 1917 (if Invert then ANYOF_NPUNCT else ANYOF_PUNCT); 1918 Parse_Pos := Parse_Pos + Punct'Length; 1919 1920 else 1921 Fail ("Invalid character class: " & E); 1922 end if; 1923 1924 else 1925 Fail ("Invalid character class: " & E); 1926 end if; 1927 1928 when 's' => 1929 if Parse_Pos + Space'Length - 1 <= Parse_End 1930 and then 1931 E (Parse_Pos .. Parse_Pos + Space'Length - 1) = Space 1932 then 1933 Class := (if Invert then ANYOF_NSPACE else ANYOF_SPACE); 1934 Parse_Pos := Parse_Pos + Space'Length; 1935 else 1936 Fail ("Invalid character class: " & E); 1937 end if; 1938 1939 when 'u' => 1940 if Parse_Pos + Upper'Length - 1 <= Parse_End 1941 and then 1942 E (Parse_Pos .. Parse_Pos + Upper'Length - 1) = Upper 1943 then 1944 Class := (if Invert then ANYOF_NUPPER else ANYOF_UPPER); 1945 Parse_Pos := Parse_Pos + Upper'Length; 1946 else 1947 Fail ("Invalid character class: " & E); 1948 end if; 1949 1950 when 'w' => 1951 if Parse_Pos + Word'Length - 1 <= Parse_End 1952 and then 1953 E (Parse_Pos .. Parse_Pos + Word'Length - 1) = Word 1954 then 1955 Class := (if Invert then ANYOF_NALNUM else ANYOF_ALNUM); 1956 Parse_Pos := Parse_Pos + Word'Length; 1957 else 1958 Fail ("Invalid character class: " & E); 1959 end if; 1960 1961 when 'x' => 1962 if Parse_Pos + Xdigit'Length - 1 <= Parse_End 1963 and then 1964 E (Parse_Pos .. Parse_Pos + Xdigit'Length - 1) = Xdigit 1965 then 1966 Class := (if Invert then ANYOF_NXDIGIT else ANYOF_XDIGIT); 1967 Parse_Pos := Parse_Pos + Xdigit'Length; 1968 1969 else 1970 Fail ("Invalid character class: " & E); 1971 end if; 1972 1973 when others => 1974 Fail ("Invalid character class: " & E); 1975 end case; 1976 1977 -- Character class not specified 1978 1979 else 1980 return ANYOF_NONE; 1981 end if; 1982 1983 return Class; 1984 end Parse_Posix_Character_Class; 1985 1986 -- Local Declarations 1987 1988 Result : Pointer; 1989 1990 Expr_Flags : Expression_Flags; 1991 pragma Unreferenced (Expr_Flags); 1992 1993 -- Start of processing for Compile 1994 1995 begin 1996 Parse (False, False, Expr_Flags, Result); 1997 1998 if Result = 0 then 1999 Fail ("Couldn't compile expression"); 2000 end if; 2001 2002 Final_Code_Size := Emit_Ptr - 1; 2003 2004 -- Do we want to actually compile the expression, or simply get the 2005 -- code size ??? 2006 2007 if Emit_Ptr <= PM.Size then 2008 Optimize (PM); 2009 end if; 2010 2011 PM.Flags := Flags; 2012 end Compile; 2013 2014 function Compile 2015 (Expression : String; 2016 Flags : Regexp_Flags := No_Flags) return Pattern_Matcher 2017 is 2018 -- Assume the compiled regexp will fit in 1000 chars. If it does not we 2019 -- will have to compile a second time once the correct size is known. If 2020 -- it fits, we save a significant amount of time by avoiding the second 2021 -- compilation. 2022 2023 Dummy : Pattern_Matcher (1000); 2024 Size : Program_Size; 2025 2026 begin 2027 Compile (Dummy, Expression, Size, Flags); 2028 2029 if Size <= Dummy.Size then 2030 return Pattern_Matcher' 2031 (Size => Size, 2032 First => Dummy.First, 2033 Anchored => Dummy.Anchored, 2034 Must_Have => Dummy.Must_Have, 2035 Must_Have_Length => Dummy.Must_Have_Length, 2036 Paren_Count => Dummy.Paren_Count, 2037 Flags => Dummy.Flags, 2038 Program => 2039 Dummy.Program 2040 (Dummy.Program'First .. Dummy.Program'First + Size - 1)); 2041 else 2042 -- We have to recompile now that we know the size 2043 -- ??? Can we use Ada 2005's return construct ? 2044 2045 declare 2046 Result : Pattern_Matcher (Size); 2047 begin 2048 Compile (Result, Expression, Size, Flags); 2049 return Result; 2050 end; 2051 end if; 2052 end Compile; 2053 2054 procedure Compile 2055 (Matcher : out Pattern_Matcher; 2056 Expression : String; 2057 Flags : Regexp_Flags := No_Flags) 2058 is 2059 Size : Program_Size; 2060 2061 begin 2062 Compile (Matcher, Expression, Size, Flags); 2063 2064 if Size > Matcher.Size then 2065 raise Expression_Error with "Pattern_Matcher is too small"; 2066 end if; 2067 end Compile; 2068 2069 -------------------- 2070 -- Dump_Operation -- 2071 -------------------- 2072 2073 procedure Dump_Operation 2074 (Program : Program_Data; 2075 Index : Pointer; 2076 Indent : Natural) 2077 is 2078 Current : Pointer := Index; 2079 begin 2080 Dump_Until (Program, Current, Current + 1, Indent); 2081 end Dump_Operation; 2082 2083 ---------------- 2084 -- Dump_Until -- 2085 ---------------- 2086 2087 procedure Dump_Until 2088 (Program : Program_Data; 2089 Index : in out Pointer; 2090 Till : Pointer; 2091 Indent : Natural; 2092 Do_Print : Boolean := True) 2093 is 2094 function Image (S : String) return String; 2095 -- Remove leading space 2096 2097 ----------- 2098 -- Image -- 2099 ----------- 2100 2101 function Image (S : String) return String is 2102 begin 2103 if S (S'First) = ' ' then 2104 return S (S'First + 1 .. S'Last); 2105 else 2106 return S; 2107 end if; 2108 end Image; 2109 2110 -- Local variables 2111 2112 Op : Opcode; 2113 Next : Pointer; 2114 Length : Pointer; 2115 Local_Indent : Natural := Indent; 2116 2117 -- Start of processing for Dump_Until 2118 2119 begin 2120 while Index < Till loop 2121 Op := Opcode'Val (Character'Pos ((Program (Index)))); 2122 Next := Get_Next (Program, Index); 2123 2124 if Do_Print then 2125 declare 2126 Point : constant String := Pointer'Image (Index); 2127 begin 2128 Put ((1 .. 4 - Point'Length => ' ') 2129 & Point & ":" 2130 & (1 .. Local_Indent * 2 => ' ') & Opcode'Image (Op)); 2131 end; 2132 2133 -- Print the parenthesis number 2134 2135 if Op = OPEN or else Op = CLOSE or else Op = REFF then 2136 Put (Image (Natural'Image 2137 (Character'Pos 2138 (Program (Index + Next_Pointer_Bytes))))); 2139 end if; 2140 2141 if Next = Index then 2142 Put (" (-)"); 2143 else 2144 Put (" (" & Image (Pointer'Image (Next)) & ")"); 2145 end if; 2146 end if; 2147 2148 case Op is 2149 when ANYOF => 2150 declare 2151 Bitmap : Character_Class; 2152 Last : Character := ASCII.NUL; 2153 Current : Natural := 0; 2154 Current_Char : Character; 2155 2156 begin 2157 Bitmap_Operand (Program, Index, Bitmap); 2158 2159 if Do_Print then 2160 Put ("["); 2161 2162 while Current <= 255 loop 2163 Current_Char := Character'Val (Current); 2164 2165 -- First item in a range 2166 2167 if Get_From_Class (Bitmap, Current_Char) then 2168 Last := Current_Char; 2169 2170 -- Search for the last item in the range 2171 2172 loop 2173 Current := Current + 1; 2174 exit when Current > 255; 2175 Current_Char := Character'Val (Current); 2176 exit when 2177 not Get_From_Class (Bitmap, Current_Char); 2178 end loop; 2179 2180 if not Is_Graphic (Last) then 2181 Put (Last'Img); 2182 else 2183 Put (Last); 2184 end if; 2185 2186 if Character'Succ (Last) /= Current_Char then 2187 Put ("\-" & Character'Pred (Current_Char)); 2188 end if; 2189 2190 else 2191 Current := Current + 1; 2192 end if; 2193 end loop; 2194 2195 Put_Line ("]"); 2196 end if; 2197 2198 Index := Index + Next_Pointer_Bytes + Bitmap'Length; 2199 end; 2200 2201 when EXACT | EXACTF => 2202 Length := String_Length (Program, Index); 2203 if Do_Print then 2204 Put (" (" & Image (Program_Size'Image (Length + 1)) 2205 & " chars) <" 2206 & String (Program (String_Operand (Index) 2207 .. String_Operand (Index) 2208 + Length))); 2209 Put_Line (">"); 2210 end if; 2211 2212 Index := String_Operand (Index) + Length + 1; 2213 2214 -- Node operand 2215 2216 when BRANCH | STAR | PLUS => 2217 if Do_Print then 2218 New_Line; 2219 end if; 2220 2221 Index := Index + Next_Pointer_Bytes; 2222 Dump_Until (Program, Index, Pointer'Min (Next, Till), 2223 Local_Indent + 1, Do_Print); 2224 2225 when CURLY | CURLYX => 2226 if Do_Print then 2227 Put_Line 2228 (" {" 2229 & Image (Natural'Image 2230 (Read_Natural (Program, Index + Next_Pointer_Bytes))) 2231 & "," 2232 & Image (Natural'Image (Read_Natural (Program, Index + 5))) 2233 & "}"); 2234 end if; 2235 2236 Index := Index + 7; 2237 Dump_Until (Program, Index, Pointer'Min (Next, Till), 2238 Local_Indent + 1, Do_Print); 2239 2240 when OPEN => 2241 if Do_Print then 2242 New_Line; 2243 end if; 2244 2245 Index := Index + 4; 2246 Local_Indent := Local_Indent + 1; 2247 2248 when CLOSE | REFF => 2249 if Do_Print then 2250 New_Line; 2251 end if; 2252 2253 Index := Index + 4; 2254 2255 if Op = CLOSE then 2256 Local_Indent := Local_Indent - 1; 2257 end if; 2258 2259 when others => 2260 Index := Index + Next_Pointer_Bytes; 2261 2262 if Do_Print then 2263 New_Line; 2264 end if; 2265 2266 exit when Op = EOP; 2267 end case; 2268 end loop; 2269 end Dump_Until; 2270 2271 ---------- 2272 -- Dump -- 2273 ---------- 2274 2275 procedure Dump (Self : Pattern_Matcher) is 2276 Program : Program_Data renames Self.Program; 2277 Index : Pointer := Program'First; 2278 2279 -- Start of processing for Dump 2280 2281 begin 2282 Put_Line ("Must start with (Self.First) = " 2283 & Character'Image (Self.First)); 2284 2285 if (Self.Flags and Case_Insensitive) /= 0 then 2286 Put_Line (" Case_Insensitive mode"); 2287 end if; 2288 2289 if (Self.Flags and Single_Line) /= 0 then 2290 Put_Line (" Single_Line mode"); 2291 end if; 2292 2293 if (Self.Flags and Multiple_Lines) /= 0 then 2294 Put_Line (" Multiple_Lines mode"); 2295 end if; 2296 2297 Dump_Until (Program, Index, Self.Program'Last + 1, 0); 2298 end Dump; 2299 2300 -------------------- 2301 -- Get_From_Class -- 2302 -------------------- 2303 2304 function Get_From_Class 2305 (Bitmap : Character_Class; 2306 C : Character) return Boolean 2307 is 2308 Value : constant Class_Byte := Character'Pos (C); 2309 begin 2310 return 2311 (Bitmap (Value / 8) and Bit_Conversion (Value mod 8)) /= 0; 2312 end Get_From_Class; 2313 2314 -------------- 2315 -- Get_Next -- 2316 -------------- 2317 2318 function Get_Next (Program : Program_Data; IP : Pointer) return Pointer is 2319 begin 2320 return IP + Pointer (Read_Natural (Program, IP + 1)); 2321 end Get_Next; 2322 2323 -------------- 2324 -- Is_Alnum -- 2325 -------------- 2326 2327 function Is_Alnum (C : Character) return Boolean is 2328 begin 2329 return Is_Alphanumeric (C) or else C = '_'; 2330 end Is_Alnum; 2331 2332 ------------------ 2333 -- Is_Printable -- 2334 ------------------ 2335 2336 function Is_Printable (C : Character) return Boolean is 2337 begin 2338 -- Printable if space or graphic character or other whitespace 2339 -- Other white space includes (HT/LF/VT/FF/CR = codes 9-13) 2340 2341 return C in Character'Val (32) .. Character'Val (126) 2342 or else C in ASCII.HT .. ASCII.CR; 2343 end Is_Printable; 2344 2345 -------------------- 2346 -- Is_White_Space -- 2347 -------------------- 2348 2349 function Is_White_Space (C : Character) return Boolean is 2350 begin 2351 -- Note: HT = 9, LF = 10, VT = 11, FF = 12, CR = 13 2352 2353 return C = ' ' or else C in ASCII.HT .. ASCII.CR; 2354 end Is_White_Space; 2355 2356 ----------- 2357 -- Match -- 2358 ----------- 2359 2360 procedure Match 2361 (Self : Pattern_Matcher; 2362 Data : String; 2363 Matches : out Match_Array; 2364 Data_First : Integer := -1; 2365 Data_Last : Positive := Positive'Last) 2366 is 2367 Program : Program_Data renames Self.Program; -- Shorter notation 2368 2369 First_In_Data : constant Integer := Integer'Max (Data_First, Data'First); 2370 Last_In_Data : constant Integer := Integer'Min (Data_Last, Data'Last); 2371 2372 -- Global work variables 2373 2374 Input_Pos : Natural; -- String-input pointer 2375 BOL_Pos : Natural; -- Beginning of input, for ^ check 2376 Matched : Boolean := False; -- Until proven True 2377 2378 Matches_Full : Match_Array (0 .. Natural'Max (Self.Paren_Count, 2379 Matches'Last)); 2380 -- Stores the value of all the parenthesis pairs. 2381 -- We do not use directly Matches, so that we can also use back 2382 -- references (REFF) even if Matches is too small. 2383 2384 type Natural_Array is array (Match_Count range <>) of Natural; 2385 Matches_Tmp : Natural_Array (Matches_Full'Range); 2386 -- Save the opening position of parenthesis 2387 2388 Last_Paren : Natural := 0; 2389 -- Last parenthesis seen 2390 2391 Greedy : Boolean := True; 2392 -- True if the next operator should be greedy 2393 2394 type Current_Curly_Record; 2395 type Current_Curly_Access is access all Current_Curly_Record; 2396 type Current_Curly_Record is record 2397 Paren_Floor : Natural; -- How far back to strip parenthesis data 2398 Cur : Integer; -- How many instances of scan we've matched 2399 Min : Natural; -- Minimal number of scans to match 2400 Max : Natural; -- Maximal number of scans to match 2401 Greedy : Boolean; -- Whether to work our way up or down 2402 Scan : Pointer; -- The thing to match 2403 Next : Pointer; -- What has to match after it 2404 Lastloc : Natural; -- Where we started matching this scan 2405 Old_Cc : Current_Curly_Access; -- Before we started this one 2406 end record; 2407 -- Data used to handle the curly operator and the plus and star 2408 -- operators for complex expressions. 2409 2410 Current_Curly : Current_Curly_Access := null; 2411 -- The curly currently being processed 2412 2413 ----------------------- 2414 -- Local Subprograms -- 2415 ----------------------- 2416 2417 function Index (Start : Positive; C : Character) return Natural; 2418 -- Find character C in Data starting at Start and return position 2419 2420 function Repeat 2421 (IP : Pointer; 2422 Max : Natural := Natural'Last) return Natural; 2423 -- Repeatedly match something simple, report how many 2424 -- It only matches on things of length 1. 2425 -- Starting from Input_Pos, it matches at most Max CURLY. 2426 2427 function Try (Pos : Positive) return Boolean; 2428 -- Try to match at specific point 2429 2430 function Match (IP : Pointer) return Boolean; 2431 -- This is the main matching routine. Conceptually the strategy 2432 -- is simple: check to see whether the current node matches, 2433 -- call self recursively to see whether the rest matches, 2434 -- and then act accordingly. 2435 -- 2436 -- In practice Match makes some effort to avoid recursion, in 2437 -- particular by going through "ordinary" nodes (that don't 2438 -- need to know whether the rest of the match failed) by 2439 -- using a loop instead of recursion. 2440 -- Why is the above comment part of the spec rather than body ??? 2441 2442 function Match_Whilem return Boolean; 2443 -- Return True if a WHILEM matches the Current_Curly 2444 2445 function Recurse_Match (IP : Pointer; From : Natural) return Boolean; 2446 pragma Inline (Recurse_Match); 2447 -- Calls Match recursively. It saves and restores the parenthesis 2448 -- status and location in the input stream correctly, so that 2449 -- backtracking is possible 2450 2451 function Match_Simple_Operator 2452 (Op : Opcode; 2453 Scan : Pointer; 2454 Next : Pointer; 2455 Greedy : Boolean) return Boolean; 2456 -- Return True it the simple operator (possibly non-greedy) matches 2457 2458 Dump_Indent : Integer := -1; 2459 procedure Dump_Current (Scan : Pointer; Prefix : Boolean := True); 2460 procedure Dump_Error (Msg : String); 2461 -- Debug: print the current context 2462 2463 pragma Inline (Index); 2464 pragma Inline (Repeat); 2465 2466 -- These are two complex functions, but used only once 2467 2468 pragma Inline (Match_Whilem); 2469 pragma Inline (Match_Simple_Operator); 2470 2471 ----------- 2472 -- Index -- 2473 ----------- 2474 2475 function Index (Start : Positive; C : Character) return Natural is 2476 begin 2477 for J in Start .. Last_In_Data loop 2478 if Data (J) = C then 2479 return J; 2480 end if; 2481 end loop; 2482 2483 return 0; 2484 end Index; 2485 2486 ------------------- 2487 -- Recurse_Match -- 2488 ------------------- 2489 2490 function Recurse_Match (IP : Pointer; From : Natural) return Boolean is 2491 L : constant Natural := Last_Paren; 2492 Tmp_F : constant Match_Array := 2493 Matches_Full (From + 1 .. Matches_Full'Last); 2494 Start : constant Natural_Array := 2495 Matches_Tmp (From + 1 .. Matches_Tmp'Last); 2496 Input : constant Natural := Input_Pos; 2497 2498 Dump_Indent_Save : constant Integer := Dump_Indent; 2499 2500 begin 2501 if Match (IP) then 2502 return True; 2503 end if; 2504 2505 Last_Paren := L; 2506 Matches_Full (Tmp_F'Range) := Tmp_F; 2507 Matches_Tmp (Start'Range) := Start; 2508 Input_Pos := Input; 2509 Dump_Indent := Dump_Indent_Save; 2510 return False; 2511 end Recurse_Match; 2512 2513 ------------------ 2514 -- Dump_Current -- 2515 ------------------ 2516 2517 procedure Dump_Current (Scan : Pointer; Prefix : Boolean := True) is 2518 Length : constant := 10; 2519 Pos : constant String := Integer'Image (Input_Pos); 2520 2521 begin 2522 if Prefix then 2523 Put ((1 .. 5 - Pos'Length => ' ')); 2524 Put (Pos & " <" 2525 & Data (Input_Pos 2526 .. Integer'Min (Last_In_Data, Input_Pos + Length - 1))); 2527 Put ((1 .. Length - 1 - Last_In_Data + Input_Pos => ' ')); 2528 Put ("> |"); 2529 2530 else 2531 Put (" "); 2532 end if; 2533 2534 Dump_Operation (Program, Scan, Indent => Dump_Indent); 2535 end Dump_Current; 2536 2537 ---------------- 2538 -- Dump_Error -- 2539 ---------------- 2540 2541 procedure Dump_Error (Msg : String) is 2542 begin 2543 Put (" | "); 2544 Put ((1 .. Dump_Indent * 2 => ' ')); 2545 Put_Line (Msg); 2546 end Dump_Error; 2547 2548 ----------- 2549 -- Match -- 2550 ----------- 2551 2552 function Match (IP : Pointer) return Boolean is 2553 Scan : Pointer := IP; 2554 Next : Pointer; 2555 Op : Opcode; 2556 Result : Boolean; 2557 2558 begin 2559 Dump_Indent := Dump_Indent + 1; 2560 2561 State_Machine : 2562 loop 2563 pragma Assert (Scan /= 0); 2564 2565 -- Determine current opcode and count its usage in debug mode 2566 2567 Op := Opcode'Val (Character'Pos (Program (Scan))); 2568 2569 -- Calculate offset of next instruction. Second character is most 2570 -- significant in Program_Data. 2571 2572 Next := Get_Next (Program, Scan); 2573 2574 if Debug then 2575 Dump_Current (Scan); 2576 end if; 2577 2578 case Op is 2579 when EOP => 2580 Dump_Indent := Dump_Indent - 1; 2581 return True; -- Success 2582 2583 when BRANCH => 2584 if Program (Next) /= BRANCH then 2585 Next := Operand (Scan); -- No choice, avoid recursion 2586 2587 else 2588 loop 2589 if Recurse_Match (Operand (Scan), 0) then 2590 Dump_Indent := Dump_Indent - 1; 2591 return True; 2592 end if; 2593 2594 Scan := Get_Next (Program, Scan); 2595 exit when Scan = 0 or else Program (Scan) /= BRANCH; 2596 end loop; 2597 2598 exit State_Machine; 2599 end if; 2600 2601 when NOTHING => 2602 null; 2603 2604 when BOL => 2605 exit State_Machine when Input_Pos /= BOL_Pos 2606 and then ((Self.Flags and Multiple_Lines) = 0 2607 or else Data (Input_Pos - 1) /= ASCII.LF); 2608 2609 when MBOL => 2610 exit State_Machine when Input_Pos /= BOL_Pos 2611 and then Data (Input_Pos - 1) /= ASCII.LF; 2612 2613 when SBOL => 2614 exit State_Machine when Input_Pos /= BOL_Pos; 2615 2616 when EOL => 2617 exit State_Machine when Input_Pos <= Data'Last 2618 and then ((Self.Flags and Multiple_Lines) = 0 2619 or else Data (Input_Pos) /= ASCII.LF); 2620 2621 when MEOL => 2622 exit State_Machine when Input_Pos <= Data'Last 2623 and then Data (Input_Pos) /= ASCII.LF; 2624 2625 when SEOL => 2626 exit State_Machine when Input_Pos <= Data'Last; 2627 2628 when BOUND | NBOUND => 2629 2630 -- Was last char in word ? 2631 2632 declare 2633 N : Boolean := False; 2634 Ln : Boolean := False; 2635 2636 begin 2637 if Input_Pos /= First_In_Data then 2638 N := Is_Alnum (Data (Input_Pos - 1)); 2639 end if; 2640 2641 Ln := 2642 (if Input_Pos > Last_In_Data 2643 then False 2644 else Is_Alnum (Data (Input_Pos))); 2645 2646 if Op = BOUND then 2647 if N = Ln then 2648 exit State_Machine; 2649 end if; 2650 else 2651 if N /= Ln then 2652 exit State_Machine; 2653 end if; 2654 end if; 2655 end; 2656 2657 when SPACE => 2658 exit State_Machine when Input_Pos > Last_In_Data 2659 or else not Is_White_Space (Data (Input_Pos)); 2660 Input_Pos := Input_Pos + 1; 2661 2662 when NSPACE => 2663 exit State_Machine when Input_Pos > Last_In_Data 2664 or else Is_White_Space (Data (Input_Pos)); 2665 Input_Pos := Input_Pos + 1; 2666 2667 when DIGIT => 2668 exit State_Machine when Input_Pos > Last_In_Data 2669 or else not Is_Digit (Data (Input_Pos)); 2670 Input_Pos := Input_Pos + 1; 2671 2672 when NDIGIT => 2673 exit State_Machine when Input_Pos > Last_In_Data 2674 or else Is_Digit (Data (Input_Pos)); 2675 Input_Pos := Input_Pos + 1; 2676 2677 when ALNUM => 2678 exit State_Machine when Input_Pos > Last_In_Data 2679 or else not Is_Alnum (Data (Input_Pos)); 2680 Input_Pos := Input_Pos + 1; 2681 2682 when NALNUM => 2683 exit State_Machine when Input_Pos > Last_In_Data 2684 or else Is_Alnum (Data (Input_Pos)); 2685 Input_Pos := Input_Pos + 1; 2686 2687 when ANY => 2688 exit State_Machine when Input_Pos > Last_In_Data 2689 or else Data (Input_Pos) = ASCII.LF; 2690 Input_Pos := Input_Pos + 1; 2691 2692 when SANY => 2693 exit State_Machine when Input_Pos > Last_In_Data; 2694 Input_Pos := Input_Pos + 1; 2695 2696 when EXACT => 2697 declare 2698 Opnd : Pointer := String_Operand (Scan); 2699 Current : Positive := Input_Pos; 2700 Last : constant Pointer := 2701 Opnd + String_Length (Program, Scan); 2702 2703 begin 2704 while Opnd <= Last loop 2705 exit State_Machine when Current > Last_In_Data 2706 or else Program (Opnd) /= Data (Current); 2707 Current := Current + 1; 2708 Opnd := Opnd + 1; 2709 end loop; 2710 2711 Input_Pos := Current; 2712 end; 2713 2714 when EXACTF => 2715 declare 2716 Opnd : Pointer := String_Operand (Scan); 2717 Current : Positive := Input_Pos; 2718 2719 Last : constant Pointer := 2720 Opnd + String_Length (Program, Scan); 2721 2722 begin 2723 while Opnd <= Last loop 2724 exit State_Machine when Current > Last_In_Data 2725 or else Program (Opnd) /= To_Lower (Data (Current)); 2726 Current := Current + 1; 2727 Opnd := Opnd + 1; 2728 end loop; 2729 2730 Input_Pos := Current; 2731 end; 2732 2733 when ANYOF => 2734 declare 2735 Bitmap : Character_Class; 2736 begin 2737 Bitmap_Operand (Program, Scan, Bitmap); 2738 exit State_Machine when Input_Pos > Last_In_Data 2739 or else not Get_From_Class (Bitmap, Data (Input_Pos)); 2740 Input_Pos := Input_Pos + 1; 2741 end; 2742 2743 when OPEN => 2744 declare 2745 No : constant Natural := 2746 Character'Pos (Program (Operand (Scan))); 2747 begin 2748 Matches_Tmp (No) := Input_Pos; 2749 end; 2750 2751 when CLOSE => 2752 declare 2753 No : constant Natural := 2754 Character'Pos (Program (Operand (Scan))); 2755 2756 begin 2757 Matches_Full (No) := (Matches_Tmp (No), Input_Pos - 1); 2758 2759 if Last_Paren < No then 2760 Last_Paren := No; 2761 end if; 2762 end; 2763 2764 when REFF => 2765 declare 2766 No : constant Natural := 2767 Character'Pos (Program (Operand (Scan))); 2768 2769 Data_Pos : Natural; 2770 2771 begin 2772 -- If we haven't seen that parenthesis yet 2773 2774 if Last_Paren < No then 2775 Dump_Indent := Dump_Indent - 1; 2776 2777 if Debug then 2778 Dump_Error ("REFF: No match, backtracking"); 2779 end if; 2780 2781 return False; 2782 end if; 2783 2784 Data_Pos := Matches_Full (No).First; 2785 2786 while Data_Pos <= Matches_Full (No).Last loop 2787 if Input_Pos > Last_In_Data 2788 or else Data (Input_Pos) /= Data (Data_Pos) 2789 then 2790 Dump_Indent := Dump_Indent - 1; 2791 2792 if Debug then 2793 Dump_Error ("REFF: No match, backtracking"); 2794 end if; 2795 2796 return False; 2797 end if; 2798 2799 Input_Pos := Input_Pos + 1; 2800 Data_Pos := Data_Pos + 1; 2801 end loop; 2802 end; 2803 2804 when MINMOD => 2805 Greedy := False; 2806 2807 when STAR | PLUS | CURLY => 2808 declare 2809 Greed : constant Boolean := Greedy; 2810 begin 2811 Greedy := True; 2812 Result := Match_Simple_Operator (Op, Scan, Next, Greed); 2813 Dump_Indent := Dump_Indent - 1; 2814 return Result; 2815 end; 2816 2817 when CURLYX => 2818 2819 -- Looking at something like: 2820 2821 -- 1: CURLYX {n,m} (->4) 2822 -- 2: code for complex thing (->3) 2823 -- 3: WHILEM (->0) 2824 -- 4: NOTHING 2825 2826 declare 2827 Min : constant Natural := 2828 Read_Natural (Program, Scan + Next_Pointer_Bytes); 2829 Max : constant Natural := 2830 Read_Natural 2831 (Program, Scan + Next_Pointer_Bytes + 2); 2832 Cc : aliased Current_Curly_Record; 2833 2834 Has_Match : Boolean; 2835 2836 begin 2837 Cc := (Paren_Floor => Last_Paren, 2838 Cur => -1, 2839 Min => Min, 2840 Max => Max, 2841 Greedy => Greedy, 2842 Scan => Scan + 7, 2843 Next => Next, 2844 Lastloc => 0, 2845 Old_Cc => Current_Curly); 2846 Greedy := True; 2847 Current_Curly := Cc'Unchecked_Access; 2848 2849 Has_Match := Match (Next - Next_Pointer_Bytes); 2850 2851 -- Start on the WHILEM 2852 2853 Current_Curly := Cc.Old_Cc; 2854 Dump_Indent := Dump_Indent - 1; 2855 2856 if not Has_Match then 2857 if Debug then 2858 Dump_Error ("CURLYX failed..."); 2859 end if; 2860 end if; 2861 2862 return Has_Match; 2863 end; 2864 2865 when WHILEM => 2866 Result := Match_Whilem; 2867 Dump_Indent := Dump_Indent - 1; 2868 2869 if Debug and then not Result then 2870 Dump_Error ("WHILEM: no match, backtracking"); 2871 end if; 2872 2873 return Result; 2874 end case; 2875 2876 Scan := Next; 2877 end loop State_Machine; 2878 2879 if Debug then 2880 Dump_Error ("failed..."); 2881 Dump_Indent := Dump_Indent - 1; 2882 end if; 2883 2884 -- If we get here, there is no match. For successful matches when EOP 2885 -- is the terminating point. 2886 2887 return False; 2888 end Match; 2889 2890 --------------------------- 2891 -- Match_Simple_Operator -- 2892 --------------------------- 2893 2894 function Match_Simple_Operator 2895 (Op : Opcode; 2896 Scan : Pointer; 2897 Next : Pointer; 2898 Greedy : Boolean) return Boolean 2899 is 2900 Next_Char : Character := ASCII.NUL; 2901 Next_Char_Known : Boolean := False; 2902 No : Integer; -- Can be negative 2903 Min : Natural; 2904 Max : Natural := Natural'Last; 2905 Operand_Code : Pointer; 2906 Old : Natural; 2907 Last_Pos : Natural; 2908 Save : constant Natural := Input_Pos; 2909 2910 begin 2911 -- Lookahead to avoid useless match attempts when we know what 2912 -- character comes next. 2913 2914 if Program (Next) = EXACT then 2915 Next_Char := Program (String_Operand (Next)); 2916 Next_Char_Known := True; 2917 end if; 2918 2919 -- Find the minimal and maximal values for the operator 2920 2921 case Op is 2922 when STAR => 2923 Min := 0; 2924 Operand_Code := Operand (Scan); 2925 2926 when PLUS => 2927 Min := 1; 2928 Operand_Code := Operand (Scan); 2929 2930 when others => 2931 Min := Read_Natural (Program, Scan + Next_Pointer_Bytes); 2932 Max := Read_Natural (Program, Scan + Next_Pointer_Bytes + 2); 2933 Operand_Code := Scan + 7; 2934 end case; 2935 2936 if Debug then 2937 Dump_Current (Operand_Code, Prefix => False); 2938 end if; 2939 2940 -- Non greedy operators 2941 2942 if not Greedy then 2943 2944 -- Test we can repeat at least Min times 2945 2946 if Min /= 0 then 2947 No := Repeat (Operand_Code, Min); 2948 2949 if No < Min then 2950 if Debug then 2951 Dump_Error ("failed... matched" & No'Img & " times"); 2952 end if; 2953 2954 return False; 2955 end if; 2956 end if; 2957 2958 Old := Input_Pos; 2959 2960 -- Find the place where 'next' could work 2961 2962 if Next_Char_Known then 2963 2964 -- Last position to check 2965 2966 if Max = Natural'Last then 2967 Last_Pos := Last_In_Data; 2968 else 2969 Last_Pos := Input_Pos + Max; 2970 2971 if Last_Pos > Last_In_Data then 2972 Last_Pos := Last_In_Data; 2973 end if; 2974 end if; 2975 2976 -- Look for the first possible opportunity 2977 2978 if Debug then 2979 Dump_Error ("Next_Char must be " & Next_Char); 2980 end if; 2981 2982 loop 2983 -- Find the next possible position 2984 2985 while Input_Pos <= Last_Pos 2986 and then Data (Input_Pos) /= Next_Char 2987 loop 2988 Input_Pos := Input_Pos + 1; 2989 end loop; 2990 2991 if Input_Pos > Last_Pos then 2992 return False; 2993 end if; 2994 2995 -- Check that we still match if we stop at the position we 2996 -- just found. 2997 2998 declare 2999 Num : constant Natural := Input_Pos - Old; 3000 3001 begin 3002 Input_Pos := Old; 3003 3004 if Debug then 3005 Dump_Error ("Would we still match at that position?"); 3006 end if; 3007 3008 if Repeat (Operand_Code, Num) < Num then 3009 return False; 3010 end if; 3011 end; 3012 3013 -- Input_Pos now points to the new position 3014 3015 if Match (Get_Next (Program, Scan)) then 3016 return True; 3017 end if; 3018 3019 Old := Input_Pos; 3020 Input_Pos := Input_Pos + 1; 3021 end loop; 3022 3023 -- We do not know what the next character is 3024 3025 else 3026 while Max >= Min loop 3027 if Debug then 3028 Dump_Error ("Non-greedy repeat, N=" & Min'Img); 3029 Dump_Error ("Do we still match Next if we stop here?"); 3030 end if; 3031 3032 -- If the next character matches 3033 3034 if Recurse_Match (Next, 1) then 3035 return True; 3036 end if; 3037 3038 Input_Pos := Save + Min; 3039 3040 -- Could not or did not match -- move forward 3041 3042 if Repeat (Operand_Code, 1) /= 0 then 3043 Min := Min + 1; 3044 else 3045 if Debug then 3046 Dump_Error ("Non-greedy repeat failed..."); 3047 end if; 3048 3049 return False; 3050 end if; 3051 end loop; 3052 end if; 3053 3054 return False; 3055 3056 -- Greedy operators 3057 3058 else 3059 No := Repeat (Operand_Code, Max); 3060 3061 if Debug and then No < Min then 3062 Dump_Error ("failed... matched" & No'Img & " times"); 3063 end if; 3064 3065 -- ??? Perl has some special code here in case the next 3066 -- instruction is of type EOL, since $ and \Z can match before 3067 -- *and* after newline at the end. 3068 3069 -- ??? Perl has some special code here in case (paren) is True 3070 3071 -- Else, if we don't have any parenthesis 3072 3073 while No >= Min loop 3074 if not Next_Char_Known 3075 or else (Input_Pos <= Last_In_Data 3076 and then Data (Input_Pos) = Next_Char) 3077 then 3078 if Match (Next) then 3079 return True; 3080 end if; 3081 end if; 3082 3083 -- Could not or did not work, we back up 3084 3085 No := No - 1; 3086 Input_Pos := Save + No; 3087 end loop; 3088 3089 return False; 3090 end if; 3091 end Match_Simple_Operator; 3092 3093 ------------------ 3094 -- Match_Whilem -- 3095 ------------------ 3096 3097 -- This is really hard to understand, because after we match what we 3098 -- are trying to match, we must make sure the rest of the REx is going 3099 -- to match for sure, and to do that we have to go back UP the parse 3100 -- tree by recursing ever deeper. And if it fails, we have to reset 3101 -- our parent's current state that we can try again after backing off. 3102 3103 function Match_Whilem return Boolean is 3104 Cc : constant Current_Curly_Access := Current_Curly; 3105 3106 N : constant Natural := Cc.Cur + 1; 3107 Ln : Natural := 0; 3108 3109 Lastloc : constant Natural := Cc.Lastloc; 3110 -- Detection of 0-len 3111 3112 begin 3113 -- If degenerate scan matches "", assume scan done 3114 3115 if Input_Pos = Cc.Lastloc 3116 and then N >= Cc.Min 3117 then 3118 -- Temporarily restore the old context, and check that we 3119 -- match was comes after CURLYX. 3120 3121 Current_Curly := Cc.Old_Cc; 3122 3123 if Current_Curly /= null then 3124 Ln := Current_Curly.Cur; 3125 end if; 3126 3127 if Match (Cc.Next) then 3128 return True; 3129 end if; 3130 3131 if Current_Curly /= null then 3132 Current_Curly.Cur := Ln; 3133 end if; 3134 3135 Current_Curly := Cc; 3136 return False; 3137 end if; 3138 3139 -- First, just match a string of min scans 3140 3141 if N < Cc.Min then 3142 Cc.Cur := N; 3143 Cc.Lastloc := Input_Pos; 3144 3145 if Debug then 3146 Dump_Error 3147 ("Tests that we match at least" & Cc.Min'Img & " N=" & N'Img); 3148 end if; 3149 3150 if Match (Cc.Scan) then 3151 return True; 3152 end if; 3153 3154 Cc.Cur := N - 1; 3155 Cc.Lastloc := Lastloc; 3156 3157 if Debug then 3158 Dump_Error ("failed..."); 3159 end if; 3160 3161 return False; 3162 end if; 3163 3164 -- Prefer next over scan for minimal matching 3165 3166 if not Cc.Greedy then 3167 Current_Curly := Cc.Old_Cc; 3168 3169 if Current_Curly /= null then 3170 Ln := Current_Curly.Cur; 3171 end if; 3172 3173 if Recurse_Match (Cc.Next, Cc.Paren_Floor) then 3174 return True; 3175 end if; 3176 3177 if Current_Curly /= null then 3178 Current_Curly.Cur := Ln; 3179 end if; 3180 3181 Current_Curly := Cc; 3182 3183 -- Maximum greed exceeded ? 3184 3185 if N >= Cc.Max then 3186 if Debug then 3187 Dump_Error ("failed..."); 3188 end if; 3189 return False; 3190 end if; 3191 3192 -- Try scanning more and see if it helps 3193 Cc.Cur := N; 3194 Cc.Lastloc := Input_Pos; 3195 3196 if Debug then 3197 Dump_Error ("Next failed, what about Current?"); 3198 end if; 3199 3200 if Recurse_Match (Cc.Scan, Cc.Paren_Floor) then 3201 return True; 3202 end if; 3203 3204 Cc.Cur := N - 1; 3205 Cc.Lastloc := Lastloc; 3206 return False; 3207 end if; 3208 3209 -- Prefer scan over next for maximal matching 3210 3211 if N < Cc.Max then -- more greed allowed ? 3212 Cc.Cur := N; 3213 Cc.Lastloc := Input_Pos; 3214 3215 if Debug then 3216 Dump_Error ("Recurse at current position"); 3217 end if; 3218 3219 if Recurse_Match (Cc.Scan, Cc.Paren_Floor) then 3220 return True; 3221 end if; 3222 end if; 3223 3224 -- Failed deeper matches of scan, so see if this one works 3225 3226 Current_Curly := Cc.Old_Cc; 3227 3228 if Current_Curly /= null then 3229 Ln := Current_Curly.Cur; 3230 end if; 3231 3232 if Debug then 3233 Dump_Error ("Failed matching for later positions"); 3234 end if; 3235 3236 if Match (Cc.Next) then 3237 return True; 3238 end if; 3239 3240 if Current_Curly /= null then 3241 Current_Curly.Cur := Ln; 3242 end if; 3243 3244 Current_Curly := Cc; 3245 Cc.Cur := N - 1; 3246 Cc.Lastloc := Lastloc; 3247 3248 if Debug then 3249 Dump_Error ("failed..."); 3250 end if; 3251 3252 return False; 3253 end Match_Whilem; 3254 3255 ------------ 3256 -- Repeat -- 3257 ------------ 3258 3259 function Repeat 3260 (IP : Pointer; 3261 Max : Natural := Natural'Last) return Natural 3262 is 3263 Scan : Natural := Input_Pos; 3264 Last : Natural; 3265 Op : constant Opcode := Opcode'Val (Character'Pos (Program (IP))); 3266 Count : Natural; 3267 C : Character; 3268 Is_First : Boolean := True; 3269 Bitmap : Character_Class; 3270 3271 begin 3272 if Max = Natural'Last or else Scan + Max - 1 > Last_In_Data then 3273 Last := Last_In_Data; 3274 else 3275 Last := Scan + Max - 1; 3276 end if; 3277 3278 case Op is 3279 when ANY => 3280 while Scan <= Last 3281 and then Data (Scan) /= ASCII.LF 3282 loop 3283 Scan := Scan + 1; 3284 end loop; 3285 3286 when SANY => 3287 Scan := Last + 1; 3288 3289 when EXACT => 3290 3291 -- The string has only one character if Repeat was called 3292 3293 C := Program (String_Operand (IP)); 3294 while Scan <= Last 3295 and then C = Data (Scan) 3296 loop 3297 Scan := Scan + 1; 3298 end loop; 3299 3300 when EXACTF => 3301 3302 -- The string has only one character if Repeat was called 3303 3304 C := Program (String_Operand (IP)); 3305 while Scan <= Last 3306 and then To_Lower (C) = Data (Scan) 3307 loop 3308 Scan := Scan + 1; 3309 end loop; 3310 3311 when ANYOF => 3312 if Is_First then 3313 Bitmap_Operand (Program, IP, Bitmap); 3314 Is_First := False; 3315 end if; 3316 3317 while Scan <= Last 3318 and then Get_From_Class (Bitmap, Data (Scan)) 3319 loop 3320 Scan := Scan + 1; 3321 end loop; 3322 3323 when ALNUM => 3324 while Scan <= Last 3325 and then Is_Alnum (Data (Scan)) 3326 loop 3327 Scan := Scan + 1; 3328 end loop; 3329 3330 when NALNUM => 3331 while Scan <= Last 3332 and then not Is_Alnum (Data (Scan)) 3333 loop 3334 Scan := Scan + 1; 3335 end loop; 3336 3337 when SPACE => 3338 while Scan <= Last 3339 and then Is_White_Space (Data (Scan)) 3340 loop 3341 Scan := Scan + 1; 3342 end loop; 3343 3344 when NSPACE => 3345 while Scan <= Last 3346 and then not Is_White_Space (Data (Scan)) 3347 loop 3348 Scan := Scan + 1; 3349 end loop; 3350 3351 when DIGIT => 3352 while Scan <= Last 3353 and then Is_Digit (Data (Scan)) 3354 loop 3355 Scan := Scan + 1; 3356 end loop; 3357 3358 when NDIGIT => 3359 while Scan <= Last 3360 and then not Is_Digit (Data (Scan)) 3361 loop 3362 Scan := Scan + 1; 3363 end loop; 3364 3365 when others => 3366 raise Program_Error; 3367 end case; 3368 3369 Count := Scan - Input_Pos; 3370 Input_Pos := Scan; 3371 return Count; 3372 end Repeat; 3373 3374 --------- 3375 -- Try -- 3376 --------- 3377 3378 function Try (Pos : Positive) return Boolean is 3379 begin 3380 Input_Pos := Pos; 3381 Last_Paren := 0; 3382 Matches_Full := (others => No_Match); 3383 3384 if Match (Program_First) then 3385 Matches_Full (0) := (Pos, Input_Pos - 1); 3386 return True; 3387 end if; 3388 3389 return False; 3390 end Try; 3391 3392 -- Start of processing for Match 3393 3394 begin 3395 -- Do we have the regexp Never_Match? 3396 3397 if Self.Size = 0 then 3398 Matches := (others => No_Match); 3399 return; 3400 end if; 3401 3402 -- If there is a "must appear" string, look for it 3403 3404 if Self.Must_Have_Length > 0 then 3405 declare 3406 First : constant Character := Program (Self.Must_Have); 3407 Must_First : constant Pointer := Self.Must_Have; 3408 Must_Last : constant Pointer := 3409 Must_First + Pointer (Self.Must_Have_Length - 1); 3410 Next_Try : Natural := Index (First_In_Data, First); 3411 3412 begin 3413 while Next_Try /= 0 3414 and then Data (Next_Try .. Next_Try + Self.Must_Have_Length - 1) 3415 = String (Program (Must_First .. Must_Last)) 3416 loop 3417 Next_Try := Index (Next_Try + 1, First); 3418 end loop; 3419 3420 if Next_Try = 0 then 3421 Matches := (others => No_Match); 3422 return; -- Not present 3423 end if; 3424 end; 3425 end if; 3426 3427 -- Mark beginning of line for ^ 3428 3429 BOL_Pos := Data'First; 3430 3431 -- Simplest case first: an anchored match need be tried only once 3432 3433 if Self.Anchored and then (Self.Flags and Multiple_Lines) = 0 then 3434 Matched := Try (First_In_Data); 3435 3436 elsif Self.Anchored then 3437 declare 3438 Next_Try : Natural := First_In_Data; 3439 begin 3440 -- Test the first position in the buffer 3441 Matched := Try (Next_Try); 3442 3443 -- Else only test after newlines 3444 3445 if not Matched then 3446 while Next_Try <= Last_In_Data loop 3447 while Next_Try <= Last_In_Data 3448 and then Data (Next_Try) /= ASCII.LF 3449 loop 3450 Next_Try := Next_Try + 1; 3451 end loop; 3452 3453 Next_Try := Next_Try + 1; 3454 3455 if Next_Try <= Last_In_Data then 3456 Matched := Try (Next_Try); 3457 exit when Matched; 3458 end if; 3459 end loop; 3460 end if; 3461 end; 3462 3463 elsif Self.First /= ASCII.NUL then 3464 -- We know what char it must start with 3465 3466 declare 3467 Next_Try : Natural := Index (First_In_Data, Self.First); 3468 3469 begin 3470 while Next_Try /= 0 loop 3471 Matched := Try (Next_Try); 3472 exit when Matched; 3473 Next_Try := Index (Next_Try + 1, Self.First); 3474 end loop; 3475 end; 3476 3477 else 3478 -- Messy cases: try all locations (including for the empty string) 3479 3480 Matched := Try (First_In_Data); 3481 3482 if not Matched then 3483 for S in First_In_Data + 1 .. Last_In_Data loop 3484 Matched := Try (S); 3485 exit when Matched; 3486 end loop; 3487 end if; 3488 end if; 3489 3490 -- Matched has its value 3491 3492 for J in Last_Paren + 1 .. Matches'Last loop 3493 Matches_Full (J) := No_Match; 3494 end loop; 3495 3496 Matches := Matches_Full (Matches'Range); 3497 end Match; 3498 3499 ----------- 3500 -- Match -- 3501 ----------- 3502 3503 function Match 3504 (Self : Pattern_Matcher; 3505 Data : String; 3506 Data_First : Integer := -1; 3507 Data_Last : Positive := Positive'Last) return Natural 3508 is 3509 Matches : Match_Array (0 .. 0); 3510 3511 begin 3512 Match (Self, Data, Matches, Data_First, Data_Last); 3513 if Matches (0) = No_Match then 3514 return Data'First - 1; 3515 else 3516 return Matches (0).First; 3517 end if; 3518 end Match; 3519 3520 function Match 3521 (Self : Pattern_Matcher; 3522 Data : String; 3523 Data_First : Integer := -1; 3524 Data_Last : Positive := Positive'Last) return Boolean 3525 is 3526 Matches : Match_Array (0 .. 0); 3527 3528 begin 3529 Match (Self, Data, Matches, Data_First, Data_Last); 3530 return Matches (0).First >= Data'First; 3531 end Match; 3532 3533 procedure Match 3534 (Expression : String; 3535 Data : String; 3536 Matches : out Match_Array; 3537 Size : Program_Size := Auto_Size; 3538 Data_First : Integer := -1; 3539 Data_Last : Positive := Positive'Last) 3540 is 3541 PM : Pattern_Matcher (Size); 3542 Finalize_Size : Program_Size; 3543 pragma Unreferenced (Finalize_Size); 3544 begin 3545 if Size = 0 then 3546 Match (Compile (Expression), Data, Matches, Data_First, Data_Last); 3547 else 3548 Compile (PM, Expression, Finalize_Size); 3549 Match (PM, Data, Matches, Data_First, Data_Last); 3550 end if; 3551 end Match; 3552 3553 ----------- 3554 -- Match -- 3555 ----------- 3556 3557 function Match 3558 (Expression : String; 3559 Data : String; 3560 Size : Program_Size := Auto_Size; 3561 Data_First : Integer := -1; 3562 Data_Last : Positive := Positive'Last) return Natural 3563 is 3564 PM : Pattern_Matcher (Size); 3565 Final_Size : Program_Size; 3566 pragma Unreferenced (Final_Size); 3567 begin 3568 if Size = 0 then 3569 return Match (Compile (Expression), Data, Data_First, Data_Last); 3570 else 3571 Compile (PM, Expression, Final_Size); 3572 return Match (PM, Data, Data_First, Data_Last); 3573 end if; 3574 end Match; 3575 3576 ----------- 3577 -- Match -- 3578 ----------- 3579 3580 function Match 3581 (Expression : String; 3582 Data : String; 3583 Size : Program_Size := Auto_Size; 3584 Data_First : Integer := -1; 3585 Data_Last : Positive := Positive'Last) return Boolean 3586 is 3587 Matches : Match_Array (0 .. 0); 3588 PM : Pattern_Matcher (Size); 3589 Final_Size : Program_Size; 3590 pragma Unreferenced (Final_Size); 3591 begin 3592 if Size = 0 then 3593 Match (Compile (Expression), Data, Matches, Data_First, Data_Last); 3594 else 3595 Compile (PM, Expression, Final_Size); 3596 Match (PM, Data, Matches, Data_First, Data_Last); 3597 end if; 3598 3599 return Matches (0).First >= Data'First; 3600 end Match; 3601 3602 ------------- 3603 -- Operand -- 3604 ------------- 3605 3606 function Operand (P : Pointer) return Pointer is 3607 begin 3608 return P + Next_Pointer_Bytes; 3609 end Operand; 3610 3611 -------------- 3612 -- Optimize -- 3613 -------------- 3614 3615 procedure Optimize (Self : in out Pattern_Matcher) is 3616 Scan : Pointer; 3617 Program : Program_Data renames Self.Program; 3618 3619 begin 3620 -- Start with safe defaults (no optimization): 3621 -- * No known first character of match 3622 -- * Does not necessarily start at beginning of line 3623 -- * No string known that has to appear in data 3624 3625 Self.First := ASCII.NUL; 3626 Self.Anchored := False; 3627 Self.Must_Have := Program'Last + 1; 3628 Self.Must_Have_Length := 0; 3629 3630 Scan := Program_First; -- First instruction (can be anything) 3631 3632 if Program (Scan) = EXACT then 3633 Self.First := Program (String_Operand (Scan)); 3634 3635 elsif Program (Scan) = BOL 3636 or else Program (Scan) = SBOL 3637 or else Program (Scan) = MBOL 3638 then 3639 Self.Anchored := True; 3640 end if; 3641 end Optimize; 3642 3643 ----------------- 3644 -- Paren_Count -- 3645 ----------------- 3646 3647 function Paren_Count (Regexp : Pattern_Matcher) return Match_Count is 3648 begin 3649 return Regexp.Paren_Count; 3650 end Paren_Count; 3651 3652 ----------- 3653 -- Quote -- 3654 ----------- 3655 3656 function Quote (Str : String) return String is 3657 S : String (1 .. Str'Length * 2); 3658 Last : Natural := 0; 3659 3660 begin 3661 for J in Str'Range loop 3662 case Str (J) is 3663 when '^' | '$' | '|' | '*' | '+' | '?' | '{' | 3664 '}' | '[' | ']' | '(' | ')' | '\' | '.' => 3665 3666 S (Last + 1) := '\'; 3667 S (Last + 2) := Str (J); 3668 Last := Last + 2; 3669 3670 when others => 3671 S (Last + 1) := Str (J); 3672 Last := Last + 1; 3673 end case; 3674 end loop; 3675 3676 return S (1 .. Last); 3677 end Quote; 3678 3679 ------------------ 3680 -- Read_Natural -- 3681 ------------------ 3682 3683 function Read_Natural 3684 (Program : Program_Data; 3685 IP : Pointer) return Natural 3686 is 3687 begin 3688 return Character'Pos (Program (IP)) + 3689 256 * Character'Pos (Program (IP + 1)); 3690 end Read_Natural; 3691 3692 ----------------- 3693 -- Reset_Class -- 3694 ----------------- 3695 3696 procedure Reset_Class (Bitmap : out Character_Class) is 3697 begin 3698 Bitmap := (others => 0); 3699 end Reset_Class; 3700 3701 ------------------ 3702 -- Set_In_Class -- 3703 ------------------ 3704 3705 procedure Set_In_Class 3706 (Bitmap : in out Character_Class; 3707 C : Character) 3708 is 3709 Value : constant Class_Byte := Character'Pos (C); 3710 begin 3711 Bitmap (Value / 8) := Bitmap (Value / 8) 3712 or Bit_Conversion (Value mod 8); 3713 end Set_In_Class; 3714 3715 ------------------- 3716 -- String_Length -- 3717 ------------------- 3718 3719 function String_Length 3720 (Program : Program_Data; 3721 P : Pointer) return Program_Size 3722 is 3723 begin 3724 pragma Assert (Program (P) = EXACT or else Program (P) = EXACTF); 3725 return Character'Pos (Program (P + Next_Pointer_Bytes)); 3726 end String_Length; 3727 3728 -------------------- 3729 -- String_Operand -- 3730 -------------------- 3731 3732 function String_Operand (P : Pointer) return Pointer is 3733 begin 3734 return P + 4; 3735 end String_Operand; 3736 3737end System.Regpat; 3738