1 #if !defined(RXH) || defined(RX_WANT_SE_DEFS)
2 #define RXH
3
4 /* Copyright (C) 1992, 1993 Free Software Foundation, Inc.
5
6 This file is part of the librx library.
7
8 Librx is free software; you can redistribute it and/or modify it under
9 the terms of the GNU Library General Public License as published by
10 the Free Software Foundation; either version 2, or (at your option)
11 any later version.
12
13 Librx is distributed in the hope that it will be useful, but WITHOUT
14 ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
15 FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
16 for more details.
17
18 You should have received a copy of the GNU Library General Public
19 License along with this software; see the file COPYING.LIB. If not,
20 write to the Free Software Foundation, 675 Mass Ave, Cambridge, MA
21 02139, USA. */
22 /* t. lord Wed Sep 23 18:20:57 1992 */
23 /* Multi-byte extension added Jun, 1997 by WILLs (Katsuyuki Okabe)
24 Last change: Jul. 3, 1997 by WILLs */
25
26
27
28
29
30
31
32
33 #ifndef RX_WANT_SE_DEFS
34
35 /* This page: Bitsets */
36
37 #ifndef RX_subset
38 typedef unsigned long RX_subset;
39 #define RX_subset_bits (32)
40 #define RX_subset_mask (RX_subset_bits - 1)
41 #endif
42
43 typedef RX_subset * rx_Bitset;
44
45 #if defined(__STDC__) || defined(MSDOS)
46 typedef void (*rx_bitset_iterator) (void *, int member_index);
47 #else
48 typedef void (*rx_bitset_iterator) ();
49 #endif
50
51 #define rx_bitset_subset(N) ((unsigned) (N) / RX_subset_bits)
52 #define rx_bitset_subset_val(B,N) ((B)[rx_bitset_subset(N)])
53 #define RX_bitset_access(B,N,OP) \
54 ((B)[rx_bitset_subset(N)] OP rx_subset_singletons[(N) & RX_subset_mask])
55 #define RX_bitset_member(B,N) RX_bitset_access(B, N, &)
56 #define RX_bitset_enjoin(B,N) RX_bitset_access(B, N, |=)
57 #define RX_bitset_remove(B,N) RX_bitset_access(B, N, &= ~)
58 #define RX_bitset_toggle(B,N) RX_bitset_access(B, N, ^= )
59 #define rx_bitset_numb_subsets(N) (((unsigned) (N) + RX_subset_bits - 1) / RX_subset_bits)
60 #define rx_sizeof_bitset(N) (rx_bitset_numb_subsets(N) * sizeof(RX_subset))
61
62
63
64 /* This page: Splay trees. */
65
66 #if defined(__STDC__) || defined(MSDOS)
67 typedef int (*rx_sp_comparer) (void * a, void * b);
68 #else
69 typedef int (*rx_sp_comparer) ();
70 #endif
71
72 struct rx_sp_node
73 {
74 void * key;
75 void * data;
76 struct rx_sp_node * kids[2];
77 };
78
79 #if defined(__STDC__) || defined(MSDOS)
80 typedef void (*rx_sp_key_data_freer) (struct rx_sp_node *);
81 #else
82 typedef void (*rx_sp_key_data_freer) ();
83 #endif
84
85
86 /* giant inflatable hash trees */
87
88 struct rx_hash_item
89 {
90 struct rx_hash_item * next_same_hash;
91 struct rx_hash * table;
92 unsigned long hash;
93 void * data;
94 void * binding;
95 };
96
97 struct rx_hash
98 {
99 struct rx_hash * parent;
100 int refs;
101 struct rx_hash * children[13];
102 struct rx_hash_item * buckets [13];
103 int bucket_size [13];
104 };
105
106 struct rx_hash_rules;
107
108 #if defined(__STDC__) || defined(MSDOS)
109 /* should return like == */
110 typedef int (*rx_hash_eq)(void *, void *);
111 typedef struct rx_hash * (*rx_alloc_hash)(struct rx_hash_rules *);
112 typedef void (*rx_free_hash)(struct rx_hash *,
113 struct rx_hash_rules *);
114 typedef struct rx_hash_item * (*rx_alloc_hash_item)(struct rx_hash_rules *,
115 void *);
116 typedef void (*rx_free_hash_item)(struct rx_hash_item *,
117 struct rx_hash_rules *);
118 #else
119 typedef int (*rx_hash_eq)();
120 typedef struct rx_hash * (*rx_alloc_hash)();
121 typedef void (*rx_free_hash)();
122 typedef struct rx_hash_item * (*rx_alloc_hash_item)();
123 typedef void (*rx_free_hash_item)();
124 #endif
125
126 struct rx_hash_rules
127 {
128 rx_hash_eq eq;
129 rx_alloc_hash hash_alloc;
130 rx_free_hash free_hash;
131 rx_alloc_hash_item hash_item_alloc;
132 rx_free_hash_item free_hash_item;
133 };
134
135
136 /* Forward declarations */
137
138 struct rx_cache;
139 struct rx_superset;
140 struct rx;
141 struct rx_se_list;
142
143
144
145 /*
146 * GLOSSARY
147 *
148 * regexp
149 * regular expression
150 * expression
151 * pattern - a `regular' expression. The expression
152 * need not be formally regular -- it can contain
153 * constructs that don't correspond to purely regular
154 * expressions.
155 *
156 * buffer
157 * string - the string (or strings) being searched or matched.
158 *
159 * pattern buffer - a structure of type `struct re_pattern_buffer'
160 * This in turn contains a `struct rx', which holds the
161 * NFA compiled from a pattern, as well as some of the state
162 * of a matcher using the pattern.
163 *
164 * NFA - nondeterministic finite automata. Some people
165 * use this term to a member of the class of
166 * regular automata (those corresponding to a regular
167 * language). However, in this code, the meaning is
168 * more general. The automata used by Rx are comperable
169 * in power to what are usually called `push down automata'.
170 *
171 * Two NFA are built by rx for every pattern. One is built
172 * by the compiler. The other is built from the first, on
173 * the fly, by the matcher. The latter is called the `superstate
174 * NFA' because its states correspond to sets of states from
175 * the first NFA. (Joe Keane gets credit for the name
176 * `superstate NFA').
177 *
178 * NFA edges
179 * epsilon edges
180 * side-effect edges - The NFA compiled from a pattern can have three
181 * kinds of edges. Epsilon edges can be taken freely anytime
182 * their source state is reached. Character set edges can be
183 * taken when their source state is reached and when the next
184 * character in the buffer is a member of the set. Side effect
185 * edges imply a transition that can only be taken after the
186 * indicated side effect has been successfully accomplished.
187 * Some examples of side effects are:
188 *
189 * Storing the current match position to record the
190 * location of a parentesized subexpression.
191 *
192 * Advancing the matcher over N characters if they
193 * match the N characters previously matched by a
194 * parentesized subexpression.
195 *
196 * Both of those kinds of edges occur in the NFA generated
197 * by the pattern: \(.\)\1
198 *
199 * Epsilon and side effect edges are similar. Unfortunately,
200 * some of the code uses the name `epsilon edge' to mean
201 * both epsilon and side effect edges. For example, the
202 * function has_non_idempotent_epsilon_path computes the existance
203 * of a non-trivial path containing only a mix of epsilon and
204 * side effect edges. In that case `nonidempotent epsilon' is being
205 * used to mean `side effect'.
206 */
207
208
209
210
211
212 /* LOW LEVEL PATTERN BUFFERS */
213
214 /* Suppose that from some NFA state, more than one path through
215 * side-effect edges is possible. In what order should the paths
216 * be tried? A function of type rx_se_list_order answers that
217 * question. It compares two lists of side effects, and says
218 * which list comes first.
219 */
220
221 #if defined(__STDC__) || defined(MSDOS)
222 typedef int (*rx_se_list_order) (struct rx *,
223 struct rx_se_list *,
224 struct rx_se_list *);
225 #else
226 typedef int (*rx_se_list_order) ();
227 #endif
228
229
230
231 /* Struct RX holds a compiled regular expression - that is, an nfa
232 * ready to be converted on demand to a more efficient superstate nfa.
233 * This is for the low level interface. The high-level interfaces enclose
234 * this in a `struct re_pattern_buffer'.
235 */
236 struct rx
237 {
238 /* The compiler assigns a unique id to every pattern.
239 * Like sequence numbers in X, there is a subtle bug here
240 * if you use Rx in a system that runs for a long time.
241 * But, because of the way the caches work out, it is almost
242 * impossible to trigger the Rx version of this bug.
243 *
244 * The id is used to validate superstates found in a cache
245 * of superstates. It isn't sufficient to let a superstate
246 * point back to the rx for which it was compiled -- the caller
247 * may be re-using a `struct rx' in which case the superstate
248 * is not really valid. So instead, superstates are validated
249 * by checking the sequence number of the pattern for which
250 * they were built.
251 */
252 int rx_id;
253
254 /* This is memory mgt. state for superstates. This may be
255 * shared by more than one struct rx.
256 */
257 struct rx_cache * cache;
258
259 /* Every regex defines the size of its own character set.
260 * A superstate has an array of this size, with each element
261 * a `struct rx_inx'. So, don't make this number too large.
262 * In particular, don't make it 2^16.
263 */
264 int local_cset_size;
265
266 /* After the NFA is built, it is copied into a contiguous region
267 * of memory (mostly for compatability with GNU regex).
268 * Here is that region, and it's size:
269 */
270 void * buffer;
271 unsigned long allocated;
272
273 /* Clients of RX can ask for some extra storage in the space pointed
274 * to by BUFFER. The field RESERVED is an input parameter to the
275 * compiler. After compilation, this much space will be available
276 * at (buffer + allocated - reserved)
277 */
278 unsigned long reserved;
279
280 /* --------- The remaining fields are for internal use only. --------- */
281 /* --------- But! they must be initialized to 0. --------- */
282
283 /* NODEC is the number of nodes in the NFA with non-epsilon
284 * transitions.
285 */
286 int nodec;
287
288 /* EPSNODEC is the number of nodes with only epsilon transitions. */
289 int epsnodec;
290
291 /* The sum (NODEC + EPSNODEC) is the total number of states in the
292 * compiled NFA.
293 */
294
295 /* Lists of side effects as stored in the NFA are `hash consed'..meaning
296 * that lists with the same elements are ==. During compilation,
297 * this table facilitates hash-consing.
298 */
299 struct rx_hash se_list_memo;
300
301 /* Lists of NFA states are also hashed.
302 */
303 struct rx_hash set_list_memo;
304
305
306
307
308 /* The compiler and matcher must build a number of instruction frames.
309 * The format of these frames is fixed (c.f. struct rx_inx). The values
310 * of the instructions is not fixed.
311 *
312 * An enumerated type (enum rx_opcode) defines the set of instructions
313 * that the compiler or matcher might generate. When filling an instruction
314 * frame, the INX field is found by indexing this instruction table
315 * with an opcode:
316 */
317 void ** instruction_table;
318
319 /* The list of all states in an NFA.
320 * During compilation, the NEXT field of NFA states links this list.
321 * After compilation, all the states are compacted into an array,
322 * ordered by state id numbers. At that time, this points to the base
323 * of that array.
324 */
325 struct rx_nfa_state *nfa_states;
326
327 /* Every nfa begins with one distinguished starting state:
328 */
329 struct rx_nfa_state *start;
330
331 /* This orders the search through super-nfa paths.
332 * See the comment near the typedef of rx_se_list_order.
333 */
334 rx_se_list_order se_list_cmp;
335
336 struct rx_superset * start_set;
337 };
338
339
340
341
342 /* SYNTAX TREES */
343
344 /* Compilation is in stages.
345 *
346 * In the first stage, a pattern specified by a string is
347 * translated into a syntax tree. Later stages will convert
348 * the syntax tree into an NFA optimized for conversion to a
349 * superstate-NFA.
350 *
351 * This page is about syntax trees.
352 */
353
354 enum rexp_node_type
355 {
356 r_cset, /* Match from a character set. `a' or `[a-z]'*/
357 r_cset2, /* Match from a multi-byte second character set. */
358 r_concat, /* Concat two subexpressions. `ab' */
359 r_alternate, /* Choose one of two subexpressions. `a\|b' */
360 r_opt, /* Optional subexpression. `a?' */
361 r_star, /* Repeated subexpression. `a*' */
362
363
364 /* A 2phase-star is a variation on a repeated subexpression.
365 * In this case, there are two subexpressions. The first, if matched,
366 * begins a repitition (otherwise, the whole expression is matches the
367 * empth string).
368 *
369 * After matching the first subexpression, a 2phase star either finishes,
370 * or matches the second subexpression. If the second subexpression is
371 * matched, then the whole construct repeats.
372 *
373 * 2phase stars are used in two circumstances. First, they
374 * are used as part of the implementation of POSIX intervals (counted
375 * repititions). Second, they are used to implement proper star
376 * semantics when the repeated subexpression contains paths of
377 * only side effects. See rx_compile for more information.
378 */
379 r_2phase_star,
380
381
382 /* c.f. "typedef void * rx_side_effect" */
383 r_side_effect,
384
385 /* This is an extension type: It is for transient use in source->source
386 * transformations (implemented over syntax trees).
387 */
388 r_data
389 };
390
391 /* A side effect is a matcher-specific action associated with
392 * transitions in the NFA. The details of side effects are up
393 * to the matcher. To the compiler and superstate constructors
394 * side effects are opaque:
395 */
396
397 typedef void * rx_side_effect;
398
399 /* Nodes in a syntax tree are of this type:
400 */
401 struct rexp_node
402 {
403 enum rexp_node_type type;
404 union
405 {
406 rx_Bitset cset;
407 rx_side_effect side_effect;
408 struct
409 {
410 struct rexp_node *left;
411 struct rexp_node *right;
412 } pair;
413 void * data;
414 } params;
415 };
416
417
418
419 /* NFA
420 *
421 * A syntax tree is compiled into an NFA. This page defines the structure
422 * of that NFA.
423 */
424
425 struct rx_nfa_state
426 {
427 /* These are kept in a list as the NFA is being built. */
428 struct rx_nfa_state *next;
429
430 /* After the NFA is built, states are given integer id's.
431 * States whose outgoing transitions are all either epsilon or
432 * side effect edges are given ids less than 0. Other states
433 * are given successive non-negative ids starting from 0.
434 */
435 int id;
436
437 /* The list of NFA edges that go from this state to some other. */
438 struct rx_nfa_edge *edges;
439
440 /* If you land in this state, then you implicitly land
441 * in all other states reachable by only epsilon translations.
442 * Call the set of maximal paths to such states the epsilon closure
443 * of this state.
444 *
445 * There may be other states that are reachable by a mixture of
446 * epsilon and side effect edges. Consider the set of maximal paths
447 * of that sort from this state. Call it the epsilon-side-effect
448 * closure of the state.
449 *
450 * The epsilon closure of the state is a subset of the epsilon-side-
451 * effect closure. It consists of all the paths that contain
452 * no side effects -- only epsilon edges.
453 *
454 * The paths in the epsilon-side-effect closure can be partitioned
455 * into equivalance sets. Two paths are equivalant if they have the
456 * same set of side effects, in the same order. The epsilon-closure
457 * is one of these equivalance sets. Let's call these equivalance
458 * sets: observably equivalant path sets. That name is chosen
459 * because equivalance of two paths means they cause the same side
460 * effects -- so they lead to the same subsequent observations other
461 * than that they may wind up in different target states.
462 *
463 * The superstate nfa, which is derived from this nfa, is based on
464 * the observation that all of the paths in an observably equivalant
465 * path set can be explored at the same time, provided that the
466 * matcher keeps track not of a single nfa state, but of a set of
467 * states. In particular, after following all the paths in an
468 * observably equivalant set, you wind up at a set of target states.
469 * That set of target states corresponds to one state in the
470 * superstate NFA.
471 *
472 * Staticly, before matching begins, it is convenient to analyze the
473 * nfa. Each state is labeled with a list of the observably
474 * equivalant path sets who's union covers all the
475 * epsilon-side-effect paths beginning in this state. This list is
476 * called the possible futures of the state.
477 *
478 * A trivial example is this NFA:
479 * s1
480 * A ---> B
481 *
482 * s2
483 * ---> C
484 *
485 * epsilon s1
486 * ---------> D ------> E
487 *
488 *
489 * In this example, A has two possible futures.
490 * One invokes the side effect `s1' and contains two paths,
491 * one ending in state B, the other in state E.
492 * The other invokes the side effect `s2' and contains only
493 * one path, landing in state C.
494 */
495 struct rx_possible_future *futures;
496
497
498 /* There are exactly two distinguished states in every NFA: */
499 unsigned int is_final:1;
500 unsigned int is_start:1;
501
502 /* These are used during NFA construction... */
503 unsigned int eclosure_needed:1;
504 unsigned int mark:1;
505 };
506
507
508 /* An edge in an NFA is typed: */
509 enum rx_nfa_etype
510 {
511 /* A cset edge is labled with a set of characters one of which
512 * must be matched for the edge to be taken.
513 */
514 ne_cset,
515
516 /* An epsilon edge is taken whenever its starting state is
517 * reached.
518 */
519 ne_epsilon,
520
521 /* A side effect edge is taken whenever its starting state is
522 * reached. Side effects may cause the match to fail or the
523 * position of the matcher to advance.
524 */
525 ne_side_effect /* A special kind of epsilon. */
526 };
527
528 struct rx_nfa_edge
529 {
530 struct rx_nfa_edge *next;
531 enum rx_nfa_etype type;
532 struct rx_nfa_state *dest;
533 union
534 {
535 rx_Bitset cset;
536 rx_side_effect side_effect;
537 } params;
538 };
539
540
541
542 /* A possible future consists of a list of side effects
543 * and a set of destination states. Below are their
544 * representations. These structures are hash-consed which
545 * means that lists with the same elements share a representation
546 * (their addresses are ==).
547 */
548
549 struct rx_nfa_state_set
550 {
551 struct rx_nfa_state * car;
552 struct rx_nfa_state_set * cdr;
553 };
554
555 struct rx_se_list
556 {
557 rx_side_effect car;
558 struct rx_se_list * cdr;
559 };
560
561 struct rx_possible_future
562 {
563 struct rx_possible_future *next;
564 struct rx_se_list * effects;
565 struct rx_nfa_state_set * destset;
566 };
567
568
569
570 /* This begins the description of the superstate NFA.
571 *
572 * The superstate NFA corresponds to the NFA in these ways:
573 *
574 * Every superstate NFA states SUPER correspond to sets of NFA states,
575 * nfa_states(SUPER).
576 *
577 * Superstate edges correspond to NFA paths.
578 *
579 * The superstate has no epsilon transitions;
580 * every edge has a character label, and a (possibly empty) side
581 * effect label. The side effect label corresponds to a list of
582 * side effects that occur in the NFA. These parts are referred
583 * to as: superedge_character(EDGE) and superedge_sides(EDGE).
584 *
585 * For a superstate edge EDGE starting in some superstate SUPER,
586 * the following is true (in pseudo-notation :-):
587 *
588 * exists DEST in nfa_states s.t.
589 * exists nfaEDGE in nfa_edges s.t.
590 * origin (nfaEDGE) == DEST
591 * && origin (nfaEDGE) is a member of nfa_states(SUPER)
592 * && exists PF in possible_futures(dest(nfaEDGE)) s.t.
593 * sides_of_possible_future (PF) == superedge_sides (EDGE)
594 *
595 * also:
596 *
597 * let SUPER2 := superedge_destination(EDGE)
598 * nfa_states(SUPER2)
599 * == union of all nfa state sets S s.t.
600 * exists PF in possible_futures(dest(nfaEDGE)) s.t.
601 * sides_of_possible_future (PF) == superedge_sides (EDGE)
602 * && S == dests_of_possible_future (PF) }
603 *
604 * Or in english, every superstate is a set of nfa states. A given
605 * character and a superstate implies many transitions in the NFA --
606 * those that begin with an edge labeled with that character from a
607 * state in the set corresponding to the superstate.
608 *
609 * The destinations of those transitions each have a set of possible
610 * futures. A possible future is a list of side effects and a set of
611 * destination NFA states. Two sets of possible futures can be
612 * `merged' by combining all pairs of possible futures that have the
613 * same side effects. A pair is combined by creating a new future
614 * with the same side effect but the union of the two destination sets.
615 * In this way, all the possible futures suggested by a superstate
616 * and a character can be merged into a set of possible futures where
617 * no two elements of the set have the same set of side effects.
618 *
619 * The destination of a possible future, being a set of NFA states,
620 * corresponds to a supernfa state. So, the merged set of possible
621 * futures we just created can serve as a set of edges in the
622 * supernfa.
623 *
624 * The representation of the superstate nfa and the nfa is critical.
625 * The nfa has to be compact, but has to facilitate the rapid
626 * computation of missing superstates. The superstate nfa has to
627 * be fast to interpret, lazilly constructed, and bounded in space.
628 *
629 * To facilitate interpretation, the superstate data structures are
630 * peppered with `instruction frames'. There is an instruction set
631 * defined below which matchers using the supernfa must be able to
632 * interpret.
633 *
634 * We'd like to make it possible but not mandatory to use code
635 * addresses to represent instructions (c.f. gcc's computed goto).
636 * Therefore, we define an enumerated type of opcodes, and when
637 * writing one of these instructions into a data structure, use
638 * the opcode as an index into a table of instruction values.
639 *
640 * Here are the opcodes that occur in the superstate nfa:
641 */
642
643
644 /* Every superstate contains a table of instruction frames indexed
645 * by characters. A normal `move' in a matcher is to fetch the next
646 * character and use it as an index into a superstates transition
647 * table.
648 *
649 * In the fasted case, only one edge follows from that character.
650 * In other cases there is more work to do.
651 *
652 * The descriptions of the opcodes refer to data structures that are
653 * described further below.
654 */
655
656 enum rx_opcode
657 {
658 /*
659 * BACKTRACK_POINT is invoked when a character transition in
660 * a superstate leads to more than one edge. In that case,
661 * the edges have to be explored independently using a backtracking
662 * strategy.
663 *
664 * A BACKTRACK_POINT instruction is stored in a superstate's
665 * transition table for some character when it is known that that
666 * character crosses more than one edge. On encountering this
667 * instruction, the matcher saves enough state to backtrack to this
668 * point in the match later.
669 */
670 rx_backtrack_point = 0, /* data is (struct transition_class *) */
671
672 /*
673 * RX_DO_SIDE_EFFECTS evaluates the side effects of an epsilon path.
674 * There is one occurence of this instruction per rx_distinct_future.
675 * This instruction is skipped if a rx_distinct_future has no side effects.
676 */
677 rx_do_side_effects = rx_backtrack_point + 1,
678
679 /* data is (struct rx_distinct_future *) */
680
681 /*
682 * RX_CACHE_MISS instructions are stored in rx_distinct_futures whose
683 * destination superstate has been reclaimed (or was never built).
684 * It recomputes the destination superstate.
685 * RX_CACHE_MISS is also stored in a superstate transition table before
686 * any of its edges have been built.
687 */
688 rx_cache_miss = rx_do_side_effects + 1,
689 /* data is (struct rx_distinct_future *) */
690
691 /*
692 * RX_NEXT_CHAR is called to consume the next character and take the
693 * corresponding transition. This is the only instruction that uses
694 * the DATA field of the instruction frame instead of DATA_2.
695 * (see EXPLORE_FUTURE in regex.c).
696 */
697 rx_next_char = rx_cache_miss + 1, /* data is (struct superstate *) */
698
699 /* RX_BACKTRACK indicates that a transition fails.
700 */
701 rx_backtrack = rx_next_char + 1, /* no data */
702
703 /*
704 * RX_ERROR_INX is stored only in places that should never be executed.
705 */
706 rx_error_inx = rx_backtrack + 1, /* Not supposed to occur. */
707
708 rx_num_instructions = rx_error_inx + 1
709 };
710
711 /* An id_instruction_table holds the values stored in instruction
712 * frames. The table is indexed by the enums declared above.
713 */
714 extern void * rx_id_instruction_table[rx_num_instructions];
715
716 /* The heart of the matcher is a `word-code-interpreter'
717 * (like a byte-code interpreter, except that instructions
718 * are a full word wide).
719 *
720 * Instructions are not stored in a vector of code, instead,
721 * they are scattered throughout the data structures built
722 * by the regexp compiler and the matcher. One word-code instruction,
723 * together with the arguments to that instruction, constitute
724 * an instruction frame (struct rx_inx).
725 *
726 * This structure type is padded by hand to a power of 2 because
727 * in one of the dominant cases, we dispatch by indexing a table
728 * of instruction frames. If that indexing can be accomplished
729 * by just a shift of the index, we're happy.
730 *
731 * Instructions take at most one argument, but there are two
732 * slots in an instruction frame that might hold that argument.
733 * These are called data and data_2. The data slot is only
734 * used for one instruction (RX_NEXT_CHAR). For all other
735 * instructions, data should be set to 0.
736 *
737 * RX_NEXT_CHAR is the most important instruction by far.
738 * By reserving the data field for its exclusive use,
739 * instruction dispatch is sped up in that case. There is
740 * no need to fetch both the instruction and the data,
741 * only the data is needed. In other words, a `cycle' begins
742 * by fetching the field data. If that is non-0, then it must
743 * be the destination state of a next_char transition, so
744 * make that value the current state, advance the match position
745 * by one character, and start a new cycle. On the other hand,
746 * if data is 0, fetch the instruction and do a more complicated
747 * dispatch on that.
748 */
749
750 struct rx_inx
751 {
752 void * data;
753 void * data_2;
754 void * inx;
755 void * fnord;
756 };
757
758 #ifndef RX_TAIL_ARRAY
759 #define RX_TAIL_ARRAY 1
760 #endif
761
762 /* A superstate corresponds to a set of nfa states. Those sets are
763 * represented by STRUCT RX_SUPERSET. The constructors
764 * guarantee that only one (shared) structure is created for a given set.
765 */
766 struct rx_superset
767 {
768 int refs; /* This is a reference counted structure. */
769
770 /* We keep these sets in a cache because (in an unpredictable way),
771 * the same set is often created again and again. But that is also
772 * problematic -- compatibility with POSIX and GNU regex requires
773 * that we not be able to tell when a program discards a particular
774 * NFA (thus invalidating the supersets created from it).
775 *
776 * But when a cache hit appears to occur, we will have in hand the
777 * nfa for which it may have happened. That is why every nfa is given
778 * its own sequence number. On a cache hit, the cache is validated
779 * by comparing the nfa sequence number to this field:
780 */
781 int id;
782
783 struct rx_nfa_state * car; /* May or may not be a valid addr. */
784 struct rx_superset * cdr;
785
786 /* If the corresponding superstate exists: */
787 struct rx_superstate * superstate;
788
789
790 /* There is another bookkeeping problem. It is expensive to
791 * compute the starting nfa state set for an nfa. So, once computed,
792 * it is cached in the `struct rx'.
793 *
794 * But, the state set can be flushed from the superstate cache.
795 * When that happens, we can't know if the corresponding `struct rx'
796 * is still alive or if it has been freed or re-used by the program.
797 * So, the cached pointer to this set in a struct rx might be invalid
798 * and we need a way to validate it.
799 *
800 * Fortunately, even if this set is flushed from the cache, it is
801 * not freed. It just goes on the free-list of supersets.
802 * So we can still examine it.
803 *
804 * So to validate a starting set memo, check to see if the
805 * starts_for field still points back to the struct rx in question,
806 * and if the ID matches the rx sequence number.
807 */
808 struct rx * starts_for;
809
810 /* This is used to link into a hash bucket so these objects can
811 * be `hash-consed'.
812 */
813 struct rx_hash_item hash_item;
814 };
815
816 #define rx_protect_superset(RX,CON) (++(CON)->refs)
817
818 /* The terminology may be confusing (rename this structure?).
819 * Every character occurs in at most one rx_super_edge per super-state.
820 * But, that structure might have more than one option, indicating a point
821 * of non-determinism.
822 *
823 * In other words, this structure holds a list of superstate edges
824 * sharing a common starting state and character label. The edges
825 * are in the field OPTIONS. All superstate edges sharing the same
826 * starting state and character are in this list.
827 */
828 struct rx_super_edge
829 {
830 struct rx_super_edge *next;
831 struct rx_inx rx_backtrack_frame;
832 int cset_size;
833 rx_Bitset cset;
834 struct rx_distinct_future *options;
835 };
836
837 /* A superstate is a set of nfa states (RX_SUPERSET) along
838 * with a transition table. Superstates are built on demand and reclaimed
839 * without warning. To protect a superstate from this ghastly fate,
840 * use LOCK_SUPERSTATE.
841 */
842 struct rx_superstate
843 {
844 int rx_id; /* c.f. the id field of rx_superset */
845 int locks; /* protection from reclamation */
846
847 /* Within a superstate cache, all the superstates are kept in a big
848 * queue. The tail of the queue is the state most likely to be
849 * reclaimed. The *recyclable fields hold the queue position of
850 * this state.
851 */
852 struct rx_superstate * next_recyclable;
853 struct rx_superstate * prev_recyclable;
854
855 /* The supernfa edges that exist in the cache and that have
856 * this state as their destination are kept in this list:
857 */
858 struct rx_distinct_future * transition_refs;
859
860 /* The list of nfa states corresponding to this superstate: */
861 struct rx_superset * contents;
862
863 /* The list of edges in the cache beginning from this state. */
864 struct rx_super_edge * edges;
865
866 /* A tail of the recyclable queue is marked as semifree. A semifree
867 * state has no incoming next_char transitions -- any transition
868 * into a semifree state causes a complex dispatch with the side
869 * effect of rescuing the state from its semifree state.
870 *
871 * An alternative to this might be to make next_char more expensive,
872 * and to move a state to the head of the recyclable queue whenever
873 * it is entered. That way, popular states would never be recycled.
874 *
875 * But unilaterally making next_char more expensive actually loses.
876 * So, incoming transitions are only made expensive for states near
877 * the tail of the recyclable queue. The more cache contention
878 * there is, the more frequently a state will have to prove itself
879 * and be moved back to the front of the queue. If there is less
880 * contention, then popular states just aggregate in the front of
881 * the queue and stay there.
882 */
883 int is_semifree;
884
885
886 /* This keeps track of the size of the transition table for this
887 * state. There is a half-hearted attempt to support variable sized
888 * superstates.
889 */
890 int trans_size;
891
892 /* Indexed by characters... */
893 struct rx_inx transitions[RX_TAIL_ARRAY];
894 };
895
896
897 /* A list of distinct futures define the edges that leave from a
898 * given superstate on a given character. c.f. rx_super_edge.
899 */
900
901 struct rx_distinct_future
902 {
903 struct rx_distinct_future * next_same_super_edge[2];
904 struct rx_distinct_future * next_same_dest;
905 struct rx_distinct_future * prev_same_dest;
906 struct rx_superstate * present; /* source state */
907 struct rx_superstate * future; /* destination state */
908 struct rx_super_edge * edge;
909
910
911 /* The future_frame holds the instruction that should be executed
912 * after all the side effects are done, when it is time to complete
913 * the transition to the next state.
914 *
915 * Normally this is a next_char instruction, but it may be a
916 * cache_miss instruction as well, depending on whether or not
917 * the superstate is in the cache and semifree.
918 *
919 * If this is the only future for a given superstate/char, and
920 * if there are no side effects to be performed, this frame is
921 * not used (directly) at all. Instead, its contents are copied
922 * into the transition table of the starting state of this dist. future.
923 */
924 struct rx_inx future_frame;
925
926 struct rx_inx side_effects_frame;
927 struct rx_se_list * effects;
928 };
929
930 #define rx_lock_superstate(R,S) ((S)->locks++)
931 #define rx_unlock_superstate(R,S) (--(S)->locks)
932
933
934 /* This page destined for rx.h */
935
936 struct rx_blocklist
937 {
938 struct rx_blocklist * next;
939 int bytes;
940 };
941
942 struct rx_freelist
943 {
944 struct rx_freelist * next;
945 };
946
947 struct rx_cache;
948
949 #if defined(__STDC__) || defined(MSDOS)
950 typedef void (*rx_morecore_fn)(struct rx_cache *);
951 #else
952 typedef void (*rx_morecore_fn)();
953 #endif
954
955 /* You use this to control the allocation of superstate data
956 * during matching. Most of it should be initialized to 0.
957 *
958 * A MORECORE function is necessary. It should allocate
959 * a new block of memory or return 0.
960 * A default that uses malloc is called `rx_morecore'.
961 *
962 * The number of SUPERSTATES_ALLOWED indirectly limits how much memory
963 * the system will try to allocate. The default is 128. Batch style
964 * applications that are very regexp intensive should use as high a number
965 * as possible without thrashing.
966 *
967 * The LOCAL_CSET_SIZE is the number of characters in a character set.
968 * It is therefore the number of entries in a superstate transition table.
969 * Generally, it should be 256. If your character set has 16 bits,
970 * it is better to translate your regexps into equivalent 8 bit patterns.
971 */
972
973 struct rx_cache
974 {
975 struct rx_hash_rules superset_hash_rules;
976
977 /* Objects are allocated by incrementing a pointer that
978 * scans across rx_blocklists.
979 */
980 struct rx_blocklist * memory;
981 struct rx_blocklist * memory_pos;
982 unsigned int bytes_left;
983 char * memory_addr;
984 rx_morecore_fn morecore;
985
986 /* Freelists. */
987 struct rx_freelist * free_superstates;
988 struct rx_freelist * free_transition_classes;
989 struct rx_freelist * free_discernable_futures;
990 struct rx_freelist * free_supersets;
991 struct rx_freelist * free_hash;
992
993 /* Two sets of superstates -- those that are semifreed, and those
994 * that are being used.
995 */
996 struct rx_superstate * lru_superstate;
997 struct rx_superstate * semifree_superstate;
998
999 struct rx_superset * empty_superset;
1000
1001 int superstates;
1002 int semifree_superstates;
1003 int hits;
1004 int misses;
1005 int superstates_allowed;
1006
1007 int local_cset_size;
1008 void ** instruction_table;
1009
1010 struct rx_hash superset_table;
1011 };
1012
1013
1014
1015 /* The lowest-level search function supports arbitrarily fragmented
1016 * strings and (optionally) suspendable/resumable searches.
1017 *
1018 * Callers have to provide a few hooks.
1019 */
1020
1021 #ifndef __GNUC__
1022 #if defined(__STDC__) || defined(MSDOS)
1023 #define __const__ const
1024 #else
1025 #define __const__
1026 #endif
1027 #endif
1028
1029 /* This holds a matcher position */
1030 struct rx_string_position
1031 {
1032 __const__ unsigned char * pos; /* The current pos. */
1033 __const__ unsigned char * string; /* The current string burst. */
1034 __const__ unsigned char * end; /* First invalid position >= POS. */
1035 int offset; /* Integer address of the current burst. */
1036 int size; /* Current string's size. */
1037 int search_direction; /* 1 or -1 */
1038 int search_end; /* First position to not try. */
1039 };
1040
1041
1042 enum rx_get_burst_return
1043 {
1044 rx_get_burst_continuation,
1045 rx_get_burst_error,
1046 rx_get_burst_ok,
1047 rx_get_burst_no_more
1048 };
1049
1050
1051 /* A call to get burst should make POS valid. It might be invalid
1052 * if the STRING field doesn't point to a burst that actually
1053 * contains POS.
1054 *
1055 * GET_BURST should take a clue from SEARCH_DIRECTION (1 or -1) as to
1056 * whether or not to pad to the left. Padding to the right is always
1057 * appropriate, but need not go past the point indicated by STOP.
1058 *
1059 * If a continuation is returned, then the reentering call to
1060 * a search function will retry the get_burst.
1061 */
1062
1063 #if defined(__STDC__) || defined(MSDOS)
1064 typedef enum rx_get_burst_return
1065 (*rx_get_burst_fn) (struct rx_string_position * pos,
1066 void * app_closure,
1067 int stop);
1068
1069 #else
1070 typedef enum rx_get_burst_return (*rx_get_burst_fn) ();
1071 #endif
1072
1073
1074 enum rx_back_check_return
1075 {
1076 rx_back_check_continuation,
1077 rx_back_check_error,
1078 rx_back_check_pass,
1079 rx_back_check_fail
1080 };
1081
1082 /* Back_check should advance the position it is passed
1083 * over rparen - lparen characters and return pass iff
1084 * the characters starting at POS match those indexed
1085 * by [LPAREN..RPAREN].
1086 *
1087 * If a continuation is returned, then the reentering call to
1088 * a search function will retry the back_check.
1089 */
1090
1091 #if defined(__STDC__)
1092 typedef enum rx_back_check_return
1093 (*rx_back_check_fn) (struct rx_string_position * pos,
1094 int lparen,
1095 int rparen,
1096 unsigned char * translate,
1097 void * app_closure,
1098 int stop);
1099
1100 #else
1101 typedef enum rx_back_check_return (*rx_back_check_fn) ();
1102 #endif
1103
1104
1105
1106
1107 /* A call to fetch_char should return the character at POS or POS + 1.
1108 * Returning continuations here isn't supported. OFFSET is either 0 or 1
1109 * and indicates which characters is desired.
1110 */
1111
1112 #if defined(__STDC__) || defined(MSDOS)
1113 typedef int (*rx_fetch_char_fn) (struct rx_string_position * pos,
1114 int offset,
1115 void * app_closure,
1116 int stop);
1117 #else
1118 typedef int (*rx_fetch_char_fn) ();
1119 #endif
1120
1121
1122 enum rx_search_return
1123 {
1124 rx_search_continuation = -4,
1125 rx_search_error = -3,
1126 rx_search_soft_fail = -2, /* failed by running out of string */
1127 rx_search_fail = -1 /* failed only by reaching failure states */
1128 /* return values >= 0 indicate the position of a successful match */
1129 };
1130
1131
1132
1133
1134
1135
1136 /* regex.h
1137 *
1138 * The remaining declarations replace regex.h.
1139 */
1140
1141 /* This is an array of error messages corresponding to the error codes.
1142 */
1143 extern __const__ char *re_error_msg[];
1144
1145 /* If any error codes are removed, changed, or added, update the
1146 `re_error_msg' table in regex.c. */
1147 typedef enum
1148 {
1149 REG_NOERROR = 0, /* Success. */
1150 REG_NOMATCH, /* Didn't find a match (for regexec). */
1151
1152 /* POSIX regcomp return error codes. (In the order listed in the
1153 standard.) */
1154 REG_BADPAT, /* Invalid pattern. */
1155 REG_ECOLLATE, /* Not implemented. */
1156 REG_ECTYPE, /* Invalid character class name. */
1157 REG_EESCAPE, /* Trailing backslash. */
1158 REG_ESUBREG, /* Invalid back reference. */
1159 REG_EBRACK, /* Unmatched left bracket. */
1160 REG_EPAREN, /* Parenthesis imbalance. */
1161 REG_EBRACE, /* Unmatched \{. */
1162 REG_BADBR, /* Invalid contents of \{\}. */
1163 REG_ERANGE, /* Invalid range end. */
1164 REG_ESPACE, /* Ran out of memory. */
1165 REG_BADRPT, /* No preceding re for repetition op. */
1166
1167 /* Error codes we've added. */
1168 REG_EEND, /* Premature end. */
1169 REG_ESIZE, /* Compiled pattern bigger than 2^16 bytes. */
1170 REG_ERPAREN /* Unmatched ) or \); not returned from regcomp. */
1171 } reg_errcode_t;
1172
1173 /* The regex.c support, as a client of rx, defines a set of possible
1174 * side effects that can be added to the edge lables of nfa edges.
1175 * Here is the list of sidef effects in use.
1176 */
1177
1178 enum re_side_effects
1179 {
1180 #define RX_WANT_SE_DEFS 1
1181 #undef RX_DEF_SE
1182 #undef RX_DEF_CPLX_SE
1183 #define RX_DEF_SE(IDEM, NAME, VALUE) NAME VALUE,
1184 #define RX_DEF_CPLX_SE(IDEM, NAME, VALUE) NAME VALUE,
1185 #include "rx.h"
1186 #undef RX_DEF_SE
1187 #undef RX_DEF_CPLX_SE
1188 #undef RX_WANT_SE_DEFS
1189 #if defined(MSDOS) && !defined(__GO32__)
1190 #define RX_RE_FLOOGLE_FLAP 32765
1191 #else
1192 #define RX_RE_FLOOGLE_FLAP 65533
1193 #endif
1194 re_floogle_flap = RX_RE_FLOOGLE_FLAP
1195 };
1196
1197 /* These hold paramaters for the kinds of side effects that are possible
1198 * in the supported pattern languages. These include things like the
1199 * numeric bounds of {} operators and the index of paren registers for
1200 * subexpression measurement or backreferencing.
1201 */
1202 struct re_se_params
1203 {
1204 enum re_side_effects se;
1205 int op1;
1206 int op2;
1207 };
1208
1209 typedef unsigned long reg_syntax_t;
1210
1211 struct re_pattern_buffer
1212 {
1213 struct rx rx;
1214 reg_syntax_t syntax; /* See below for syntax bit definitions. */
1215
1216 unsigned int no_sub:1; /* If set, don't return register offsets. */
1217 unsigned int not_bol:1; /* If set, the anchors ('^' and '$') don't */
1218 unsigned int not_eol:1; /* match at the ends of the string. */
1219 unsigned int newline_anchor:1;/* If true, an anchor at a newline matches.*/
1220 unsigned int least_subs:1; /* If set, and returning registers, return
1221 * as few values as possible. Only
1222 * backreferenced groups and group 0 (the whole
1223 * match) will be returned.
1224 */
1225
1226 /* If true, this says that the matcher should keep registers on its
1227 * backtracking stack. For many patterns, we can easily determine that
1228 * this isn't necessary.
1229 */
1230 unsigned int match_regs_on_stack:1;
1231 unsigned int search_regs_on_stack:1;
1232
1233 /* is_anchored and begbuf_only are filled in by rx_compile. */
1234 unsigned int is_anchored:1; /* Anchorded by ^? */
1235 unsigned int begbuf_only:1; /* Anchored to char position 0? */
1236
1237
1238 /* If REGS_UNALLOCATED, allocate space in the `regs' structure
1239 * for `max (RE_NREGS, re_nsub + 1)' groups.
1240 * If REGS_REALLOCATE, reallocate space if necessary.
1241 * If REGS_FIXED, use what's there.
1242 */
1243 #define REGS_UNALLOCATED 0
1244 #define REGS_REALLOCATE 1
1245 #define REGS_FIXED 2
1246 unsigned int regs_allocated:2;
1247
1248
1249 /* Either a translate table to apply to all characters before
1250 * comparing them, or zero for no translation. The translation
1251 * is applied to a pattern when it is compiled and to a string
1252 * when it is matched.
1253 */
1254 unsigned char * translate;
1255
1256 /* If this is a valid pointer, it tells rx not to store the extents of
1257 * certain subexpressions (those corresponding to non-zero entries).
1258 * Passing 0x1 is the same as passing an array of all ones. Passing 0x0
1259 * is the same as passing an array of all zeros.
1260 * The array should contain as many entries as their are subexps in the
1261 * regexp.
1262 *
1263 * For POSIX compatability, when using regcomp and regexec this field
1264 * is zeroed and ignored.
1265 */
1266 char * syntax_parens;
1267
1268 /* Number of subexpressions found by the compiler. */
1269 size_t re_nsub;
1270
1271 void * buffer; /* Malloced memory for the nfa. */
1272 unsigned long allocated; /* Size of that memory. */
1273
1274 /* Pointer to a fastmap, if any, otherwise zero. re_search uses
1275 * the fastmap, if there is one, to skip over impossible
1276 * starting points for matches. */
1277 char *fastmap;
1278
1279 unsigned int fastmap_accurate:1; /* These three are internal. */
1280 unsigned int can_match_empty:1;
1281 struct rx_nfa_state * start; /* The nfa starting state. */
1282
1283 /* This is the list of iterator bounds for {lo,hi} constructs.
1284 * The memory pointed to is part of the rx->buffer.
1285 */
1286 struct re_se_params *se_params;
1287
1288 /* This is a bitset representation of the fastmap.
1289 * This is a true fastmap that already takes the translate
1290 * table into account.
1291 */
1292 rx_Bitset fastset;
1293 };
1294
1295 /* Type for byte offsets within the string. POSIX mandates this. */
1296 typedef int regoff_t;
1297
1298 /* This is the structure we store register match data in. See
1299 regex.texinfo for a full description of what registers match. */
1300 struct re_registers
1301 {
1302 unsigned num_regs;
1303 regoff_t *start;
1304 regoff_t *end;
1305 };
1306
1307 typedef struct re_pattern_buffer regex_t;
1308
1309 /* POSIX specification for registers. Aside from the different names than
1310 `re_registers', POSIX uses an array of structures, instead of a
1311 structure of arrays. */
1312 typedef struct
1313 {
1314 regoff_t rm_so; /* Byte offset from string's start to substring's start. */
1315 regoff_t rm_eo; /* Byte offset from string's start to substring's end. */
1316 } regmatch_t;
1317
1318
1319 /* The following bits are used to determine the regexp syntax we
1320 recognize. The set/not-set meanings are chosen so that Emacs syntax
1321 remains the value 0. The bits are given in alphabetical order, and
1322 the definitions shifted by one from the previous bit; thus, when we
1323 add or remove a bit, only one other definition need change. */
1324
1325 /* If this bit is not set, then \ inside a bracket expression is literal.
1326 If set, then such a \ quotes the following character. */
1327 #define RE_BACKSLASH_ESCAPE_IN_LISTS ((unsigned long) 1)
1328
1329 /* If this bit is not set, then + and ? are operators, and \+ and \? are
1330 literals.
1331 If set, then \+ and \? are operators and + and ? are literals. */
1332 #define RE_BK_PLUS_QM (RE_BACKSLASH_ESCAPE_IN_LISTS << 1)
1333
1334 /* If this bit is set, then character classes are supported. They are:
1335 [:alpha:], [:upper:], [:lower:], [:digit:], [:alnum:], [:xdigit:],
1336 [:space:], [:print:], [:punct:], [:graph:], and [:cntrl:].
1337 If not set, then character classes are not supported. */
1338 #define RE_CHAR_CLASSES (RE_BK_PLUS_QM << 1)
1339
1340 /* If this bit is set, then ^ and $ are always anchors (outside bracket
1341 expressions, of course).
1342 If this bit is not set, then it depends:
1343 ^ is an anchor if it is at the beginning of a regular
1344 expression or after an open-group or an alternation operator;
1345 $ is an anchor if it is at the end of a regular expression, or
1346 before a close-group or an alternation operator.
1347
1348 This bit could be (re)combined with RE_CONTEXT_INDEP_OPS, because
1349 POSIX draft 11.2 says that * etc. in leading positions is undefined.
1350 We already implemented a previous draft which made those constructs
1351 invalid, though, so we haven't changed the code back. */
1352 #define RE_CONTEXT_INDEP_ANCHORS (RE_CHAR_CLASSES << 1)
1353
1354 /* If this bit is set, then special characters are always special
1355 regardless of where they are in the pattern.
1356 If this bit is not set, then special characters are special only in
1357 some contexts; otherwise they are ordinary. Specifically,
1358 * + ? and intervals are only special when not after the beginning,
1359 open-group, or alternation operator. */
1360 #define RE_CONTEXT_INDEP_OPS (RE_CONTEXT_INDEP_ANCHORS << 1)
1361
1362 /* If this bit is set, then *, +, ?, and { cannot be first in an re or
1363 immediately after an alternation or begin-group operator. */
1364 #define RE_CONTEXT_INVALID_OPS (RE_CONTEXT_INDEP_OPS << 1)
1365
1366 /* If this bit is set, then . matches newline.
1367 If not set, then it doesn't. */
1368 #define RE_DOT_NEWLINE (RE_CONTEXT_INVALID_OPS << 1)
1369
1370 /* If this bit is set, then . doesn't match NUL.
1371 If not set, then it does. */
1372 #define RE_DOT_NOT_NULL (RE_DOT_NEWLINE << 1)
1373
1374 /* If this bit is set, nonmatching lists [^...] do not match newline.
1375 If not set, they do. */
1376 #define RE_HAT_LISTS_NOT_NEWLINE (RE_DOT_NOT_NULL << 1)
1377
1378 /* If this bit is set, either \{...\} or {...} defines an
1379 interval, depending on RE_NO_BK_BRACES.
1380 If not set, \{, \}, {, and } are literals. */
1381 #define RE_INTERVALS (RE_HAT_LISTS_NOT_NEWLINE << 1)
1382
1383 /* If this bit is set, +, ? and | aren't recognized as operators.
1384 If not set, they are. */
1385 #define RE_LIMITED_OPS (RE_INTERVALS << 1)
1386
1387 /* If this bit is set, newline is an alternation operator.
1388 If not set, newline is literal. */
1389 #define RE_NEWLINE_ALT (RE_LIMITED_OPS << 1)
1390
1391 /* If this bit is set, then `{...}' defines an interval, and \{ and \}
1392 are literals.
1393 If not set, then `\{...\}' defines an interval. */
1394 #define RE_NO_BK_BRACES (RE_NEWLINE_ALT << 1)
1395
1396 /* If this bit is set, (...) defines a group, and \( and \) are literals.
1397 If not set, \(...\) defines a group, and ( and ) are literals. */
1398 #define RE_NO_BK_PARENS (RE_NO_BK_BRACES << 1)
1399
1400 /* If this bit is set, then \<digit> matches <digit>.
1401 If not set, then \<digit> is a back-reference. */
1402 #define RE_NO_BK_REFS (RE_NO_BK_PARENS << 1)
1403
1404 /* If this bit is set, then | is an alternation operator, and \| is literal.
1405 If not set, then \| is an alternation operator, and | is literal. */
1406 #define RE_NO_BK_VBAR (RE_NO_BK_REFS << 1)
1407
1408 /* If this bit is set, then an ending range point collating higher
1409 than the starting range point, as in [z-a], is invalid.
1410 If not set, then when ending range point collates higher than the
1411 starting range point, the range is ignored. */
1412 #define RE_NO_EMPTY_RANGES (RE_NO_BK_VBAR << 1)
1413
1414 /* If this bit is set, then an unmatched ) is ordinary.
1415 If not set, then an unmatched ) is invalid. */
1416 #define RE_UNMATCHED_RIGHT_PAREN_ORD (RE_NO_EMPTY_RANGES << 1)
1417
1418 /* This global variable defines the particular regexp syntax to use (for
1419 some interfaces). When a regexp is compiled, the syntax used is
1420 stored in the pattern buffer, so changing this does not affect
1421 already-compiled regexps. */
1422 extern reg_syntax_t re_syntax_options;
1423
1424 /* Define combinations of the above bits for the standard possibilities.
1425 (The [[[ comments delimit what gets put into the Texinfo file, so
1426 don't delete them!) */
1427 /* [[[begin syntaxes]]] */
1428 #define RE_SYNTAX_EMACS 0
1429
1430 #define RE_SYNTAX_AWK \
1431 (RE_BACKSLASH_ESCAPE_IN_LISTS | RE_DOT_NOT_NULL \
1432 | RE_NO_BK_PARENS | RE_NO_BK_REFS \
1433 | RE_NO_BK_VBAR | RE_NO_EMPTY_RANGES \
1434 | RE_UNMATCHED_RIGHT_PAREN_ORD)
1435
1436 #define RE_SYNTAX_POSIX_AWK \
1437 (RE_SYNTAX_POSIX_EXTENDED | RE_BACKSLASH_ESCAPE_IN_LISTS)
1438
1439 #define RE_SYNTAX_GREP \
1440 (RE_BK_PLUS_QM | RE_CHAR_CLASSES \
1441 | RE_HAT_LISTS_NOT_NEWLINE | RE_INTERVALS \
1442 | RE_NEWLINE_ALT)
1443
1444 #define RE_SYNTAX_EGREP \
1445 (RE_CHAR_CLASSES | RE_CONTEXT_INDEP_ANCHORS \
1446 | RE_CONTEXT_INDEP_OPS | RE_HAT_LISTS_NOT_NEWLINE \
1447 | RE_NEWLINE_ALT | RE_NO_BK_PARENS \
1448 | RE_NO_BK_VBAR)
1449
1450 #define RE_SYNTAX_POSIX_EGREP \
1451 (RE_SYNTAX_EGREP | RE_INTERVALS | RE_NO_BK_BRACES)
1452
1453 #define RE_SYNTAX_SED RE_SYNTAX_POSIX_BASIC
1454
1455 /* Syntax bits common to both basic and extended POSIX regex syntax. */
1456 #define _RE_SYNTAX_POSIX_COMMON \
1457 (RE_CHAR_CLASSES | RE_DOT_NEWLINE | RE_DOT_NOT_NULL \
1458 | RE_INTERVALS | RE_NO_EMPTY_RANGES)
1459
1460 #define RE_SYNTAX_POSIX_BASIC \
1461 (_RE_SYNTAX_POSIX_COMMON | RE_BK_PLUS_QM)
1462
1463 /* Differs from ..._POSIX_BASIC only in that RE_BK_PLUS_QM becomes
1464 RE_LIMITED_OPS, i.e., \? \+ \| are not recognized. Actually, this
1465 isn't minimal, since other operators, such as \`, aren't disabled. */
1466 #define RE_SYNTAX_POSIX_MINIMAL_BASIC \
1467 (_RE_SYNTAX_POSIX_COMMON | RE_LIMITED_OPS)
1468
1469 #define RE_SYNTAX_POSIX_EXTENDED \
1470 (_RE_SYNTAX_POSIX_COMMON | RE_CONTEXT_INDEP_ANCHORS \
1471 | RE_CONTEXT_INDEP_OPS | RE_NO_BK_BRACES \
1472 | RE_NO_BK_PARENS | RE_NO_BK_VBAR \
1473 | RE_UNMATCHED_RIGHT_PAREN_ORD)
1474
1475 /* Differs from ..._POSIX_EXTENDED in that RE_CONTEXT_INVALID_OPS
1476 replaces RE_CONTEXT_INDEP_OPS and RE_NO_BK_REFS is added. */
1477 #define RE_SYNTAX_POSIX_MINIMAL_EXTENDED \
1478 (_RE_SYNTAX_POSIX_COMMON | RE_CONTEXT_INDEP_ANCHORS \
1479 | RE_CONTEXT_INVALID_OPS | RE_NO_BK_BRACES \
1480 | RE_NO_BK_PARENS | RE_NO_BK_REFS \
1481 | RE_NO_BK_VBAR | RE_UNMATCHED_RIGHT_PAREN_ORD)
1482 /* [[[end syntaxes]]] */
1483
1484 /* Maximum number of duplicates an interval can allow. Some systems
1485 (erroneously) define this in other header files, but we want our
1486 value, so remove any previous define. */
1487 #ifdef RE_DUP_MAX
1488 #undef RE_DUP_MAX
1489 #endif
1490 #define RE_DUP_MAX (0x7fff)
1491
1492
1493
1494 /* POSIX `cflags' bits (i.e., information for `regcomp'). */
1495
1496 /* If this bit is set, then use extended regular expression syntax.
1497 If not set, then use basic regular expression syntax. */
1498 #define REG_EXTENDED 1
1499
1500 /* If this bit is set, then ignore case when matching.
1501 If not set, then case is significant. */
1502 #define REG_ICASE (REG_EXTENDED << 1)
1503
1504 /* If this bit is set, then anchors do not match at newline
1505 characters in the string.
1506 If not set, then anchors do match at newlines. */
1507 #define REG_NEWLINE (REG_ICASE << 1)
1508
1509 /* If this bit is set, then report only success or fail in regexec.
1510 If not set, then returns differ between not matching and errors. */
1511 #define REG_NOSUB (REG_NEWLINE << 1)
1512
1513
1514 /* POSIX `eflags' bits (i.e., information for regexec). */
1515
1516 /* If this bit is set, then the beginning-of-line operator doesn't match
1517 the beginning of the string (presumably because it's not the
1518 beginning of a line).
1519 If not set, then the beginning-of-line operator does match the
1520 beginning of the string. */
1521 #define REG_NOTBOL 1
1522
1523 /* Like REG_NOTBOL, except for the end-of-line. */
1524 #define REG_NOTEOL (1 << 1)
1525
1526 /* If `regs_allocated' is REGS_UNALLOCATED in the pattern buffer,
1527 * `re_match_2' returns information about at least this many registers
1528 * the first time a `regs' structure is passed.
1529 *
1530 * Also, this is the greatest number of backreferenced subexpressions
1531 * allowed in a pattern being matched without caller-supplied registers.
1532 */
1533 #ifndef RE_NREGS
1534 #define RE_NREGS 30
1535 #endif
1536
1537 extern int rx_cache_bound;
1538 extern char rx_version_string[];
1539
1540
1541
1542 #ifdef RX_WANT_RX_DEFS
1543
1544 /* This is decls to the interesting subsystems and lower layers
1545 * of rx. Everything which doesn't have a public counterpart in
1546 * regex.c is declared here.
1547 */
1548
1549
1550 #if defined(__STDC__)
1551 typedef void (*rx_hash_freefn) (struct rx_hash_item * it);
1552 #else /* ndef __STDC__ */
1553 typedef void (*rx_hash_freefn) ();
1554 #endif /* ndef __STDC__ */
1555
1556
1557
1558
1559 #if defined(__STDC__) || defined(MSDOS)
1560 RX_DECL int rx_bitset_is_equal (int size, rx_Bitset a, rx_Bitset b);
1561 RX_DECL int rx_bitset_is_subset (int size, rx_Bitset a, rx_Bitset b);
1562 RX_DECL int rx_bitset_empty (int size, rx_Bitset set);
1563 RX_DECL void rx_bitset_null (int size, rx_Bitset b);
1564 RX_DECL void rx_bitset_universe (int size, rx_Bitset b);
1565 RX_DECL void rx_bitset_complement (int size, rx_Bitset b);
1566 RX_DECL void rx_bitset_assign (int size, rx_Bitset a, rx_Bitset b);
1567 RX_DECL void rx_bitset_union (int size, rx_Bitset a, rx_Bitset b);
1568 RX_DECL void rx_bitset_intersection (int size,
1569 rx_Bitset a, rx_Bitset b);
1570 RX_DECL void rx_bitset_difference (int size, rx_Bitset a, rx_Bitset b);
1571 #if 0
1572 RX_DECL void rx_bitset_revdifference (int size,
1573 rx_Bitset a, rx_Bitset b);
1574 #endif
1575 #ifdef emacs
1576 RX_DECL void rx_bitset_xor (int size, rx_Bitset a, rx_Bitset b);
1577 #endif
1578 RX_DECL unsigned long rx_bitset_hash (int size, rx_Bitset b);
1579 RX_DECL struct rx_hash_item * rx_hash_find (struct rx_hash * table,
1580 unsigned long hash,
1581 void * value,
1582 struct rx_hash_rules * rules);
1583 RX_DECL struct rx_hash_item * rx_hash_store (struct rx_hash * table,
1584 unsigned long hash,
1585 void * value,
1586 struct rx_hash_rules * rules);
1587 RX_DECL void rx_hash_free (struct rx_hash_item * it, struct rx_hash_rules * rules);
1588 RX_DECL void rx_free_hash_table (struct rx_hash * tab, rx_hash_freefn freefn,
1589 struct rx_hash_rules * rules);
1590 RX_DECL rx_Bitset rx_cset (struct rx *rx);
1591 RX_DECL rx_Bitset rx_copy_cset (struct rx *rx, rx_Bitset a);
1592 RX_DECL void rx_free_cset (struct rx * rx, rx_Bitset c);
1593 RX_DECL struct rexp_node * rexp_node (struct rx *rx,
1594 enum rexp_node_type type);
1595 RX_DECL struct rexp_node * rx_mk_r_cset (struct rx * rx,
1596 rx_Bitset b);
1597 RX_DECL struct rexp_node * rx_mk_r_concat (struct rx * rx,
1598 struct rexp_node * a,
1599 struct rexp_node * b);
1600 RX_DECL struct rexp_node * rx_mk_r_alternate (struct rx * rx,
1601 struct rexp_node * a,
1602 struct rexp_node * b);
1603 RX_DECL struct rexp_node * rx_mk_r_opt (struct rx * rx,
1604 struct rexp_node * a);
1605 RX_DECL struct rexp_node * rx_mk_r_star (struct rx * rx,
1606 struct rexp_node * a);
1607 RX_DECL struct rexp_node * rx_mk_r_2phase_star (struct rx * rx,
1608 struct rexp_node * a,
1609 struct rexp_node * b);
1610 RX_DECL struct rexp_node * rx_mk_r_side_effect (struct rx * rx,
1611 rx_side_effect a);
1612 #if 0
1613 RX_DECL struct rexp_node * rx_mk_r_data (struct rx * rx,
1614 void * a);
1615 #endif
1616 RX_DECL void rx_free_rexp (struct rx * rx, struct rexp_node * node);
1617 RX_DECL struct rexp_node * rx_copy_rexp (struct rx *rx,
1618 struct rexp_node *node);
1619 RX_DECL struct rx_nfa_state * rx_nfa_state (struct rx *rx);
1620 RX_DECL void rx_free_nfa_state (struct rx_nfa_state * n);
1621 RX_DECL struct rx_nfa_state * rx_id_to_nfa_state (struct rx * rx,
1622 int id);
1623 RX_DECL struct rx_nfa_edge * rx_nfa_edge (struct rx *rx,
1624 enum rx_nfa_etype type,
1625 struct rx_nfa_state *start,
1626 struct rx_nfa_state *dest);
1627 RX_DECL void rx_free_nfa_edge (struct rx_nfa_edge * e);
1628 RX_DECL void rx_free_nfa (struct rx *rx);
1629 RX_DECL int rx_build_nfa (struct rx *rx,
1630 struct rexp_node *rexp,
1631 struct rx_nfa_state **start,
1632 struct rx_nfa_state **end);
1633 RX_DECL void rx_name_nfa_states (struct rx *rx);
1634 RX_DECL int rx_eclose_nfa (struct rx *rx);
1635 RX_DECL void rx_delete_epsilon_transitions (struct rx *rx);
1636 RX_DECL int rx_compactify_nfa (struct rx *rx,
1637 void **mem, unsigned long *size);
1638 RX_DECL void rx_release_superset (struct rx *rx,
1639 struct rx_superset *set);
1640 RX_DECL struct rx_superset * rx_superset_cons (struct rx * rx,
1641 struct rx_nfa_state *car, struct rx_superset *cdr);
1642 RX_DECL struct rx_superset * rx_superstate_eclosure_union
1643 (struct rx * rx, struct rx_superset *set, struct rx_nfa_state_set *ecl);
1644 RX_DECL struct rx_superstate * rx_superstate (struct rx *rx,
1645 struct rx_superset *set);
1646 RX_DECL struct rx_inx * rx_handle_cache_miss
1647 (struct rx *rx, struct rx_superstate *super, unsigned char chr, void *data);
1648 RX_DECL reg_errcode_t rx_compile (__const__ char *pattern, int size,
1649 reg_syntax_t syntax,
1650 struct re_pattern_buffer * rxb);
1651 RX_DECL void rx_blow_up_fastmap (struct re_pattern_buffer * rxb);
1652 #else /* STDC */
1653 RX_DECL int rx_bitset_is_equal ();
1654 RX_DECL int rx_bitset_is_subset ();
1655 RX_DECL int rx_bitset_empty ();
1656 RX_DECL void rx_bitset_null ();
1657 RX_DECL void rx_bitset_universe ();
1658 RX_DECL void rx_bitset_complement ();
1659 RX_DECL void rx_bitset_assign ();
1660 RX_DECL void rx_bitset_union ();
1661 RX_DECL void rx_bitset_intersection ();
1662 RX_DECL void rx_bitset_difference ();
1663 #if 0
1664 RX_DECL void rx_bitset_revdifference ();
1665 #endif
1666 #ifdef emacs
1667 RX_DECL void rx_bitset_xor ();
1668 #endif
1669 RX_DECL unsigned long rx_bitset_hash ();
1670 RX_DECL struct rx_hash_item * rx_hash_find ();
1671 RX_DECL struct rx_hash_item * rx_hash_store ();
1672 RX_DECL void rx_hash_free ();
1673 RX_DECL void rx_free_hash_table ();
1674 RX_DECL rx_Bitset rx_cset ();
1675 RX_DECL rx_Bitset rx_copy_cset ();
1676 RX_DECL void rx_free_cset ();
1677 RX_DECL struct rexp_node * rexp_node ();
1678 RX_DECL struct rexp_node * rx_mk_r_cset ();
1679 RX_DECL struct rexp_node * rx_mk_r_concat ();
1680 RX_DECL struct rexp_node * rx_mk_r_alternate ();
1681 RX_DECL struct rexp_node * rx_mk_r_opt ();
1682 RX_DECL struct rexp_node * rx_mk_r_star ();
1683 RX_DECL struct rexp_node * rx_mk_r_2phase_star ();
1684 RX_DECL struct rexp_node * rx_mk_r_side_effect ();
1685 #if 0
1686 RX_DECL struct rexp_node * rx_mk_r_data ();
1687 #endif
1688 RX_DECL void rx_free_rexp ();
1689 RX_DECL struct rexp_node * rx_copy_rexp ();
1690 RX_DECL struct rx_nfa_state * rx_nfa_state ();
1691 RX_DECL void rx_free_nfa_state ();
1692 RX_DECL struct rx_nfa_state * rx_id_to_nfa_state ();
1693 RX_DECL struct rx_nfa_edge * rx_nfa_edge ();
1694 RX_DECL void rx_free_nfa_edge ();
1695 RX_DECL void rx_free_nfa ();
1696 RX_DECL int rx_build_nfa ();
1697 RX_DECL void rx_name_nfa_states ();
1698 RX_DECL int rx_eclose_nfa ();
1699 RX_DECL void rx_delete_epsilon_transitions ();
1700 RX_DECL int rx_compactify_nfa ();
1701 RX_DECL void rx_release_superset ();
1702 RX_DECL struct rx_superset * rx_superset_cons ();
1703 RX_DECL struct rx_superset * rx_superstate_eclosure_union ();
1704 RX_DECL struct rx_superstate * rx_superstate ();
1705 RX_DECL struct rx_inx * rx_handle_cache_miss ();
1706 RX_DECL reg_errcode_t rx_compile ();
1707 RX_DECL void rx_blow_up_fastmap ();
1708 #endif /* STDC */
1709
1710
1711 #endif /* RX_WANT_RX_DEFS */
1712
1713
1714
1715 #if defined(__STDC__) || defined(MSDOS)
1716 extern int re_search_2 (struct re_pattern_buffer *rxb,
1717 __const__ char * string1, int size1,
1718 __const__ char * string2, int size2,
1719 int startpos, int range,
1720 struct re_registers *regs,
1721 int stop);
1722 extern int re_search (struct re_pattern_buffer * rxb, __const__ char *string,
1723 int size, int startpos, int range,
1724 struct re_registers *regs);
1725 extern int re_match_2 (struct re_pattern_buffer * rxb,
1726 __const__ char * string1, int size1,
1727 __const__ char * string2, int size2,
1728 int pos, struct re_registers *regs, int stop);
1729 extern int re_match (struct re_pattern_buffer * rxb,
1730 __const__ char * string,
1731 int size, int pos,
1732 struct re_registers *regs);
1733 extern reg_syntax_t re_set_syntax (reg_syntax_t syntax);
1734 extern void re_set_registers (struct re_pattern_buffer *bufp,
1735 struct re_registers *regs,
1736 unsigned num_regs,
1737 regoff_t * starts, regoff_t * ends);
1738 extern __const__ char * re_compile_pattern (__const__ char *pattern,
1739 int length,
1740 struct re_pattern_buffer * rxb);
1741 extern int re_compile_fastmap (struct re_pattern_buffer * rxb);
1742 extern char * re_comp (__const__ char *s);
1743 extern int re_exec (__const__ char *s);
1744 extern int regcomp (regex_t * preg, __const__ char * pattern, int cflags);
1745 extern int regexec (__const__ regex_t *preg, __const__ char *string,
1746 size_t nmatch, regmatch_t pmatch[],
1747 int eflags);
1748 extern size_t regerror (int errcode, __const__ regex_t *preg,
1749 char *errbuf, size_t errbuf_size);
1750 extern void regfree (regex_t *preg);
1751
1752 #else /* STDC */
1753 extern int re_search_2 ();
1754 extern int re_search ();
1755 extern int re_match_2 ();
1756 extern int re_match ();
1757 extern reg_syntax_t re_set_syntax ();
1758 extern void re_set_registers ();
1759 extern __const__ char * re_compile_pattern ();
1760 extern int re_compile_fastmap ();
1761 extern char * re_comp ();
1762 extern int re_exec ();
1763 extern int regcomp ();
1764 extern int regexec ();
1765 extern size_t regerror ();
1766 extern void regfree ();
1767
1768 #endif /* STDC */
1769
1770
1771
1772 #ifdef RX_WANT_RX_DEFS
1773
1774 #include "mbc.h"
1775
1776 struct rx_counter_frame
1777 {
1778 int tag;
1779 int val;
1780 struct rx_counter_frame * inherited_from; /* If this is a copy. */
1781 struct rx_counter_frame * cdr;
1782 };
1783
1784 struct rx_backtrack_frame
1785 {
1786 char * counter_stack_sp;
1787
1788 /* A frame is used to save the matchers state when it crosses a
1789 * backtracking point. The `stk_' fields correspond to variables
1790 * in re_search_2 (just strip off thes `stk_'). They are documented
1791 * tere.
1792 */
1793 struct rx_superstate * stk_super;
1794 unsigned int stk_c;
1795 struct rx_string_position stk_test_pos;
1796 int stk_last_l;
1797 int stk_last_r;
1798 int stk_test_ret;
1799
1800 /* This is the list of options left to explore at the backtrack
1801 * point for which this frame was created.
1802 */
1803 struct rx_distinct_future * df;
1804 struct rx_distinct_future * first_df;
1805
1806 #ifdef RX_DEBUG
1807 int stk_line_no;
1808 #endif
1809 };
1810
1811 struct rx_stack_chunk
1812 {
1813 struct rx_stack_chunk * next_chunk;
1814 unsigned int bytes_left;
1815 char * sp;
1816 };
1817
1818 enum rx_outer_entry
1819 {
1820 rx_outer_start,
1821 rx_outer_fastmap,
1822 rx_outer_test,
1823 rx_outer_restore_pos
1824 };
1825
1826 enum rx_fastmap_return
1827 {
1828 rx_fastmap_continuation,
1829 rx_fastmap_error,
1830 rx_fastmap_ok,
1831 rx_fastmap_fail
1832 };
1833
1834 enum rx_fastmap_entry
1835 {
1836 rx_fastmap_start,
1837 rx_fastmap_string_break
1838 };
1839
1840 enum rx_test_return
1841 {
1842 rx_test_continuation,
1843 rx_test_error,
1844 rx_test_fail,
1845 rx_test_ok
1846 };
1847
1848 enum rx_test_internal_return
1849 {
1850 rx_test_internal_error,
1851 rx_test_found_first,
1852 rx_test_line_finished
1853 };
1854
1855 enum rx_test_match_entry
1856 {
1857 rx_test_start,
1858 rx_test_cache_hit_loop,
1859 rx_test_backreference_check,
1860 rx_test_backtrack_return
1861 };
1862
1863 struct rx_search_state
1864 {
1865 /* Two groups of registers are kept. The group with the register state
1866 * of the current test match, and the group that holds the state at the end
1867 * of the best known match, if any.
1868 *
1869 * For some patterns, there may also be registers saved on the stack.
1870 */
1871 unsigned num_regs; /* Includes an element for register zero. */
1872 regoff_t * lparen; /* scratch space for register returns */
1873 regoff_t * rparen;
1874 regoff_t * best_lpspace; /* in case the user doesn't want these */
1875 regoff_t * best_rpspace; /* values, we still need space to store
1876 * them. Normally, this memoryis unused
1877 * and the space pointed to by REGS is
1878 * used instead.
1879 */
1880
1881 int last_l; /* Highest index of a valid lparen. */
1882 int last_r; /* It's dual. */
1883
1884 int * best_lparen; /* This contains the best known register */
1885 int * best_rparen; /* assignments.
1886 * This may point to the same mem as
1887 * best_lpspace, or it might point to memory
1888 * passed by the caller.
1889 */
1890 int best_last_l; /* best_last_l:best_lparen::last_l:lparen */
1891 int best_last_r;
1892
1893
1894 unsigned char * translate;
1895
1896 struct rx_string_position outer_pos;
1897
1898 struct rx_superstate * start_super;
1899 int nfa_choice;
1900 int first_found; /* If true, return after finding any match. */
1901 int ret_val;
1902
1903 /* For continuations... */
1904 enum rx_outer_entry outer_search_resume_pt;
1905 struct re_pattern_buffer * saved_rxb;
1906 int saved_startpos;
1907 int saved_range;
1908 int saved_stop;
1909 int saved_total_size;
1910 rx_get_burst_fn saved_get_burst;
1911 rx_back_check_fn saved_back_check;
1912 struct re_registers * saved_regs;
1913
1914 /**
1915 ** state for fastmap
1916 **/
1917 char * fastmap;
1918 int fastmap_chr;
1919 int fastmap_val;
1920
1921 /* for continuations in the fastmap procedure: */
1922 enum rx_fastmap_entry fastmap_resume_pt;
1923
1924 /**
1925 ** state for test_match
1926 **/
1927
1928 /* The current superNFA position of the matcher. */
1929 struct rx_superstate * super;
1930
1931 /* The matcher interprets a series of instruction frames.
1932 * This is the `instruction counter' for the interpretation.
1933 */
1934 struct rx_inx * ifr;
1935
1936 /* We insert a ghost character in the string to prime
1937 * the nfa. test_pos.pos, test_pos.str_half, and test_pos.end_half
1938 * keep track of the test-match position and string-half.
1939 */
1940 unsigned char c;
1941
1942 /* Position within the string. */
1943 struct rx_string_position test_pos;
1944
1945 struct rx_stack_chunk * counter_stack;
1946 struct rx_stack_chunk * backtrack_stack;
1947 int backtrack_frame_bytes;
1948 int chunk_bytes;
1949 struct rx_stack_chunk * free_chunks;
1950
1951 /* To return from this function, set test_ret and
1952 * `goto test_do_return'.
1953 *
1954 * Possible return values are:
1955 * 1 --- end of string while the superNFA is still going
1956 * 0 --- internal error (out of memory)
1957 * -1 --- search completed by reaching the superNFA fail state
1958 * -2 --- a match was found, maybe not the longest.
1959 *
1960 * When the search is complete (-1), best_last_r indicates whether
1961 * a match was found.
1962 *
1963 * -2 is return only if search_state.first_found is non-zero.
1964 *
1965 * if search_state.first_found is non-zero, a return of -1 indicates no match,
1966 * otherwise, best_last_r has to be checked.
1967 */
1968 int test_ret;
1969
1970 int could_have_continued;
1971
1972 #ifdef RX_DEBUG
1973 int backtrack_depth;
1974 /* There is a search tree with every node as set of deterministic
1975 * transitions in the super nfa. For every branch of a
1976 * backtrack point is an edge in the tree.
1977 * This counts up a pre-order of nodes in that tree.
1978 * It's saved on the search stack and printed when debugging.
1979 */
1980 int line_no;
1981 int lines_found;
1982 #endif
1983
1984
1985 /* For continuations within the match tester */
1986 enum rx_test_match_entry test_match_resume_pt;
1987 struct rx_inx * saved_next_tr_table;
1988 struct rx_inx * saved_this_tr_table;
1989 int saved_reg;
1990 struct rx_backtrack_frame * saved_bf;
1991
1992 };
1993
1994
1995 extern char rx_slowmap[];
1996 extern unsigned char rx_id_translation[];
1997
1998 static __inline__ void
init_fastmap(rxb,search_state)1999 init_fastmap (rxb, search_state)
2000 struct re_pattern_buffer * rxb;
2001 struct rx_search_state * search_state;
2002 {
2003 search_state->fastmap = (rxb->fastmap
2004 ? (char *)rxb->fastmap
2005 : (char *)rx_slowmap);
2006 /* Update the fastmap now if not correct already.
2007 * When the regexp was compiled, the fastmap was computed
2008 * and stored in a bitset. This expands the bitset into a
2009 * character array containing 1s and 0s.
2010 */
2011 if ((search_state->fastmap == rxb->fastmap) && !rxb->fastmap_accurate)
2012 rx_blow_up_fastmap (rxb);
2013 search_state->fastmap_chr = -1;
2014 search_state->fastmap_val = 0;
2015 search_state->fastmap_resume_pt = rx_fastmap_start;
2016 }
2017
2018 static __inline__ void
uninit_fastmap(rxb,search_state)2019 uninit_fastmap (rxb, search_state)
2020 struct re_pattern_buffer * rxb;
2021 struct rx_search_state * search_state;
2022 {
2023 /* Unset the fastmap sentinel */
2024 if (search_state->fastmap_chr >= 0)
2025 search_state->fastmap[search_state->fastmap_chr]
2026 = search_state->fastmap_val;
2027 }
2028
2029 static __inline__ int
fastmap_search(rxb,stop,get_burst,app_closure,search_state)2030 fastmap_search (rxb, stop, get_burst, app_closure, search_state)
2031 struct re_pattern_buffer * rxb;
2032 int stop;
2033 rx_get_burst_fn get_burst;
2034 void * app_closure;
2035 struct rx_search_state * search_state;
2036 {
2037 enum rx_fastmap_entry pc;
2038
2039 if (0)
2040 {
2041 return_continuation:
2042 search_state->fastmap_resume_pt = pc;
2043 return rx_fastmap_continuation;
2044 }
2045
2046 pc = search_state->fastmap_resume_pt;
2047
2048 switch (pc)
2049 {
2050 default:
2051 return rx_fastmap_error;
2052 case rx_fastmap_start:
2053 init_fastmap_sentinal:
2054 /* For the sake of fast fastmapping, set a sentinal in the fastmap.
2055 * This sentinal will trap the fastmap loop when it reaches the last
2056 * valid character in a string half.
2057 *
2058 * This must be reset when the fastmap/search loop crosses a string
2059 * boundry, and before returning to the caller. So sometimes,
2060 * the fastmap loop is restarted with `continue', othertimes by
2061 * `goto init_fastmap_sentinal'.
2062 */
2063 if (search_state->outer_pos.size)
2064 {
2065 search_state->fastmap_chr = ((search_state->outer_pos.search_direction > 0)
2066 ? *(search_state->outer_pos.end - 1)
2067 : *search_state->outer_pos.string);
2068 search_state->fastmap_val
2069 = search_state->fastmap[search_state->fastmap_chr];
2070 search_state->fastmap[search_state->fastmap_chr] = 1;
2071 }
2072 else
2073 {
2074 search_state->fastmap_chr = -1;
2075 search_state->fastmap_val = 0;
2076 }
2077
2078 if (search_state->outer_pos.pos >= search_state->outer_pos.end)
2079 goto fastmap_hit_bound;
2080 else
2081 {
2082 if (search_state->outer_pos.search_direction > 0)
2083 {
2084 if (search_state->fastmap_val)
2085 {
2086 for (;;)
2087 {
2088 while (!search_state->fastmap[*search_state->outer_pos.pos]) {
2089 if (ismbchar (*search_state->outer_pos.pos)) {
2090 search_state->outer_pos.pos += 2;
2091 if (search_state->outer_pos.pos == search_state->outer_pos.end)
2092 goto fastmap_hit_bound;
2093 continue;
2094 }
2095 ++search_state->outer_pos.pos;
2096 }
2097 return rx_fastmap_ok;
2098 }
2099 }
2100 else
2101 {
2102 for (;;)
2103 {
2104 while (!search_state->fastmap[*search_state->outer_pos.pos]) {
2105 if (ismbchar (*search_state->outer_pos.pos)) {
2106 search_state->outer_pos.pos += 2;
2107 if (search_state->outer_pos.pos == search_state->outer_pos.end)
2108 goto fastmap_hit_bound;
2109 continue;
2110 }
2111 ++search_state->outer_pos.pos;
2112 }
2113 if (*search_state->outer_pos.pos != search_state->fastmap_chr)
2114 return rx_fastmap_ok;
2115 else
2116 {
2117 ++search_state->outer_pos.pos;
2118 if (search_state->outer_pos.pos == search_state->outer_pos.end)
2119 goto fastmap_hit_bound;
2120 }
2121 }
2122 }
2123 }
2124 else
2125 {
2126 __const__ unsigned char * bound;
2127 bound = search_state->outer_pos.string - 1;
2128 if (search_state->fastmap_val)
2129 {
2130 for (;;)
2131 {
2132 while (!search_state->fastmap[*search_state->outer_pos.pos])
2133 --search_state->outer_pos.pos;
2134 return rx_fastmap_ok;
2135 }
2136 }
2137 else
2138 {
2139 for (;;)
2140 {
2141 while (!search_state->fastmap[*search_state->outer_pos.pos])
2142 --search_state->outer_pos.pos;
2143 if ((*search_state->outer_pos.pos != search_state->fastmap_chr) || search_state->fastmap_val)
2144 return rx_fastmap_ok;
2145 else
2146 {
2147 --search_state->outer_pos.pos;
2148 if (search_state->outer_pos.pos == bound)
2149 goto fastmap_hit_bound;
2150 }
2151 }
2152 }
2153 }
2154 }
2155
2156 case rx_fastmap_string_break:
2157 fastmap_hit_bound:
2158 {
2159 /* If we hit a bound, it may be time to fetch another burst
2160 * of string, or it may be time to return a continuation to
2161 * the caller, or it might be time to fail.
2162 */
2163
2164 int burst_state;
2165 burst_state = get_burst (&search_state->outer_pos, app_closure, stop);
2166 switch (burst_state)
2167 {
2168 default:
2169 case rx_get_burst_error:
2170 return rx_fastmap_error;
2171 case rx_get_burst_continuation:
2172 {
2173 pc = rx_fastmap_string_break;
2174 goto return_continuation;
2175 }
2176 case rx_get_burst_ok:
2177 goto init_fastmap_sentinal;
2178 case rx_get_burst_no_more:
2179 /* ...not a string split, simply no more string.
2180 *
2181 * When searching backward, running out of string
2182 * is reason to quit.
2183 *
2184 * When searching forward, we allow the possibility
2185 * of an (empty) match after the last character in the
2186 * virtual string. So, fall through to the matcher
2187 */
2188 return ( (search_state->outer_pos.search_direction > 0)
2189 ? rx_fastmap_ok
2190 : rx_fastmap_fail);
2191 }
2192 }
2193 }
2194
2195 }
2196
2197
2198
2199 #ifdef emacs
2200 /* The `emacs' switch turns on certain matching commands
2201 * that make sense only in Emacs.
2202 */
2203 #include "config.h"
2204 #include "lisp.h"
2205 #include "buffer.h"
2206 #include "syntax.h"
2207 #endif /* emacs */
2208
2209 /* Setting RX_MEMDBUG is useful if you have dbmalloc. Maybe with similar
2210 * packages too.
2211 */
2212 #ifdef RX_MEMDBUG
2213 #include <malloc.h>
2214 #endif /* RX_RX_MEMDBUG */
2215
2216 /* We used to test for `BSTRING' here, but only GCC and Emacs define
2217 * `BSTRING', as far as I know, and neither of them use this code.
2218 */
2219 #if HAVE_STRING_H || STDC_HEADERS
2220 #include <string.h>
2221
2222 #ifndef bcmp
2223 #define bcmp(s1, s2, n) memcmp ((s1), (s2), (n))
2224 #endif
2225
2226 #ifndef bcopy
2227 #define bcopy(s, d, n) memcpy ((d), (s), (n))
2228 #endif
2229
2230 #ifndef bzero
2231 #define bzero(s, n) memset ((s), 0, (n))
2232 #endif
2233
2234 #else /* HAVE_STRING_H || STDC_HEADERS */
2235 #include <strings.h>
2236 #endif /* not (HAVE_STRING_H || STDC_HEADERS) */
2237
2238 #ifdef STDC_HEADERS
2239 #include <stdlib.h>
2240 #else /* not STDC_HEADERS */
2241 char *malloc ();
2242 char *realloc ();
2243 #endif /* not STDC_HEADERS */
2244
2245
2246
2247
2248 /* How many characters in the character set. */
2249 #define CHAR_SET_SIZE (1 << CHARBITS)
2250
2251 #ifndef emacs
2252 /* Define the syntax basics for \<, \>, etc.
2253 * This must be nonzero for the wordchar and notwordchar pattern
2254 * commands in re_match_2.
2255 */
2256 #ifndef Sword
2257 #define Sword 1
2258 #endif
2259 #define SYNTAX(c) re_syntax_table[c]
2260 RX_DECL char re_syntax_table[CHAR_SET_SIZE];
2261 #endif /* not emacs */
2262
2263
2264 /* Test if at very beginning or at very end of the virtual concatenation
2265 * of `string1' and `string2'. If only one string, it's `string2'.
2266 */
2267
2268 #define AT_STRINGS_BEG() \
2269 ( -1 \
2270 == ((search_state.test_pos.pos - search_state.test_pos.string) \
2271 + search_state.test_pos.offset))
2272
2273 #define AT_STRINGS_END() \
2274 ( (total_size - 1) \
2275 == ((search_state.test_pos.pos - search_state.test_pos.string) \
2276 + search_state.test_pos.offset))
2277
2278
2279 /* Test if POS + 1 points to a character which is word-constituent. We have
2280 * two special cases to check for: if past the end of string1, look at
2281 * the first character in string2; and if before the beginning of
2282 * string2, look at the last character in string1.
2283 *
2284 * Assumes `string1' exists, so use in conjunction with AT_STRINGS_BEG ().
2285 */
2286 #define LETTER_P(POS,OFF) \
2287 ( SYNTAX (fetch_char(POS, OFF, app_closure, stop)) \
2288 == Sword)
2289
2290 /* Test if the character at D and the one after D differ with respect
2291 * to being word-constituent.
2292 */
2293 #define AT_WORD_BOUNDARY(d) \
2294 (AT_STRINGS_BEG () || AT_STRINGS_END () || LETTER_P (d,0) != LETTER_P (d, 1))
2295
2296
2297 #ifdef RX_SUPPORT_CONTINUATIONS
2298 #define RX_STACK_ALLOC(BYTES) malloc(BYTES)
2299 #define RX_STACK_FREE(MEM) free(MEM)
2300 #else
2301 #define RX_STACK_ALLOC(BYTES) alloca(BYTES)
2302 #define RX_STACK_FREE(MEM) \
2303 ((struct rx_stack_chunk *)MEM)->next_chunk = search_state.free_chunks; \
2304 search_state.free_chunks = ((struct rx_stack_chunk *)MEM);
2305
2306 #endif
2307
2308 #define PUSH(CHUNK_VAR,BYTES) \
2309 if (!CHUNK_VAR || (CHUNK_VAR->bytes_left < (BYTES))) \
2310 { \
2311 struct rx_stack_chunk * new_chunk; \
2312 if (search_state.free_chunks) \
2313 { \
2314 new_chunk = search_state.free_chunks; \
2315 search_state.free_chunks = search_state.free_chunks->next_chunk; \
2316 } \
2317 else \
2318 { \
2319 new_chunk = (struct rx_stack_chunk *)RX_STACK_ALLOC(search_state.chunk_bytes); \
2320 if (!new_chunk) \
2321 { \
2322 search_state.ret_val = 0; \
2323 goto test_do_return; \
2324 } \
2325 } \
2326 new_chunk->sp = (char *)new_chunk + sizeof (struct rx_stack_chunk); \
2327 new_chunk->bytes_left = (search_state.chunk_bytes \
2328 - (BYTES) \
2329 - sizeof (struct rx_stack_chunk)); \
2330 new_chunk->next_chunk = CHUNK_VAR; \
2331 CHUNK_VAR = new_chunk; \
2332 } \
2333 else \
2334 (CHUNK_VAR->sp += (BYTES)), (CHUNK_VAR->bytes_left -= (BYTES))
2335
2336 #define POP(CHUNK_VAR,BYTES) \
2337 if (CHUNK_VAR->sp == ((char *)CHUNK_VAR + sizeof(*CHUNK_VAR))) \
2338 { \
2339 struct rx_stack_chunk * new_chunk = CHUNK_VAR->next_chunk; \
2340 RX_STACK_FREE(CHUNK_VAR); \
2341 CHUNK_VAR = new_chunk; \
2342 } \
2343 else \
2344 (CHUNK_VAR->sp -= BYTES), (CHUNK_VAR->bytes_left += BYTES)
2345
2346
2347
2348 #define SRCH_TRANSLATE(C) search_state.translate[(unsigned char) (C)]
2349
2350
2351
2352
2353 #if defined(__STDC__) || defined(MSDOS)
2354 RX_DECL __inline__ int
rx_search(struct re_pattern_buffer * rxb,int startpos,int range,int stop,int total_size,rx_get_burst_fn get_burst,rx_back_check_fn back_check,rx_fetch_char_fn fetch_char,void * app_closure,struct re_registers * regs,struct rx_search_state * resume_state,struct rx_search_state * save_state)2355 rx_search (struct re_pattern_buffer * rxb,
2356 int startpos,
2357 int range,
2358 int stop,
2359 int total_size,
2360 rx_get_burst_fn get_burst,
2361 rx_back_check_fn back_check,
2362 rx_fetch_char_fn fetch_char,
2363 void * app_closure,
2364 struct re_registers * regs,
2365 struct rx_search_state * resume_state,
2366 struct rx_search_state * save_state)
2367 #else
2368 RX_DECL __inline__ int
2369 rx_search (rxb, startpos, range, stop, total_size,
2370 get_burst, back_check, fetch_char,
2371 app_closure, regs, resume_state, save_state)
2372 struct re_pattern_buffer * rxb;
2373 int startpos;
2374 int range;
2375 int stop;
2376 int total_size;
2377 rx_get_burst_fn get_burst;
2378 rx_back_check_fn back_check;
2379 rx_fetch_char_fn fetch_char;
2380 void * app_closure;
2381 struct re_registers * regs;
2382 struct rx_search_state * resume_state;
2383 struct rx_search_state * save_state;
2384 #endif
2385 {
2386 int pc;
2387 int test_state;
2388 struct rx_search_state search_state;
2389
2390 search_state.free_chunks = 0;
2391 if (!resume_state)
2392 pc = rx_outer_start;
2393 else
2394 {
2395 search_state = *resume_state;
2396 regs = search_state.saved_regs;
2397 rxb = search_state.saved_rxb;
2398 startpos = search_state.saved_startpos;
2399 range = search_state.saved_range;
2400 stop = search_state.saved_stop;
2401 total_size = search_state.saved_total_size;
2402 get_burst = search_state.saved_get_burst;
2403 back_check = search_state.saved_back_check;
2404 pc = search_state.outer_search_resume_pt;
2405 if (0)
2406 {
2407 return_continuation:
2408 if (save_state)
2409 {
2410 *save_state = search_state;
2411 save_state->saved_regs = regs;
2412 save_state->saved_rxb = rxb;
2413 save_state->saved_startpos = startpos;
2414 save_state->saved_range = range;
2415 save_state->saved_stop = stop;
2416 save_state->saved_total_size = total_size;
2417 save_state->saved_get_burst = get_burst;
2418 save_state->saved_back_check = back_check;
2419 save_state->outer_search_resume_pt = pc;
2420 }
2421 return rx_search_continuation;
2422 }
2423 }
2424
2425 switch (pc)
2426 {
2427 case rx_outer_start:
2428 search_state.ret_val = rx_search_fail;
2429 ( search_state.lparen
2430 = search_state.rparen
2431 = search_state.best_lpspace
2432 = search_state.best_rpspace
2433 = 0);
2434
2435 /* figure the number of registers we may need for use in backreferences.
2436 * the number here includes an element for register zero.
2437 */
2438 search_state.num_regs = rxb->re_nsub + 1;
2439
2440
2441 /* check for out-of-range startpos. */
2442 if ((startpos < 0) || (startpos > total_size))
2443 return rx_search_fail;
2444
2445 /* fix up range if it might eventually take us outside the string. */
2446 {
2447 int endpos;
2448 endpos = startpos + range;
2449 if (endpos < -1)
2450 range = (-1 - startpos);
2451 else if (endpos > (total_size + 1))
2452 range = total_size - startpos;
2453 }
2454
2455 /* if the search isn't to be a backwards one, don't waste time in a
2456 * long search for a pattern that says it is anchored.
2457 */
2458 if (rxb->begbuf_only && (range > 0))
2459 {
2460 if (startpos > 0)
2461 return rx_search_fail;
2462 else
2463 range = 1;
2464 }
2465
2466 /* decide whether to use internal or user-provided reg buffers. */
2467 if (!regs || rxb->no_sub)
2468 {
2469 search_state.best_lpspace =
2470 (regoff_t *)REGEX_ALLOCATE (search_state.num_regs * sizeof(regoff_t));
2471 search_state.best_rpspace =
2472 (regoff_t *)REGEX_ALLOCATE (search_state.num_regs * sizeof(regoff_t));
2473 search_state.best_lparen = search_state.best_lpspace;
2474 search_state.best_rparen = search_state.best_rpspace;
2475 }
2476 else
2477 {
2478 /* have the register data arrays been allocated? */
2479 if (rxb->regs_allocated == REGS_UNALLOCATED)
2480 { /* no. so allocate them with malloc. we need one
2481 extra element beyond `search_state.num_regs' for the `-1' marker
2482 gnu code uses. */
2483 regs->num_regs = MAX (RE_NREGS, rxb->re_nsub + 1);
2484 regs->start = ((regoff_t *)
2485 malloc (regs->num_regs * sizeof ( regoff_t)));
2486 regs->end = ((regoff_t *)
2487 malloc (regs->num_regs * sizeof ( regoff_t)));
2488 if (regs->start == 0 || regs->end == 0)
2489 return rx_search_error;
2490 rxb->regs_allocated = REGS_REALLOCATE;
2491 }
2492 else if (rxb->regs_allocated == REGS_REALLOCATE)
2493 { /* yes. if we need more elements than were already
2494 allocated, reallocate them. if we need fewer, just
2495 leave it alone. */
2496 if (regs->num_regs < search_state.num_regs + 1)
2497 {
2498 regs->num_regs = search_state.num_regs + 1;
2499 regs->start = ((regoff_t *)
2500 realloc (regs->start,
2501 regs->num_regs * sizeof (regoff_t)));
2502 regs->end = ((regoff_t *)
2503 realloc (regs->end,
2504 regs->num_regs * sizeof ( regoff_t)));
2505 if (regs->start == 0 || regs->end == 0)
2506 return rx_search_error;
2507 }
2508 }
2509 else if (rxb->regs_allocated != REGS_FIXED)
2510 return rx_search_error;
2511
2512 if (regs->num_regs < search_state.num_regs + 1)
2513 {
2514 search_state.best_lpspace =
2515 ((regoff_t *)
2516 REGEX_ALLOCATE (search_state.num_regs * sizeof(regoff_t)));
2517 search_state.best_rpspace =
2518 ((regoff_t *)
2519 REGEX_ALLOCATE (search_state.num_regs * sizeof(regoff_t)));
2520 search_state.best_lparen = search_state.best_lpspace;
2521 search_state.best_rparen = search_state.best_rpspace;
2522 }
2523 else
2524 {
2525 search_state.best_lparen = regs->start;
2526 search_state.best_rparen = regs->end;
2527 }
2528 }
2529
2530 search_state.lparen =
2531 (regoff_t *) REGEX_ALLOCATE (search_state.num_regs * sizeof(regoff_t));
2532 search_state.rparen =
2533 (regoff_t *) REGEX_ALLOCATE (search_state.num_regs * sizeof(regoff_t));
2534
2535 if (! ( search_state.best_rparen
2536 && search_state.best_lparen
2537 && search_state.lparen && search_state.rparen))
2538 return rx_search_error;
2539
2540 search_state.best_last_l = search_state.best_last_r = -1;
2541
2542 search_state.translate = (rxb->translate
2543 ? rxb->translate
2544 : rx_id_translation);
2545
2546
2547
2548 /*
2549 * two nfa's were compiled.
2550 * `0' is complete.
2551 * `1' faster but gets registers wrong and ends too soon.
2552 */
2553 search_state.nfa_choice = (regs && !rxb->least_subs) ? '\0' : '\1';
2554
2555 /* we have the option to look for the best match or the first
2556 * one we can find. if the user isn't asking for register information,
2557 * we don't need to find the best match.
2558 */
2559 search_state.first_found = !regs;
2560
2561 if (range >= 0)
2562 {
2563 search_state.outer_pos.search_end = startpos + range;
2564 search_state.outer_pos.search_direction = 1;
2565 }
2566 else
2567 {
2568 search_state.outer_pos.search_end = startpos + range;
2569 search_state.outer_pos.search_direction = -1;
2570 }
2571
2572 /* the vacuous search always turns up nothing. */
2573 if ((search_state.outer_pos.search_direction > 0)
2574 ? (startpos > search_state.outer_pos.search_end)
2575 : (startpos < search_state.outer_pos.search_end))
2576 return rx_search_fail;
2577
2578 /* now we build the starting state of the supernfa. */
2579 {
2580 struct rx_superset * start_contents;
2581 struct rx_nfa_state_set * start_nfa_set;
2582
2583 /* we presume here that the nfa start state has only one
2584 * possible future with no side effects.
2585 */
2586 start_nfa_set = rxb->start->futures->destset;
2587 if ( rxb->rx.start_set
2588 && (rxb->rx.start_set->starts_for == &rxb->rx))
2589 start_contents = rxb->rx.start_set;
2590 else
2591 {
2592 start_contents =
2593 rx_superstate_eclosure_union (&rxb->rx,
2594 rx_superset_cons (&rxb->rx, 0, 0),
2595 start_nfa_set);
2596
2597 if (!start_contents)
2598 return rx_search_fail;
2599
2600 start_contents->starts_for = &rxb->rx;
2601 rxb->rx.start_set = start_contents;
2602 }
2603 if ( start_contents->superstate
2604 && (start_contents->superstate->rx_id == rxb->rx.rx_id))
2605 {
2606 search_state.start_super = start_contents->superstate;
2607 rx_lock_superstate (&rxb->rx, search_state.start_super);
2608 }
2609 else
2610 {
2611 rx_protect_superset (&rxb->rx, start_contents);
2612
2613 search_state.start_super = rx_superstate (&rxb->rx, start_contents);
2614 if (!search_state.start_super)
2615 return rx_search_fail;
2616 rx_lock_superstate (&rxb->rx, search_state.start_super);
2617 rx_release_superset (&rxb->rx, start_contents);
2618 }
2619 }
2620
2621
2622 /* The outer_pos tracks the position within the strings
2623 * as seen by loop that calls fastmap_search.
2624 *
2625 * The caller supplied get_burst function actually
2626 * gives us pointers to chars.
2627 *
2628 * Communication with the get_burst function is through an
2629 * rx_string_position structure. Here, the structure for
2630 * outer_pos is initialized. It is set to point to the
2631 * NULL string, at an offset of STARTPOS. STARTPOS is out
2632 * of range of the NULL string, so the first call to
2633 * getburst will patch up the rx_string_position to point
2634 * to valid characters.
2635 */
2636
2637 ( search_state.outer_pos.string
2638 = search_state.outer_pos.end
2639 = 0);
2640
2641 search_state.outer_pos.offset = 0;
2642 search_state.outer_pos.size = 0;
2643 search_state.outer_pos.pos = (unsigned char *)startpos;
2644 init_fastmap (rxb, &search_state);
2645
2646 search_state.fastmap_resume_pt = rx_fastmap_start;
2647 case rx_outer_fastmap:
2648 /* do { */
2649 pseudo_do:
2650 {
2651 {
2652 int fastmap_state;
2653 fastmap_state = fastmap_search (rxb, stop, get_burst, app_closure,
2654 &search_state);
2655 switch (fastmap_state)
2656 {
2657 case rx_fastmap_continuation:
2658 pc = rx_outer_fastmap;
2659 goto return_continuation;
2660 case rx_fastmap_fail:
2661 goto finish;
2662 case rx_fastmap_ok:
2663 break;
2664 }
2665 }
2666
2667 /* now the fastmap loop has brought us to a plausible
2668 * starting point for a match. so, it's time to run the
2669 * nfa and see if a match occured.
2670 */
2671 startpos = ( search_state.outer_pos.pos
2672 - search_state.outer_pos.string
2673 + search_state.outer_pos.offset);
2674 #if 0
2675 /*|*/ if ((range > 0) && (startpos == search_state.outer_pos.search_end))
2676 /*|*/ goto finish;
2677 #endif
2678 }
2679
2680 search_state.test_match_resume_pt = rx_test_start;
2681 /* do interrupted for entry point... */
2682 case rx_outer_test:
2683 /* ...do continued */
2684 {
2685 goto test_match;
2686 test_returns_to_search:
2687 switch (test_state)
2688 {
2689 case rx_test_continuation:
2690 pc = rx_outer_test;
2691 goto return_continuation;
2692 case rx_test_error:
2693 search_state.ret_val = rx_search_error;
2694 goto finish;
2695 case rx_test_fail:
2696 break;
2697 case rx_test_ok:
2698 goto finish;
2699 }
2700 if (search_state.outer_pos.search_direction > 0)
2701 {
2702 if (ismbchar (*search_state.outer_pos.pos))
2703 {
2704 search_state.outer_pos.pos++;
2705 startpos++;
2706 }
2707 search_state.outer_pos.pos++;
2708 startpos++;
2709 }
2710 else
2711 {
2712 __const__ unsigned char *p;
2713 search_state.outer_pos.pos--;
2714 startpos--;
2715 for (p = search_state.outer_pos.pos;
2716 p-- > search_state.outer_pos.string && ismbchar (*p);
2717 /* no-op */)
2718 ;
2719 if (!((search_state.outer_pos.pos - p) & 1))
2720 {
2721 search_state.outer_pos.pos--;
2722 startpos--;
2723 }
2724 }
2725 #if 0
2726 /*|*/ if (search_state.test_pos.pos < search_state.test_pos.end)
2727 /*|*/ break;
2728 #endif
2729 }
2730 /* do interrupted for entry point... */
2731 case rx_outer_restore_pos:
2732 {
2733 int x;
2734 x = get_burst (&search_state.outer_pos, app_closure, stop);
2735 switch (x)
2736 {
2737 case rx_get_burst_continuation:
2738 pc = rx_outer_restore_pos;
2739 goto return_continuation;
2740 case rx_get_burst_error:
2741 search_state.ret_val = rx_search_error;
2742 goto finish;
2743 case rx_get_burst_no_more:
2744 if (rxb->can_match_empty)
2745 break;
2746 goto finish;
2747 case rx_get_burst_ok:
2748 break;
2749 }
2750 } /* } while (...see below...) */
2751
2752 if ((search_state.outer_pos.search_direction > 0)
2753 ? (startpos <= search_state.outer_pos.search_end)
2754 : (startpos > search_state.outer_pos.search_end))
2755 goto pseudo_do;
2756
2757
2758 finish:
2759 uninit_fastmap (rxb, &search_state);
2760 if (search_state.start_super)
2761 rx_unlock_superstate (&rxb->rx, search_state.start_super);
2762
2763 #ifdef regex_malloc
2764 if (search_state.lparen) free (search_state.lparen);
2765 if (search_state.rparen) free (search_state.rparen);
2766 if (search_state.best_lpspace) free (search_state.best_lpspace);
2767 if (search_state.best_rpspace) free (search_state.best_rpspace);
2768 #endif
2769 return search_state.ret_val;
2770 }
2771
2772
2773 test_match:
2774 {
2775 enum rx_test_match_entry test_pc;
2776 int inx;
2777 test_pc = search_state.test_match_resume_pt;
2778 if (test_pc == rx_test_start)
2779 {
2780 #ifdef RX_DEBUG
2781 search_state.backtrack_depth = 0;
2782 #endif
2783 search_state.last_l = search_state.last_r = 0;
2784 search_state.lparen[0] = startpos;
2785 search_state.super = search_state.start_super;
2786 search_state.c = search_state.nfa_choice;
2787 search_state.test_pos.pos = search_state.outer_pos.pos - 1;
2788 search_state.test_pos.string = search_state.outer_pos.string;
2789 search_state.test_pos.end = search_state.outer_pos.end;
2790 search_state.test_pos.offset = search_state.outer_pos.offset;
2791 search_state.test_pos.size = search_state.outer_pos.size;
2792 search_state.test_pos.search_direction = 1;
2793 search_state.counter_stack = 0;
2794 search_state.backtrack_stack = 0;
2795 search_state.backtrack_frame_bytes =
2796 (sizeof (struct rx_backtrack_frame)
2797 + (rxb->match_regs_on_stack
2798 ? sizeof (regoff_t) * (search_state.num_regs + 1) * 2
2799 : 0));
2800 search_state.chunk_bytes = search_state.backtrack_frame_bytes * 64;
2801 search_state.test_ret = rx_test_line_finished;
2802 search_state.could_have_continued = 0;
2803 }
2804 /* This is while (1)...except that the body of the loop is interrupted
2805 * by some alternative entry points.
2806 */
2807 pseudo_while_1:
2808 switch (test_pc)
2809 {
2810 case rx_test_cache_hit_loop:
2811 goto resume_continuation_1;
2812 case rx_test_backreference_check:
2813 goto resume_continuation_2;
2814 case rx_test_backtrack_return:
2815 goto resume_continuation_3;
2816 case rx_test_start:
2817 #ifdef RX_DEBUG
2818 /* There is a search tree with every node as set of deterministic
2819 * transitions in the super nfa. For every branch of a
2820 * backtrack point is an edge in the tree.
2821 * This counts up a pre-order of nodes in that tree.
2822 * It's saved on the search stack and printed when debugging.
2823 */
2824 search_state.line_no = 0;
2825 search_state.lines_found = 0;
2826 #endif
2827
2828 top_of_cycle:
2829 /* A superstate is basicly a transition table, indexed by
2830 * characters from the string being tested, and containing
2831 * RX_INX (`instruction frame') structures.
2832 */
2833 search_state.ifr = &search_state.super->transitions [search_state.c];
2834
2835 recurse_test_match:
2836 /* This is the point to which control is sent when the
2837 * test matcher `recurses'. Before jumping here, some variables
2838 * need to be saved on the stack and the next instruction frame
2839 * has to be computed.
2840 */
2841
2842 restart:
2843 /* Some instructions don't advance the matcher, but just
2844 * carry out some side effects and fetch a new instruction.
2845 * To dispatch that new instruction, `goto restart'.
2846 */
2847
2848 {
2849 struct rx_inx * next_tr_table;
2850 struct rx_inx * this_tr_table;
2851 /* The fastest route through the loop is when the instruction
2852 * is RX_NEXT_CHAR. This case is detected when SEARCH_STATE.IFR->DATA
2853 * is non-zero. In that case, it points to the next
2854 * superstate.
2855 *
2856 * This allows us to not bother fetching the bytecode.
2857 */
2858 next_tr_table = (struct rx_inx *)search_state.ifr->data;
2859 this_tr_table = search_state.super->transitions;
2860 while (next_tr_table)
2861 {
2862 #ifdef RX_DEBUG_0
2863 if (rx_debug_trace)
2864 {
2865 struct rx_superset * setp;
2866
2867 fprintf (stderr, "%d %d>> re_next_char @ %d (%d)",
2868 search_state.line_no,
2869 search_state.backtrack_depth,
2870 (search_state.test_pos.pos - search_state.test_pos.string
2871 + search_state.test_pos.offset), search_state.c);
2872
2873 search_state.super =
2874 ((struct rx_superstate *)
2875 ((char *)this_tr_table
2876 - ((unsigned long)
2877 ((struct rx_superstate *)0)->transitions)));
2878
2879 setp = search_state.super->contents;
2880 fprintf (stderr, " superstet (rx=%d, &=%x: ",
2881 rxb->rx.rx_id, setp);
2882 while (setp)
2883 {
2884 fprintf (stderr, "%d ", setp->id);
2885 setp = setp->cdr;
2886 }
2887 fprintf (stderr, "\n");
2888 }
2889 #endif
2890 this_tr_table = next_tr_table;
2891 ++search_state.test_pos.pos;
2892 if (search_state.test_pos.pos == search_state.test_pos.end)
2893 {
2894 int burst_state;
2895 try_burst_1:
2896 burst_state = get_burst (&search_state.test_pos,
2897 app_closure, stop);
2898 switch (burst_state)
2899 {
2900 case rx_get_burst_continuation:
2901 search_state.saved_this_tr_table = this_tr_table;
2902 search_state.saved_next_tr_table = next_tr_table;
2903 test_pc = rx_test_cache_hit_loop;
2904 goto test_return_continuation;
2905
2906 resume_continuation_1:
2907 /* Continuation one jumps here to do its work: */
2908 search_state.saved_this_tr_table = this_tr_table;
2909 search_state.saved_next_tr_table = next_tr_table;
2910 goto try_burst_1;
2911
2912 case rx_get_burst_ok:
2913 /* get_burst succeeded...keep going */
2914 break;
2915
2916 case rx_get_burst_no_more:
2917 search_state.test_ret = rx_test_line_finished;
2918 search_state.could_have_continued = 1;
2919 goto test_do_return;
2920
2921 case rx_get_burst_error:
2922 /* An error... */
2923 search_state.test_ret = rx_test_internal_error;
2924 goto test_do_return;
2925 }
2926 }
2927 search_state.c = *search_state.test_pos.pos;
2928 search_state.ifr = this_tr_table + search_state.c;
2929 next_tr_table = (struct rx_inx *)search_state.ifr->data;
2930 } /* Fast loop through cached transition tables */
2931
2932 /* Here when we ran out of cached next-char transitions.
2933 * So, it will be necessary to do a more expensive
2934 * dispatch on the current instruction. The superstate
2935 * pointer is allowed to become invalid during next-char
2936 * transitions -- now we must bring it up to date.
2937 */
2938 search_state.super =
2939 ((struct rx_superstate *)
2940 ((char *)this_tr_table
2941 - ((unsigned long)
2942 ((struct rx_superstate *)0)->transitions)));
2943 }
2944
2945 /* We've encountered an instruction other than next-char.
2946 * Dispatch that instruction:
2947 */
2948 inx = (int)search_state.ifr->inx;
2949 #ifdef RX_DEBUG_0
2950 if (rx_debug_trace)
2951 {
2952 struct rx_superset * setp = search_state.super->contents;
2953
2954 fprintf (stderr, "%d %d>> %s @ %d (%d)", search_state.line_no,
2955 search_state.backtrack_depth,
2956 inx_names[inx],
2957 (search_state.test_pos.pos - search_state.test_pos.string
2958 + (test_pos.half == 0 ? 0 : size1)), search_state.c);
2959
2960 fprintf (stderr, " superstet (rx=%d, &=%x: ",
2961 rxb->rx.rx_id, setp);
2962 while (setp)
2963 {
2964 fprintf (stderr, "%d ", setp->id);
2965 setp = setp->cdr;
2966 }
2967 fprintf (stderr, "\n");
2968 }
2969 #endif
2970 switch ((enum rx_opcode)inx)
2971 {
2972 case rx_do_side_effects:
2973
2974 /* RX_DO_SIDE_EFFECTS occurs when we cross epsilon
2975 * edges associated with parentheses, backreferencing, etc.
2976 */
2977 {
2978 struct rx_distinct_future * df =
2979 (struct rx_distinct_future *)search_state.ifr->data_2;
2980 struct rx_se_list * el = df->effects;
2981 /* Side effects come in lists. This walks down
2982 * a list, dispatching.
2983 */
2984 while (el)
2985 {
2986 long effect;
2987 effect = (long)el->car;
2988 if (effect < 0)
2989 {
2990 #ifdef RX_DEBUG_0
2991 if (rx_debug_trace)
2992 {
2993 struct rx_superset * setp = search_state.super->contents;
2994
2995 fprintf (stderr, "....%d %d>> %s\n", search_state.line_no,
2996 search_state.backtrack_depth,
2997 efnames[-effect]);
2998 }
2999 #endif
3000 switch ((enum re_side_effects) effect)
3001
3002 {
3003 case re_se_pushback:
3004 search_state.ifr = &df->future_frame;
3005 if (!search_state.ifr->data)
3006 {
3007 struct rx_superstate * sup;
3008 sup = search_state.super;
3009 rx_lock_superstate (rx, sup);
3010 if (!rx_handle_cache_miss (&rxb->rx,
3011 search_state.super,
3012 search_state.c,
3013 (search_state.ifr
3014 ->data_2)))
3015 {
3016 rx_unlock_superstate (rx, sup);
3017 search_state.test_ret = rx_test_internal_error;
3018 goto test_do_return;
3019 }
3020 rx_unlock_superstate (rx, sup);
3021 }
3022 /* --search_state.test_pos.pos; */
3023 search_state.c = 't';
3024 search_state.super
3025 = ((struct rx_superstate *)
3026 ((char *)search_state.ifr->data
3027 - (long)(((struct rx_superstate *)0)
3028 ->transitions)));
3029 goto top_of_cycle;
3030 break;
3031 case re_se_push0:
3032 {
3033 struct rx_counter_frame * old_cf
3034 = (search_state.counter_stack
3035 ? ((struct rx_counter_frame *)
3036 search_state.counter_stack->sp)
3037 : 0);
3038 struct rx_counter_frame * cf;
3039 PUSH (search_state.counter_stack,
3040 sizeof (struct rx_counter_frame));
3041 cf = ((struct rx_counter_frame *)
3042 search_state.counter_stack->sp);
3043 cf->tag = re_se_iter;
3044 cf->val = 0;
3045 cf->inherited_from = 0;
3046 cf->cdr = old_cf;
3047 break;
3048 }
3049 case re_se_fail:
3050 goto test_do_return;
3051 case re_se_begbuf:
3052 if (!AT_STRINGS_BEG ())
3053 goto test_do_return;
3054 break;
3055 case re_se_endbuf:
3056 if (!AT_STRINGS_END ())
3057 goto test_do_return;
3058 break;
3059 case re_se_wordbeg:
3060 if ( LETTER_P (&search_state.test_pos, 1)
3061 && ( AT_STRINGS_BEG()
3062 || !LETTER_P (&search_state.test_pos, 0)))
3063 break;
3064 else
3065 goto test_do_return;
3066 case re_se_wordend:
3067 if ( !AT_STRINGS_BEG ()
3068 && LETTER_P (&search_state.test_pos, 0)
3069 && (AT_STRINGS_END ()
3070 || !LETTER_P (&search_state.test_pos, 1)))
3071 break;
3072 else
3073 goto test_do_return;
3074 case re_se_wordbound:
3075 if (AT_WORD_BOUNDARY (&search_state.test_pos))
3076 break;
3077 else
3078 goto test_do_return;
3079 case re_se_notwordbound:
3080 if (!AT_WORD_BOUNDARY (&search_state.test_pos))
3081 break;
3082 else
3083 goto test_do_return;
3084 case re_se_hat:
3085 if (AT_STRINGS_BEG ())
3086 {
3087 if (rxb->not_bol)
3088 goto test_do_return;
3089 else
3090 break;
3091 }
3092 else
3093 {
3094 char pos_c = *search_state.test_pos.pos;
3095 if ( (SRCH_TRANSLATE (pos_c)
3096 == SRCH_TRANSLATE('\n'))
3097 && rxb->newline_anchor)
3098 break;
3099 else
3100 goto test_do_return;
3101 }
3102 case re_se_dollar:
3103 if (AT_STRINGS_END ())
3104 {
3105 if (rxb->not_eol)
3106 goto test_do_return;
3107 else
3108 break;
3109 }
3110 else
3111 {
3112 if ( ( SRCH_TRANSLATE (fetch_char
3113 (&search_state.test_pos, 1,
3114 app_closure, stop))
3115 == SRCH_TRANSLATE ('\n'))
3116 && rxb->newline_anchor)
3117 break;
3118 else
3119 goto test_do_return;
3120 }
3121
3122 case re_se_try:
3123 /* This is the first side effect in every
3124 * expression.
3125 *
3126 * FOR NO GOOD REASON...get rid of it...
3127 */
3128 break;
3129
3130 case re_se_pushpos:
3131 {
3132 int urhere =
3133 ((int)(search_state.test_pos.pos
3134 - search_state.test_pos.string)
3135 + search_state.test_pos.offset);
3136 struct rx_counter_frame * old_cf
3137 = (search_state.counter_stack
3138 ? ((struct rx_counter_frame *)
3139 search_state.counter_stack->sp)
3140 : 0);
3141 struct rx_counter_frame * cf;
3142 PUSH(search_state.counter_stack,
3143 sizeof (struct rx_counter_frame));
3144 cf = ((struct rx_counter_frame *)
3145 search_state.counter_stack->sp);
3146 cf->tag = re_se_pushpos;
3147 cf->val = urhere;
3148 cf->inherited_from = 0;
3149 cf->cdr = old_cf;
3150 break;
3151 }
3152
3153 case re_se_chkpos:
3154 {
3155 int urhere =
3156 ((int)(search_state.test_pos.pos
3157 - search_state.test_pos.string)
3158 + search_state.test_pos.offset);
3159 struct rx_counter_frame * cf
3160 = ((struct rx_counter_frame *)
3161 search_state.counter_stack->sp);
3162 if (cf->val == urhere)
3163 goto test_do_return;
3164 cf->val = urhere;
3165 break;
3166 }
3167 break;
3168
3169 case re_se_poppos:
3170 POP(search_state.counter_stack,
3171 sizeof (struct rx_counter_frame));
3172 break;
3173
3174
3175 case re_se_at_dot:
3176 case re_se_syntax:
3177 case re_se_not_syntax:
3178 #ifdef emacs
3179 /*
3180 * this release lacks emacs support
3181 */
3182 #endif
3183 break;
3184 case re_se_win:
3185 case re_se_lparen:
3186 case re_se_rparen:
3187 case re_se_backref:
3188 case re_se_iter:
3189 case re_se_end_iter:
3190 case re_se_tv:
3191 case re_floogle_flap:
3192 search_state.ret_val = 0;
3193 goto test_do_return;
3194 }
3195 }
3196 else
3197 {
3198 #ifdef RX_DEBUG_0
3199 if (rx_debug_trace)
3200 fprintf (stderr, "....%d %d>> %s %d %d\n", search_state.line_no,
3201 search_state.backtrack_depth,
3202 efnames2[rxb->se_params [effect].se],
3203 rxb->se_params [effect].op1,
3204 rxb->se_params [effect].op2);
3205 #endif
3206 switch (rxb->se_params [effect].se)
3207 {
3208 case re_se_win:
3209 /* This side effect indicates that we've
3210 * found a match, though not necessarily the
3211 * best match. This is a fancy assignment to
3212 * register 0 unless the caller didn't
3213 * care about registers. In which case,
3214 * this stops the match.
3215 */
3216 {
3217 int urhere =
3218 ((int)(search_state.test_pos.pos
3219 - search_state.test_pos.string)
3220 + search_state.test_pos.offset);
3221
3222 if ( (search_state.best_last_r < 0)
3223 || (urhere + 1 > search_state.best_rparen[0]))
3224 {
3225 /* Record the best known and keep
3226 * looking.
3227 */
3228 int x;
3229 for (x = 0; x <= search_state.last_l; ++x)
3230 search_state.best_lparen[x] = search_state.lparen[x];
3231 search_state.best_last_l = search_state.last_l;
3232 for (x = 0; x <= search_state.last_r; ++x)
3233 search_state.best_rparen[x] = search_state.rparen[x];
3234 search_state.best_rparen[0] = urhere + 1;
3235 search_state.best_last_r = search_state.last_r;
3236 }
3237 /* If we're not reporting the match-length
3238 * or other register info, we need look no
3239 * further.
3240 */
3241 if (search_state.first_found)
3242 {
3243 search_state.test_ret = rx_test_found_first;
3244 goto test_do_return;
3245 }
3246 }
3247 break;
3248 case re_se_lparen:
3249 {
3250 int urhere =
3251 ((int)(search_state.test_pos.pos
3252 - search_state.test_pos.string)
3253 + search_state.test_pos.offset);
3254
3255 int reg = rxb->se_params [effect].op1;
3256 #if 0
3257 if (reg > search_state.last_l)
3258 #endif
3259 {
3260 search_state.lparen[reg] = urhere + 1;
3261 /* In addition to making this assignment,
3262 * we now know that lower numbered regs
3263 * that haven't already been assigned,
3264 * won't be. We make sure they're
3265 * filled with -1, so they can be
3266 * recognized as unassigned.
3267 */
3268 if (search_state.last_l < reg)
3269 while (++search_state.last_l < reg)
3270 search_state.lparen[search_state.last_l] = -1;
3271 }
3272 break;
3273 }
3274
3275 case re_se_rparen:
3276 {
3277 int urhere =
3278 ((int)(search_state.test_pos.pos
3279 - search_state.test_pos.string)
3280 + search_state.test_pos.offset);
3281 int reg = rxb->se_params [effect].op1;
3282 search_state.rparen[reg] = urhere + 1;
3283 if (search_state.last_r < reg)
3284 {
3285 while (++search_state.last_r < reg)
3286 search_state.rparen[search_state.last_r]
3287 = -1;
3288 }
3289 break;
3290 }
3291
3292 case re_se_backref:
3293 {
3294 int reg = rxb->se_params [effect].op1;
3295 if ( reg > search_state.last_r
3296 || search_state.rparen[reg] < 0)
3297 goto test_do_return;
3298
3299 {
3300 int backref_status;
3301 check_backreference:
3302 backref_status
3303 = back_check (&search_state.test_pos,
3304 search_state.lparen[reg],
3305 search_state.rparen[reg],
3306 search_state.translate,
3307 app_closure,
3308 stop);
3309 switch (backref_status)
3310 {
3311 case rx_back_check_continuation:
3312 search_state.saved_reg = reg;
3313 test_pc = rx_test_backreference_check;
3314 goto test_return_continuation;
3315 resume_continuation_2:
3316 reg = search_state.saved_reg;
3317 goto check_backreference;
3318 case rx_back_check_fail:
3319 /* Fail */
3320 goto test_do_return;
3321 case rx_back_check_pass:
3322 /* pass --
3323 * test_pos now advanced to last
3324 * char matched by backref
3325 */
3326 break;
3327 }
3328 }
3329 break;
3330 }
3331 case re_se_iter:
3332 {
3333 struct rx_counter_frame * csp
3334 = ((struct rx_counter_frame *)
3335 search_state.counter_stack->sp);
3336 if (csp->val == rxb->se_params[effect].op2)
3337 goto test_do_return;
3338 else
3339 ++csp->val;
3340 break;
3341 }
3342 case re_se_end_iter:
3343 {
3344 struct rx_counter_frame * csp
3345 = ((struct rx_counter_frame *)
3346 search_state.counter_stack->sp);
3347 if (csp->val < rxb->se_params[effect].op1)
3348 goto test_do_return;
3349 else
3350 {
3351 struct rx_counter_frame * source = csp;
3352 while (source->inherited_from)
3353 source = source->inherited_from;
3354 if (!source || !source->cdr)
3355 {
3356 POP(search_state.counter_stack,
3357 sizeof(struct rx_counter_frame));
3358 }
3359 else
3360 {
3361 source = source->cdr;
3362 csp->val = source->val;
3363 csp->tag = source->tag;
3364 csp->cdr = 0;
3365 csp->inherited_from = source;
3366 }
3367 }
3368 break;
3369 }
3370 case re_se_tv:
3371 /* is a noop */
3372 break;
3373 case re_se_try:
3374 case re_se_pushback:
3375 case re_se_push0:
3376 case re_se_pushpos:
3377 case re_se_chkpos:
3378 case re_se_poppos:
3379 case re_se_at_dot:
3380 case re_se_syntax:
3381 case re_se_not_syntax:
3382 case re_se_begbuf:
3383 case re_se_hat:
3384 case re_se_wordbeg:
3385 case re_se_wordbound:
3386 case re_se_notwordbound:
3387 case re_se_wordend:
3388 case re_se_endbuf:
3389 case re_se_dollar:
3390 case re_se_fail:
3391 case re_floogle_flap:
3392 search_state.ret_val = 0;
3393 goto test_do_return;
3394 }
3395 }
3396 el = el->cdr;
3397 }
3398 /* Now the side effects are done,
3399 * so get the next instruction.
3400 * and move on.
3401 */
3402 search_state.ifr = &df->future_frame;
3403 goto restart;
3404 }
3405
3406 case rx_backtrack_point:
3407 {
3408 /* A backtrack point indicates that we've reached a
3409 * non-determinism in the superstate NFA. This is a
3410 * loop that exhaustively searches the possibilities.
3411 *
3412 * A backtracking strategy is used. We keep track of what
3413 * registers are valid so we can erase side effects.
3414 *
3415 * First, make sure there is some stack space to hold
3416 * our state.
3417 */
3418
3419 struct rx_backtrack_frame * bf;
3420
3421 PUSH(search_state.backtrack_stack,
3422 search_state.backtrack_frame_bytes);
3423 #ifdef RX_DEBUG_0
3424 ++search_state.backtrack_depth;
3425 #endif
3426
3427 bf = ((struct rx_backtrack_frame *)
3428 search_state.backtrack_stack->sp);
3429 {
3430 bf->stk_super = search_state.super;
3431 /* We prevent the current superstate from being
3432 * deleted from the superstate cache.
3433 */
3434 rx_lock_superstate (&rxb->rx, search_state.super);
3435 #ifdef RX_DEBUG_0
3436 bf->stk_search_state.line_no = search_state.line_no;
3437 #endif
3438 bf->stk_c = search_state.c;
3439 bf->stk_test_pos = search_state.test_pos;
3440 bf->stk_last_l = search_state.last_l;
3441 bf->stk_last_r = search_state.last_r;
3442 bf->df = ((struct rx_super_edge *)
3443 search_state.ifr->data_2)->options;
3444 bf->first_df = bf->df;
3445 bf->counter_stack_sp = (search_state.counter_stack
3446 ? search_state.counter_stack->sp
3447 : 0);
3448 bf->stk_test_ret = search_state.test_ret;
3449 if (rxb->match_regs_on_stack)
3450 {
3451 int x;
3452 regoff_t * stk =
3453 (regoff_t *)((char *)bf + sizeof (*bf));
3454 for (x = 0; x <= search_state.last_l; ++x)
3455 stk[x] = search_state.lparen[x];
3456 stk += x;
3457 for (x = 0; x <= search_state.last_r; ++x)
3458 stk[x] = search_state.rparen[x];
3459 }
3460 }
3461
3462 /* Here is a while loop whose body is mainly a function
3463 * call and some code to handle a return from that
3464 * function.
3465 *
3466 * From here on for the rest of `case backtrack_point' it
3467 * is unsafe to assume that the search_state copies of
3468 * variables saved on the backtracking stack are valid
3469 * -- so read their values from the backtracking stack.
3470 *
3471 * This lets us use one generation fewer stack saves in
3472 * the call-graph of a search.
3473 */
3474
3475 while_non_det_options:
3476 #ifdef RX_DEBUG_0
3477 ++search_state.lines_found;
3478 if (rx_debug_trace)
3479 fprintf (stderr, "@@@ %d calls %d @@@\n",
3480 search_state.line_no, search_state.lines_found);
3481
3482 search_state.line_no = search_state.lines_found;
3483 #endif
3484
3485 if (bf->df->next_same_super_edge[0] == bf->first_df)
3486 {
3487 /* This is a tail-call optimization -- we don't recurse
3488 * for the last of the possible futures.
3489 */
3490 search_state.ifr = (bf->df->effects
3491 ? &bf->df->side_effects_frame
3492 : &bf->df->future_frame);
3493
3494 rx_unlock_superstate (&rxb->rx, search_state.super);
3495 POP(search_state.backtrack_stack,
3496 search_state.backtrack_frame_bytes);
3497 #ifdef RX_DEBUG
3498 --search_state.backtrack_depth;
3499 #endif
3500 goto restart;
3501 }
3502 else
3503 {
3504 if (search_state.counter_stack)
3505 {
3506 struct rx_counter_frame * old_cf
3507 = ((struct rx_counter_frame *)search_state.counter_stack->sp);
3508 struct rx_counter_frame * cf;
3509 PUSH(search_state.counter_stack, sizeof (struct rx_counter_frame));
3510 cf = ((struct rx_counter_frame *)search_state.counter_stack->sp);
3511 cf->tag = old_cf->tag;
3512 cf->val = old_cf->val;
3513 cf->inherited_from = old_cf;
3514 cf->cdr = 0;
3515 }
3516 /* `Call' this test-match block */
3517 search_state.ifr = (bf->df->effects
3518 ? &bf->df->side_effects_frame
3519 : &bf->df->future_frame);
3520 goto recurse_test_match;
3521 }
3522
3523 /* Returns in this block are accomplished by
3524 * goto test_do_return. There are two cases.
3525 * If there is some search-stack left,
3526 * then it is a return from a `recursive' call.
3527 * If there is no search-stack left, then
3528 * we should return to the fastmap/search loop.
3529 */
3530
3531 test_do_return:
3532
3533 if (!search_state.backtrack_stack)
3534 {
3535 #ifdef RX_DEBUG_0
3536 if (rx_debug_trace)
3537 fprintf (stderr, "!!! %d bails returning %d !!!\n",
3538 search_state.line_no, search_state.test_ret);
3539 #endif
3540
3541 /* No more search-stack -- this test is done. */
3542 if (search_state.test_ret != rx_test_internal_error)
3543 goto return_from_test_match;
3544 else
3545 goto error_in_testing_match;
3546 }
3547
3548 /* Returning from a recursive call to
3549 * the test match block:
3550 */
3551
3552 bf = ((struct rx_backtrack_frame *)
3553 search_state.backtrack_stack->sp);
3554 #ifdef RX_DEBUG_0
3555 if (rx_debug_trace)
3556 fprintf (stderr, "+++ %d returns %d (to %d)+++\n",
3557 search_state.line_no,
3558 search_state.test_ret,
3559 bf->stk_search_state.line_no);
3560 #endif
3561
3562 while (search_state.counter_stack
3563 && (!bf->counter_stack_sp
3564 || (bf->counter_stack_sp
3565 != search_state.counter_stack->sp)))
3566 {
3567 POP(search_state.counter_stack,
3568 sizeof (struct rx_counter_frame));
3569 }
3570
3571 if (search_state.test_ret == rx_test_internal_error)
3572 {
3573 POP (search_state.backtrack_stack,
3574 search_state.backtrack_frame_bytes);
3575 search_state.test_ret = rx_test_internal_error;
3576 goto test_do_return;
3577 }
3578
3579 /* If a non-longest match was found and that is good
3580 * enough, return immediately.
3581 */
3582 if ( (search_state.test_ret == rx_test_found_first)
3583 && search_state.first_found)
3584 {
3585 rx_unlock_superstate (&rxb->rx, bf->stk_super);
3586 POP (search_state.backtrack_stack,
3587 search_state.backtrack_frame_bytes);
3588 goto test_do_return;
3589 }
3590
3591 search_state.test_ret = bf->stk_test_ret;
3592 search_state.last_l = bf->stk_last_l;
3593 search_state.last_r = bf->stk_last_r;
3594 bf->df = bf->df->next_same_super_edge[0];
3595 search_state.super = bf->stk_super;
3596 search_state.c = bf->stk_c;
3597 #ifdef RX_DEBUG_0
3598 search_state.line_no = bf->stk_search_state.line_no;
3599 #endif
3600
3601 if (rxb->match_regs_on_stack)
3602 {
3603 int x;
3604 regoff_t * stk =
3605 (regoff_t *)((char *)bf + sizeof (*bf));
3606 for (x = 0; x <= search_state.last_l; ++x)
3607 search_state.lparen[x] = stk[x];
3608 stk += x;
3609 for (x = 0; x <= search_state.last_r; ++x)
3610 search_state.rparen[x] = stk[x];
3611 }
3612
3613 {
3614 int x;
3615 try_burst_2:
3616 x = get_burst (&bf->stk_test_pos, app_closure, stop);
3617 switch (x)
3618 {
3619 case rx_get_burst_continuation:
3620 search_state.saved_bf = bf;
3621 test_pc = rx_test_backtrack_return;
3622 goto test_return_continuation;
3623 resume_continuation_3:
3624 bf = search_state.saved_bf;
3625 goto try_burst_2;
3626 case rx_get_burst_no_more:
3627 /* Since we've been here before, it is some kind of
3628 * error that we can't return.
3629 */
3630 case rx_get_burst_error:
3631 search_state.test_ret = rx_test_internal_error;
3632 goto test_do_return;
3633 case rx_get_burst_ok:
3634 break;
3635 }
3636 }
3637 search_state.test_pos = bf->stk_test_pos;
3638 goto while_non_det_options;
3639 }
3640
3641
3642 case rx_cache_miss:
3643 /* Because the superstate NFA is lazily constructed,
3644 * and in fact may erode from underneath us, we sometimes
3645 * have to construct the next instruction from the hard way.
3646 * This invokes one step in the lazy-conversion.
3647 */
3648 search_state.ifr = rx_handle_cache_miss (&rxb->rx,
3649 search_state.super,
3650 search_state.c,
3651 search_state.ifr->data_2);
3652 if (!search_state.ifr)
3653 {
3654 search_state.test_ret = rx_test_internal_error;
3655 goto test_do_return;
3656 }
3657 goto restart;
3658
3659 case rx_backtrack:
3660 /* RX_BACKTRACK means that we've reached the empty
3661 * superstate, indicating that match can't succeed
3662 * from this point.
3663 */
3664 goto test_do_return;
3665
3666 case rx_next_char:
3667 case rx_error_inx:
3668 case rx_num_instructions:
3669 search_state.ret_val = 0;
3670 goto test_do_return;
3671 }
3672 goto pseudo_while_1;
3673 }
3674
3675 /* Healthy exits from the test-match loop do a
3676 * `goto return_from_test_match' On the other hand,
3677 * we might end up here.
3678 */
3679 error_in_testing_match:
3680 test_state = rx_test_error;
3681 goto test_returns_to_search;
3682
3683 /***** fastmap/search loop body
3684 * considering the results testing for a match
3685 */
3686
3687 return_from_test_match:
3688
3689 if (search_state.best_last_l >= 0)
3690 {
3691 if (regs && (regs->start != search_state.best_lparen))
3692 {
3693 bcopy (search_state.best_lparen, regs->start,
3694 regs->num_regs * sizeof (int));
3695 bcopy (search_state.best_rparen, regs->end,
3696 regs->num_regs * sizeof (int));
3697 }
3698 if (regs && !rxb->no_sub)
3699 {
3700 int q;
3701 int bound = (regs->num_regs > search_state.num_regs
3702 ? regs->num_regs
3703 : search_state.num_regs);
3704 regoff_t * s = regs->start;
3705 regoff_t * e = regs->end;
3706 for (q = search_state.best_last_l + 1; q < bound; ++q)
3707 s[q] = e[q] = -1;
3708 }
3709 search_state.ret_val = search_state.best_lparen[0];
3710 test_state = rx_test_ok;
3711 goto test_returns_to_search;
3712 }
3713 else
3714 {
3715 test_state = rx_test_fail;
3716 goto test_returns_to_search;
3717 }
3718
3719 test_return_continuation:
3720 search_state.test_match_resume_pt = test_pc;
3721 test_state = rx_test_continuation;
3722 goto test_returns_to_search;
3723 }
3724 }
3725
3726
3727
3728 #endif /* RX_WANT_RX_DEFS */
3729
3730
3731
3732 #else /* RX_WANT_SE_DEFS */
3733 /* Integers are used to represent side effects.
3734 *
3735 * Simple side effects are given negative integer names by these enums.
3736 *
3737 * Non-negative names are reserved for complex effects.
3738 *
3739 * Complex effects are those that take arguments. For example,
3740 * a register assignment associated with a group is complex because
3741 * it requires an argument to tell which group is being matched.
3742 *
3743 * The integer name of a complex effect is an index into rxb->se_params.
3744 */
3745
3746 RX_DEF_SE(1, re_se_try, = -1) /* Epsilon from start state */
3747
3748 RX_DEF_SE(0, re_se_pushback, = re_se_try - 1)
3749 RX_DEF_SE(0, re_se_push0, = re_se_pushback -1)
3750 RX_DEF_SE(0, re_se_pushpos, = re_se_push0 - 1)
3751 RX_DEF_SE(0, re_se_chkpos, = re_se_pushpos -1)
3752 RX_DEF_SE(0, re_se_poppos, = re_se_chkpos - 1)
3753
3754 RX_DEF_SE(1, re_se_at_dot, = re_se_poppos - 1) /* Emacs only */
3755 RX_DEF_SE(0, re_se_syntax, = re_se_at_dot - 1) /* Emacs only */
3756 RX_DEF_SE(0, re_se_not_syntax, = re_se_syntax - 1) /* Emacs only */
3757
3758 RX_DEF_SE(1, re_se_begbuf, = re_se_not_syntax - 1) /* match beginning of buffer */
3759 RX_DEF_SE(1, re_se_hat, = re_se_begbuf - 1) /* match beginning of line */
3760
3761 RX_DEF_SE(1, re_se_wordbeg, = re_se_hat - 1)
3762 RX_DEF_SE(1, re_se_wordbound, = re_se_wordbeg - 1)
3763 RX_DEF_SE(1, re_se_notwordbound, = re_se_wordbound - 1)
3764
3765 RX_DEF_SE(1, re_se_wordend, = re_se_notwordbound - 1)
3766 RX_DEF_SE(1, re_se_endbuf, = re_se_wordend - 1)
3767
3768 /* This fails except at the end of a line.
3769 * It deserves to go here since it is typicly one of the last steps
3770 * in a match.
3771 */
3772 RX_DEF_SE(1, re_se_dollar, = re_se_endbuf - 1)
3773
3774 /* Simple effects: */
3775 RX_DEF_SE(1, re_se_fail, = re_se_dollar - 1)
3776
3777 /* Complex effects. These are used in the 'se' field of
3778 * a struct re_se_params. Indexes into the se array
3779 * are stored as instructions on nfa edges.
3780 */
3781 RX_DEF_CPLX_SE(1, re_se_win, = 0)
3782 RX_DEF_CPLX_SE(1, re_se_lparen, = re_se_win + 1)
3783 RX_DEF_CPLX_SE(1, re_se_rparen, = re_se_lparen + 1)
3784 RX_DEF_CPLX_SE(0, re_se_backref, = re_se_rparen + 1)
3785 RX_DEF_CPLX_SE(0, re_se_iter, = re_se_backref + 1)
3786 RX_DEF_CPLX_SE(0, re_se_end_iter, = re_se_iter + 1)
3787 RX_DEF_CPLX_SE(0, re_se_tv, = re_se_end_iter + 1)
3788
3789 #endif
3790
3791 #endif
3792