1 /* super.h - decls for lazilly constructed DFAs
2  *
3  ****************************************************************
4  * Copyright (C) 1998, 2000 Thomas Lord
5  *
6  * See the file "COPYING" for further information about
7  * the copyright and warranty status of this work.
8  */
9 
10 
11 
12 #ifndef INCLUDE__RX__SUPER_H
13 #define INCLUDE__RX__SUPER_H
14 
15 /* #define RX_LARGE_TABLES */
16 
17 struct rx_superset;
18 struct rx_super_edge;
19 struct rx_superstate;
20 
21 #include "hackerlab/mem/alloc-limits.h"
22 #include "hackerlab/bitsets/bitset.h"
23 #include "hackerlab/hash/hashtree.h"
24 #include "hackerlab/rx/nfa.h"
25 
26 
27 
28 /************************************************************************
29  *(h1 "Superstate Data Structures")
30  *
31  *
32  * One state of the DFA is called a "superstate" because it
33  * represents the composition of one or more NFA states.  The
34  * functions in this chapter build superstates and edges between
35  * them.
36  *
37  * A DFA can be prohibitively large -- too large to keep in memory all
38  * at once.  So these functions help you to build only the superstates
39  * you actually need.  While you keep those states locked, they remain
40  * in memory;  when unlocked, they can be discarded and recycled.
41  *
42  * As you fill the superstate cache with DFA states you visit often,
43  * you'll want to follow transitions between those DFA states as
44  * quickly as possible.  For that reason, the data structures in this
45  * chapter are slightly complicated: to heavily optimize the case when
46  * you are tracing through the DFA graph entirely out of the
47  * superstate cache.  In the optimized case, you can advance a DFA
48  * through one state transition with only 10-20 machine instructions.
49  * That number of instructions, multiplied by the length of the input
50  * string, is the entire overhead of a DFA comparison running out
51  * of the cache!
52  */
53 
54 /************************************************************************
55  *(h2 "DFA State Transition Tables")
56  *
57  * A regular expression DFA is represented by a collection of
58  * transition tables, one per DFA state.  If the character set has `N'
59  * characters, the transition tables have `N' entries.
60  *
61  * Each entry in a transition table is one instruction for a virtual
62  * machine.
63  */
64 
65 /*(c #s"struct rx_inx" :category type)
66  * struct rx_inx;
67  *
68  * This structure holds one instruction for a virtual machine that
69  * interprets the DFA.  For example, a structure of this type is used
70  * to represent one entry in the transition table of a DFA state.
71  *
72  * This structure type is padded to a power-of-2 bytes because in some
73  * critical code, we dispatch by indexing an array of these
74  * structures.  If that indexing can be accomplished by just a shift
75  * of the index, we're much happier than if it requires a full-fledged
76  * multiplication.
77  *
78  * An instruction includes an opcode and two operands.  The opcode is
79  * stored in the field `inx' and the operands are stored in the fields
80  * `data' and `data_2'.  The field `data' is only used by one
81  * instruction (`rx_next_char').  For all other instructions, `data'
82  * is set to 0.  This is used to speed up dispatching: the field
83  * `data' is fetched and tested before the field `inx'.  If data is
84  * not 0, there is no need to fetch the field `inx' at all -- we
85  * already know it contains `rx_next_char' and we already have the
86  * primary operand to that instruction in a register.
87  *
88  * In the case of a `rx_next_char' instruction, the primary operand is
89  * a pointer to the transition table of the destination DFA state.
90  * The secondary operand is the state label of the destination state.
91  * Thus, the state transition part of a DFA matching loop is something
92  * like:
93  *
94  *	while (1)
95  *	  {
96  *	    // Presume that the current transition table entry
97  *	    // for the next input character (`input_char') contains
98  *	    // the virtual instruction `rx_next_char'.
99  *	    //
100  *	    // Fetch the operands for that instruction:
101  *	    //
102  * 	    next_state_table
103  *	      = (struct rx_inx *)current_state_table[input_char].data;
104  *	    next_state_label
105  *	      = (long)current_state_table[input_char].data_2;
106  *
107  * 	    // Now check, is it really a `rx_next_char' instruction?
108  *	    //
109  *	    if (!next_state_table)
110  *	      {
111  *		// Primary operand was 0 -- not really `rx_next_char'.
112  *		//
113  * 	        ... handle a cache miss or illegal input character
114  *		... patch up the values of `next_state_table' and
115  *		... `next_state_label'
116  *	      }
117  *
118  * 	    // Advance to the next DFA state:
119  *	    //
120  *	    current_state_table = next_state_table;
121  *	    current_state_label = next_state_label;
122  *
123  * 	    // Test whether or not it is a final state:
124  *	    //
125  *	    if (current_state_label)
126  *	      {
127  *	        // we have reached a final state in the DFA
128  * 		... perhaps return or perhaps continue to look for
129  *		... a longer match.
130  *	      }
131  *
132  * 	    // Process further input.
133  *	    //
134  *	    input_char = read_next_char ();
135  *         }
136  *
137  *
138  * Here are the details of the strucure type:
139  *
140  insert*/
141 struct rx_inx
142 {
143   void * data;
144   /*   	The primary operand of this instruction.  For efficient
145 	indexing, this must be the first field of the structure.*/
146 
147   void * data_2;
148   /*   	The secondary operand of this instruction.*/
149 
150   void * inx;
151   /*   	The opcode for this instruction.  This field is declared
152     	`void *' to force it to occupy exactly one word.  In fact it
153     	holds a value of type `enum rx_opcode'.*/
154 
155   struct rx_inx * next_same_edge;
156   /* 	A list of instruction frames, anchored at e->inx_list,
157 	of instructions built for the same edge. */
158 };
159 /*end-insert
160  */
161 
162 
163 /*(c #s"enum rx_opcode" :category type)
164  * enum rx_opcode;
165  *
166  * This type represents opcodes for a regular expression DFA virtual
167  * machine.
168  *
169  insert*/
170 enum rx_opcode
171 {
172   rx_cache_miss = 0,		/* must be 0 */
173   /* 	An `rx_cache_miss' instruction means that a transition
174 	has been reached whose edge or destination state is not
175 	currently in the DFA cache.
176 
177 	If the edge is missing from the cache, then `data' and
178 	`data_2' are 0.  If the edge is in the cache, but the
179 	destination state is either not in the cache or is unknown to
180 	the edge, then `data' is 0 and `data_2' points to the edge.*/
181 
182 
183   rx_next_char = rx_cache_miss + 1,
184   /*	An `rx_next_char' instruction means that a transition has been
185 	reached to a DFA state that is known to be in the DFA state
186 	cache.
187 
188 	`data' points to the transition table of the destination state
189 	(the `transitions' field of a `struct rx_superstate').
190 
191 	`data_2' is set to the state label of the destination state.
192 
193 	If the `data' field of a `struct rx_inx' is not 0, then the
194      	`inx' field is guaranteed to be `rx_next_char'.  If the
195      	insruction is `rx_next_char', a DFA interpreter is almost
196      	certain to need the value in `data'.  A DFA interpreter
197      	should optimize this instruction above all others.  This
198      	suggests that interpreters should dispatch first on the value
199      	in `data' (on whether or not it is 0), and then (if `data' is
200      	0) on the instruction in field `inx'.*/
201 
202 
203   rx_backtrack = rx_next_char + 1,
204   /*	An `rx_backtrack' instruction means that there is no
205 	transition defined for an input character.  If the DFA is
206 	being used for regular expression comparison, this means that
207 	the input does not match.*/
208 
209   rx_huge_char = rx_backtrack + 1,
210   /* 	An `rx_huge_char' instruction means that the the character
211  	is a UTF-16 high surrogate. */
212 
213 
214   rx_bogus_utf8 = rx_backtrack + 1,
215   /*	Second, third or fourth byte of a UTF-8 character.
216 	*/
217 
218   rx_2byte_utf8 = rx_bogus_utf8 + 1,
219   /* 	First byte of a 2-byte UTF-8 character.
220 	*/
221 
222   rx_3byte_utf8 = rx_2byte_utf8 + 1,
223   /* 	First byte of a 3-byte UTF-8 character.
224 	*/
225 
226   rx_4byte_utf8 = rx_3byte_utf8 + 1,
227   /* 	First byte of a 4-byte UTF-8 character.
228 	*/
229 
230   rx_num_instructions = rx_4byte_utf8 + 1,
231 };
232 /*end-insert
233  */
234 
235 /************************************************************************
236  *(h2 "DFA States")
237  *
238  */
239 
240 /*(c #s"struct rx_superstate" :category type)
241  * struct rx_superstate;
242  *
243  * This structure holds one state in a regular expression DFA.
244  *
245  insert*/
246 struct rx_super_transition;	/* no such structure */
247 typedef struct rx_super_transition ** rx_transition_table;
248 
249 struct rx_superstate
250 {
251   struct rx_superset * members;
252   /* 	A list of NFA states: the NFA state set corresponding to this
253 	DFA state.  No two DFA states have the same set of members. */
254 
255   struct rx_super_edge * outgoing_edges;
256   /* 	A list of outgoing DFA edges, linked by `next_same_present'.
257 	At any time, some edges for this state may be missing from the
258 	list (because they are missing from the DFA cache).  To build
259 	a complete DFA, it is necessary to traverse the complete
260 	transition table of every state, keeping locks on each state
261 	encountered so that none are flushed from the DFA cache.  (Be
262 	advised: DFA graphs can be quite large.)*/
263 
264 
265   struct rx_super_edge * incoming_edges;
266   /* 	A queue of incoming DFA edges, linked by `next_same_dest' and
267 	`prev_same_dest'.  Again, those edges not in the DFA cache
268 	are missing from this list.  Also missing from this list are
269 	edges that are in the cache, but that do not yet know their
270 	destination state.*/
271 
272 
273   /* Superstate Locks and Superstate Cache Management
274    *
275    * Every superstate has a lock count -- a kind of reference
276    * count.  While your code holds a pointer to a superstate, that
277    * superstate should have a non-0 lock count unless you are
278    * certain that none of the functions that allocate storage in
279    * the DFA cache will be called.  You can use `rx_lock_superstate'
280    * and `rx_unlock_superstate' to manipulate this reference count.
281    *
282    * The gory details of memory management for DFA states is
283    * interesting and relevant to programmers who want to obtain the
284    * best performance from this library.  Many of those details are
285    * explained here.
286    *
287    * Upon creation, superstates are given a lock count of 0 and are
288    * added to the back of a queue of live superstates.  When a
289    * cache-hit yields an already existing superstate, it is moved
290    * back to back of that queue.  Thus, the states most recently
291    * returned from the DFA cache tend to be at the back of the queue
292    * of live states.
293    *
294    * While a state has a lock count of 0, it is a candidate to be
295    * made semifree and then truly free.  This can happen any time the
296    * old state reaches the _front_ of the queue of live states and
297    * new states or edges are allocated in the DFA cache (such as
298    * during a call to `rx_handle_cache_miss').  When this occurs,
299    * incoming transitions to the state are modified to cause cache
300    * misses, the state is moved from the queue of live states to the
301    * back of the queue of semifree states, and the state is a
302    * candidate to become truly free.  To convert a state which is
303    * semifree back to a live state, use `rx_refresh_this_superstate'.
304    *
305    * When a state becomes semifree, there are two possibilities of
306    * what will happen next.  One possibility is that Rx will reclaim
307    * the state's storage.  This will happen if the state reaches the
308    * _front_ of the queue of semifree states at a time when Rx detects
309    * that the DFA cache is over-full.
310    *
311    * The other possibility is that a semifree state will be
312    * referenced by a DFA transition which results in a cache miss.
313    * The cache miss handler reverts the semifree state to a live
314    * state.  Restoring the state this way is less expensive than
315    * rebuilding the state from scratch and gives us the information
316    * that the state is useful enough to keep in the cache.  In
317    * effect, we use the cache misses that restore semifree states to
318    * sort the two queues of DFA states, moving popular states to the
319    * back of those queues, and letting unpopular states drift to the
320    * front of the live queue, then through the semifree queue, and
321    * finally into being recycled.
322    *
323    * There is a (soft) upper bound on the amount of memory Rx will use
324    * for caching superstates.  While that upper bound is exceeded, Rx
325    * tries to free some superstates which have a lock count of 0,
326    * starting with the head of the queue of semifree states.
327    */
328   int locks;			/* A reference count. */
329   int is_semifree;		/* Is this state live or semifree? */
330 
331   struct rx_superstate * next_recyclable;
332   struct rx_superstate * prev_recyclable;
333   /*	While a DFA state is live, these links place the state on
334 	a queue of all live DFA states.
335 
336 	While the state is semifree, these links place the state on
337 	queue of all semifree DFA states.*/
338 
339 
340 
341   /* At run-time, a matcher follows transitions from one superstate
342    * to the next.  At times, a destination state may be missing from
343    * the superstate cache or from a particular instruction frame.  In
344    * those cases, a destination superset (set of NFA states) is
345    * computed, and that is used as a key to search the cache of DFA
346    * states.  If a cache entry is found (and valid) then the
347    * superstate is missing only from the instruction frame, which is
348    * then filled in, and the transition resumed normally.  If no
349    * cache entry is found, a new superstate is constructed for that
350    * superset.
351    *
352    * The following field (`rx_id') is used when validating cache
353    * entries in the superstate cache keyed on supersets.  If a cache
354    * search for a particular DFA state yields this state, the result
355    * is only valid if this field agrees with the `rx_id' field of the
356    * NFA.
357    *
358    * This field is necessary because of the possibility of the
359    * following (unlikely but not impossible) sequence of events:
360    *
361    *	The caller allocates an NFA (call it "NFA A").
362    *
363    *	The caller builds DFA states based on NFA A.
364    *
365    *	The caller frees NFA A.  The DFA states continue to exist
366    *	until flushed from the DFA cache from lack of use.
367    *
368    *	The caller allocates a new NFA ("NFA B") which happens
369    *	to use the same region of the heap as NFA A.
370    *	By coincidence, some of the NFA states sets for NFA A
371    *	and NFA B hash to the same value.  Consequently, when
372    *	looking for a DFA state built for NFA B, the cache probe
373    *	might yield a stale DFA state built for NFA A.
374    *
375    *	But luckilly, NFA A and NFA B are guaranteed to have
376    *	different values for the field `rx_id'.  By comparing
377    *	the `rx_id' field of NFA B to the superstate yielded by
378    *	the cache probe, we can know to invalidate that cache
379    * 	entry.
380    */
381   int rx_id;
382 
383   /* Superstates of a given NFA are numbered sequentially.  The
384    * sequence number counter is the global variable
385    * `rx_superstate_counter' which programs are free to modify.
386    *
387    * Sequence numbers are useful for applications which manipulate the
388    * DFA graph in ways other than simply following transition tables.
389    */
390   int seq;
391 
392   /* The following field is used by `rx_mkstatic', which builds a
393    * complete DFA by examining superstates.
394    */
395   long table_id;
396 
397   int storage_unit_size;	/* see nfa.h */
398   int has_huge_table;		/* yes for Unicode */
399 
400 #ifdef RX_LARGE_TABLES
401   struct rx_inx *** huge_char_transitions;
402 #else
403   struct rx_inx **** huge_char_transitions;
404 #endif
405 
406   /* struct rx_inx transitions[1]; */
407   struct rx_super_transition * transitions[1];
408 };
409 /*end-insert
410  */
411 
412 
413 #define rx_transitions_to_suprestate(T)		((struct rx_superstate *) \
414 						 ((char *)(T) \
415 						  - ((unsigned long) \
416 						     ((struct rx_superstate *)0)->transitions)))
417 
418 
419 #define rx_transition8(T,N)			(((struct rx_inx *)(T)) + (N))
420 #define rx_subset_transitions8(T,N)		(((struct rx_inx *)(T)) + bitset_subset_offset(N))
421 
422 #ifdef RX_LARGE_TABLES
423 
424 #  define rx_page1_index16(N)			((N) >> 8)
425 #  define rx_page2_index16(N)			((N) & 0xff)
426 #  define rx_page2_subset_index16(N)		bitset_subset_offset (rx_page2_index16(N))
427 #  define rx_page1_16(T)			((struct rx_inx **)(T))
428 #  define rx_page2_16(T,N)			rx_page1_16 (T)[rx_page1_index16(N)]
429 
430 #  define rx_transition16(T,N)			(rx_page2_16 ((T), (N)) + rx_page2_index16 (N))
431 #  define rx_subset_transitions16(T,N)		(rx_page2_16 ((T), (N)) + rx_page2_subset_index16 (N))
432 
433 
434 #  define rx_page1_size21			32
435 #  define rx_page2_size21			256
436 #  define rx_page1_index21(N)			((N) >> 16)
437 #  define rx_page2_index21(N)			(((N) >> 8) & 0xff)
438 #  define rx_page3_index21(N)			((N) & 0xff)
439 #  define rx_page3_subset_index21(N)		bitset_subset_offset (rx_page3_index21 (N))
440 #  define rx_page2_21(T,N)			((T)[rx_page1_index21 (N)])
441 #  define rx_page3_21(T,N)			rx_page2_21 ((T), (N))[rx_page2_index21 (N)]
442 
443 #  define rx_transition21(T,N)			(rx_page3_21 ((T), (N)) + rx_page3_index21 (N))
444 #  define rx_subset_transition21(T,N)		(rx_page3_21 ((T), (N)) + rx_page3_subset_index21 (N))
445 
446 
447 #else /* !defined(RX_LARGE_TABLES) */
448 
449 
450 #  if (bits_per_subset == 32)
451 
452 #    define rx_page2_size			(8)
453 #    define rx_page3_size			bits_per_subset
454 #    define rx_page1_index16(N)			((N) >> 8)
455 #    define rx_page2_index16(N)			(((N) >> 5) & 0x7)
456 #    define rx_page3_index16(N)			((N) & 0x1f)
457 
458 #    define rx_page1_size21			256
459 #    define rx_page2_size21			32
460 #    define rx_page3_size21			(8)
461 #    define rx_page4_size21			bits_per_subset
462 #    define rx_page1_index21(N)			((N) >> 13)
463 #    define rx_page2_index21(N)			(((N) >> 8) & 0x1f)
464 #    define rx_page3_index21(N)			(((N) >> 5) & 0x7)
465 #    define rx_page4_index21(N)			((N) & 0x1f)
466 
467 
468 #  elif (bits_per_subset == 64)
469 
470 #    define rx_page2_size			(4)
471 #    define rx_page3_size			bits_per_subset
472 #    define rx_page1_index16(N)			((N) >> 8)
473 #    define rx_page2_index16(N)			(((N) >> 6) & 0x3)
474 #    define rx_page3_index16(N)			((N) & 0x3f)
475 
476 #    define rx_page1_size21			256
477 #    define rx_page2_size21			32
478 #    define rx_page3_size21			(8)
479 #    define rx_page4_size21			bits_per_subset
480 #    define rx_page1_index21(N)			((N) >> 13)
481 #    define rx_page2_index21(N)			(((N) >> 8) & 0x1f)
482 #    define rx_page3_index21(N)			(((N) >> 6) & 0x3)
483 #    define rx_page4_index21(N)			((N) & 0x3f)
484 
485 #  else
486 
487 #    error "odd bits_per_subset in hackerlab/rx/super.h"
488 
489 #  endif
490 
491 #  define rx_page1_16(T)			((struct rx_inx ***)(T))
492 #  define rx_page2_16(T,N)			rx_page1_16 (T)[rx_page1_index16(N)]
493 #  define rx_page3_16(T,N)			rx_page2_16 (T,N)[rx_page2_index16(N)]
494 #  define rx_page3_subset_index16(N)		bitset_subset_offset (rx_page3_index16(N))
495 
496 #  define rx_transition16(T,N)			(rx_page3_16 ((T), (N)) + rx_page3_index16 (N))
497 #  define rx_subset_transitions16(T,N)		(rx_page3_16 ((T), (N)) + rx_page3_subset_index16 (N))
498 
499 
500 #  define rx_page1_21(T)			(T)
501 #  define rx_page2_21(T,N)			rx_page1_21(T)[rx_page1_index21(N)]
502 #  define rx_page3_21(T,N)			rx_page2_21((T),(N))[rx_page2_index21(N)]
503 #  define rx_page4_21(T,N)			rx_page3_21((T),(N))[rx_page3_index21(N)]
504 #  define rx_page4_subset_index21(N)		bitset_subset_offset (rx_page4_index21 (N))
505 
506 #  define rx_transition21(T,N)			(rx_page4_21 ((T), (N)) + rx_page4_index21 (N))
507 #  define rx_subset_transition21(T,N)		(rx_page4_21 ((T), (N)) + rx_page4_subset_index21 (N))
508 
509 #endif
510 
511 
512 
513 
514 
515 
516 
517 /*(c rx_lock_superstate :category macro)
518  * #define rx_lock_superstate(RX_NFA, SUPERSTATE)
519  *
520  * Increment the reference count of a DFA state.
521  */
522 #define rx_lock_superstate(R,S)  ((S)->locks++)
523 
524 /*(c rx_unlock_superstate :category macro)
525  * #define rx_unlock_superstate(RX_NFA, SUPERSTATE)
526  *
527  * Decrement the reference count of a DFA state.
528  */
529 #define rx_unlock_superstate(R,S) (--(S)->locks)
530 
531 /*(c rx_superstate_transition_table :category macro)
532  * #define rx_superstate_transition_table(RX_NFA, SUPERSTATE)
533  *
534  * As a function:
535  *
536  *	extern struct rx_inx *
537  *	rx_superstate_transition_table (struct rx_nfa * rx,
538  *					struct rx_superstate * state);
539  *
540  * Return the DFA state transition table that corresponds to a
541  * particular superstate
542  */
543 #define rx_superstate_transition_table(RX_NFA, SUPERSTATE) \
544 	&((SUPERSTATE)->transitions)
545 
546 /*(c rx_transition_table_superstate :category macro)
547  * #define rx_transition_table_superstate(RX_NFA, TABLE)
548  *
549  * As a function:
550  *
551  *	extern struct rx_superstate *
552  *	rx_superstate_transition_table (struct rx_nfa * rx,
553  *					struct rx_inx * table);
554  *
555  * Return the superstate that corresponds to a particular
556  * DFA state transition table.
557  */
558 #define rx_transition_table_superstate(RX_NFA, TABLE) \
559 	((struct rx_superstate *)\
560 	 ( ((char *)(TABLE))  - (char *)(&((struct rx_superstate *)0)->transitions)))
561 
562 
563 
564 /*(c rx_superstate_counter :category variable)
565  * extern int rx_superstate_counter;
566  *
567  * As DFA states are allocated, they are assigned sequential
568  * numbers.  The number of the next DFA state is stored in this
569  * variable.
570  */
571 extern int rx_superstate_counter;
572 
573 
574 
575 /************************************************************************
576  *(h2 "DFA Edges")
577  *
578  */
579 
580 /*(c #s"struct rx_super_edge" :category type)
581  *
582  * This structure holds one edge of a regular expression DFA.
583  *
584  * All DFA edges are character set edges -- they are labeled by
585  * a non-empty set of characters.  No two of the character sets
586  * of edges originating in a common DFA state intersect.
587  *
588  insert*/
589 struct rx_super_edge
590 {
591   bits cset;
592   /*	The character set of this edge.*/
593 
594   struct rx_superstate * present;
595   /*   	The DFA source state of this edge.  If the edge exists, this
596 	state is guaranteed to be in the cache.*/
597 
598   struct rx_superstate * future;
599   /*   	The DFA destination state of this edge.  This state may or may
600 	not be in the DFA cache, and if it is in the cache, this edge
601 	may or may not "know" about it.  If the state is in the cache
602 	and this edge "knows" about it, then this field points to the
603 	state.  Otherwise, this field contains 0.*/
604 
605 
606   struct rx_inx * inx_list;
607   /* 	A list of instruction frames, linked by next_same_edge,
608 	built for this edge. */
609 
610 
611   struct rx_super_edge * next_same_present;
612   /*   	This points to the next edge in a list of DFA edges sharing a
613 	common starting state.   The list starts at
614 	`present->outgoing_edges'.*/
615 
616   struct rx_super_edge * next_same_dest;
617   struct rx_super_edge * prev_same_dest;
618   /*  	These link this edge into a queue of edges sharing a
619 	common DFA ending state.  These fields are only valid if
620 	`future' is not 0.  In that case, this edge is stored on
621 	the queue `future->incoming_edges'.*/
622 
623   int is_backtrack;
624 };
625 /*end-insert
626  */
627 
628 
629 
630 
631 /************************************************************************
632  *(h2 "Sets of NFA States")
633  *
634  * Every state of a DFA corresponds to a non-empty set of NFA states.
635  * No two DFA states share the same set of NFA states.
636  *
637  */
638 
639 /*(c #s"struct rx_superset" :category type)
640  * struct rx_superset;
641  *
642  * This structure holds a set of NFA states and is used to represent
643  * the set of states that comprise a single DFA state.  In xref:"The
644  * NFA Data Types" we encountered another representation for sets of
645  * NFA states, used for computing epsilon closures of NFA states.
646  * Unfortunately, the requirements of computing epsilon closures and
647  * of computing DFA states are too different, so two representations
648  * are needed.
649  *
650  * For equal `car' and `cdr' (in the sense of `=='), there is at most
651  * one of these structures.
652  *
653  * The empty set is *not* represented by `(struct rx_superset *)0'
654  * but instead by a superset structure with `car' and `cdr' equal to
655  * 0.  Thus, the last element of every `struct rx_superste' list
656  * (linked by `cdr' pointers) is a structure with `car' and `cdr' 0.
657  * (This simplifies hash-consing sets of these structures).
658  *
659  insert*/
660 struct rx_superset
661 {
662   struct rx_nfa_state * car;
663   /*	This points to the first NFA state in the set.  The
664 	list of states is kept sorted by id number, from
665 	least to greatest.
666 
667 	Note that if the NFA has been freed but the superset has not,
668 	then this pointer is invalid.  This condition can be detected
669 	by comparing the `id' field of this structure with the `rx_id'
670 	field of the NFA over which this set was supposedly built.  If
671 	the two numbers agree, then the `car' pointer is valid.  If
672 	the two numbers disagree, then this set was actually built for
673 	an NFA that no longer exists: it is a stale entry in the cache
674 	of supersets.*/
675 
676   struct rx_superset * cdr;
677   /*	This points to the rest of this set.*/
678 
679   long state_label;
680   /*  	If all of the state labels of the NFA states in this set are
681 	0, then this field is 0.  Otherwise, this field is the NFA
682 	state label with the smallest magnitude.*/
683 
684   int has_cset_edges;
685   /*   	This is the logical-or (`||') of the same field of the `cdr',
686      	and the same-named field of the `car'.  If it is 0, then this
687      	superset corresponds to a dead-end DFA state (no outgoing
688      	edges).*/
689 
690   /* Cache management data.
691    */
692   int refs;				/* A reference count. */
693   int id;				/* See `car', above. */
694   struct hashtree_item hash_item;	/* For hash-consing lists */
695 
696 
697   struct rx_nfa_state * nfa_state;
698   /*	If this superset is discovered to be the eclosure of a
699 	single NFA state by `rx_nfa_state_to_superstate', this
700 	field points to that NFA state.
701 
702 	The state points back to this set in the field
703 	`superstate_set'.
704 
705 	If the NFA is subsequently freed, this field is reset to 0.*/
706 
707   struct rx_superstate * superstate[2];
708   /*   	If the DFA states for this superset exists (in the cache of DFA
709 	states), then this field points to those DFA state.
710 
711 	superstate[0] for 1-byte storage unit size,
712 	superstate[1] for 2-byte storage unit size.
713 	*/
714 
715 };
716 /*end-insert
717  */
718 
719 
720 extern alloc_limits rx__dfa_alloc_limits;
721 
722 
723 /* automatically generated __STDC__ prototypes */
724 extern void rx__init_dfa_alloc_limits (void);
725 extern size_t rx_dfa_cache_failure_pt (void);
726 extern void rx__dfa_cache_statistics (size_t * threshold,
727 				      size_t * failure_pt,
728 				      size_t * in_use,
729 				      size_t * high_water_mark,
730 				      int * hits,
731 				      int * misses,
732 				      int * total_hits,
733 				      int * total_misses);
734 extern void rx_set_dfa_cache_failure_pt (size_t n);
735 extern int rx__really_free_superstate (void);
736 extern struct rx_superstate * rx_nfa_state_to_superstate (struct rx_nfa * rx,
737 							  struct rx_nfa_state * nfa_state,
738 							  int storage_unit_size);
739 extern void rx_clear_superstate_table_ids (struct rx_nfa * nfa);
740 extern struct rx_inx * rx_handle_cache_miss (struct rx_nfa * rx,
741 					     struct rx_superstate * super,
742 					     unsigned int chr,
743 					     void * data_2) ;
744 #endif  /* INCLUDE__RX__SUPER_H */
745