1 /*	$NetBSD: ure.c,v 1.1.1.3 2010/12/12 15:22:07 adam Exp $	*/
2 
3 /* OpenLDAP: pkg/ldap/libraries/liblunicode/ure/ure.c,v 1.17.2.5 2010/04/13 20:23:04 kurt Exp */
4 /* This work is part of OpenLDAP Software <http://www.openldap.org/>.
5  *
6  * Copyright 1998-2010 The OpenLDAP Foundation.
7  * All rights reserved.
8  *
9  * Redistribution and use in source and binary forms, with or without
10  * modification, are permitted only as authorized by the OpenLDAP
11  * Public License.
12  *
13  * A copy of this license is available in file LICENSE in the
14  * top-level directory of the distribution or, alternatively, at
15  * <http://www.OpenLDAP.org/license.html>.
16  */
17 /* Copyright 1997, 1998, 1999 Computing Research Labs,
18  * New Mexico State University
19  *
20  * Permission is hereby granted, free of charge, to any person obtaining a
21  * copy of this software and associated documentation files (the "Software"),
22  * to deal in the Software without restriction, including without limitation
23  * the rights to use, copy, modify, merge, publish, distribute, sublicense,
24  * and/or sell copies of the Software, and to permit persons to whom the
25  * Software is furnished to do so, subject to the following conditions:
26  *
27  * The above copyright notice and this permission notice shall be included in
28  * all copies or substantial portions of the Software.
29  *
30  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
31  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
32  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
33  * THE COMPUTING RESEARCH LAB OR NEW MEXICO STATE UNIVERSITY BE LIABLE FOR ANY
34  * CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT
35  * OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR
36  * THE USE OR OTHER DEALINGS IN THE SOFTWARE.
37  */
38 /* Id: ure.c,v 1.2 1999/09/21 15:47:43 mleisher Exp" */
39 
40 #include "portable.h"
41 
42 #include <ac/stdlib.h>
43 #include <ac/string.h>
44 #include <ac/unistd.h>
45 
46 #include "ure.h"
47 
48 /*
49  * Flags used internally in the DFA.
50  */
51 #define _URE_DFA_CASEFOLD  0x01
52 #define _URE_DFA_BLANKLINE 0x02
53 
54 static unsigned long cclass_flags[] = {
55     0,
56     _URE_NONSPACING,
57     _URE_COMBINING,
58     _URE_NUMDIGIT,
59     _URE_NUMOTHER,
60     _URE_SPACESEP,
61     _URE_LINESEP,
62     _URE_PARASEP,
63     _URE_CNTRL,
64     _URE_PUA,
65     _URE_UPPER,
66     _URE_LOWER,
67     _URE_TITLE,
68     _URE_MODIFIER,
69     _URE_OTHERLETTER,
70     _URE_DASHPUNCT,
71     _URE_OPENPUNCT,
72     _URE_CLOSEPUNCT,
73     _URE_OTHERPUNCT,
74     _URE_MATHSYM,
75     _URE_CURRENCYSYM,
76     _URE_OTHERSYM,
77     _URE_LTR,
78     _URE_RTL,
79     _URE_EURONUM,
80     _URE_EURONUMSEP,
81     _URE_EURONUMTERM,
82     _URE_ARABNUM,
83     _URE_COMMONSEP,
84     _URE_BLOCKSEP,
85     _URE_SEGMENTSEP,
86     _URE_WHITESPACE,
87     _URE_OTHERNEUT,
88 };
89 
90 /*
91  * Symbol types for the DFA.
92  */
93 #define _URE_ANY_CHAR   1
94 #define _URE_CHAR       2
95 #define _URE_CCLASS     3
96 #define _URE_NCCLASS    4
97 #define _URE_BOL_ANCHOR 5
98 #define _URE_EOL_ANCHOR 6
99 
100 /*
101  * Op codes for converting the NFA to a DFA.
102  */
103 #define _URE_SYMBOL     10
104 #define _URE_PAREN      11
105 #define _URE_QUEST      12
106 #define _URE_STAR       13
107 #define _URE_PLUS       14
108 #define _URE_ONE        15
109 #define _URE_AND        16
110 #define _URE_OR         17
111 
112 #define _URE_NOOP       0xffff
113 
114 #define _URE_REGSTART 0x8000
115 #define _URE_REGEND   0x4000
116 
117 /*
118  * Structure used to handle a compacted range of characters.
119  */
120 typedef struct {
121     ucs4_t min_code;
122     ucs4_t max_code;
123 } _ure_range_t;
124 
125 typedef struct {
126     _ure_range_t *ranges;
127     ucs2_t ranges_used;
128     ucs2_t ranges_size;
129 } _ure_ccl_t;
130 
131 typedef union {
132     ucs4_t chr;
133     _ure_ccl_t ccl;
134 } _ure_sym_t;
135 
136 /*
137  * This is a general element structure used for expressions and stack
138  * elements.
139  */
140 typedef struct {
141     ucs2_t reg;
142     ucs2_t onstack;
143     ucs2_t type;
144     ucs2_t lhs;
145     ucs2_t rhs;
146 } _ure_elt_t;
147 
148 /*
149  * This is a structure used to track a list or a stack of states.
150  */
151 typedef struct {
152     ucs2_t *slist;
153     ucs2_t slist_size;
154     ucs2_t slist_used;
155 } _ure_stlist_t;
156 
157 /*
158  * Structure to track the list of unique states for a symbol
159  * during reduction.
160  */
161 typedef struct {
162     ucs2_t id;
163     ucs2_t type;
164     unsigned long mods;
165     unsigned long props;
166     _ure_sym_t sym;
167     _ure_stlist_t states;
168 } _ure_symtab_t;
169 
170 /*
171  * Structure to hold a single state.
172  */
173 typedef struct {
174     ucs2_t id;
175     ucs2_t accepting;
176     ucs2_t pad;
177     _ure_stlist_t st;
178     _ure_elt_t *trans;
179     ucs2_t trans_size;
180     ucs2_t trans_used;
181 } _ure_state_t;
182 
183 /*
184  * Structure used for keeping lists of states.
185  */
186 typedef struct {
187     _ure_state_t *states;
188     ucs2_t states_size;
189     ucs2_t states_used;
190 } _ure_statetable_t;
191 
192 /*
193  * Structure to track pairs of DFA states when equivalent states are
194  * merged.
195  */
196 typedef struct {
197     ucs2_t l;
198     ucs2_t r;
199 } _ure_equiv_t;
200 
201 /*
202  * Structure used for constructing the NFA and reducing to a minimal DFA.
203  */
204 typedef struct _ure_buffer_t {
205     int reducing;
206     int error;
207     unsigned long flags;
208 
209     _ure_stlist_t stack;
210 
211     /*
212      * Table of unique symbols encountered.
213      */
214     _ure_symtab_t *symtab;
215     ucs2_t symtab_size;
216     ucs2_t symtab_used;
217 
218     /*
219      * Tracks the unique expressions generated for the NFA and when the NFA is
220      * reduced.
221      */
222     _ure_elt_t *expr;
223     ucs2_t expr_used;
224     ucs2_t expr_size;
225 
226     /*
227      * The reduced table of unique groups of NFA states.
228      */
229     _ure_statetable_t states;
230 
231     /*
232      * Tracks states when equivalent states are merged.
233      */
234     _ure_equiv_t *equiv;
235     ucs2_t equiv_used;
236     ucs2_t equiv_size;
237 } _ure_buffer_t;
238 
239 typedef struct {
240     ucs2_t symbol;
241     ucs2_t next_state;
242 } _ure_trans_t;
243 
244 typedef struct {
245     ucs2_t accepting;
246     ucs2_t ntrans;
247     _ure_trans_t *trans;
248 } _ure_dstate_t;
249 
250 typedef struct _ure_dfa_t {
251     unsigned long flags;
252 
253     _ure_symtab_t *syms;
254     ucs2_t nsyms;
255 
256     _ure_dstate_t *states;
257     ucs2_t nstates;
258 
259     _ure_trans_t *trans;
260     ucs2_t ntrans;
261 } _ure_dfa_t;
262 
263 /*************************************************************************
264  *
265  * Functions.
266  *
267  *************************************************************************/
268 
269 static void
270 _ure_memmove(char *dest, char *src, unsigned long bytes)
271 {
272     long i, j;
273 
274     i = (long) bytes;
275     j = i & 7;
276     i = (i + 7) >> 3;
277 
278     /*
279      * Do a memmove using Ye Olde Duff's Device for efficiency.
280      */
281     if (src < dest) {
282         src += bytes;
283         dest += bytes;
284 
285         switch (j) {
286           case 0: do {
287               *--dest = *--src;
288             case 7: *--dest = *--src;
289             case 6: *--dest = *--src;
290             case 5: *--dest = *--src;
291             case 4: *--dest = *--src;
292             case 3: *--dest = *--src;
293             case 2: *--dest = *--src;
294             case 1: *--dest = *--src;
295           } while (--i > 0);
296         }
297     } else if (src > dest) {
298         switch (j) {
299           case 0: do {
300               *dest++ = *src++;
301             case 7: *dest++ = *src++;
302             case 6: *dest++ = *src++;
303             case 5: *dest++ = *src++;
304             case 4: *dest++ = *src++;
305             case 3: *dest++ = *src++;
306             case 2: *dest++ = *src++;
307             case 1: *dest++ = *src++;
308           } while (--i > 0);
309         }
310     }
311 }
312 
313 static void
314 _ure_push(ucs2_t v, _ure_buffer_t *b)
315 {
316     _ure_stlist_t *s;
317 
318     if (b == 0)
319       return;
320 
321     /*
322      * If the `reducing' parameter is non-zero, check to see if the value
323      * passed is already on the stack.
324      */
325     if (b->reducing != 0 && b->expr[v].onstack != 0)
326       return;
327 
328     s = &b->stack;
329     if (s->slist_used == s->slist_size) {
330         if (s->slist_size == 0)
331           s->slist = (ucs2_t *) malloc(sizeof(ucs2_t) << 3);
332         else
333           s->slist = (ucs2_t *) realloc((char *) s->slist,
334                                         sizeof(ucs2_t) * (s->slist_size + 8));
335         s->slist_size += 8;
336     }
337     s->slist[s->slist_used++] = v;
338 
339     /*
340      * If the `reducing' parameter is non-zero, flag the element as being on
341      * the stack.
342      */
343     if (b->reducing != 0)
344       b->expr[v].onstack = 1;
345 }
346 
347 static ucs2_t
348 _ure_peek(_ure_buffer_t *b)
349 {
350     if (b == 0 || b->stack.slist_used == 0)
351       return _URE_NOOP;
352 
353     return b->stack.slist[b->stack.slist_used - 1];
354 }
355 
356 static ucs2_t
357 _ure_pop(_ure_buffer_t *b)
358 {
359     ucs2_t v;
360 
361     if (b == 0 || b->stack.slist_used == 0)
362       return _URE_NOOP;
363 
364     v = b->stack.slist[--b->stack.slist_used];
365     if (b->reducing)
366       b->expr[v].onstack = 0;
367 
368     return v;
369 }
370 
371 /*************************************************************************
372  *
373  * Start symbol parse functions.
374  *
375  *************************************************************************/
376 
377 /*
378  * Parse a comma-separated list of integers that represent character
379  * properties.  Combine them into a mask that is returned in the `mask'
380  * variable, and return the number of characters consumed.
381  */
382 static unsigned long
383 _ure_prop_list(ucs2_t *pp, unsigned long limit, unsigned long *mask,
384                _ure_buffer_t *b)
385 {
386     unsigned long n, m;
387     ucs2_t *sp, *ep;
388 
389     sp = pp;
390     ep = sp + limit;
391 
392     for (m = n = 0; b->error == _URE_OK && sp < ep; sp++) {
393         if (*sp == ',') {
394             /*
395              * Encountered a comma, so select the next character property flag
396              * and reset the number.
397              */
398             m |= cclass_flags[n];
399             n = 0;
400         } else if (*sp >= '0' && *sp <= '9')
401           /*
402            * Encountered a digit, so start or continue building the cardinal
403            * that represents the character property flag.
404            */
405           n = (n * 10) + (*sp - '0');
406         else
407           /*
408            * Encountered something that is not part of the property list.
409            * Indicate that we are done.
410            */
411           break;
412 
413         /*
414          * If a property number greater than 32 occurs, then there is a
415          * problem.  Most likely a missing comma separator.
416          */
417         if (n > 32)
418           b->error = _URE_INVALID_PROPERTY;
419     }
420 
421     if (n != 0)
422       m |= cclass_flags[n];
423 
424     /*
425      * Set the mask that represents the group of character properties.
426      */
427     *mask = m;
428 
429     /*
430      * Return the number of characters consumed.
431      */
432     return sp - pp;
433 }
434 
435 /*
436  * Collect a hex number with 1 to 4 digits and return the number
437  * of characters used.
438  */
439 static unsigned long
440 _ure_hex(ucs2_t *np, unsigned long limit, ucs4_t *n)
441 {
442     ucs2_t i;
443     ucs2_t *sp, *ep;
444     ucs4_t nn;
445 
446     sp = np;
447     ep = sp + limit;
448 
449     for (nn = 0, i = 0; i < 4 && sp < ep; i++, sp++) {
450         if (*sp >= '0' && *sp <= '9')
451           nn = (nn << 4) + (*sp - '0');
452         else if (*sp >= 'A' && *sp <= 'F')
453           nn = (nn << 4) + ((*sp - 'A') + 10);
454         else if (*sp >= 'a' && *sp <= 'f')
455           nn = (nn << 4) + ((*sp - 'a') + 10);
456         else
457           /*
458            * Encountered something that is not a hex digit.
459            */
460           break;
461     }
462 
463     /*
464      * Assign the character code collected and return the number of
465      * characters used.
466      */
467     *n = nn;
468 
469     return sp - np;
470 }
471 
472 /*
473  * Insert a range into a character class, removing duplicates and ordering
474  * them in increasing range-start order.
475  */
476 static void
477 _ure_add_range(_ure_ccl_t *ccl, _ure_range_t *r, _ure_buffer_t *b)
478 {
479     ucs2_t i;
480     ucs4_t tmp;
481     _ure_range_t *rp;
482 
483     /*
484      * If the `casefold' flag is set, then make sure both endpoints of the
485      * range are converted to lower case.
486      */
487     if (b->flags & _URE_DFA_CASEFOLD) {
488         r->min_code = _ure_tolower(r->min_code);
489         r->max_code = _ure_tolower(r->max_code);
490     }
491 
492     /*
493      * Swap the range endpoints if they are not in increasing order.
494      */
495     if (r->min_code > r->max_code) {
496         tmp = r->min_code;
497         r->min_code = r->max_code;
498         r->max_code = tmp;
499     }
500 
501     for (i = 0, rp = ccl->ranges;
502          i < ccl->ranges_used && r->min_code < rp->min_code; i++, rp++) ;
503 
504     /*
505      * Check for a duplicate.
506      */
507     if (i < ccl->ranges_used &&
508         r->min_code == rp->min_code && r->max_code == rp->max_code)
509       return;
510 
511     if (ccl->ranges_used == ccl->ranges_size) {
512         if (ccl->ranges_size == 0)
513           ccl->ranges = (_ure_range_t *) malloc(sizeof(_ure_range_t) << 3);
514         else
515           ccl->ranges = (_ure_range_t *)
516               realloc((char *) ccl->ranges,
517                       sizeof(_ure_range_t) * (ccl->ranges_size + 8));
518         ccl->ranges_size += 8;
519     }
520 
521     rp = ccl->ranges + ccl->ranges_used;
522 
523     if (i < ccl->ranges_used)
524       _ure_memmove((char *) (rp + 1), (char *) rp,
525                    sizeof(_ure_range_t) * (ccl->ranges_used - i));
526 
527     ccl->ranges_used++;
528     rp->min_code = r->min_code;
529     rp->max_code = r->max_code;
530 }
531 
532 #define _URE_ALPHA_MASK  (_URE_UPPER|_URE_LOWER|_URE_OTHERLETTER|\
533 _URE_MODIFIER|_URE_TITLE|_URE_NONSPACING|_URE_COMBINING)
534 #define _URE_ALNUM_MASK  (_URE_ALPHA_MASK|_URE_NUMDIGIT)
535 #define _URE_PUNCT_MASK  (_URE_DASHPUNCT|_URE_OPENPUNCT|_URE_CLOSEPUNCT|\
536 _URE_OTHERPUNCT)
537 #define _URE_GRAPH_MASK (_URE_NUMDIGIT|_URE_NUMOTHER|_URE_ALPHA_MASK|\
538 _URE_MATHSYM|_URE_CURRENCYSYM|_URE_OTHERSYM)
539 #define _URE_PRINT_MASK (_URE_GRAPH_MASK|_URE_SPACESEP)
540 #define _URE_SPACE_MASK  (_URE_SPACESEP|_URE_LINESEP|_URE_PARASEP)
541 
542 typedef void (*_ure_cclsetup_t)(
543     _ure_symtab_t *sym,
544     unsigned long mask,
545     _ure_buffer_t *b
546 );
547 
548 typedef struct {
549     ucs2_t key;
550     unsigned long len;
551     unsigned long next;
552     _ure_cclsetup_t func;
553     unsigned long mask;
554 } _ure_trie_t;
555 
556 static void
557 _ure_ccl_setup(_ure_symtab_t *sym, unsigned long mask, _ure_buffer_t *b)
558 {
559     sym->props |= mask;
560 }
561 
562 static void
563 _ure_space_setup(_ure_symtab_t *sym, unsigned long mask, _ure_buffer_t *b)
564 {
565     _ure_range_t range;
566 
567     sym->props |= mask;
568 
569     /*
570      * Add the additional characters needed for handling isspace().
571      */
572     range.min_code = range.max_code = '\t';
573     _ure_add_range(&sym->sym.ccl, &range, b);
574     range.min_code = range.max_code = '\r';
575     _ure_add_range(&sym->sym.ccl, &range, b);
576     range.min_code = range.max_code = '\n';
577     _ure_add_range(&sym->sym.ccl, &range, b);
578     range.min_code = range.max_code = '\f';
579     _ure_add_range(&sym->sym.ccl, &range, b);
580     range.min_code = range.max_code = 0xfeff;
581     _ure_add_range(&sym->sym.ccl, &range, b);
582 }
583 
584 static void
585 _ure_xdigit_setup(_ure_symtab_t *sym, unsigned long mask, _ure_buffer_t *b)
586 {
587     _ure_range_t range;
588 
589     /*
590      * Add the additional characters needed for handling isxdigit().
591      */
592     range.min_code = '0';
593     range.max_code = '9';
594     _ure_add_range(&sym->sym.ccl, &range, b);
595     range.min_code = 'A';
596     range.max_code = 'F';
597     _ure_add_range(&sym->sym.ccl, &range, b);
598     range.min_code = 'a';
599     range.max_code = 'f';
600     _ure_add_range(&sym->sym.ccl, &range, b);
601 }
602 
603 static _ure_trie_t cclass_trie[] = {
604     {0x003a, 1, 1, 0, 0},
605     {0x0061, 9, 10, 0, 0},
606     {0x0063, 8, 19, 0, 0},
607     {0x0064, 7, 24, 0, 0},
608     {0x0067, 6, 29, 0, 0},
609     {0x006c, 5, 34, 0, 0},
610     {0x0070, 4, 39, 0, 0},
611     {0x0073, 3, 49, 0, 0},
612     {0x0075, 2, 54, 0, 0},
613     {0x0078, 1, 59, 0, 0},
614     {0x006c, 1, 11, 0, 0},
615     {0x006e, 2, 13, 0, 0},
616     {0x0070, 1, 16, 0, 0},
617     {0x0075, 1, 14, 0, 0},
618     {0x006d, 1, 15, 0, 0},
619     {0x003a, 1, 16, _ure_ccl_setup, _URE_ALNUM_MASK},
620     {0x0068, 1, 17, 0, 0},
621     {0x0061, 1, 18, 0, 0},
622     {0x003a, 1, 19, _ure_ccl_setup, _URE_ALPHA_MASK},
623     {0x006e, 1, 20, 0, 0},
624     {0x0074, 1, 21, 0, 0},
625     {0x0072, 1, 22, 0, 0},
626     {0x006c, 1, 23, 0, 0},
627     {0x003a, 1, 24, _ure_ccl_setup, _URE_CNTRL},
628     {0x0069, 1, 25, 0, 0},
629     {0x0067, 1, 26, 0, 0},
630     {0x0069, 1, 27, 0, 0},
631     {0x0074, 1, 28, 0, 0},
632     {0x003a, 1, 29, _ure_ccl_setup, _URE_NUMDIGIT},
633     {0x0072, 1, 30, 0, 0},
634     {0x0061, 1, 31, 0, 0},
635     {0x0070, 1, 32, 0, 0},
636     {0x0068, 1, 33, 0, 0},
637     {0x003a, 1, 34, _ure_ccl_setup, _URE_GRAPH_MASK},
638     {0x006f, 1, 35, 0, 0},
639     {0x0077, 1, 36, 0, 0},
640     {0x0065, 1, 37, 0, 0},
641     {0x0072, 1, 38, 0, 0},
642     {0x003a, 1, 39, _ure_ccl_setup, _URE_LOWER},
643     {0x0072, 2, 41, 0, 0},
644     {0x0075, 1, 45, 0, 0},
645     {0x0069, 1, 42, 0, 0},
646     {0x006e, 1, 43, 0, 0},
647     {0x0074, 1, 44, 0, 0},
648     {0x003a, 1, 45, _ure_ccl_setup, _URE_PRINT_MASK},
649     {0x006e, 1, 46, 0, 0},
650     {0x0063, 1, 47, 0, 0},
651     {0x0074, 1, 48, 0, 0},
652     {0x003a, 1, 49, _ure_ccl_setup, _URE_PUNCT_MASK},
653     {0x0070, 1, 50, 0, 0},
654     {0x0061, 1, 51, 0, 0},
655     {0x0063, 1, 52, 0, 0},
656     {0x0065, 1, 53, 0, 0},
657     {0x003a, 1, 54, _ure_space_setup, _URE_SPACE_MASK},
658     {0x0070, 1, 55, 0, 0},
659     {0x0070, 1, 56, 0, 0},
660     {0x0065, 1, 57, 0, 0},
661     {0x0072, 1, 58, 0, 0},
662     {0x003a, 1, 59, _ure_ccl_setup, _URE_UPPER},
663     {0x0064, 1, 60, 0, 0},
664     {0x0069, 1, 61, 0, 0},
665     {0x0067, 1, 62, 0, 0},
666     {0x0069, 1, 63, 0, 0},
667     {0x0074, 1, 64, 0, 0},
668     {0x003a, 1, 65, _ure_xdigit_setup, 0},
669 };
670 
671 /*
672  * Probe for one of the POSIX colon delimited character classes in the static
673  * trie.
674  */
675 static unsigned long
676 _ure_posix_ccl(ucs2_t *cp, unsigned long limit, _ure_symtab_t *sym,
677                _ure_buffer_t *b)
678 {
679     int i;
680     unsigned long n;
681     _ure_trie_t *tp;
682     ucs2_t *sp, *ep;
683 
684     /*
685      * If the number of characters left is less than 7, then this cannot be
686      * interpreted as one of the colon delimited classes.
687      */
688     if (limit < 7)
689       return 0;
690 
691     sp = cp;
692     ep = sp + limit;
693     tp = cclass_trie;
694     for (i = 0; sp < ep && i < 8; i++, sp++) {
695         n = tp->len;
696 
697         for (; n > 0 && tp->key != *sp; tp++, n--) ;
698 
699         if (n == 0)
700           return 0;
701 
702         if (*sp == ':' && (i == 6 || i == 7)) {
703             sp++;
704             break;
705         }
706         if (sp + 1 < ep)
707           tp = cclass_trie + tp->next;
708     }
709     if (tp->func == 0)
710       return 0;
711 
712     (*tp->func)(sym, tp->mask, b);
713 
714     return sp - cp;
715 }
716 
717 /*
718  * Construct a list of ranges and return the number of characters consumed.
719  */
720 static unsigned long
721 _ure_cclass(ucs2_t *cp, unsigned long limit, _ure_symtab_t *symp,
722             _ure_buffer_t *b)
723 {
724     int range_end;
725     unsigned long n;
726     ucs2_t *sp, *ep;
727     ucs4_t c, last;
728     _ure_ccl_t *cclp;
729     _ure_range_t range;
730 
731     sp = cp;
732     ep = sp + limit;
733 
734     if (*sp == '^') {
735       symp->type = _URE_NCCLASS;
736       sp++;
737     } else
738       symp->type = _URE_CCLASS;
739 
740     for (last = 0, range_end = 0;
741          b->error == _URE_OK && sp < ep && *sp != ']'; ) {
742         c = *sp++;
743         if (c == '\\') {
744             if (sp == ep) {
745                 /*
746                  * The EOS was encountered when expecting the reverse solidus
747                  * to be followed by the character it is escaping.  Set an
748                  * error code and return the number of characters consumed up
749                  * to this point.
750                  */
751                 b->error = _URE_UNEXPECTED_EOS;
752                 return sp - cp;
753             }
754 
755             c = *sp++;
756             switch (c) {
757               case 'a':
758                 c = 0x07;
759                 break;
760               case 'b':
761                 c = 0x08;
762                 break;
763               case 'f':
764                 c = 0x0c;
765                 break;
766               case 'n':
767                 c = 0x0a;
768                 break;
769               case 'r':
770                 c = 0x0d;
771                 break;
772               case 't':
773                 c = 0x09;
774                 break;
775               case 'v':
776                 c = 0x0b;
777                 break;
778               case 'p':
779               case 'P':
780                 sp += _ure_prop_list(sp, ep - sp, &symp->props, b);
781                 /*
782                  * Invert the bit mask of the properties if this is a negated
783                  * character class or if 'P' is used to specify a list of
784                  * character properties that should *not* match in a
785                  * character class.
786                  */
787                 if (c == 'P')
788                   symp->props = ~symp->props;
789                 continue;
790                 break;
791               case 'x':
792               case 'X':
793               case 'u':
794               case 'U':
795                 if (sp < ep &&
796                     ((*sp >= '0' && *sp <= '9') ||
797                      (*sp >= 'A' && *sp <= 'F') ||
798                      (*sp >= 'a' && *sp <= 'f')))
799                   sp += _ure_hex(sp, ep - sp, &c);
800             }
801         } else if (c == ':') {
802             /*
803              * Probe for a POSIX colon delimited character class.
804              */
805             sp--;
806             if ((n = _ure_posix_ccl(sp, ep - sp, symp, b)) == 0)
807               sp++;
808             else {
809                 sp += n;
810                 continue;
811             }
812         }
813 
814         cclp = &symp->sym.ccl;
815 
816         /*
817          * Check to see if the current character is a low surrogate that needs
818          * to be combined with a preceding high surrogate.
819          */
820         if (last != 0) {
821             if (c >= 0xdc00 && c <= 0xdfff)
822               /*
823                * Construct the UTF16 character code.
824                */
825               c = 0x10000 + (((last & 0x03ff) << 10) | (c & 0x03ff));
826             else {
827                 /*
828                  * Add the isolated high surrogate to the range.
829                  */
830                 if (range_end == 1)
831                   range.max_code = last & 0xffff;
832                 else
833                   range.min_code = range.max_code = last & 0xffff;
834 
835                 _ure_add_range(cclp, &range, b);
836                 range_end = 0;
837             }
838         }
839 
840         /*
841          * Clear the last character code.
842          */
843         last = 0;
844 
845         /*
846          * This slightly awkward code handles the different cases needed to
847          * construct a range.
848          */
849         if (c >= 0xd800 && c <= 0xdbff) {
850             /*
851              * If the high surrogate is followed by a range indicator, simply
852              * add it as the range start.  Otherwise, save it in case the next
853              * character is a low surrogate.
854              */
855             if (*sp == '-') {
856                 sp++;
857                 range.min_code = c;
858                 range_end = 1;
859             } else
860               last = c;
861         } else if (range_end == 1) {
862             range.max_code = c;
863             _ure_add_range(cclp, &range, b);
864             range_end = 0;
865         } else {
866             range.min_code = range.max_code = c;
867             if (*sp == '-') {
868                 sp++;
869                 range_end = 1;
870             } else
871               _ure_add_range(cclp, &range, b);
872         }
873     }
874 
875     if (sp < ep && *sp == ']')
876       sp++;
877     else
878       /*
879        * The parse was not terminated by the character class close symbol
880        * (']'), so set an error code.
881        */
882       b->error = _URE_CCLASS_OPEN;
883 
884     return sp - cp;
885 }
886 
887 /*
888  * Probe for a low surrogate hex code.
889  */
890 static unsigned long
891 _ure_probe_ls(ucs2_t *ls, unsigned long limit, ucs4_t *c)
892 {
893     ucs4_t i, code;
894     ucs2_t *sp, *ep;
895 
896     for (i = code = 0, sp = ls, ep = sp + limit; i < 4 && sp < ep; sp++) {
897         if (*sp >= '0' && *sp <= '9')
898           code = (code << 4) + (*sp - '0');
899         else if (*sp >= 'A' && *sp <= 'F')
900           code = (code << 4) + ((*sp - 'A') + 10);
901         else if (*sp >= 'a' && *sp <= 'f')
902           code = (code << 4) + ((*sp - 'a') + 10);
903         else
904           break;
905     }
906 
907     *c = code;
908     return (0xdc00 <= code && code <= 0xdfff) ? sp - ls : 0;
909 }
910 
911 static unsigned long
912 _ure_compile_symbol(ucs2_t *sym, unsigned long limit, _ure_symtab_t *symp,
913                     _ure_buffer_t *b)
914 {
915     ucs4_t c;
916     ucs2_t *sp, *ep;
917 
918     sp = sym;
919     ep = sym + limit;
920 
921     if ((c = *sp++) == '\\') {
922 
923         if (sp == ep) {
924             /*
925              * The EOS was encountered when expecting the reverse solidus to
926              * be followed by the character it is escaping.  Set an error code
927              * and return the number of characters consumed up to this point.
928              */
929             b->error = _URE_UNEXPECTED_EOS;
930             return sp - sym;
931         }
932 
933         c = *sp++;
934         switch (c) {
935           case 'p':
936           case 'P':
937             symp->type = (c == 'p') ? _URE_CCLASS : _URE_NCCLASS;
938             sp += _ure_prop_list(sp, ep - sp, &symp->props, b);
939             break;
940           case 'a':
941             symp->type = _URE_CHAR;
942             symp->sym.chr = 0x07;
943             break;
944           case 'b':
945             symp->type = _URE_CHAR;
946             symp->sym.chr = 0x08;
947             break;
948           case 'f':
949             symp->type = _URE_CHAR;
950             symp->sym.chr = 0x0c;
951             break;
952           case 'n':
953             symp->type = _URE_CHAR;
954             symp->sym.chr = 0x0a;
955             break;
956           case 'r':
957             symp->type = _URE_CHAR;
958             symp->sym.chr = 0x0d;
959             break;
960           case 't':
961             symp->type = _URE_CHAR;
962             symp->sym.chr = 0x09;
963             break;
964           case 'v':
965             symp->type = _URE_CHAR;
966             symp->sym.chr = 0x0b;
967             break;
968           case 'x':
969           case 'X':
970           case 'u':
971           case 'U':
972             /*
973              * Collect between 1 and 4 digits representing a UCS2 code.  Fall
974              * through to the next case.
975              */
976             if (sp < ep &&
977                 ((*sp >= '0' && *sp <= '9') ||
978                  (*sp >= 'A' && *sp <= 'F') ||
979                  (*sp >= 'a' && *sp <= 'f')))
980               sp += _ure_hex(sp, ep - sp, &c);
981             /* FALLTHROUGH */
982           default:
983             /*
984              * Simply add an escaped character here.
985              */
986             symp->type = _URE_CHAR;
987             symp->sym.chr = c;
988         }
989     } else if (c == '^' || c == '$')
990       /*
991        * Handle the BOL and EOL anchors.  This actually consists simply of
992        * setting a flag that indicates that the user supplied anchor match
993        * function should be called.  This needs to be done instead of simply
994        * matching line/paragraph separators because beginning-of-text and
995        * end-of-text tests are needed as well.
996        */
997       symp->type = (c == '^') ? _URE_BOL_ANCHOR : _URE_EOL_ANCHOR;
998     else if (c == '[')
999       /*
1000        * Construct a character class.
1001        */
1002       sp += _ure_cclass(sp, ep - sp, symp, b);
1003     else if (c == '.')
1004       symp->type = _URE_ANY_CHAR;
1005     else {
1006         symp->type = _URE_CHAR;
1007         symp->sym.chr = c;
1008     }
1009 
1010     /*
1011      * If the symbol type happens to be a character and is a high surrogate,
1012      * then probe forward to see if it is followed by a low surrogate that
1013      * needs to be added.
1014      */
1015     if (sp < ep && symp->type == _URE_CHAR &&
1016         0xd800 <= symp->sym.chr && symp->sym.chr <= 0xdbff) {
1017 
1018         if (0xdc00 <= *sp && *sp <= 0xdfff) {
1019             symp->sym.chr = 0x10000 + (((symp->sym.chr & 0x03ff) << 10) |
1020                                        (*sp & 0x03ff));
1021             sp++;
1022         } else if (*sp == '\\' && (*(sp + 1) == 'x' || *(sp + 1) == 'X' ||
1023                                  *(sp + 1) == 'u' || *(sp + 1) == 'U')) {
1024             sp += _ure_probe_ls(sp + 2, ep - (sp + 2), &c);
1025             if (0xdc00 <= c && c <= 0xdfff) {
1026                 /*
1027                  * Take into account the \[xu] in front of the hex code.
1028                  */
1029                 sp += 2;
1030                 symp->sym.chr = 0x10000 + (((symp->sym.chr & 0x03ff) << 10) |
1031                                            (c & 0x03ff));
1032             }
1033         }
1034     }
1035 
1036     /*
1037      * Last, make sure any _URE_CHAR type symbols are changed to lower case if
1038      * the `casefold' flag is set.
1039      */
1040     if ((b->flags & _URE_DFA_CASEFOLD) && symp->type == _URE_CHAR)
1041       symp->sym.chr = _ure_tolower(symp->sym.chr);
1042 
1043     /*
1044      * If the symbol constructed is anything other than one of the anchors,
1045      * make sure the _URE_DFA_BLANKLINE flag is removed.
1046      */
1047     if (symp->type != _URE_BOL_ANCHOR && symp->type != _URE_EOL_ANCHOR)
1048       b->flags &= ~_URE_DFA_BLANKLINE;
1049 
1050     /*
1051      * Return the number of characters consumed.
1052      */
1053     return sp - sym;
1054 }
1055 
1056 static int
1057 _ure_sym_neq(_ure_symtab_t *a, _ure_symtab_t *b)
1058 {
1059     if (a->type != b->type || a->mods != b->mods || a->props != b->props)
1060       return 1;
1061 
1062     if (a->type == _URE_CCLASS || a->type == _URE_NCCLASS) {
1063         if (a->sym.ccl.ranges_used != b->sym.ccl.ranges_used)
1064           return 1;
1065         if (a->sym.ccl.ranges_used > 0 &&
1066             memcmp((char *) a->sym.ccl.ranges, (char *) b->sym.ccl.ranges,
1067                    sizeof(_ure_range_t) * a->sym.ccl.ranges_used) != 0)
1068           return 1;
1069     } else if (a->type == _URE_CHAR && a->sym.chr != b->sym.chr)
1070       return 1;
1071     return 0;
1072 }
1073 
1074 /*
1075  * Construct a symbol, but only keep unique symbols.
1076  */
1077 static ucs2_t
1078 _ure_make_symbol(ucs2_t *sym, unsigned long limit, unsigned long *consumed,
1079                  _ure_buffer_t *b)
1080 {
1081     ucs2_t i;
1082     _ure_symtab_t *sp, symbol;
1083 
1084     /*
1085      * Build the next symbol so we can test to see if it is already in the
1086      * symbol table.
1087      */
1088     (void) memset((char *) &symbol, '\0', sizeof(_ure_symtab_t));
1089     *consumed = _ure_compile_symbol(sym, limit, &symbol, b);
1090 
1091     /*
1092      * Check to see if the symbol exists.
1093      */
1094     for (i = 0, sp = b->symtab;
1095          i < b->symtab_used && _ure_sym_neq(&symbol, sp); i++, sp++) ;
1096 
1097     if (i < b->symtab_used) {
1098         /*
1099          * Free up any ranges used for the symbol.
1100          */
1101         if ((symbol.type == _URE_CCLASS || symbol.type == _URE_NCCLASS) &&
1102             symbol.sym.ccl.ranges_size > 0)
1103           free((char *) symbol.sym.ccl.ranges);
1104 
1105         return b->symtab[i].id;
1106     }
1107 
1108     /*
1109      * Need to add the new symbol.
1110      */
1111     if (b->symtab_used == b->symtab_size) {
1112         if (b->symtab_size == 0)
1113           b->symtab = (_ure_symtab_t *) malloc(sizeof(_ure_symtab_t) << 3);
1114         else
1115           b->symtab = (_ure_symtab_t *)
1116               realloc((char *) b->symtab,
1117                       sizeof(_ure_symtab_t) * (b->symtab_size + 8));
1118         sp = b->symtab + b->symtab_size;
1119         (void) memset((char *) sp, '\0', sizeof(_ure_symtab_t) << 3);
1120         b->symtab_size += 8;
1121     }
1122 
1123     symbol.id = b->symtab_used++;
1124     (void) AC_MEMCPY((char *) &b->symtab[symbol.id], (char *) &symbol,
1125                   sizeof(_ure_symtab_t));
1126 
1127     return symbol.id;
1128 }
1129 
1130 /*************************************************************************
1131  *
1132  * End symbol parse functions.
1133  *
1134  *************************************************************************/
1135 
1136 static ucs2_t
1137 _ure_make_expr(ucs2_t type, ucs2_t lhs, ucs2_t rhs, _ure_buffer_t *b)
1138 {
1139     ucs2_t i;
1140 
1141     if (b == 0)
1142       return _URE_NOOP;
1143 
1144     /*
1145      * Determine if the expression already exists or not.
1146      */
1147     for (i = 0; i < b->expr_used; i++) {
1148         if (b->expr[i].type == type && b->expr[i].lhs == lhs &&
1149             b->expr[i].rhs == rhs)
1150           break;
1151     }
1152     if (i < b->expr_used)
1153       return i;
1154 
1155     /*
1156      * Need to add a new expression.
1157      */
1158     if (b->expr_used == b->expr_size) {
1159         if (b->expr_size == 0)
1160           b->expr = (_ure_elt_t *) malloc(sizeof(_ure_elt_t) << 3);
1161         else
1162           b->expr = (_ure_elt_t *)
1163               realloc((char *) b->expr,
1164                       sizeof(_ure_elt_t) * (b->expr_size + 8));
1165         b->expr_size += 8;
1166     }
1167 
1168     b->expr[b->expr_used].onstack = 0;
1169     b->expr[b->expr_used].type = type;
1170     b->expr[b->expr_used].lhs = lhs;
1171     b->expr[b->expr_used].rhs = rhs;
1172 
1173     return b->expr_used++;
1174 }
1175 
1176 static unsigned char spmap[] = {
1177     0x00, 0x00, 0x00, 0x00, 0x00, 0x0f, 0x00, 0x80, 0x00, 0x00, 0x00, 0x00,
1178     0x00, 0x00, 0x00, 0x10, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
1179     0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
1180 };
1181 
1182 #define _ure_isspecial(cc) ((cc) > 0x20 && (cc) < 0x7f && \
1183                             (spmap[(cc) >> 3] & (1 << ((cc) & 7))))
1184 
1185 /*
1186  * Convert the regular expression into an NFA in a form that will be easy to
1187  * reduce to a DFA.  The starting state for the reduction will be returned.
1188  */
1189 static ucs2_t
1190 _ure_re2nfa(ucs2_t *re, unsigned long relen, _ure_buffer_t *b)
1191 {
1192     ucs2_t c, state, top, sym, *sp, *ep;
1193     unsigned long used;
1194 
1195     state = _URE_NOOP;
1196 
1197     sp = re;
1198     ep = sp + relen;
1199     while (b->error == _URE_OK && sp < ep) {
1200         c = *sp++;
1201         switch (c) {
1202           case '(':
1203             _ure_push(_URE_PAREN, b);
1204             break;
1205           case ')':
1206             /*
1207              * Check for the case of too many close parentheses.
1208              */
1209             if (_ure_peek(b) == _URE_NOOP) {
1210                 b->error = _URE_UNBALANCED_GROUP;
1211                 break;
1212             }
1213 
1214             while ((top = _ure_peek(b)) == _URE_AND || top == _URE_OR)
1215               /*
1216                * Make an expression with the AND or OR operator and its right
1217                * hand side.
1218                */
1219               state = _ure_make_expr(_ure_pop(b), _ure_pop(b), state, b);
1220 
1221             /*
1222              * Remove the _URE_PAREN off the stack.
1223              */
1224             (void) _ure_pop(b);
1225             break;
1226           case '*':
1227             state = _ure_make_expr(_URE_STAR, state, _URE_NOOP, b);
1228             break;
1229           case '+':
1230             state = _ure_make_expr(_URE_PLUS, state, _URE_NOOP, b);
1231             break;
1232           case '?':
1233             state = _ure_make_expr(_URE_QUEST, state, _URE_NOOP, b);
1234             break;
1235           case '|':
1236             while ((top = _ure_peek(b)) == _URE_AND || top == _URE_OR)
1237               /*
1238                * Make an expression with the AND or OR operator and its right
1239                * hand side.
1240                */
1241               state = _ure_make_expr(_ure_pop(b), _ure_pop(b), state, b);
1242 
1243             _ure_push(state, b);
1244             _ure_push(_URE_OR, b);
1245             break;
1246           default:
1247             sp--;
1248             sym = _ure_make_symbol(sp, ep - sp, &used, b);
1249             sp += used;
1250             state = _ure_make_expr(_URE_SYMBOL, sym, _URE_NOOP, b);
1251             break;
1252         }
1253 
1254         if (c != '(' && c != '|' && sp < ep &&
1255             (!_ure_isspecial(*sp) || *sp == '(')) {
1256             _ure_push(state, b);
1257             _ure_push(_URE_AND, b);
1258         }
1259     }
1260     while ((top = _ure_peek(b)) == _URE_AND || top == _URE_OR)
1261       /*
1262        * Make an expression with the AND or OR operator and its right
1263        * hand side.
1264        */
1265       state = _ure_make_expr(_ure_pop(b), _ure_pop(b), state, b);
1266 
1267     if (b->stack.slist_used > 0)
1268       b->error = _URE_UNBALANCED_GROUP;
1269 
1270     return (b->error == _URE_OK) ? state : _URE_NOOP;
1271 }
1272 
1273 static void
1274 _ure_add_symstate(ucs2_t sym, ucs2_t state, _ure_buffer_t *b)
1275 {
1276     ucs2_t i, *stp;
1277     _ure_symtab_t *sp;
1278 
1279     /*
1280      * Locate the symbol in the symbol table so the state can be added.
1281      * If the symbol doesn't exist, then a real problem exists.
1282      */
1283     for (i = 0, sp = b->symtab; i < b->symtab_used && sym != sp->id;
1284          i++, sp++) ;
1285 
1286     /*
1287      * Now find out if the state exists in the symbol's state list.
1288      */
1289     for (i = 0, stp = sp->states.slist;
1290          i < sp->states.slist_used && state > *stp; i++, stp++) ;
1291 
1292     if (i == sp->states.slist_used || state < *stp) {
1293         /*
1294          * Need to add the state in order.
1295          */
1296         if (sp->states.slist_used == sp->states.slist_size) {
1297             if (sp->states.slist_size == 0)
1298               sp->states.slist = (ucs2_t *) malloc(sizeof(ucs2_t) << 3);
1299             else
1300               sp->states.slist = (ucs2_t *)
1301                   realloc((char *) sp->states.slist,
1302                           sizeof(ucs2_t) * (sp->states.slist_size + 8));
1303             sp->states.slist_size += 8;
1304         }
1305         if (i < sp->states.slist_used)
1306           (void) _ure_memmove((char *) (sp->states.slist + i + 1),
1307                               (char *) (sp->states.slist + i),
1308                               sizeof(ucs2_t) * (sp->states.slist_used - i));
1309         sp->states.slist[i] = state;
1310         sp->states.slist_used++;
1311     }
1312 }
1313 
1314 static ucs2_t
1315 _ure_add_state(ucs2_t nstates, ucs2_t *states, _ure_buffer_t *b)
1316 {
1317     ucs2_t i;
1318     _ure_state_t *sp;
1319 
1320     for (i = 0, sp = b->states.states; i < b->states.states_used; i++, sp++) {
1321         if (sp->st.slist_used == nstates &&
1322             memcmp((char *) states, (char *) sp->st.slist,
1323                    sizeof(ucs2_t) * nstates) == 0)
1324           break;
1325     }
1326 
1327     if (i == b->states.states_used) {
1328         /*
1329          * Need to add a new DFA state (set of NFA states).
1330          */
1331         if (b->states.states_used == b->states.states_size) {
1332             if (b->states.states_size == 0)
1333               b->states.states = (_ure_state_t *)
1334                   malloc(sizeof(_ure_state_t) << 3);
1335             else
1336               b->states.states = (_ure_state_t *)
1337                   realloc((char *) b->states.states,
1338                           sizeof(_ure_state_t) * (b->states.states_size + 8));
1339             sp = b->states.states + b->states.states_size;
1340             (void) memset((char *) sp, '\0', sizeof(_ure_state_t) << 3);
1341             b->states.states_size += 8;
1342         }
1343 
1344         sp = b->states.states + b->states.states_used++;
1345         sp->id = i;
1346 
1347         if (sp->st.slist_used + nstates > sp->st.slist_size) {
1348             if (sp->st.slist_size == 0)
1349               sp->st.slist = (ucs2_t *)
1350                   malloc(sizeof(ucs2_t) * (sp->st.slist_used + nstates));
1351             else
1352               sp->st.slist = (ucs2_t *)
1353                   realloc((char *) sp->st.slist,
1354                           sizeof(ucs2_t) * (sp->st.slist_used + nstates));
1355             sp->st.slist_size = sp->st.slist_used + nstates;
1356         }
1357         sp->st.slist_used = nstates;
1358         (void) AC_MEMCPY((char *) sp->st.slist, (char *) states,
1359                       sizeof(ucs2_t) * nstates);
1360     }
1361 
1362     /*
1363      * Return the ID of the DFA state representing a group of NFA states.
1364      */
1365     return i;
1366 }
1367 
1368 static void
1369 _ure_reduce(ucs2_t start, _ure_buffer_t *b)
1370 {
1371     ucs2_t i, j, state, eval, syms, rhs;
1372     ucs2_t s1, s2, ns1, ns2;
1373     _ure_state_t *sp;
1374     _ure_symtab_t *smp;
1375 
1376     b->reducing = 1;
1377 
1378     /*
1379      * Add the starting state for the reduction.
1380      */
1381     _ure_add_state(1, &start, b);
1382 
1383     /*
1384      * Process each set of NFA states that get created.
1385      */
1386     for (i = 0; i < b->states.states_used; i++) {
1387         sp = b->states.states + i;
1388 
1389         /*
1390          * Push the current states on the stack.
1391          */
1392         for (j = 0; j < sp->st.slist_used; j++)
1393           _ure_push(sp->st.slist[j], b);
1394 
1395         /*
1396          * Reduce the NFA states.
1397          */
1398         for (j = sp->accepting = syms = 0; j < b->stack.slist_used; j++) {
1399             state = b->stack.slist[j];
1400             eval = 1;
1401 
1402             /*
1403              * This inner loop is the iterative equivalent of recursively
1404              * reducing subexpressions generated as a result of a reduction.
1405              */
1406             while (eval) {
1407                 switch (b->expr[state].type) {
1408                   case _URE_SYMBOL:
1409                     ns1 = _ure_make_expr(_URE_ONE, _URE_NOOP, _URE_NOOP, b);
1410                     _ure_add_symstate(b->expr[state].lhs, ns1, b);
1411                     syms++;
1412                     eval = 0;
1413                     break;
1414                   case _URE_ONE:
1415                     sp->accepting = 1;
1416                     eval = 0;
1417                     break;
1418                   case _URE_QUEST:
1419                     s1 = b->expr[state].lhs;
1420                     ns1 = _ure_make_expr(_URE_ONE, _URE_NOOP, _URE_NOOP, b);
1421                     state = _ure_make_expr(_URE_OR, ns1, s1, b);
1422                     break;
1423                   case _URE_PLUS:
1424                     s1 = b->expr[state].lhs;
1425                     ns1 = _ure_make_expr(_URE_STAR, s1, _URE_NOOP, b);
1426                     state = _ure_make_expr(_URE_AND, s1, ns1, b);
1427                     break;
1428                   case _URE_STAR:
1429                     s1 = b->expr[state].lhs;
1430                     ns1 = _ure_make_expr(_URE_ONE, _URE_NOOP, _URE_NOOP, b);
1431                     ns2 = _ure_make_expr(_URE_PLUS, s1, _URE_NOOP, b);
1432                     state = _ure_make_expr(_URE_OR, ns1, ns2, b);
1433                     break;
1434                   case _URE_OR:
1435                     s1 = b->expr[state].lhs;
1436                     s2 = b->expr[state].rhs;
1437                     _ure_push(s1, b);
1438                     _ure_push(s2, b);
1439                     eval = 0;
1440                     break;
1441                   case _URE_AND:
1442                     s1 = b->expr[state].lhs;
1443                     s2 = b->expr[state].rhs;
1444                     switch (b->expr[s1].type) {
1445                       case _URE_SYMBOL:
1446                         _ure_add_symstate(b->expr[s1].lhs, s2, b);
1447                         syms++;
1448                         eval = 0;
1449                         break;
1450                       case _URE_ONE:
1451                         state = s2;
1452                         break;
1453                       case _URE_QUEST:
1454                         ns1 = b->expr[s1].lhs;
1455                         ns2 = _ure_make_expr(_URE_AND, ns1, s2, b);
1456                         state = _ure_make_expr(_URE_OR, s2, ns2, b);
1457                         break;
1458                       case _URE_PLUS:
1459                         ns1 = b->expr[s1].lhs;
1460                         ns2 = _ure_make_expr(_URE_OR, s2, state, b);
1461                         state = _ure_make_expr(_URE_AND, ns1, ns2, b);
1462                         break;
1463                       case _URE_STAR:
1464                         ns1 = b->expr[s1].lhs;
1465                         ns2 = _ure_make_expr(_URE_AND, ns1, state, b);
1466                         state = _ure_make_expr(_URE_OR, s2, ns2, b);
1467                         break;
1468                       case _URE_OR:
1469                         ns1 = b->expr[s1].lhs;
1470                         ns2 = b->expr[s1].rhs;
1471                         ns1 = _ure_make_expr(_URE_AND, ns1, s2, b);
1472                         ns2 = _ure_make_expr(_URE_AND, ns2, s2, b);
1473                         state = _ure_make_expr(_URE_OR, ns1, ns2, b);
1474                         break;
1475                       case _URE_AND:
1476                         ns1 = b->expr[s1].lhs;
1477                         ns2 = b->expr[s1].rhs;
1478                         ns2 = _ure_make_expr(_URE_AND, ns2, s2, b);
1479                         state = _ure_make_expr(_URE_AND, ns1, ns2, b);
1480                         break;
1481                     }
1482                 }
1483             }
1484         }
1485 
1486         /*
1487          * Clear the state stack.
1488          */
1489         while (_ure_pop(b) != _URE_NOOP) ;
1490 
1491         /*
1492          * Reset the state pointer because the reduction may have moved it
1493          * during a reallocation.
1494          */
1495         sp = b->states.states + i;
1496 
1497         /*
1498          * Generate the DFA states for the symbols collected during the
1499          * current reduction.
1500          */
1501         if (sp->trans_used + syms > sp->trans_size) {
1502             if (sp->trans_size == 0)
1503               sp->trans = (_ure_elt_t *)
1504                   malloc(sizeof(_ure_elt_t) * (sp->trans_used + syms));
1505             else
1506               sp->trans = (_ure_elt_t *)
1507                   realloc((char *) sp->trans,
1508                           sizeof(_ure_elt_t) * (sp->trans_used + syms));
1509             sp->trans_size = sp->trans_used + syms;
1510         }
1511 
1512         /*
1513          * Go through the symbol table and generate the DFA state transitions
1514          * for each symbol that has collected NFA states.
1515          */
1516         for (j = syms = 0, smp = b->symtab; j < b->symtab_used; j++, smp++) {
1517             sp = b->states.states + i;
1518 
1519             if (smp->states.slist_used > 0) {
1520                 sp->trans[syms].lhs = smp->id;
1521                 rhs = _ure_add_state(smp->states.slist_used,
1522                                      smp->states.slist, b);
1523                 /*
1524                  * Reset the state pointer in case the reallocation moves it
1525                  * in memory.
1526                  */
1527                 sp = b->states.states + i;
1528                 sp->trans[syms].rhs = rhs;
1529 
1530                 smp->states.slist_used = 0;
1531                 syms++;
1532             }
1533         }
1534 
1535         /*
1536          * Set the number of transitions actually used.
1537          */
1538         sp->trans_used = syms;
1539     }
1540     b->reducing = 0;
1541 }
1542 
1543 static void
1544 _ure_add_equiv(ucs2_t l, ucs2_t r, _ure_buffer_t *b)
1545 {
1546     ucs2_t tmp;
1547 
1548     l = b->states.states[l].id;
1549     r = b->states.states[r].id;
1550 
1551     if (l == r)
1552       return;
1553 
1554     if (l > r) {
1555         tmp = l;
1556         l = r;
1557         r = tmp;
1558     }
1559 
1560     /*
1561      * Check to see if the equivalence pair already exists.
1562      */
1563     for (tmp = 0; tmp < b->equiv_used &&
1564              (b->equiv[tmp].l != l || b->equiv[tmp].r != r);
1565          tmp++) ;
1566 
1567     if (tmp < b->equiv_used)
1568       return;
1569 
1570     if (b->equiv_used == b->equiv_size) {
1571         if (b->equiv_size == 0)
1572           b->equiv = (_ure_equiv_t *) malloc(sizeof(_ure_equiv_t) << 3);
1573         else
1574           b->equiv = (_ure_equiv_t *) realloc((char *) b->equiv,
1575                                               sizeof(_ure_equiv_t) *
1576                                               (b->equiv_size + 8));
1577         b->equiv_size += 8;
1578     }
1579     b->equiv[b->equiv_used].l = l;
1580     b->equiv[b->equiv_used].r = r;
1581     b->equiv_used++;
1582 }
1583 
1584 /*
1585  * Merge the DFA states that are equivalent.
1586  */
1587 static void
1588 _ure_merge_equiv(_ure_buffer_t *b)
1589 {
1590     ucs2_t i, j, k, eq, done;
1591     _ure_state_t *sp1, *sp2, *ls, *rs;
1592 
1593     for (i = 0; i < b->states.states_used; i++) {
1594         sp1 = b->states.states + i;
1595         if (sp1->id != i)
1596           continue;
1597         for (j = 0; j < i; j++) {
1598             sp2 = b->states.states + j;
1599             if (sp2->id != j)
1600               continue;
1601             b->equiv_used = 0;
1602             _ure_add_equiv(i, j, b);
1603             for (eq = 0, done = 0; eq < b->equiv_used; eq++) {
1604                 ls = b->states.states + b->equiv[eq].l;
1605                 rs = b->states.states + b->equiv[eq].r;
1606                 if (ls->accepting != rs->accepting ||
1607                     ls->trans_used != rs->trans_used) {
1608                     done = 1;
1609                     break;
1610                 }
1611                 for (k = 0; k < ls->trans_used &&
1612                          ls->trans[k].lhs == rs->trans[k].lhs; k++) ;
1613                 if (k < ls->trans_used) {
1614                     done = 1;
1615                     break;
1616                 }
1617 
1618                 for (k = 0; k < ls->trans_used; k++)
1619                   _ure_add_equiv(ls->trans[k].rhs, rs->trans[k].rhs, b);
1620             }
1621             if (done == 0)
1622               break;
1623         }
1624         for (eq = 0; j < i && eq < b->equiv_used; eq++)
1625           b->states.states[b->equiv[eq].r].id =
1626               b->states.states[b->equiv[eq].l].id;
1627     }
1628 
1629     /*
1630      * Renumber the states appropriately.
1631      */
1632     for (i = eq = 0, sp1 = b->states.states; i < b->states.states_used;
1633          sp1++, i++)
1634       sp1->id = (sp1->id == i) ? eq++ : b->states.states[sp1->id].id;
1635 }
1636 
1637 /*************************************************************************
1638  *
1639  * API.
1640  *
1641  *************************************************************************/
1642 
1643 ure_buffer_t
1644 ure_buffer_create(void)
1645 {
1646     ure_buffer_t b;
1647 
1648     b = (ure_buffer_t) calloc(1, sizeof(_ure_buffer_t));
1649 
1650     return b;
1651 }
1652 
1653 void
1654 ure_buffer_free(ure_buffer_t buf)
1655 {
1656     unsigned long i;
1657 
1658     if (buf == 0)
1659       return;
1660 
1661     if (buf->stack.slist_size > 0)
1662       free((char *) buf->stack.slist);
1663 
1664     if (buf->expr_size > 0)
1665       free((char *) buf->expr);
1666 
1667     for (i = 0; i < buf->symtab_size; i++) {
1668         if (buf->symtab[i].states.slist_size > 0)
1669           free((char *) buf->symtab[i].states.slist);
1670     }
1671 
1672     if (buf->symtab_size > 0)
1673       free((char *) buf->symtab);
1674 
1675     for (i = 0; i < buf->states.states_size; i++) {
1676         if (buf->states.states[i].trans_size > 0)
1677           free((char *) buf->states.states[i].trans);
1678         if (buf->states.states[i].st.slist_size > 0)
1679           free((char *) buf->states.states[i].st.slist);
1680     }
1681 
1682     if (buf->states.states_size > 0)
1683       free((char *) buf->states.states);
1684 
1685     if (buf->equiv_size > 0)
1686       free((char *) buf->equiv);
1687 
1688     free((char *) buf);
1689 }
1690 
1691 ure_dfa_t
1692 ure_compile(ucs2_t *re, unsigned long relen, int casefold, ure_buffer_t buf)
1693 {
1694     ucs2_t i, j, state;
1695     _ure_state_t *sp;
1696     _ure_dstate_t *dsp;
1697     _ure_trans_t *tp;
1698     ure_dfa_t dfa;
1699 
1700     if (re == 0 || *re == 0 || relen == 0 || buf == 0)
1701       return 0;
1702 
1703     /*
1704      * Reset the various fields of the compilation buffer.  Default the flags
1705      * to indicate the presense of the "^$" pattern.  If any other pattern
1706      * occurs, then this flag will be removed.  This is done to catch this
1707      * special pattern and handle it specially when matching.
1708      */
1709     buf->flags = _URE_DFA_BLANKLINE | ((casefold) ? _URE_DFA_CASEFOLD : 0);
1710     buf->reducing = 0;
1711     buf->stack.slist_used = 0;
1712     buf->expr_used = 0;
1713 
1714     for (i = 0; i < buf->symtab_used; i++)
1715       buf->symtab[i].states.slist_used = 0;
1716     buf->symtab_used = 0;
1717 
1718     for (i = 0; i < buf->states.states_used; i++) {
1719         buf->states.states[i].st.slist_used = 0;
1720         buf->states.states[i].trans_used = 0;
1721     }
1722     buf->states.states_used = 0;
1723 
1724     /*
1725      * Construct the NFA.  If this stage returns a 0, then an error occured or
1726      * an empty expression was passed.
1727      */
1728     if ((state = _ure_re2nfa(re, relen, buf)) == _URE_NOOP)
1729       return 0;
1730 
1731     /*
1732      * Do the expression reduction to get the initial DFA.
1733      */
1734     _ure_reduce(state, buf);
1735 
1736     /*
1737      * Merge all the equivalent DFA states.
1738      */
1739     _ure_merge_equiv(buf);
1740 
1741     /*
1742      * Construct the minimal DFA.
1743      */
1744     dfa = (ure_dfa_t) malloc(sizeof(_ure_dfa_t));
1745     (void) memset((char *) dfa, '\0', sizeof(_ure_dfa_t));
1746 
1747     dfa->flags = buf->flags & (_URE_DFA_CASEFOLD|_URE_DFA_BLANKLINE);
1748 
1749     /*
1750      * Free up the NFA state groups and transfer the symbols from the buffer
1751      * to the DFA.
1752      */
1753     for (i = 0; i < buf->symtab_size; i++) {
1754         if (buf->symtab[i].states.slist_size > 0)
1755           free((char *) buf->symtab[i].states.slist);
1756     }
1757     dfa->syms = buf->symtab;
1758     dfa->nsyms = buf->symtab_used;
1759 
1760     buf->symtab_used = buf->symtab_size = 0;
1761 
1762     /*
1763      * Collect the total number of states and transitions needed for the DFA.
1764      */
1765     for (i = state = 0, sp = buf->states.states; i < buf->states.states_used;
1766          i++, sp++) {
1767         if (sp->id == state) {
1768             dfa->nstates++;
1769             dfa->ntrans += sp->trans_used;
1770             state++;
1771         }
1772     }
1773 
1774     /*
1775      * Allocate enough space for the states and transitions.
1776      */
1777     dfa->states = (_ure_dstate_t *) malloc(sizeof(_ure_dstate_t) *
1778                                            dfa->nstates);
1779     dfa->trans = (_ure_trans_t *) malloc(sizeof(_ure_trans_t) * dfa->ntrans);
1780 
1781     /*
1782      * Actually transfer the DFA states from the buffer.
1783      */
1784     dsp = dfa->states;
1785     tp = dfa->trans;
1786     for (i = state = 0, sp = buf->states.states; i < buf->states.states_used;
1787          i++, sp++) {
1788         if (sp->id == state) {
1789             dsp->trans = tp;
1790             dsp->ntrans = sp->trans_used;
1791             dsp->accepting = sp->accepting;
1792 
1793             /*
1794              * Add the transitions for the state.
1795              */
1796             for (j = 0; j < dsp->ntrans; j++, tp++) {
1797                 tp->symbol = sp->trans[j].lhs;
1798                 tp->next_state = buf->states.states[sp->trans[j].rhs].id;
1799             }
1800 
1801             dsp++;
1802             state++;
1803         }
1804     }
1805 
1806     return dfa;
1807 }
1808 
1809 void
1810 ure_dfa_free(ure_dfa_t dfa)
1811 {
1812     ucs2_t i;
1813 
1814     if (dfa == 0)
1815       return;
1816 
1817     for (i = 0; i < dfa->nsyms; i++) {
1818         if ((dfa->syms[i].type == _URE_CCLASS ||
1819              dfa->syms[i].type == _URE_NCCLASS) &&
1820             dfa->syms[i].sym.ccl.ranges_size > 0)
1821           free((char *) dfa->syms[i].sym.ccl.ranges);
1822     }
1823     if (dfa->nsyms > 0)
1824       free((char *) dfa->syms);
1825 
1826     if (dfa->nstates > 0)
1827       free((char *) dfa->states);
1828     if (dfa->ntrans > 0)
1829       free((char *) dfa->trans);
1830     free((char *) dfa);
1831 }
1832 
1833 void
1834 ure_write_dfa(ure_dfa_t dfa, FILE *out)
1835 {
1836     ucs2_t i, j, k, h, l;
1837     _ure_dstate_t *sp;
1838     _ure_symtab_t *sym;
1839     _ure_range_t *rp;
1840 
1841     if (dfa == 0 || out == 0)
1842       return;
1843 
1844     /*
1845      * Write all the different character classes.
1846      */
1847     for (i = 0, sym = dfa->syms; i < dfa->nsyms; i++, sym++) {
1848         if (sym->type == _URE_CCLASS || sym->type == _URE_NCCLASS) {
1849             fprintf(out, "C%hd = ", sym->id);
1850             if (sym->sym.ccl.ranges_used > 0) {
1851                 putc('[', out);
1852                 if (sym->type == _URE_NCCLASS)
1853                   putc('^', out);
1854             }
1855             if (sym->props != 0) {
1856                 if (sym->type == _URE_NCCLASS)
1857                   fprintf(out, "\\P");
1858                 else
1859                   fprintf(out, "\\p");
1860                 for (k = h = 0; k < 32; k++) {
1861                     if (sym->props & (1 << k)) {
1862                         if (h != 0)
1863                           putc(',', out);
1864                         fprintf(out, "%hd", k + 1);
1865                         h = 1;
1866                     }
1867                 }
1868             }
1869             /*
1870              * Dump the ranges.
1871              */
1872             for (k = 0, rp = sym->sym.ccl.ranges;
1873                  k < sym->sym.ccl.ranges_used; k++, rp++) {
1874                 /*
1875                  * Check for UTF16 characters.
1876                  */
1877                 if (0x10000 <= rp->min_code &&
1878                     rp->min_code <= 0x10ffff) {
1879                     h = (ucs2_t) (((rp->min_code - 0x10000) >> 10) + 0xd800);
1880                     l = (ucs2_t) (((rp->min_code - 0x10000) & 1023) + 0xdc00);
1881                     fprintf(out, "\\x%04hX\\x%04hX", h, l);
1882                 } else
1883                   fprintf(out, "\\x%04lX", rp->min_code & 0xffff);
1884                 if (rp->max_code != rp->min_code) {
1885                     putc('-', out);
1886                     if (rp->max_code >= 0x10000 &&
1887                         rp->max_code <= 0x10ffff) {
1888                         h = (ucs2_t) (((rp->max_code - 0x10000) >> 10) + 0xd800);
1889                         l = (ucs2_t) (((rp->max_code - 0x10000) & 1023) + 0xdc00);
1890                         fprintf(out, "\\x%04hX\\x%04hX", h, l);
1891                     } else
1892                       fprintf(out, "\\x%04lX", rp->max_code & 0xffff);
1893                 }
1894             }
1895             if (sym->sym.ccl.ranges_used > 0)
1896               putc(']', out);
1897             putc('\n', out);
1898         }
1899     }
1900 
1901     for (i = 0, sp = dfa->states; i < dfa->nstates; i++, sp++) {
1902         fprintf(out, "S%hd = ", i);
1903         if (sp->accepting) {
1904             fprintf(out, "1 ");
1905             if (sp->ntrans)
1906               fprintf(out, "| ");
1907         }
1908         for (j = 0; j < sp->ntrans; j++) {
1909             if (j > 0)
1910               fprintf(out, "| ");
1911 
1912             sym = dfa->syms + sp->trans[j].symbol;
1913             switch (sym->type) {
1914               case _URE_CHAR:
1915                 if (0x10000 <= sym->sym.chr && sym->sym.chr <= 0x10ffff) {
1916                     /*
1917                      * Take care of UTF16 characters.
1918                      */
1919                     h = (ucs2_t) (((sym->sym.chr - 0x10000) >> 10) + 0xd800);
1920                     l = (ucs2_t) (((sym->sym.chr - 0x10000) & 1023) + 0xdc00);
1921                     fprintf(out, "\\x%04hX\\x%04hX ", h, l);
1922                 } else
1923                   fprintf(out, "\\x%04lX ", sym->sym.chr & 0xffff);
1924                 break;
1925               case _URE_ANY_CHAR:
1926                 fprintf(out, "<any> ");
1927                 break;
1928               case _URE_BOL_ANCHOR:
1929                 fprintf(out, "<bol-anchor> ");
1930                 break;
1931               case _URE_EOL_ANCHOR:
1932                 fprintf(out, "<eol-anchor> ");
1933                 break;
1934               case _URE_CCLASS:
1935               case _URE_NCCLASS:
1936                 fprintf(out, "[C%hd] ", sym->id);
1937                 break;
1938             }
1939             fprintf(out, "S%hd", sp->trans[j].next_state);
1940             if (j + 1 < sp->ntrans)
1941               putc(' ', out);
1942         }
1943         putc('\n', out);
1944     }
1945 }
1946 
1947 #define _ure_issep(cc) ((cc) == '\n' || (cc) == '\r' || (cc) == 0x2028 ||\
1948                         (cc) == 0x2029)
1949 
1950 int
1951 ure_exec(ure_dfa_t dfa, int flags, ucs2_t *text, unsigned long textlen,
1952          unsigned long *match_start, unsigned long *match_end)
1953 {
1954     int i, j, matched, found, skip;
1955     unsigned long ms, me;
1956     ucs4_t c;
1957     ucs2_t *sp, *ep, *lp;
1958     _ure_dstate_t *stp;
1959     _ure_symtab_t *sym;
1960     _ure_range_t *rp;
1961 
1962     if (dfa == 0 || text == 0)
1963       return 0;
1964 
1965     /*
1966      * Handle the special case of an empty string matching the "^$" pattern.
1967      */
1968     if (textlen == 0 && (dfa->flags & _URE_DFA_BLANKLINE)) {
1969         *match_start = *match_end = 0;
1970         return 1;
1971     }
1972 
1973     sp = text;
1974     ep = sp + textlen;
1975 
1976     ms = me = ~0;
1977 
1978     stp = dfa->states;
1979 
1980     for (found = skip = 0; found == 0 && sp < ep; ) {
1981         lp = sp;
1982         c = *sp++;
1983 
1984         /*
1985          * Check to see if this is a high surrogate that should be
1986          * combined with a following low surrogate.
1987          */
1988         if (sp < ep && 0xd800 <= c && c <= 0xdbff &&
1989             0xdc00 <= *sp && *sp <= 0xdfff)
1990           c = 0x10000 + (((c & 0x03ff) << 10) | (*sp++ & 0x03ff));
1991 
1992         /*
1993          * Determine if the character is non-spacing and should be skipped.
1994          */
1995         if (_ure_matches_properties(_URE_NONSPACING, c) &&
1996             (flags & URE_IGNORE_NONSPACING)) {
1997             sp++;
1998             continue;
1999         }
2000 
2001         if (dfa->flags & _URE_DFA_CASEFOLD)
2002           c = _ure_tolower(c);
2003 
2004         /*
2005          * See if one of the transitions matches.
2006          */
2007         for (i = 0, matched = 0; matched == 0 && i < stp->ntrans; i++) {
2008             sym = dfa->syms + stp->trans[i].symbol;
2009             switch (sym->type) {
2010               case _URE_ANY_CHAR:
2011                 if ((flags & URE_DOT_MATCHES_SEPARATORS) ||
2012                     !_ure_issep(c))
2013                   matched = 1;
2014                 break;
2015               case _URE_CHAR:
2016                 if (c == sym->sym.chr)
2017                   matched = 1;
2018                 break;
2019               case _URE_BOL_ANCHOR:
2020                 if (lp == text) {
2021                     sp = lp;
2022                     matched = 1;
2023                 } else if (_ure_issep(c)) {
2024                     if (c == '\r' && sp < ep && *sp == '\n')
2025                       sp++;
2026                     lp = sp;
2027                     matched = 1;
2028                 }
2029                 break;
2030               case _URE_EOL_ANCHOR:
2031                 if (_ure_issep(c)) {
2032                     /*
2033                      * Put the pointer back before the separator so the match
2034                      * end position will be correct.  This case will also
2035                      * cause the `sp' pointer to be advanced over the current
2036                      * separator once the match end point has been recorded.
2037                      */
2038                     sp = lp;
2039                     matched = 1;
2040                 }
2041                 break;
2042               case _URE_CCLASS:
2043               case _URE_NCCLASS:
2044                 if (sym->props != 0)
2045                   matched = _ure_matches_properties(sym->props, c);
2046                 for (j = 0, rp = sym->sym.ccl.ranges;
2047                      j < sym->sym.ccl.ranges_used; j++, rp++) {
2048                     if (rp->min_code <= c && c <= rp->max_code)
2049                       matched = 1;
2050                 }
2051                 if (sym->type == _URE_NCCLASS)
2052                   matched = !matched;
2053                 break;
2054             }
2055 
2056             if (matched) {
2057                 if (ms == ~0UL)
2058                   ms = lp - text;
2059                 else
2060                   me = sp - text;
2061                 stp = dfa->states + stp->trans[i].next_state;
2062 
2063                 /*
2064                  * If the match was an EOL anchor, adjust the pointer past the
2065                  * separator that caused the match.  The correct match
2066                  * position has been recorded already.
2067                  */
2068                 if (sym->type == _URE_EOL_ANCHOR) {
2069                     /*
2070                      * Skip the character that caused the match.
2071                      */
2072                     sp++;
2073 
2074                     /*
2075                      * Handle the infamous CRLF situation.
2076                      */
2077                     if (sp < ep && c == '\r' && *sp == '\n')
2078                       sp++;
2079                 }
2080             }
2081         }
2082 
2083         if (matched == 0) {
2084             if (stp->accepting == 0) {
2085                 /*
2086                  * If the last state was not accepting, then reset
2087                  * and start over.
2088                  */
2089                 stp = dfa->states;
2090                 ms = me = ~0;
2091             } else
2092               /*
2093                * The last state was accepting, so terminate the matching
2094                * loop to avoid more work.
2095                */
2096               found = 1;
2097         } else if (sp == ep) {
2098             if (!stp->accepting) {
2099                 /*
2100                  * This ugly hack is to make sure the end-of-line anchors
2101                  * match when the source text hits the end.  This is only done
2102                  * if the last subexpression matches.
2103                  */
2104                 for (i = 0; found == 0 && i < stp->ntrans; i++) {
2105                     sym = dfa->syms + stp->trans[i].symbol;
2106                     if (sym->type ==_URE_EOL_ANCHOR) {
2107                         stp = dfa->states + stp->trans[i].next_state;
2108                         if (stp->accepting) {
2109                             me = sp - text;
2110                             found = 1;
2111                         } else
2112                           break;
2113                     }
2114                 }
2115             } else {
2116                 /*
2117                  * Make sure any conditions that match all the way to the end
2118                  * of the string match.
2119                  */
2120                 found = 1;
2121                 me = sp - text;
2122             }
2123         }
2124     }
2125 
2126     if (found == 0)
2127       ms = me = ~0;
2128 
2129     *match_start = ms;
2130     *match_end = me;
2131 
2132     return (ms != ~0UL) ? 1 : 0;
2133 }
2134