xref: /openbsd/gnu/usr.bin/perl/regcomp_trie.c (revision 5486feef)
1 #ifdef PERL_EXT_RE_BUILD
2 #include "re_top.h"
3 #endif
4 
5 #include "EXTERN.h"
6 #define PERL_IN_REGEX_ENGINE
7 #define PERL_IN_REGCOMP_ANY
8 #define PERL_IN_REGCOMP_TRIE_C
9 #include "perl.h"
10 
11 #ifdef PERL_IN_XSUB_RE
12 #  include "re_comp.h"
13 #else
14 #  include "regcomp.h"
15 #endif
16 
17 #include "invlist_inline.h"
18 #include "unicode_constants.h"
19 #include "regcomp_internal.h"
20 
21 #define TRIE_LIST_ITEM(state,idx) (trie->states[state].trans.list)[ idx ]
22 #define TRIE_LIST_CUR(state)  ( TRIE_LIST_ITEM( state, 0 ).forid )
23 #define TRIE_LIST_LEN(state) ( TRIE_LIST_ITEM( state, 0 ).newstate )
24 #define TRIE_LIST_USED(idx)  ( trie->states[state].trans.list         \
25                                ? (TRIE_LIST_CUR( idx ) - 1)           \
26                                : 0 )
27 
28 
29 #ifdef DEBUGGING
30 /*
31    dump_trie(trie,widecharmap,revcharmap)
32    dump_trie_interim_list(trie,widecharmap,revcharmap,next_alloc)
33    dump_trie_interim_table(trie,widecharmap,revcharmap,next_alloc)
34 
35    These routines dump out a trie in a somewhat readable format.
36    The _interim_ variants are used for debugging the interim
37    tables that are used to generate the final compressed
38    representation which is what dump_trie expects.
39 
40    Part of the reason for their existence is to provide a form
41    of documentation as to how the different representations function.
42 
43 */
44 
45 /*
46   Dumps the final compressed table form of the trie to Perl_debug_log.
47   Used for debugging make_trie().
48 */
49 
50 STATIC void
S_dump_trie(pTHX_ const struct _reg_trie_data * trie,HV * widecharmap,AV * revcharmap,U32 depth)51 S_dump_trie(pTHX_ const struct _reg_trie_data *trie, HV *widecharmap,
52             AV *revcharmap, U32 depth)
53 {
54     U32 state;
55     SV *sv=sv_newmortal();
56     int colwidth= widecharmap ? 6 : 4;
57     U16 word;
58     DECLARE_AND_GET_RE_DEBUG_FLAGS;
59 
60     PERL_ARGS_ASSERT_DUMP_TRIE;
61 
62     Perl_re_indentf( aTHX_  "Char : %-6s%-6s%-4s ",
63         depth+1, "Match","Base","Ofs" );
64 
65     for( state = 0 ; state < trie->uniquecharcount ; state++ ) {
66         SV ** const tmp = av_fetch_simple( revcharmap, state, 0);
67         if ( tmp ) {
68             Perl_re_printf( aTHX_  "%*s",
69                 colwidth,
70                 pv_pretty(sv, SvPV_nolen_const(*tmp), SvCUR(*tmp), colwidth,
71                             PL_colors[0], PL_colors[1],
72                             (SvUTF8(*tmp) ? PERL_PV_ESCAPE_UNI : 0) |
73                             PERL_PV_ESCAPE_FIRSTCHAR
74                 )
75             );
76         }
77     }
78     Perl_re_printf( aTHX_  "\n");
79     Perl_re_indentf( aTHX_ "State|-----------------------", depth+1);
80 
81     for( state = 0 ; state < trie->uniquecharcount ; state++ )
82         Perl_re_printf( aTHX_  "%.*s", colwidth, "--------");
83     Perl_re_printf( aTHX_  "\n");
84 
85     for( state = 1 ; state < trie->statecount ; state++ ) {
86         const U32 base = trie->states[ state ].trans.base;
87 
88         Perl_re_indentf( aTHX_  "#%4" UVXf "|", depth+1, (UV)state);
89 
90         if ( trie->states[ state ].wordnum ) {
91             Perl_re_printf( aTHX_  " W%4X", trie->states[ state ].wordnum );
92         } else {
93             Perl_re_printf( aTHX_  "%6s", "" );
94         }
95 
96         Perl_re_printf( aTHX_  " @%4" UVXf " ", (UV)base );
97 
98         if ( base ) {
99             U32 ofs = 0;
100 
101             while( ( base + ofs  < trie->uniquecharcount ) ||
102                    ( base + ofs - trie->uniquecharcount < trie->lasttrans
103                      && trie->trans[ base + ofs - trie->uniquecharcount ].check
104                                                                     != state))
105                     ofs++;
106 
107             Perl_re_printf( aTHX_  "+%2" UVXf "[ ", (UV)ofs);
108 
109             for ( ofs = 0 ; ofs < trie->uniquecharcount ; ofs++ ) {
110                 if ( ( base + ofs >= trie->uniquecharcount )
111                         && ( base + ofs - trie->uniquecharcount
112                                                         < trie->lasttrans )
113                         && trie->trans[ base + ofs
114                                     - trie->uniquecharcount ].check == state )
115                 {
116                    Perl_re_printf( aTHX_  "%*" UVXf, colwidth,
117                     (UV)trie->trans[ base + ofs - trie->uniquecharcount ].next
118                    );
119                 } else {
120                     Perl_re_printf( aTHX_  "%*s", colwidth,"   ." );
121                 }
122             }
123 
124             Perl_re_printf( aTHX_  "]");
125 
126         }
127         Perl_re_printf( aTHX_  "\n" );
128     }
129     Perl_re_indentf( aTHX_  "word_info N:(prev,len)=",
130                                 depth);
131     for (word=1; word <= trie->wordcount; word++) {
132         Perl_re_printf( aTHX_  " %d:(%d,%d)",
133             (int)word, (int)(trie->wordinfo[word].prev),
134             (int)(trie->wordinfo[word].len));
135     }
136     Perl_re_printf( aTHX_  "\n" );
137 }
138 /*
139   Dumps a fully constructed but uncompressed trie in list form.
140   List tries normally only are used for construction when the number of
141   possible chars (trie->uniquecharcount) is very high.
142   Used for debugging make_trie().
143 */
144 STATIC void
S_dump_trie_interim_list(pTHX_ const struct _reg_trie_data * trie,HV * widecharmap,AV * revcharmap,U32 next_alloc,U32 depth)145 S_dump_trie_interim_list(pTHX_ const struct _reg_trie_data *trie,
146                          HV *widecharmap, AV *revcharmap, U32 next_alloc,
147                          U32 depth)
148 {
149     U32 state;
150     SV *sv=sv_newmortal();
151     int colwidth= widecharmap ? 6 : 4;
152     DECLARE_AND_GET_RE_DEBUG_FLAGS;
153 
154     PERL_ARGS_ASSERT_DUMP_TRIE_INTERIM_LIST;
155 
156     /* print out the table precompression.  */
157     Perl_re_indentf( aTHX_  "State :Word | Transition Data\n",
158             depth+1 );
159     Perl_re_indentf( aTHX_  "%s",
160             depth+1, "------:-----+-----------------\n" );
161 
162     for( state=1 ; state < next_alloc ; state ++ ) {
163         U16 charid;
164 
165         Perl_re_indentf( aTHX_  " %4" UVXf " :",
166             depth+1, (UV)state  );
167         if ( ! trie->states[ state ].wordnum ) {
168             Perl_re_printf( aTHX_  "%5s| ","");
169         } else {
170             Perl_re_printf( aTHX_  "W%4x| ",
171                 trie->states[ state ].wordnum
172             );
173         }
174         for( charid = 1 ; charid <= TRIE_LIST_USED( state ) ; charid++ ) {
175             SV ** const tmp = av_fetch_simple( revcharmap,
176                                         TRIE_LIST_ITEM(state, charid).forid, 0);
177             if ( tmp ) {
178                 Perl_re_printf( aTHX_  "%*s:%3X=%4" UVXf " | ",
179                     colwidth,
180                     pv_pretty(sv, SvPV_nolen_const(*tmp), SvCUR(*tmp),
181                               colwidth,
182                               PL_colors[0], PL_colors[1],
183                               (SvUTF8(*tmp) ? PERL_PV_ESCAPE_UNI : 0)
184                               | PERL_PV_ESCAPE_FIRSTCHAR
185                     ) ,
186                     TRIE_LIST_ITEM(state, charid).forid,
187                     (UV)TRIE_LIST_ITEM(state, charid).newstate
188                 );
189                 if (!(charid % 10))
190                     Perl_re_printf( aTHX_  "\n%*s| ",
191                         (int)((depth * 2) + 14), "");
192             }
193         }
194         Perl_re_printf( aTHX_  "\n");
195     }
196 }
197 
198 /*
199   Dumps a fully constructed but uncompressed trie in table form.
200   This is the normal DFA style state transition table, with a few
201   twists to facilitate compression later.
202   Used for debugging make_trie().
203 */
204 STATIC void
S_dump_trie_interim_table(pTHX_ const struct _reg_trie_data * trie,HV * widecharmap,AV * revcharmap,U32 next_alloc,U32 depth)205 S_dump_trie_interim_table(pTHX_ const struct _reg_trie_data *trie,
206                           HV *widecharmap, AV *revcharmap, U32 next_alloc,
207                           U32 depth)
208 {
209     U32 state;
210     U16 charid;
211     SV *sv=sv_newmortal();
212     int colwidth= widecharmap ? 6 : 4;
213     DECLARE_AND_GET_RE_DEBUG_FLAGS;
214 
215     PERL_ARGS_ASSERT_DUMP_TRIE_INTERIM_TABLE;
216 
217     /*
218        print out the table precompression so that we can do a visual check
219        that they are identical.
220      */
221 
222     Perl_re_indentf( aTHX_  "Char : ", depth+1 );
223 
224     for( charid = 0 ; charid < trie->uniquecharcount ; charid++ ) {
225         SV ** const tmp = av_fetch_simple( revcharmap, charid, 0);
226         if ( tmp ) {
227             Perl_re_printf( aTHX_  "%*s",
228                 colwidth,
229                 pv_pretty(sv, SvPV_nolen_const(*tmp), SvCUR(*tmp), colwidth,
230                             PL_colors[0], PL_colors[1],
231                             (SvUTF8(*tmp) ? PERL_PV_ESCAPE_UNI : 0) |
232                             PERL_PV_ESCAPE_FIRSTCHAR
233                 )
234             );
235         }
236     }
237 
238     Perl_re_printf( aTHX_ "\n");
239     Perl_re_indentf( aTHX_  "State+-", depth+1 );
240 
241     for( charid=0 ; charid < trie->uniquecharcount ; charid++ ) {
242         Perl_re_printf( aTHX_  "%.*s", colwidth,"--------");
243     }
244 
245     Perl_re_printf( aTHX_  "\n" );
246 
247     for( state=1 ; state < next_alloc ; state += trie->uniquecharcount ) {
248 
249         Perl_re_indentf( aTHX_  "%4" UVXf " : ",
250             depth+1,
251             (UV)TRIE_NODENUM( state ) );
252 
253         for( charid = 0 ; charid < trie->uniquecharcount ; charid++ ) {
254             UV v=(UV)SAFE_TRIE_NODENUM( trie->trans[ state + charid ].next );
255             if (v)
256                 Perl_re_printf( aTHX_  "%*" UVXf, colwidth, v );
257             else
258                 Perl_re_printf( aTHX_  "%*s", colwidth, "." );
259         }
260         if ( ! trie->states[ TRIE_NODENUM( state ) ].wordnum ) {
261             Perl_re_printf( aTHX_  " (%4" UVXf ")\n",
262                                             (UV)trie->trans[ state ].check );
263         } else {
264             Perl_re_printf( aTHX_  " (%4" UVXf ") W%4X\n",
265                                             (UV)trie->trans[ state ].check,
266             trie->states[ TRIE_NODENUM( state ) ].wordnum );
267         }
268     }
269 }
270 
271 #endif
272 
273 
274 /* make_trie(startbranch,first,last,tail,word_count,flags,depth)
275   startbranch: the first branch in the whole branch sequence
276   first      : start branch of sequence of branch-exact nodes.
277                May be the same as startbranch
278   last       : Thing following the last branch.
279                May be the same as tail.
280   tail       : item following the branch sequence
281   count      : words in the sequence
282   flags      : currently the OP() type we will be building one of /EXACT(|F|FA|FU|FU_SS|L|FLU8)/
283   depth      : indent depth
284 
285 Inplace optimizes a sequence of 2 or more Branch-Exact nodes into a TRIE node.
286 
287 A trie is an N'ary tree where the branches are determined by digital
288 decomposition of the key. IE, at the root node you look up the 1st character and
289 follow that branch repeat until you find the end of the branches. Nodes can be
290 marked as "accepting" meaning they represent a complete word. Eg:
291 
292   /he|she|his|hers/
293 
294 would convert into the following structure. Numbers represent states, letters
295 following numbers represent valid transitions on the letter from that state, if
296 the number is in square brackets it represents an accepting state, otherwise it
297 will be in parenthesis.
298 
299       +-h->+-e->[3]-+-r->(8)-+-s->[9]
300       |    |
301       |   (2)
302       |    |
303      (1)   +-i->(6)-+-s->[7]
304       |
305       +-s->(3)-+-h->(4)-+-e->[5]
306 
307       Accept Word Mapping: 3=>1 (he),5=>2 (she), 7=>3 (his), 9=>4 (hers)
308 
309 This shows that when matching against the string 'hers' we will begin at state 1
310 read 'h' and move to state 2, read 'e' and move to state 3 which is accepting,
311 then read 'r' and go to state 8 followed by 's' which takes us to state 9 which
312 is also accepting. Thus we know that we can match both 'he' and 'hers' with a
313 single traverse. We store a mapping from accepting to state to which word was
314 matched, and then when we have multiple possibilities we try to complete the
315 rest of the regex in the order in which they occurred in the alternation.
316 
317 The only prior NFA like behaviour that would be changed by the TRIE support is
318 the silent ignoring of duplicate alternations which are of the form:
319 
320  / (DUPE|DUPE) X? (?{ ... }) Y /x
321 
322 Thus EVAL blocks following a trie may be called a different number of times with
323 and without the optimisation. With the optimisations dupes will be silently
324 ignored. This inconsistent behaviour of EVAL type nodes is well established as
325 the following demonstrates:
326 
327  'words'=~/(word|word|word)(?{ print $1 })[xyz]/
328 
329 which prints out 'word' three times, but
330 
331  'words'=~/(word|word|word)(?{ print $1 })S/
332 
333 which doesnt print it out at all. This is due to other optimisations kicking in.
334 
335 Example of what happens on a structural level:
336 
337 The regexp /(ac|ad|ab)+/ will produce the following debug output:
338 
339    1: CURLYM[1] {1,32767}(18)
340    5:   BRANCH(8)
341    6:     EXACT <ac>(16)
342    8:   BRANCH(11)
343    9:     EXACT <ad>(16)
344   11:   BRANCH(14)
345   12:     EXACT <ab>(16)
346   16:   SUCCEED(0)
347   17:   NOTHING(18)
348   18: END(0)
349 
350 This would be optimizable with startbranch=5, first=5, last=16, tail=16
351 and should turn into:
352 
353    1: CURLYM[1] {1,32767}(18)
354    5:   TRIE(16)
355         [Words:3 Chars Stored:6 Unique Chars:4 States:5 NCP:1]
356           <ac>
357           <ad>
358           <ab>
359   16:   SUCCEED(0)
360   17:   NOTHING(18)
361   18: END(0)
362 
363 Cases where tail != last would be like /(?foo|bar)baz/:
364 
365    1: BRANCH(4)
366    2:   EXACT <foo>(8)
367    4: BRANCH(7)
368    5:   EXACT <bar>(8)
369    7: TAIL(8)
370    8: EXACT <baz>(10)
371   10: END(0)
372 
373 which would be optimizable with startbranch=1, first=1, last=7, tail=8
374 and would end up looking like:
375 
376     1: TRIE(8)
377       [Words:2 Chars Stored:6 Unique Chars:5 States:7 NCP:1]
378         <foo>
379         <bar>
380    7: TAIL(8)
381    8: EXACT <baz>(10)
382   10: END(0)
383 
384     d = uvchr_to_utf8_flags(d, uv, 0);
385 
386 is the recommended Unicode-aware way of saying
387 
388     *(d++) = uv;
389 */
390 
391 #define TRIE_STORE_REVCHAR(val)                                            \
392     STMT_START {                                                           \
393         if (UTF) {                                                         \
394             SV *zlopp = newSV(UTF8_MAXBYTES);                              \
395             unsigned char *flrbbbbb = (unsigned char *) SvPVX(zlopp);      \
396             unsigned char *const kapow = uvchr_to_utf8(flrbbbbb, val);     \
397             *kapow = '\0';                                                 \
398             SvCUR_set(zlopp, kapow - flrbbbbb);                            \
399             SvPOK_on(zlopp);                                               \
400             SvUTF8_on(zlopp);                                              \
401             av_push_simple(revcharmap, zlopp);                                     \
402         } else {                                                           \
403             char ooooff = (char)val;                                           \
404             av_push_simple(revcharmap, newSVpvn(&ooooff, 1));                      \
405         }                                                                  \
406         } STMT_END
407 
408 /* This gets the next character from the input, folding it if not already
409  * folded. */
410 #define TRIE_READ_CHAR STMT_START {                                           \
411     wordlen++;                                                                \
412     if ( UTF ) {                                                              \
413         /* if it is UTF then it is either already folded, or does not need    \
414          * folding */                                                         \
415         uvc = valid_utf8_to_uvchr( (const U8*) uc, &len);                     \
416     }                                                                         \
417     else if (folder == PL_fold_latin1) {                                      \
418         /* This folder implies Unicode rules, which in the range expressible  \
419          *  by not UTF is the lower case, with the two exceptions, one of     \
420          *  which should have been taken care of before calling this */       \
421         assert(*uc != LATIN_SMALL_LETTER_SHARP_S);                            \
422         uvc = toLOWER_L1(*uc);                                                \
423         if (UNLIKELY(uvc == MICRO_SIGN)) uvc = GREEK_SMALL_LETTER_MU;         \
424         len = 1;                                                              \
425     } else {                                                                  \
426         /* raw data, will be folded later if needed */                        \
427         uvc = (U32)*uc;                                                       \
428         len = 1;                                                              \
429     }                                                                         \
430 } STMT_END
431 
432 
433 
434 #define TRIE_LIST_PUSH(state,fid,ns) STMT_START {               \
435     if ( TRIE_LIST_CUR( state ) >=TRIE_LIST_LEN( state ) ) {    \
436         U32 ging = TRIE_LIST_LEN( state ) * 2;                  \
437         Renew( trie->states[ state ].trans.list, ging, reg_trie_trans_le ); \
438         TRIE_LIST_LEN( state ) = ging;                          \
439     }                                                           \
440     TRIE_LIST_ITEM( state, TRIE_LIST_CUR( state ) ).forid = fid;     \
441     TRIE_LIST_ITEM( state, TRIE_LIST_CUR( state ) ).newstate = ns;   \
442     TRIE_LIST_CUR( state )++;                                   \
443 } STMT_END
444 
445 #define TRIE_LIST_NEW(state) STMT_START {                       \
446     Newx( trie->states[ state ].trans.list,                     \
447         4, reg_trie_trans_le );                                 \
448      TRIE_LIST_CUR( state ) = 1;                                \
449      TRIE_LIST_LEN( state ) = 4;                                \
450 } STMT_END
451 
452 #define TRIE_HANDLE_WORD(state) STMT_START {                    \
453     U16 dupe= trie->states[ state ].wordnum;                    \
454     regnode * const noper_next = regnext( noper );              \
455                                                                 \
456     DEBUG_r({                                                   \
457         /* store the word for dumping */                        \
458         SV* tmp;                                                \
459         if (OP(noper) != NOTHING)                               \
460             tmp = newSVpvn_utf8(STRING(noper), STR_LEN(noper), UTF);    \
461         else                                                    \
462             tmp = newSVpvn_utf8( "", 0, UTF );                  \
463         av_push_simple( trie_words, tmp );                             \
464     });                                                         \
465                                                                 \
466     curword++;                                                  \
467     trie->wordinfo[curword].prev   = 0;                         \
468     trie->wordinfo[curword].len    = wordlen;                   \
469     trie->wordinfo[curword].accept = state;                     \
470                                                                 \
471     if ( noper_next < tail ) {                                  \
472         if (!trie->jump) {                                      \
473             trie->jump = (U16 *) PerlMemShared_calloc( word_count + 1, \
474                                                  sizeof(U16) ); \
475             trie->j_before_paren = (U16 *) PerlMemShared_calloc( word_count + 1, \
476                                                  sizeof(U16) ); \
477             trie->j_after_paren = (U16 *) PerlMemShared_calloc( word_count + 1, \
478                                                  sizeof(U16) ); \
479         }                                                       \
480         trie->jump[curword] = (U16)(noper_next - convert);      \
481         U16 set_before_paren;                                   \
482         U16 set_after_paren;                                    \
483         if (OP(cur) == BRANCH) {                                \
484             set_before_paren = ARG1a(cur);                       \
485             set_after_paren = ARG1b(cur);                        \
486         } else {                                                \
487             set_before_paren = ARG2a(cur);                     \
488             set_after_paren = ARG2b(cur);                      \
489         }                                                       \
490         trie->j_before_paren[curword] = set_before_paren;       \
491         trie->j_after_paren[curword] = set_after_paren;         \
492         if (!jumper)                                            \
493             jumper = noper_next;                                \
494         if (!nextbranch)                                        \
495             nextbranch= regnext(cur);                           \
496     }                                                           \
497                                                                 \
498     if ( dupe ) {                                               \
499         /* It's a dupe. Pre-insert into the wordinfo[].prev   */\
500         /* chain, so that when the bits of chain are later    */\
501         /* linked together, the dups appear in the chain      */\
502         trie->wordinfo[curword].prev = trie->wordinfo[dupe].prev; \
503         trie->wordinfo[dupe].prev = curword;                    \
504     } else {                                                    \
505         /* we haven't inserted this word yet.                */ \
506         trie->states[ state ].wordnum = curword;                \
507     }                                                           \
508 } STMT_END
509 
510 
511 #define TRIE_TRANS_STATE(state,base,ucharcount,charid,special)          \
512      ( ( base + charid >=  ucharcount                                   \
513          && base + charid < ubound                                      \
514          && state == trie->trans[ base - ucharcount + charid ].check    \
515          && trie->trans[ base - ucharcount + charid ].next )            \
516            ? trie->trans[ base - ucharcount + charid ].next             \
517            : ( state==1 ? special : 0 )                                 \
518       )
519 
520 /* Helper function for make_trie(): set a trie bit for both the character
521  * and its folded variant, and for the first byte of a variant codepoint,
522  * if any */
523 
524 STATIC void
S_trie_bitmap_set_folded(pTHX_ RExC_state_t * pRExC_state,reg_trie_data * trie,U8 ch,const U8 * folder)525 S_trie_bitmap_set_folded(pTHX_ RExC_state_t *pRExC_state,
526     reg_trie_data *trie, U8 ch, const U8 * folder)
527 {
528     TRIE_BITMAP_SET(trie, ch);
529     /* store the folded codepoint */
530     if ( folder )
531         TRIE_BITMAP_SET(trie, folder[ch]);
532 
533     if ( !UTF ) {
534         /* store first byte of utf8 representation of */
535         /* variant codepoints */
536         if (! UVCHR_IS_INVARIANT(ch)) {
537             U8 hi = UTF8_TWO_BYTE_HI(ch);
538             /* Note that hi will be either 0xc2 or 0xc3 (apart from EBCDIC
539              * systems), and TRIE_BITMAP_SET() will do >>3 to get the byte
540              * offset within the bit table, which is constant, and
541              * Coverity complained about this (CID 488118). */
542             TRIE_BITMAP_SET(trie, hi);
543         }
544     }
545 }
546 
547 
548 I32
Perl_make_trie(pTHX_ RExC_state_t * pRExC_state,regnode * startbranch,regnode * first,regnode * last,regnode * tail,U32 word_count,U32 flags,U32 depth)549 Perl_make_trie(pTHX_ RExC_state_t *pRExC_state, regnode *startbranch,
550                   regnode *first, regnode *last, regnode *tail,
551                   U32 word_count, U32 flags, U32 depth)
552 {
553     /* first pass, loop through and scan words */
554     reg_trie_data *trie;
555     HV *widecharmap = NULL;
556     AV *revcharmap = newAV();
557     regnode *cur;
558     STRLEN len = 0;
559     UV uvc = 0;
560     U16 curword = 0;
561     U32 next_alloc = 0;
562     regnode *jumper = NULL;
563     regnode *nextbranch = NULL;
564     regnode *lastbranch = NULL;
565     regnode *convert = NULL;
566     U32 *prev_states; /* temp array mapping each state to previous one */
567     /* we just use folder as a flag in utf8 */
568     const U8 * folder = NULL;
569 
570     /* in the below reg_add_data call we are storing either 'tu' or 'tuaa'
571      * which stands for one trie structure, one hash, optionally followed
572      * by two arrays */
573 #ifdef DEBUGGING
574     const U32 data_slot = reg_add_data( pRExC_state, STR_WITH_LEN("tuaa"));
575     AV *trie_words = NULL;
576     /* along with revcharmap, this only used during construction but both are
577      * useful during debugging so we store them in the struct when debugging.
578      */
579 #else
580     const U32 data_slot = reg_add_data( pRExC_state, STR_WITH_LEN("tu"));
581     STRLEN trie_charcount=0;
582 #endif
583     SV *re_trie_maxbuff;
584     DECLARE_AND_GET_RE_DEBUG_FLAGS;
585 
586     PERL_ARGS_ASSERT_MAKE_TRIE;
587 #ifndef DEBUGGING
588     PERL_UNUSED_ARG(depth);
589 #endif
590 
591     switch (flags) {
592         case EXACT: case EXACT_REQ8: case EXACTL: break;
593         case EXACTFAA:
594         case EXACTFUP:
595         case EXACTFU:
596         case EXACTFLU8: folder = PL_fold_latin1; break;
597         case EXACTF:  folder = PL_fold; break;
598         default: Perl_croak( aTHX_ "panic! In trie construction, unknown node type %u %s", (unsigned) flags, REGNODE_NAME(flags) );
599     }
600 
601     /* create the trie struct, all zeroed */
602     trie = (reg_trie_data *) PerlMemShared_calloc( 1, sizeof(reg_trie_data) );
603     trie->refcount = 1;
604     trie->startstate = 1;
605     trie->wordcount = word_count;
606     RExC_rxi->data->data[ data_slot ] = (void*)trie;
607     trie->charmap = (U16 *) PerlMemShared_calloc( 256, sizeof(U16) );
608     if (flags == EXACT || flags == EXACT_REQ8 || flags == EXACTL)
609         trie->bitmap = (char *) PerlMemShared_calloc( ANYOF_BITMAP_SIZE, 1 );
610     trie->wordinfo = (reg_trie_wordinfo *) PerlMemShared_calloc(
611                        trie->wordcount+1, sizeof(reg_trie_wordinfo));
612 
613     DEBUG_r({
614         trie_words = newAV();
615     });
616 
617     re_trie_maxbuff = get_sv(RE_TRIE_MAXBUF_NAME, GV_ADD);
618     assert(re_trie_maxbuff);
619     if (!SvIOK(re_trie_maxbuff)) {
620         sv_setiv(re_trie_maxbuff, RE_TRIE_MAXBUF_INIT);
621     }
622     DEBUG_TRIE_COMPILE_r({
623         Perl_re_indentf( aTHX_
624           "make_trie start==%d, first==%d, last==%d, tail==%d depth=%d\n",
625           depth+1,
626           REG_NODE_NUM(startbranch), REG_NODE_NUM(first),
627           REG_NODE_NUM(last), REG_NODE_NUM(tail), (int)depth);
628     });
629 
630    /* Find the node we are going to overwrite */
631     if ( first == startbranch && OP( last ) != BRANCH ) {
632         /* whole branch chain */
633         convert = first;
634     } else {
635         /* branch sub-chain */
636         convert = REGNODE_AFTER( first );
637     }
638 
639     /*  -- First loop and Setup --
640 
641        We first traverse the branches and scan each word to determine if it
642        contains widechars, and how many unique chars there are, this is
643        important as we have to build a table with at least as many columns as we
644        have unique chars.
645 
646        We use an array of integers to represent the character codes 0..255
647        (trie->charmap) and we use a an HV* to store Unicode characters. We use
648        the native representation of the character value as the key and IV's for
649        the coded index.
650 
651        *TODO* If we keep track of how many times each character is used we can
652        remap the columns so that the table compression later on is more
653        efficient in terms of memory by ensuring the most common value is in the
654        middle and the least common are on the outside.  IMO this would be better
655        than a most to least common mapping as theres a decent chance the most
656        common letter will share a node with the least common, meaning the node
657        will not be compressible. With a middle is most common approach the worst
658        case is when we have the least common nodes twice.
659 
660      */
661 
662     for ( cur = first ; cur < last ; cur = regnext( cur ) ) {
663         regnode *noper = REGNODE_AFTER( cur );
664         const U8 *uc;
665         const U8 *e;
666         int foldlen = 0;
667         U32 wordlen      = 0;         /* required init */
668         STRLEN minchars = 0;
669         STRLEN maxchars = 0;
670         bool set_bit = trie->bitmap ? 1 : 0; /*store the first char in the
671                                                bitmap?*/
672 
673         /* wordlen is needed for the TRIE_READ_CHAR() macro, but we don't use its
674            value in this scope, we only modify it.  clang 17 warns about this.
675            The later definitions of wordlen in this function do have their values
676            used.
677         */
678         PERL_UNUSED_VAR(wordlen);
679 
680         lastbranch = cur;
681 
682         if (OP(noper) == NOTHING) {
683             /* skip past a NOTHING at the start of an alternation
684              * eg, /(?:)a|(?:b)/ should be the same as /a|b/
685              *
686              * If the next node is not something we are supposed to process
687              * we will just ignore it due to the condition guarding the
688              * next block.
689              */
690 
691             regnode *noper_next= regnext(noper);
692             if (noper_next < tail)
693                 noper= noper_next;
694         }
695 
696         if (    noper < tail
697             && (    OP(noper) == flags
698                 || (flags == EXACT && OP(noper) == EXACT_REQ8)
699                 || (flags == EXACTFU && (   OP(noper) == EXACTFU_REQ8
700                                          || OP(noper) == EXACTFUP))))
701         {
702             uc= (U8*)STRING(noper);
703             e= uc + STR_LEN(noper);
704         } else {
705             trie->minlen= 0;
706             continue;
707         }
708 
709 
710         if ( set_bit ) { /* bitmap only alloced when !(UTF&&Folding) */
711             TRIE_BITMAP_SET(trie,*uc); /* store the raw first byte
712                                           regardless of encoding */
713             if (OP( noper ) == EXACTFUP) {
714                 /* false positives are ok, so just set this */
715                 TRIE_BITMAP_SET(trie, LATIN_SMALL_LETTER_SHARP_S);
716             }
717         }
718 
719         for ( ; uc < e ; uc += len ) {  /* Look at each char in the current
720                                            branch */
721             TRIE_CHARCOUNT(trie)++;
722             TRIE_READ_CHAR;
723 
724             /* TRIE_READ_CHAR returns the current character, or its fold if /i
725              * is in effect.  Under /i, this character can match itself, or
726              * anything that folds to it.  If not under /i, it can match just
727              * itself.  Most folds are 1-1, for example k, K, and KELVIN SIGN
728              * all fold to k, and all are single characters.   But some folds
729              * expand to more than one character, so for example LATIN SMALL
730              * LIGATURE FFI folds to the three character sequence 'ffi'.  If
731              * the string beginning at 'uc' is 'ffi', it could be matched by
732              * three characters, or just by the one ligature character. (It
733              * could also be matched by two characters: LATIN SMALL LIGATURE FF
734              * followed by 'i', or by 'f' followed by LATIN SMALL LIGATURE FI).
735              * (Of course 'I' and/or 'F' instead of 'i' and 'f' can also
736              * match.)  The trie needs to know the minimum and maximum number
737              * of characters that could match so that it can use size alone to
738              * quickly reject many match attempts.  The max is simple: it is
739              * the number of folded characters in this branch (since a fold is
740              * never shorter than what folds to it. */
741 
742             maxchars++;
743 
744             /* And the min is equal to the max if not under /i (indicated by
745              * 'folder' being NULL), or there are no multi-character folds.  If
746              * there is a multi-character fold, the min is incremented just
747              * once, for the character that folds to the sequence.  Each
748              * character in the sequence needs to be added to the list below of
749              * characters in the trie, but we count only the first towards the
750              * min number of characters needed.  This is done through the
751              * variable 'foldlen', which is returned by the macros that look
752              * for these sequences as the number of bytes the sequence
753              * occupies.  Each time through the loop, we decrement 'foldlen' by
754              * how many bytes the current char occupies.  Only when it reaches
755              * 0 do we increment 'minchars' or look for another multi-character
756              * sequence. */
757             if (folder == NULL) {
758                 minchars++;
759             }
760             else if (foldlen > 0) {
761                 foldlen -= (UTF) ? UTF8SKIP(uc) : 1;
762             }
763             else {
764                 minchars++;
765 
766                 /* See if *uc is the beginning of a multi-character fold.  If
767                  * so, we decrement the length remaining to look at, to account
768                  * for the current character this iteration.  (We can use 'uc'
769                  * instead of the fold returned by TRIE_READ_CHAR because the
770                  * macro is smart enough to account for any unfolded
771                  * characters. */
772                 if (UTF) {
773                     if ((foldlen = is_MULTI_CHAR_FOLD_utf8_safe(uc, e))) {
774                         foldlen -= UTF8SKIP(uc);
775                     }
776                 }
777                 else if ((foldlen = is_MULTI_CHAR_FOLD_latin1_safe(uc, e))) {
778                     foldlen--;
779                 }
780             }
781 
782             /* The current character (and any potential folds) should be added
783              * to the possible matching characters for this position in this
784              * branch */
785             if ( uvc < 256 ) {
786                 if ( folder ) {
787                     U8 folded= folder[ (U8) uvc ];
788                     if ( !trie->charmap[ folded ] ) {
789                         trie->charmap[ folded ]=( ++trie->uniquecharcount );
790                         TRIE_STORE_REVCHAR( folded );
791                     }
792                 }
793                 if ( !trie->charmap[ uvc ] ) {
794                     trie->charmap[ uvc ]=( ++trie->uniquecharcount );
795                     TRIE_STORE_REVCHAR( uvc );
796                 }
797                 if ( set_bit ) {
798                     /* store the codepoint in the bitmap, and its folded
799                      * equivalent. */
800                     S_trie_bitmap_set_folded(aTHX_ pRExC_state, trie, uvc, folder);
801                     set_bit = 0; /* We've done our bit :-) */
802                 }
803             } else {
804 
805                 /* XXX We could come up with the list of code points that fold
806                  * to this using PL_utf8_foldclosures, except not for
807                  * multi-char folds, as there may be multiple combinations
808                  * there that could work, which needs to wait until runtime to
809                  * resolve (The comment about LIGATURE FFI above is such an
810                  * example */
811 
812                 SV** svpp;
813                 if ( !widecharmap )
814                     widecharmap = newHV();
815 
816                 svpp = hv_fetch( widecharmap, (char*)&uvc, sizeof( UV ), 1 );
817 
818                 if ( !svpp )
819                     Perl_croak( aTHX_ "error creating/fetching widecharmap entry for 0x%" UVXf, uvc );
820 
821                 if ( !SvTRUE( *svpp ) ) {
822                     sv_setiv( *svpp, ++trie->uniquecharcount );
823                     TRIE_STORE_REVCHAR(uvc);
824                 }
825             }
826         } /* end loop through characters in this branch of the trie */
827 
828         /* We take the min and max for this branch and combine to find the min
829          * and max for all branches processed so far */
830         if( cur == first ) {
831             trie->minlen = minchars;
832             trie->maxlen = maxchars;
833         } else if (minchars < trie->minlen) {
834             trie->minlen = minchars;
835         } else if (maxchars > trie->maxlen) {
836             trie->maxlen = maxchars;
837         }
838     } /* end first pass */
839     trie->before_paren = OP(first) == BRANCH
840                  ? ARG1a(first)
841                  : ARG2a(first); /* BRANCHJ */
842 
843     trie->after_paren = OP(lastbranch) == BRANCH
844                  ? ARG1b(lastbranch)
845                  : ARG2b(lastbranch); /* BRANCHJ */
846     DEBUG_TRIE_COMPILE_r(
847         Perl_re_indentf( aTHX_
848                 "TRIE(%s): W:%d C:%d Uq:%d Min:%d Max:%d\n",
849                 depth+1,
850                 ( widecharmap ? "UTF8" : "NATIVE" ), (int)word_count,
851                 (int)TRIE_CHARCOUNT(trie), trie->uniquecharcount,
852                 (int)trie->minlen, (int)trie->maxlen )
853     );
854 
855     /*
856         We now know what we are dealing with in terms of unique chars and
857         string sizes so we can calculate how much memory a naive
858         representation using a flat table  will take. If it's over a reasonable
859         limit (as specified by ${^RE_TRIE_MAXBUF}) we use a more memory
860         conservative but potentially much slower representation using an array
861         of lists.
862 
863         At the end we convert both representations into the same compressed
864         form that will be used in regexec.c for matching with. The latter
865         is a form that cannot be used to construct with but has memory
866         properties similar to the list form and access properties similar
867         to the table form making it both suitable for fast searches and
868         small enough that its feasable to store for the duration of a program.
869 
870         See the comment in the code where the compressed table is produced
871         inplace from the flat tabe representation for an explanation of how
872         the compression works.
873 
874     */
875 
876 
877     Newx(prev_states, TRIE_CHARCOUNT(trie) + 2, U32);
878     prev_states[1] = 0;
879 
880     if ( (IV)( ( TRIE_CHARCOUNT(trie) + 1 ) * trie->uniquecharcount + 1)
881                                                     > SvIV(re_trie_maxbuff) )
882     {
883         /*
884             Second Pass -- Array Of Lists Representation
885 
886             Each state will be represented by a list of charid:state records
887             (reg_trie_trans_le) the first such element holds the CUR and LEN
888             points of the allocated array. (See defines above).
889 
890             We build the initial structure using the lists, and then convert
891             it into the compressed table form which allows faster lookups
892             (but cant be modified once converted).
893         */
894 
895         STRLEN transcount = 1;
896 
897         DEBUG_TRIE_COMPILE_MORE_r( Perl_re_indentf( aTHX_  "Compiling trie using list compiler\n",
898             depth+1));
899 
900         trie->states = (reg_trie_state *)
901             PerlMemShared_calloc( TRIE_CHARCOUNT(trie) + 2,
902                                   sizeof(reg_trie_state) );
903         TRIE_LIST_NEW(1);
904         next_alloc = 2;
905 
906         for ( cur = first ; cur < last ; cur = regnext( cur ) ) {
907 
908             regnode *noper   = REGNODE_AFTER( cur );
909             U32 state        = 1;         /* required init */
910             U16 charid       = 0;         /* sanity init */
911             U32 wordlen      = 0;         /* required init */
912 
913             if (OP(noper) == NOTHING) {
914                 regnode *noper_next= regnext(noper);
915                 if (noper_next < tail)
916                     noper= noper_next;
917                 /* we will undo this assignment if noper does not
918                  * point at a trieable type in the else clause of
919                  * the following statement. */
920             }
921 
922             if (    noper < tail
923                 && (    OP(noper) == flags
924                     || (flags == EXACT && OP(noper) == EXACT_REQ8)
925                     || (flags == EXACTFU && (   OP(noper) == EXACTFU_REQ8
926                                              || OP(noper) == EXACTFUP))))
927             {
928                 const U8 *uc= (U8*)STRING(noper);
929                 const U8 *e= uc + STR_LEN(noper);
930 
931                 for ( ; uc < e ; uc += len ) {
932 
933                     TRIE_READ_CHAR;
934 
935                     if ( uvc < 256 ) {
936                         charid = trie->charmap[ uvc ];
937                     } else {
938                         SV** const svpp = hv_fetch( widecharmap,
939                                                     (char*)&uvc,
940                                                     sizeof( UV ),
941                                                     0);
942                         if ( !svpp ) {
943                             charid = 0;
944                         } else {
945                             charid=(U16)SvIV( *svpp );
946                         }
947                     }
948                     /* charid is now 0 if we dont know the char read, or
949                      * nonzero if we do */
950                     if ( charid ) {
951 
952                         U16 check;
953                         U32 newstate = 0;
954 
955                         charid--;
956                         if ( !trie->states[ state ].trans.list ) {
957                             TRIE_LIST_NEW( state );
958                         }
959                         for ( check = 1;
960                               check <= TRIE_LIST_USED( state );
961                               check++ )
962                         {
963                             if ( TRIE_LIST_ITEM( state, check ).forid
964                                                                     == charid )
965                             {
966                                 newstate = TRIE_LIST_ITEM( state, check ).newstate;
967                                 break;
968                             }
969                         }
970                         if ( ! newstate ) {
971                             newstate = next_alloc++;
972                             prev_states[newstate] = state;
973                             TRIE_LIST_PUSH( state, charid, newstate );
974                             transcount++;
975                         }
976                         state = newstate;
977                     } else {
978                         Perl_croak( aTHX_ "panic! In trie construction, no char mapping for %" IVdf, uvc );
979                     }
980                 }
981             } else {
982                 /* If we end up here it is because we skipped past a NOTHING, but did not end up
983                  * on a trieable type. So we need to reset noper back to point at the first regop
984                  * in the branch before we call TRIE_HANDLE_WORD()
985                 */
986                 noper= REGNODE_AFTER(cur);
987             }
988             TRIE_HANDLE_WORD(state);
989 
990         } /* end second pass */
991 
992         /* next alloc is the NEXT state to be allocated */
993         trie->statecount = next_alloc;
994         trie->states = (reg_trie_state *)
995             PerlMemShared_realloc( trie->states,
996                                    next_alloc
997                                    * sizeof(reg_trie_state) );
998 
999         /* and now dump it out before we compress it */
1000         DEBUG_TRIE_COMPILE_MORE_r(dump_trie_interim_list(trie, widecharmap,
1001                                                          revcharmap, next_alloc,
1002                                                          depth+1)
1003         );
1004 
1005         trie->trans = (reg_trie_trans *)
1006             PerlMemShared_calloc( transcount, sizeof(reg_trie_trans) );
1007         {
1008             U32 state;
1009             U32 tp = 0;
1010             U32 zp = 0;
1011 
1012 
1013             for( state=1 ; state < next_alloc ; state ++ ) {
1014                 U32 base=0;
1015 
1016                 /*
1017                 DEBUG_TRIE_COMPILE_MORE_r(
1018                     Perl_re_printf( aTHX_  "tp: %d zp: %d ",tp,zp)
1019                 );
1020                 */
1021 
1022                 if (trie->states[state].trans.list) {
1023                     U16 minid=TRIE_LIST_ITEM( state, 1).forid;
1024                     U16 maxid=minid;
1025                     U16 idx;
1026 
1027                     for( idx = 2 ; idx <= TRIE_LIST_USED( state ) ; idx++ ) {
1028                         const U16 forid = TRIE_LIST_ITEM( state, idx).forid;
1029                         if ( forid < minid ) {
1030                             minid=forid;
1031                         } else if ( forid > maxid ) {
1032                             maxid=forid;
1033                         }
1034                     }
1035                     if ( transcount < tp + maxid - minid + 1) {
1036                         transcount *= 2;
1037                         trie->trans = (reg_trie_trans *)
1038                             PerlMemShared_realloc( trie->trans,
1039                                                      transcount
1040                                                      * sizeof(reg_trie_trans) );
1041                         Zero( trie->trans + (transcount / 2),
1042                               transcount / 2,
1043                               reg_trie_trans );
1044                     }
1045                     base = trie->uniquecharcount + tp - minid;
1046                     if ( maxid == minid ) {
1047                         U32 set = 0;
1048                         for ( ; zp < tp ; zp++ ) {
1049                             if ( ! trie->trans[ zp ].next ) {
1050                                 base = trie->uniquecharcount + zp - minid;
1051                                 trie->trans[ zp ].next = TRIE_LIST_ITEM( state,
1052                                                                    1).newstate;
1053                                 trie->trans[ zp ].check = state;
1054                                 set = 1;
1055                                 break;
1056                             }
1057                         }
1058                         if ( !set ) {
1059                             trie->trans[ tp ].next = TRIE_LIST_ITEM( state,
1060                                                                    1).newstate;
1061                             trie->trans[ tp ].check = state;
1062                             tp++;
1063                             zp = tp;
1064                         }
1065                     } else {
1066                         for ( idx=1; idx <= TRIE_LIST_USED( state ) ; idx++ ) {
1067                             const U32 tid = base
1068                                            - trie->uniquecharcount
1069                                            + TRIE_LIST_ITEM( state, idx ).forid;
1070                             trie->trans[ tid ].next = TRIE_LIST_ITEM( state,
1071                                                                 idx ).newstate;
1072                             trie->trans[ tid ].check = state;
1073                         }
1074                         tp += ( maxid - minid + 1 );
1075                     }
1076                     Safefree(trie->states[ state ].trans.list);
1077                 }
1078                 /*
1079                 DEBUG_TRIE_COMPILE_MORE_r(
1080                     Perl_re_printf( aTHX_  " base: %d\n",base);
1081                 );
1082                 */
1083                 trie->states[ state ].trans.base=base;
1084             }
1085             trie->lasttrans = tp + 1;
1086         }
1087     } else {
1088         /*
1089            Second Pass -- Flat Table Representation.
1090 
1091            we dont use the 0 slot of either trans[] or states[] so we add 1 to
1092            each.  We know that we will need Charcount+1 trans at most to store
1093            the data (one row per char at worst case) So we preallocate both
1094            structures assuming worst case.
1095 
1096            We then construct the trie using only the .next slots of the entry
1097            structs.
1098 
1099            We use the .check field of the first entry of the node temporarily
1100            to make compression both faster and easier by keeping track of how
1101            many non zero fields are in the node.
1102 
1103            Since trans are numbered from 1 any 0 pointer in the table is a FAIL
1104            transition.
1105 
1106            There are two terms at use here: state as a TRIE_NODEIDX() which is
1107            a number representing the first entry of the node, and state as a
1108            TRIE_NODENUM() which is the trans number. state 1 is TRIE_NODEIDX(1)
1109            and TRIE_NODENUM(1), state 2 is TRIE_NODEIDX(2) and TRIE_NODENUM(3)
1110            if there are 2 entrys per node. eg:
1111 
1112              A B       A B
1113           1. 2 4    1. 3 7
1114           2. 0 3    3. 0 5
1115           3. 0 0    5. 0 0
1116           4. 0 0    7. 0 0
1117 
1118            The table is internally in the right hand, idx form. However as we
1119            also have to deal with the states array which is indexed by nodenum
1120            we have to use TRIE_NODENUM() to convert.
1121 
1122         */
1123         DEBUG_TRIE_COMPILE_MORE_r( Perl_re_indentf( aTHX_  "Compiling trie using table compiler\n",
1124             depth+1));
1125 
1126         trie->trans = (reg_trie_trans *)
1127             PerlMemShared_calloc( ( TRIE_CHARCOUNT(trie) + 1 )
1128                                   * trie->uniquecharcount + 1,
1129                                   sizeof(reg_trie_trans) );
1130         trie->states = (reg_trie_state *)
1131             PerlMemShared_calloc( TRIE_CHARCOUNT(trie) + 2,
1132                                   sizeof(reg_trie_state) );
1133         next_alloc = trie->uniquecharcount + 1;
1134 
1135 
1136         for ( cur = first ; cur < last ; cur = regnext( cur ) ) {
1137 
1138             regnode *noper   = REGNODE_AFTER( cur );
1139 
1140             U32 state        = 1;         /* required init */
1141 
1142             U16 charid       = 0;         /* sanity init */
1143             U32 accept_state = 0;         /* sanity init */
1144 
1145             U32 wordlen      = 0;         /* required init */
1146 
1147             if (OP(noper) == NOTHING) {
1148                 regnode *noper_next= regnext(noper);
1149                 if (noper_next < tail)
1150                     noper= noper_next;
1151                 /* we will undo this assignment if noper does not
1152                  * point at a trieable type in the else clause of
1153                  * the following statement. */
1154             }
1155 
1156             if (    noper < tail
1157                 && (    OP(noper) == flags
1158                     || (flags == EXACT && OP(noper) == EXACT_REQ8)
1159                     || (flags == EXACTFU && (   OP(noper) == EXACTFU_REQ8
1160                                              || OP(noper) == EXACTFUP))))
1161             {
1162                 const U8 *uc= (U8*)STRING(noper);
1163                 const U8 *e= uc + STR_LEN(noper);
1164 
1165                 for ( ; uc < e ; uc += len ) {
1166 
1167                     TRIE_READ_CHAR;
1168 
1169                     if ( uvc < 256 ) {
1170                         charid = trie->charmap[ uvc ];
1171                     } else {
1172                         SV* const * const svpp = hv_fetch( widecharmap,
1173                                                            (char*)&uvc,
1174                                                            sizeof( UV ),
1175                                                            0);
1176                         charid = svpp ? (U16)SvIV(*svpp) : 0;
1177                     }
1178                     if ( charid ) {
1179                         charid--;
1180                         if ( !trie->trans[ state + charid ].next ) {
1181                             trie->trans[ state + charid ].next = next_alloc;
1182                             trie->trans[ state ].check++;
1183                             prev_states[TRIE_NODENUM(next_alloc)]
1184                                     = TRIE_NODENUM(state);
1185                             next_alloc += trie->uniquecharcount;
1186                         }
1187                         state = trie->trans[ state + charid ].next;
1188                     } else {
1189                         Perl_croak( aTHX_ "panic! In trie construction, no char mapping for %" IVdf, uvc );
1190                     }
1191                     /* charid is now 0 if we dont know the char read, or
1192                      * nonzero if we do */
1193                 }
1194             } else {
1195                 /* If we end up here it is because we skipped past a NOTHING, but did not end up
1196                  * on a trieable type. So we need to reset noper back to point at the first regop
1197                  * in the branch before we call TRIE_HANDLE_WORD().
1198                 */
1199                 noper= REGNODE_AFTER(cur);
1200             }
1201             accept_state = TRIE_NODENUM( state );
1202             TRIE_HANDLE_WORD(accept_state);
1203 
1204         } /* end second pass */
1205 
1206         /* and now dump it out before we compress it */
1207         DEBUG_TRIE_COMPILE_MORE_r(dump_trie_interim_table(trie, widecharmap,
1208                                                           revcharmap,
1209                                                           next_alloc, depth+1));
1210 
1211         {
1212         /*
1213            * Inplace compress the table.*
1214 
1215            For sparse data sets the table constructed by the trie algorithm will
1216            be mostly 0/FAIL transitions or to put it another way mostly empty.
1217            (Note that leaf nodes will not contain any transitions.)
1218 
1219            This algorithm compresses the tables by eliminating most such
1220            transitions, at the cost of a modest bit of extra work during lookup:
1221 
1222            - Each states[] entry contains a .base field which indicates the
1223            index in the state[] array wheres its transition data is stored.
1224 
1225            - If .base is 0 there are no valid transitions from that node.
1226 
1227            - If .base is nonzero then charid is added to it to find an entry in
1228            the trans array.
1229 
1230            -If trans[states[state].base+charid].check!=state then the
1231            transition is taken to be a 0/Fail transition. Thus if there are fail
1232            transitions at the front of the node then the .base offset will point
1233            somewhere inside the previous nodes data (or maybe even into a node
1234            even earlier), but the .check field determines if the transition is
1235            valid.
1236 
1237            XXX - wrong maybe?
1238            The following process inplace converts the table to the compressed
1239            table: We first do not compress the root node 1,and mark all its
1240            .check pointers as 1 and set its .base pointer as 1 as well. This
1241            allows us to do a DFA construction from the compressed table later,
1242            and ensures that any .base pointers we calculate later are greater
1243            than 0.
1244 
1245            - We set 'pos' to indicate the first entry of the second node.
1246 
1247            - We then iterate over the columns of the node, finding the first and
1248            last used entry at l and m. We then copy l..m into pos..(pos+m-l),
1249            and set the .check pointers accordingly, and advance pos
1250            appropriately and repreat for the next node. Note that when we copy
1251            the next pointers we have to convert them from the original
1252            NODEIDX form to NODENUM form as the former is not valid post
1253            compression.
1254 
1255            - If a node has no transitions used we mark its base as 0 and do not
1256            advance the pos pointer.
1257 
1258            - If a node only has one transition we use a second pointer into the
1259            structure to fill in allocated fail transitions from other states.
1260            This pointer is independent of the main pointer and scans forward
1261            looking for null transitions that are allocated to a state. When it
1262            finds one it writes the single transition into the "hole".  If the
1263            pointer doesnt find one the single transition is appended as normal.
1264 
1265            - Once compressed we can Renew/realloc the structures to release the
1266            excess space.
1267 
1268            See "Table-Compression Methods" in sec 3.9 of the Red Dragon,
1269            specifically Fig 3.47 and the associated pseudocode.
1270 
1271            demq
1272         */
1273         const U32 laststate = TRIE_NODENUM( next_alloc );
1274         U32 state, charid;
1275         U32 pos = 0, zp=0;
1276         trie->statecount = laststate;
1277 
1278         for ( state = 1 ; state < laststate ; state++ ) {
1279             U8 flag = 0;
1280             const U32 stateidx = TRIE_NODEIDX( state );
1281             const U32 o_used = trie->trans[ stateidx ].check;
1282             U32 used = trie->trans[ stateidx ].check;
1283             trie->trans[ stateidx ].check = 0;
1284 
1285             for ( charid = 0;
1286                   used && charid < trie->uniquecharcount;
1287                   charid++ )
1288             {
1289                 if ( flag || trie->trans[ stateidx + charid ].next ) {
1290                     if ( trie->trans[ stateidx + charid ].next ) {
1291                         if (o_used == 1) {
1292                             for ( ; zp < pos ; zp++ ) {
1293                                 if ( ! trie->trans[ zp ].next ) {
1294                                     break;
1295                                 }
1296                             }
1297                             trie->states[ state ].trans.base
1298                                                     = zp
1299                                                       + trie->uniquecharcount
1300                                                       - charid ;
1301                             trie->trans[ zp ].next
1302                                 = SAFE_TRIE_NODENUM( trie->trans[ stateidx
1303                                                              + charid ].next );
1304                             trie->trans[ zp ].check = state;
1305                             if ( ++zp > pos ) pos = zp;
1306                             break;
1307                         }
1308                         used--;
1309                     }
1310                     if ( !flag ) {
1311                         flag = 1;
1312                         trie->states[ state ].trans.base
1313                                        = pos + trie->uniquecharcount - charid ;
1314                     }
1315                     trie->trans[ pos ].next
1316                         = SAFE_TRIE_NODENUM(
1317                                        trie->trans[ stateidx + charid ].next );
1318                     trie->trans[ pos ].check = state;
1319                     pos++;
1320                 }
1321             }
1322         }
1323         trie->lasttrans = pos + 1;
1324         trie->states = (reg_trie_state *)
1325             PerlMemShared_realloc( trie->states, laststate
1326                                    * sizeof(reg_trie_state) );
1327         DEBUG_TRIE_COMPILE_MORE_r(
1328             Perl_re_indentf( aTHX_  "Alloc: %d Orig: %" IVdf " elements, Final:%" IVdf ". Savings of %%%5.2f\n",
1329                 depth+1,
1330                 (int)( ( TRIE_CHARCOUNT(trie) + 1 ) * trie->uniquecharcount
1331                        + 1 ),
1332                 (IV)next_alloc,
1333                 (IV)pos,
1334                 ( ( next_alloc - pos ) * 100 ) / (double)next_alloc );
1335             );
1336 
1337         } /* end table compress */
1338     }
1339     DEBUG_TRIE_COMPILE_MORE_r(
1340             Perl_re_indentf( aTHX_  "Statecount:%" UVxf " Lasttrans:%" UVxf "\n",
1341                 depth+1,
1342                 (UV)trie->statecount,
1343                 (UV)trie->lasttrans)
1344     );
1345     /* resize the trans array to remove unused space */
1346     trie->trans = (reg_trie_trans *)
1347         PerlMemShared_realloc( trie->trans, trie->lasttrans
1348                                * sizeof(reg_trie_trans) );
1349 
1350     {   /* Modify the program and insert the new TRIE node */
1351         U8 nodetype =(U8) flags;
1352         char *str=NULL;
1353 
1354 #ifdef DEBUGGING
1355         regnode *optimize = NULL;
1356 #endif /* DEBUGGING */
1357         /* make sure we have enough room to inject the TRIE op */
1358         assert((!trie->jump) || !trie->jump[1] ||
1359                 (trie->jump[1] >= (sizeof(tregnode_TRIE)/sizeof(struct regnode))));
1360         /*
1361            This means we convert either the first branch or the first Exact,
1362            depending on whether the thing following (in 'last') is a branch
1363            or not and whther first is the startbranch (ie is it a sub part of
1364            the alternation or is it the whole thing.)
1365            Assuming its a sub part we convert the EXACT otherwise we convert
1366            the whole branch sequence, including the first.
1367          */
1368         /* Find the node we are going to overwrite */
1369         if ( first != startbranch || OP( last ) == BRANCH ) {
1370             /* branch sub-chain */
1371             NEXT_OFF( first ) = (U16)(last - first);
1372             /* whole branch chain */
1373         }
1374         /* But first we check to see if there is a common prefix we can
1375            split out as an EXACT and put in front of the TRIE node.  */
1376         trie->startstate= 1;
1377         if ( trie->bitmap && !widecharmap && !trie->jump  ) {
1378             /* we want to find the first state that has more than
1379              * one transition, if that state is not the first state
1380              * then we have a common prefix which we can remove.
1381              */
1382             U32 state;
1383             for ( state = 1 ; state < trie->statecount-1 ; state++ ) {
1384                 U32 ofs = 0;
1385                 I32 first_ofs = -1; /* keeps track of the ofs of the first
1386                                        transition, -1 means none */
1387                 U32 count = 0;
1388                 const U32 base = trie->states[ state ].trans.base;
1389 
1390                 /* does this state terminate an alternation? */
1391                 if ( trie->states[state].wordnum )
1392                         count = 1;
1393 
1394                 for ( ofs = 0 ; ofs < trie->uniquecharcount ; ofs++ ) {
1395                     if ( ( base + ofs >= trie->uniquecharcount ) &&
1396                          ( base + ofs - trie->uniquecharcount < trie->lasttrans ) &&
1397                          trie->trans[ base + ofs - trie->uniquecharcount ].check == state )
1398                     {
1399                         if ( ++count > 1 ) {
1400                             /* we have more than one transition */
1401                             SV **tmp;
1402                             U8 *ch;
1403                             /* if this is the first state there is no common prefix
1404                              * to extract, so we can exit */
1405                             if ( state == 1 ) break;
1406                             tmp = av_fetch_simple( revcharmap, ofs, 0);
1407                             ch = (U8*)SvPV_nolen_const( *tmp );
1408 
1409                             /* if we are on count 2 then we need to initialize the
1410                              * bitmap, and store the previous char if there was one
1411                              * in it*/
1412                             if ( count == 2 ) {
1413                                 /* clear the bitmap */
1414                                 Zero(trie->bitmap, ANYOF_BITMAP_SIZE, char);
1415                                 DEBUG_OPTIMISE_r(
1416                                     Perl_re_indentf( aTHX_  "New Start State=%" UVuf " Class: [",
1417                                         depth+1,
1418                                         (UV)state));
1419                                 if (first_ofs >= 0) {
1420                                     SV ** const tmp = av_fetch_simple( revcharmap, first_ofs, 0);
1421                                     const U8 * const ch = (U8*)SvPV_nolen_const( *tmp );
1422 
1423                                     S_trie_bitmap_set_folded(aTHX_ pRExC_state, trie, *ch, folder);
1424                                     DEBUG_OPTIMISE_r(
1425                                         Perl_re_printf( aTHX_  "%s", (char*)ch)
1426                                     );
1427                                 }
1428                             }
1429                             /* store the current firstchar in the bitmap */
1430                             S_trie_bitmap_set_folded(aTHX_ pRExC_state, trie, *ch, folder);
1431                             DEBUG_OPTIMISE_r(Perl_re_printf( aTHX_ "%s", ch));
1432                         }
1433                         first_ofs = ofs;
1434                     }
1435                 }
1436                 if ( count == 1 ) {
1437                     /* This state has only one transition, its transition is part
1438                      * of a common prefix - we need to concatenate the char it
1439                      * represents to what we have so far. */
1440                     SV **tmp = av_fetch_simple( revcharmap, first_ofs, 0);
1441                     STRLEN len;
1442                     char *ch = SvPV( *tmp, len );
1443                     DEBUG_OPTIMISE_r({
1444                         SV *sv=sv_newmortal();
1445                         Perl_re_indentf( aTHX_  "Prefix State: %" UVuf " Ofs:%" UVuf " Char='%s'\n",
1446                             depth+1,
1447                             (UV)state, (UV)first_ofs,
1448                             pv_pretty(sv, SvPV_nolen_const(*tmp), SvCUR(*tmp), 6,
1449                                 PL_colors[0], PL_colors[1],
1450                                 (SvUTF8(*tmp) ? PERL_PV_ESCAPE_UNI : 0) |
1451                                 PERL_PV_ESCAPE_FIRSTCHAR
1452                             )
1453                         );
1454                     });
1455                     if ( state==1 ) {
1456                         OP( convert ) = nodetype;
1457                         str=STRING(convert);
1458                         setSTR_LEN(convert, 0);
1459                     }
1460                     assert( ( STR_LEN(convert) + len ) < 256 );
1461                     setSTR_LEN(convert, (U8)(STR_LEN(convert) + len));
1462                     while (len--)
1463                         *str++ = *ch++;
1464                 } else {
1465 #ifdef DEBUGGING
1466                     if (state>1)
1467                         DEBUG_OPTIMISE_r(Perl_re_printf( aTHX_ "]\n"));
1468 #endif
1469                     break;
1470                 }
1471             }
1472             trie->prefixlen = (state-1);
1473             if (str) {
1474                 regnode *n = REGNODE_AFTER(convert);
1475                 assert( n - convert <= U16_MAX );
1476                 NEXT_OFF(convert) = n - convert;
1477                 trie->startstate = state;
1478                 trie->minlen -= (state - 1);
1479                 trie->maxlen -= (state - 1);
1480 #ifdef DEBUGGING
1481                /* At least the UNICOS C compiler choked on this
1482                 * being argument to DEBUG_r(), so let's just have
1483                 * it right here. */
1484                if (
1485 #ifdef PERL_EXT_RE_BUILD
1486                    1
1487 #else
1488                    DEBUG_r_TEST
1489 #endif
1490                    ) {
1491                    U32 word = trie->wordcount;
1492                    while (word--) {
1493                        SV ** const tmp = av_fetch_simple( trie_words, word, 0 );
1494                        if (tmp) {
1495                            if ( STR_LEN(convert) <= SvCUR(*tmp) )
1496                                sv_chop(*tmp, SvPV_nolen(*tmp) + STR_LEN(convert));
1497                            else
1498                                sv_chop(*tmp, SvPV_nolen(*tmp) + SvCUR(*tmp));
1499                        }
1500                    }
1501                }
1502 #endif
1503                 if (trie->maxlen) {
1504                     convert = n;
1505                 } else {
1506                     NEXT_OFF(convert) = (U16)(tail - convert);
1507                     DEBUG_r(optimize= n);
1508                 }
1509             }
1510         }
1511         if (!jumper)
1512             jumper = last;
1513         if ( trie->maxlen ) {
1514             NEXT_OFF( convert ) = (U16)(tail - convert);
1515             ARG1u_SET( convert, data_slot );
1516             /* Store the offset to the first unabsorbed branch in
1517                jump[0], which is otherwise unused by the jump logic.
1518                We use this when dumping a trie and during optimisation. */
1519             if (trie->jump)
1520                 trie->jump[0] = (U16)(nextbranch - convert);
1521 
1522             /* If the start state is not accepting (meaning there is no empty string/NOTHING)
1523              *   and there is a bitmap
1524              *   and the first "jump target" node we found leaves enough room
1525              * then convert the TRIE node into a TRIEC node, with the bitmap
1526              * embedded inline in the opcode - this is hypothetically faster.
1527              */
1528             if ( !trie->states[trie->startstate].wordnum
1529                  && trie->bitmap
1530                  && ( (char *)jumper - (char *)convert) >= (int)sizeof(tregnode_TRIEC) )
1531             {
1532                 OP( convert ) = TRIEC;
1533                 Copy(trie->bitmap, ((tregnode_TRIEC *)convert)->bitmap, ANYOF_BITMAP_SIZE, char);
1534                 PerlMemShared_free(trie->bitmap);
1535                 trie->bitmap= NULL;
1536             } else
1537                 OP( convert ) = TRIE;
1538 
1539             /* store the type in the flags */
1540             FLAGS(convert) = nodetype;
1541             DEBUG_r({
1542             optimize = convert
1543                       + NODE_STEP_REGNODE
1544                       + REGNODE_ARG_LEN( OP( convert ) );
1545             });
1546             /* XXX We really should free up the resource in trie now,
1547                    as we won't use them - (which resources?) dmq */
1548         }
1549         /* needed for dumping*/
1550         DEBUG_r(if (optimize) {
1551             /*
1552                 Try to clean up some of the debris left after the
1553                 optimisation.
1554              */
1555             while( optimize < jumper ) {
1556                 OP( optimize ) = OPTIMIZED;
1557                 optimize++;
1558             }
1559         });
1560     } /* end node insert */
1561 
1562     /*  Finish populating the prev field of the wordinfo array.  Walk back
1563      *  from each accept state until we find another accept state, and if
1564      *  so, point the first word's .prev field at the second word. If the
1565      *  second already has a .prev field set, stop now. This will be the
1566      *  case either if we've already processed that word's accept state,
1567      *  or that state had multiple words, and the overspill words were
1568      *  already linked up earlier.
1569      */
1570     {
1571         U16 word;
1572         U32 state;
1573         U16 prev;
1574 
1575         for (word=1; word <= trie->wordcount; word++) {
1576             prev = 0;
1577             if (trie->wordinfo[word].prev)
1578                 continue;
1579             state = trie->wordinfo[word].accept;
1580             while (state) {
1581                 state = prev_states[state];
1582                 if (!state)
1583                     break;
1584                 prev = trie->states[state].wordnum;
1585                 if (prev)
1586                     break;
1587             }
1588             trie->wordinfo[word].prev = prev;
1589         }
1590         Safefree(prev_states);
1591     }
1592 
1593 
1594     /* and now dump out the compressed format */
1595     DEBUG_TRIE_COMPILE_r(dump_trie(trie, widecharmap, revcharmap, depth+1));
1596 
1597     RExC_rxi->data->data[ data_slot + 1 ] = (void*)widecharmap;
1598 #ifdef DEBUGGING
1599     RExC_rxi->data->data[ data_slot + TRIE_WORDS_OFFSET ] = (void*)trie_words;
1600     RExC_rxi->data->data[ data_slot + 3 ] = (void*)revcharmap;
1601 #else
1602     SvREFCNT_dec_NN(revcharmap);
1603 #endif
1604     return trie->jump
1605            ? MADE_JUMP_TRIE
1606            : trie->startstate>1
1607              ? MADE_EXACT_TRIE
1608              : MADE_TRIE;
1609 }
1610 
1611 regnode *
Perl_construct_ahocorasick_from_trie(pTHX_ RExC_state_t * pRExC_state,regnode * source,U32 depth)1612 Perl_construct_ahocorasick_from_trie(pTHX_ RExC_state_t *pRExC_state, regnode *source, U32 depth)
1613 {
1614 /* The Trie is constructed and compressed now so we can build a fail array if
1615  * it's needed
1616 
1617    This is basically the Aho-Corasick algorithm. Its from exercise 3.31 and
1618    3.32 in the
1619    "Red Dragon" -- Compilers, principles, techniques, and tools. Aho, Sethi,
1620    Ullman 1985/88
1621    ISBN 0-201-10088-6
1622 
1623    We find the fail state for each state in the trie, this state is the longest
1624    proper suffix of the current state's 'word' that is also a proper prefix of
1625    another word in our trie. State 1 represents the word '' and is thus the
1626    default fail state. This allows the DFA not to have to restart after its
1627    tried and failed a word at a given point, it simply continues as though it
1628    had been matching the other word in the first place.
1629    Consider
1630       'abcdgu'=~/abcdefg|cdgu/
1631    When we get to 'd' we are still matching the first word, we would encounter
1632    'g' which would fail, which would bring us to the state representing 'd' in
1633    the second word where we would try 'g' and succeed, proceeding to match
1634    'cdgu'.
1635  */
1636  /* add a fail transition */
1637     const U32 trie_offset = ARG1u(source);
1638     reg_trie_data *trie=(reg_trie_data *)RExC_rxi->data->data[trie_offset];
1639     U32 *q;
1640     const U32 ucharcount = trie->uniquecharcount;
1641     const U32 numstates = trie->statecount;
1642     const U32 ubound = trie->lasttrans + ucharcount;
1643     U32 q_read = 0;
1644     U32 q_write = 0;
1645     U32 charid;
1646     U32 base = trie->states[ 1 ].trans.base;
1647     U32 *fail;
1648     reg_ac_data *aho;
1649     const U32 data_slot = reg_add_data( pRExC_state, STR_WITH_LEN("T"));
1650     regnode *stclass;
1651     DECLARE_AND_GET_RE_DEBUG_FLAGS;
1652 
1653     PERL_ARGS_ASSERT_CONSTRUCT_AHOCORASICK_FROM_TRIE;
1654     PERL_UNUSED_CONTEXT;
1655 #ifndef DEBUGGING
1656     PERL_UNUSED_ARG(depth);
1657 #endif
1658 
1659     if ( OP(source) == TRIE ) {
1660         tregnode_TRIE *op = (tregnode_TRIE *)
1661             PerlMemShared_calloc(1, sizeof(tregnode_TRIE));
1662         StructCopy(source, op, tregnode_TRIE);
1663         stclass = (regnode *)op;
1664     } else {
1665         tregnode_TRIEC *op = (tregnode_TRIEC *)
1666             PerlMemShared_calloc(1, sizeof(tregnode_TRIEC));
1667         StructCopy(source, op, tregnode_TRIEC);
1668         stclass = (regnode *)op;
1669     }
1670     OP(stclass)+=2; /* convert the TRIE type to its AHO-CORASICK equivalent */
1671 
1672     ARG1u_SET( stclass, data_slot );
1673     aho = (reg_ac_data *) PerlMemShared_calloc( 1, sizeof(reg_ac_data) );
1674     RExC_rxi->data->data[ data_slot ] = (void*)aho;
1675     aho->trie=trie_offset;
1676     aho->states=(reg_trie_state *)PerlMemShared_malloc( numstates * sizeof(reg_trie_state) );
1677     Copy( trie->states, aho->states, numstates, reg_trie_state );
1678     Newx( q, numstates, U32);
1679     aho->fail = (U32 *) PerlMemShared_calloc( numstates, sizeof(U32) );
1680     aho->refcount = 1;
1681     fail = aho->fail;
1682     /* initialize fail[0..1] to be 1 so that we always have
1683        a valid final fail state */
1684     fail[ 0 ] = fail[ 1 ] = 1;
1685 
1686     for ( charid = 0; charid < ucharcount ; charid++ ) {
1687         const U32 newstate = TRIE_TRANS_STATE( 1, base, ucharcount, charid, 0 );
1688         if ( newstate ) {
1689             q[ q_write ] = newstate;
1690             /* set to point at the root */
1691             fail[ q[ q_write++ ] ]=1;
1692         }
1693     }
1694     while ( q_read < q_write) {
1695         const U32 cur = q[ q_read++ % numstates ];
1696         base = trie->states[ cur ].trans.base;
1697 
1698         for ( charid = 0 ; charid < ucharcount ; charid++ ) {
1699             const U32 ch_state = TRIE_TRANS_STATE( cur, base, ucharcount, charid, 1 );
1700             if (ch_state) {
1701                 U32 fail_state = cur;
1702                 U32 fail_base;
1703                 do {
1704                     fail_state = fail[ fail_state ];
1705                     fail_base = aho->states[ fail_state ].trans.base;
1706                 } while ( !TRIE_TRANS_STATE( fail_state, fail_base, ucharcount, charid, 1 ) );
1707 
1708                 fail_state = TRIE_TRANS_STATE( fail_state, fail_base, ucharcount, charid, 1 );
1709                 fail[ ch_state ] = fail_state;
1710                 if ( !aho->states[ ch_state ].wordnum && aho->states[ fail_state ].wordnum )
1711                 {
1712                         aho->states[ ch_state ].wordnum =  aho->states[ fail_state ].wordnum;
1713                 }
1714                 q[ q_write++ % numstates] = ch_state;
1715             }
1716         }
1717     }
1718     /* restore fail[0..1] to 0 so that we "fall out" of the AC loop
1719        when we fail in state 1, this allows us to use the
1720        charclass scan to find a valid start char. This is based on the principle
1721        that theres a good chance the string being searched contains lots of stuff
1722        that cant be a start char.
1723      */
1724     fail[ 0 ] = fail[ 1 ] = 0;
1725     DEBUG_TRIE_COMPILE_r({
1726         Perl_re_indentf( aTHX_  "Stclass Failtable (%" UVuf " states): 0",
1727                       depth, (UV)numstates
1728         );
1729         for( q_read=1; q_read<numstates; q_read++ ) {
1730             Perl_re_printf( aTHX_  ", %" UVuf, (UV)fail[q_read]);
1731         }
1732         Perl_re_printf( aTHX_  "\n");
1733     });
1734     Safefree(q);
1735     /*RExC_seen |= REG_TRIEDFA_SEEN;*/
1736     return stclass;
1737 }
1738