1 #ifdef PERL_EXT_RE_BUILD
2 #include "re_top.h"
3 #endif
4
5 #include "EXTERN.h"
6 #define PERL_IN_REGEX_ENGINE
7 #define PERL_IN_REGCOMP_ANY
8 #define PERL_IN_REGCOMP_TRIE_C
9 #include "perl.h"
10
11 #ifdef PERL_IN_XSUB_RE
12 # include "re_comp.h"
13 #else
14 # include "regcomp.h"
15 #endif
16
17 #include "invlist_inline.h"
18 #include "unicode_constants.h"
19 #include "regcomp_internal.h"
20
21 #define TRIE_LIST_ITEM(state,idx) (trie->states[state].trans.list)[ idx ]
22 #define TRIE_LIST_CUR(state) ( TRIE_LIST_ITEM( state, 0 ).forid )
23 #define TRIE_LIST_LEN(state) ( TRIE_LIST_ITEM( state, 0 ).newstate )
24 #define TRIE_LIST_USED(idx) ( trie->states[state].trans.list \
25 ? (TRIE_LIST_CUR( idx ) - 1) \
26 : 0 )
27
28
29 #ifdef DEBUGGING
30 /*
31 dump_trie(trie,widecharmap,revcharmap)
32 dump_trie_interim_list(trie,widecharmap,revcharmap,next_alloc)
33 dump_trie_interim_table(trie,widecharmap,revcharmap,next_alloc)
34
35 These routines dump out a trie in a somewhat readable format.
36 The _interim_ variants are used for debugging the interim
37 tables that are used to generate the final compressed
38 representation which is what dump_trie expects.
39
40 Part of the reason for their existence is to provide a form
41 of documentation as to how the different representations function.
42
43 */
44
45 /*
46 Dumps the final compressed table form of the trie to Perl_debug_log.
47 Used for debugging make_trie().
48 */
49
50 STATIC void
S_dump_trie(pTHX_ const struct _reg_trie_data * trie,HV * widecharmap,AV * revcharmap,U32 depth)51 S_dump_trie(pTHX_ const struct _reg_trie_data *trie, HV *widecharmap,
52 AV *revcharmap, U32 depth)
53 {
54 U32 state;
55 SV *sv=sv_newmortal();
56 int colwidth= widecharmap ? 6 : 4;
57 U16 word;
58 DECLARE_AND_GET_RE_DEBUG_FLAGS;
59
60 PERL_ARGS_ASSERT_DUMP_TRIE;
61
62 Perl_re_indentf( aTHX_ "Char : %-6s%-6s%-4s ",
63 depth+1, "Match","Base","Ofs" );
64
65 for( state = 0 ; state < trie->uniquecharcount ; state++ ) {
66 SV ** const tmp = av_fetch_simple( revcharmap, state, 0);
67 if ( tmp ) {
68 Perl_re_printf( aTHX_ "%*s",
69 colwidth,
70 pv_pretty(sv, SvPV_nolen_const(*tmp), SvCUR(*tmp), colwidth,
71 PL_colors[0], PL_colors[1],
72 (SvUTF8(*tmp) ? PERL_PV_ESCAPE_UNI : 0) |
73 PERL_PV_ESCAPE_FIRSTCHAR
74 )
75 );
76 }
77 }
78 Perl_re_printf( aTHX_ "\n");
79 Perl_re_indentf( aTHX_ "State|-----------------------", depth+1);
80
81 for( state = 0 ; state < trie->uniquecharcount ; state++ )
82 Perl_re_printf( aTHX_ "%.*s", colwidth, "--------");
83 Perl_re_printf( aTHX_ "\n");
84
85 for( state = 1 ; state < trie->statecount ; state++ ) {
86 const U32 base = trie->states[ state ].trans.base;
87
88 Perl_re_indentf( aTHX_ "#%4" UVXf "|", depth+1, (UV)state);
89
90 if ( trie->states[ state ].wordnum ) {
91 Perl_re_printf( aTHX_ " W%4X", trie->states[ state ].wordnum );
92 } else {
93 Perl_re_printf( aTHX_ "%6s", "" );
94 }
95
96 Perl_re_printf( aTHX_ " @%4" UVXf " ", (UV)base );
97
98 if ( base ) {
99 U32 ofs = 0;
100
101 while( ( base + ofs < trie->uniquecharcount ) ||
102 ( base + ofs - trie->uniquecharcount < trie->lasttrans
103 && trie->trans[ base + ofs - trie->uniquecharcount ].check
104 != state))
105 ofs++;
106
107 Perl_re_printf( aTHX_ "+%2" UVXf "[ ", (UV)ofs);
108
109 for ( ofs = 0 ; ofs < trie->uniquecharcount ; ofs++ ) {
110 if ( ( base + ofs >= trie->uniquecharcount )
111 && ( base + ofs - trie->uniquecharcount
112 < trie->lasttrans )
113 && trie->trans[ base + ofs
114 - trie->uniquecharcount ].check == state )
115 {
116 Perl_re_printf( aTHX_ "%*" UVXf, colwidth,
117 (UV)trie->trans[ base + ofs - trie->uniquecharcount ].next
118 );
119 } else {
120 Perl_re_printf( aTHX_ "%*s", colwidth," ." );
121 }
122 }
123
124 Perl_re_printf( aTHX_ "]");
125
126 }
127 Perl_re_printf( aTHX_ "\n" );
128 }
129 Perl_re_indentf( aTHX_ "word_info N:(prev,len)=",
130 depth);
131 for (word=1; word <= trie->wordcount; word++) {
132 Perl_re_printf( aTHX_ " %d:(%d,%d)",
133 (int)word, (int)(trie->wordinfo[word].prev),
134 (int)(trie->wordinfo[word].len));
135 }
136 Perl_re_printf( aTHX_ "\n" );
137 }
138 /*
139 Dumps a fully constructed but uncompressed trie in list form.
140 List tries normally only are used for construction when the number of
141 possible chars (trie->uniquecharcount) is very high.
142 Used for debugging make_trie().
143 */
144 STATIC void
S_dump_trie_interim_list(pTHX_ const struct _reg_trie_data * trie,HV * widecharmap,AV * revcharmap,U32 next_alloc,U32 depth)145 S_dump_trie_interim_list(pTHX_ const struct _reg_trie_data *trie,
146 HV *widecharmap, AV *revcharmap, U32 next_alloc,
147 U32 depth)
148 {
149 U32 state;
150 SV *sv=sv_newmortal();
151 int colwidth= widecharmap ? 6 : 4;
152 DECLARE_AND_GET_RE_DEBUG_FLAGS;
153
154 PERL_ARGS_ASSERT_DUMP_TRIE_INTERIM_LIST;
155
156 /* print out the table precompression. */
157 Perl_re_indentf( aTHX_ "State :Word | Transition Data\n",
158 depth+1 );
159 Perl_re_indentf( aTHX_ "%s",
160 depth+1, "------:-----+-----------------\n" );
161
162 for( state=1 ; state < next_alloc ; state ++ ) {
163 U16 charid;
164
165 Perl_re_indentf( aTHX_ " %4" UVXf " :",
166 depth+1, (UV)state );
167 if ( ! trie->states[ state ].wordnum ) {
168 Perl_re_printf( aTHX_ "%5s| ","");
169 } else {
170 Perl_re_printf( aTHX_ "W%4x| ",
171 trie->states[ state ].wordnum
172 );
173 }
174 for( charid = 1 ; charid <= TRIE_LIST_USED( state ) ; charid++ ) {
175 SV ** const tmp = av_fetch_simple( revcharmap,
176 TRIE_LIST_ITEM(state, charid).forid, 0);
177 if ( tmp ) {
178 Perl_re_printf( aTHX_ "%*s:%3X=%4" UVXf " | ",
179 colwidth,
180 pv_pretty(sv, SvPV_nolen_const(*tmp), SvCUR(*tmp),
181 colwidth,
182 PL_colors[0], PL_colors[1],
183 (SvUTF8(*tmp) ? PERL_PV_ESCAPE_UNI : 0)
184 | PERL_PV_ESCAPE_FIRSTCHAR
185 ) ,
186 TRIE_LIST_ITEM(state, charid).forid,
187 (UV)TRIE_LIST_ITEM(state, charid).newstate
188 );
189 if (!(charid % 10))
190 Perl_re_printf( aTHX_ "\n%*s| ",
191 (int)((depth * 2) + 14), "");
192 }
193 }
194 Perl_re_printf( aTHX_ "\n");
195 }
196 }
197
198 /*
199 Dumps a fully constructed but uncompressed trie in table form.
200 This is the normal DFA style state transition table, with a few
201 twists to facilitate compression later.
202 Used for debugging make_trie().
203 */
204 STATIC void
S_dump_trie_interim_table(pTHX_ const struct _reg_trie_data * trie,HV * widecharmap,AV * revcharmap,U32 next_alloc,U32 depth)205 S_dump_trie_interim_table(pTHX_ const struct _reg_trie_data *trie,
206 HV *widecharmap, AV *revcharmap, U32 next_alloc,
207 U32 depth)
208 {
209 U32 state;
210 U16 charid;
211 SV *sv=sv_newmortal();
212 int colwidth= widecharmap ? 6 : 4;
213 DECLARE_AND_GET_RE_DEBUG_FLAGS;
214
215 PERL_ARGS_ASSERT_DUMP_TRIE_INTERIM_TABLE;
216
217 /*
218 print out the table precompression so that we can do a visual check
219 that they are identical.
220 */
221
222 Perl_re_indentf( aTHX_ "Char : ", depth+1 );
223
224 for( charid = 0 ; charid < trie->uniquecharcount ; charid++ ) {
225 SV ** const tmp = av_fetch_simple( revcharmap, charid, 0);
226 if ( tmp ) {
227 Perl_re_printf( aTHX_ "%*s",
228 colwidth,
229 pv_pretty(sv, SvPV_nolen_const(*tmp), SvCUR(*tmp), colwidth,
230 PL_colors[0], PL_colors[1],
231 (SvUTF8(*tmp) ? PERL_PV_ESCAPE_UNI : 0) |
232 PERL_PV_ESCAPE_FIRSTCHAR
233 )
234 );
235 }
236 }
237
238 Perl_re_printf( aTHX_ "\n");
239 Perl_re_indentf( aTHX_ "State+-", depth+1 );
240
241 for( charid=0 ; charid < trie->uniquecharcount ; charid++ ) {
242 Perl_re_printf( aTHX_ "%.*s", colwidth,"--------");
243 }
244
245 Perl_re_printf( aTHX_ "\n" );
246
247 for( state=1 ; state < next_alloc ; state += trie->uniquecharcount ) {
248
249 Perl_re_indentf( aTHX_ "%4" UVXf " : ",
250 depth+1,
251 (UV)TRIE_NODENUM( state ) );
252
253 for( charid = 0 ; charid < trie->uniquecharcount ; charid++ ) {
254 UV v=(UV)SAFE_TRIE_NODENUM( trie->trans[ state + charid ].next );
255 if (v)
256 Perl_re_printf( aTHX_ "%*" UVXf, colwidth, v );
257 else
258 Perl_re_printf( aTHX_ "%*s", colwidth, "." );
259 }
260 if ( ! trie->states[ TRIE_NODENUM( state ) ].wordnum ) {
261 Perl_re_printf( aTHX_ " (%4" UVXf ")\n",
262 (UV)trie->trans[ state ].check );
263 } else {
264 Perl_re_printf( aTHX_ " (%4" UVXf ") W%4X\n",
265 (UV)trie->trans[ state ].check,
266 trie->states[ TRIE_NODENUM( state ) ].wordnum );
267 }
268 }
269 }
270
271 #endif
272
273
274 /* make_trie(startbranch,first,last,tail,word_count,flags,depth)
275 startbranch: the first branch in the whole branch sequence
276 first : start branch of sequence of branch-exact nodes.
277 May be the same as startbranch
278 last : Thing following the last branch.
279 May be the same as tail.
280 tail : item following the branch sequence
281 count : words in the sequence
282 flags : currently the OP() type we will be building one of /EXACT(|F|FA|FU|FU_SS|L|FLU8)/
283 depth : indent depth
284
285 Inplace optimizes a sequence of 2 or more Branch-Exact nodes into a TRIE node.
286
287 A trie is an N'ary tree where the branches are determined by digital
288 decomposition of the key. IE, at the root node you look up the 1st character and
289 follow that branch repeat until you find the end of the branches. Nodes can be
290 marked as "accepting" meaning they represent a complete word. Eg:
291
292 /he|she|his|hers/
293
294 would convert into the following structure. Numbers represent states, letters
295 following numbers represent valid transitions on the letter from that state, if
296 the number is in square brackets it represents an accepting state, otherwise it
297 will be in parenthesis.
298
299 +-h->+-e->[3]-+-r->(8)-+-s->[9]
300 | |
301 | (2)
302 | |
303 (1) +-i->(6)-+-s->[7]
304 |
305 +-s->(3)-+-h->(4)-+-e->[5]
306
307 Accept Word Mapping: 3=>1 (he),5=>2 (she), 7=>3 (his), 9=>4 (hers)
308
309 This shows that when matching against the string 'hers' we will begin at state 1
310 read 'h' and move to state 2, read 'e' and move to state 3 which is accepting,
311 then read 'r' and go to state 8 followed by 's' which takes us to state 9 which
312 is also accepting. Thus we know that we can match both 'he' and 'hers' with a
313 single traverse. We store a mapping from accepting to state to which word was
314 matched, and then when we have multiple possibilities we try to complete the
315 rest of the regex in the order in which they occurred in the alternation.
316
317 The only prior NFA like behaviour that would be changed by the TRIE support is
318 the silent ignoring of duplicate alternations which are of the form:
319
320 / (DUPE|DUPE) X? (?{ ... }) Y /x
321
322 Thus EVAL blocks following a trie may be called a different number of times with
323 and without the optimisation. With the optimisations dupes will be silently
324 ignored. This inconsistent behaviour of EVAL type nodes is well established as
325 the following demonstrates:
326
327 'words'=~/(word|word|word)(?{ print $1 })[xyz]/
328
329 which prints out 'word' three times, but
330
331 'words'=~/(word|word|word)(?{ print $1 })S/
332
333 which doesnt print it out at all. This is due to other optimisations kicking in.
334
335 Example of what happens on a structural level:
336
337 The regexp /(ac|ad|ab)+/ will produce the following debug output:
338
339 1: CURLYM[1] {1,32767}(18)
340 5: BRANCH(8)
341 6: EXACT <ac>(16)
342 8: BRANCH(11)
343 9: EXACT <ad>(16)
344 11: BRANCH(14)
345 12: EXACT <ab>(16)
346 16: SUCCEED(0)
347 17: NOTHING(18)
348 18: END(0)
349
350 This would be optimizable with startbranch=5, first=5, last=16, tail=16
351 and should turn into:
352
353 1: CURLYM[1] {1,32767}(18)
354 5: TRIE(16)
355 [Words:3 Chars Stored:6 Unique Chars:4 States:5 NCP:1]
356 <ac>
357 <ad>
358 <ab>
359 16: SUCCEED(0)
360 17: NOTHING(18)
361 18: END(0)
362
363 Cases where tail != last would be like /(?foo|bar)baz/:
364
365 1: BRANCH(4)
366 2: EXACT <foo>(8)
367 4: BRANCH(7)
368 5: EXACT <bar>(8)
369 7: TAIL(8)
370 8: EXACT <baz>(10)
371 10: END(0)
372
373 which would be optimizable with startbranch=1, first=1, last=7, tail=8
374 and would end up looking like:
375
376 1: TRIE(8)
377 [Words:2 Chars Stored:6 Unique Chars:5 States:7 NCP:1]
378 <foo>
379 <bar>
380 7: TAIL(8)
381 8: EXACT <baz>(10)
382 10: END(0)
383
384 d = uvchr_to_utf8_flags(d, uv, 0);
385
386 is the recommended Unicode-aware way of saying
387
388 *(d++) = uv;
389 */
390
391 #define TRIE_STORE_REVCHAR(val) \
392 STMT_START { \
393 if (UTF) { \
394 SV *zlopp = newSV(UTF8_MAXBYTES); \
395 unsigned char *flrbbbbb = (unsigned char *) SvPVX(zlopp); \
396 unsigned char *const kapow = uvchr_to_utf8(flrbbbbb, val); \
397 *kapow = '\0'; \
398 SvCUR_set(zlopp, kapow - flrbbbbb); \
399 SvPOK_on(zlopp); \
400 SvUTF8_on(zlopp); \
401 av_push_simple(revcharmap, zlopp); \
402 } else { \
403 char ooooff = (char)val; \
404 av_push_simple(revcharmap, newSVpvn(&ooooff, 1)); \
405 } \
406 } STMT_END
407
408 /* This gets the next character from the input, folding it if not already
409 * folded. */
410 #define TRIE_READ_CHAR STMT_START { \
411 wordlen++; \
412 if ( UTF ) { \
413 /* if it is UTF then it is either already folded, or does not need \
414 * folding */ \
415 uvc = valid_utf8_to_uvchr( (const U8*) uc, &len); \
416 } \
417 else if (folder == PL_fold_latin1) { \
418 /* This folder implies Unicode rules, which in the range expressible \
419 * by not UTF is the lower case, with the two exceptions, one of \
420 * which should have been taken care of before calling this */ \
421 assert(*uc != LATIN_SMALL_LETTER_SHARP_S); \
422 uvc = toLOWER_L1(*uc); \
423 if (UNLIKELY(uvc == MICRO_SIGN)) uvc = GREEK_SMALL_LETTER_MU; \
424 len = 1; \
425 } else { \
426 /* raw data, will be folded later if needed */ \
427 uvc = (U32)*uc; \
428 len = 1; \
429 } \
430 } STMT_END
431
432
433
434 #define TRIE_LIST_PUSH(state,fid,ns) STMT_START { \
435 if ( TRIE_LIST_CUR( state ) >=TRIE_LIST_LEN( state ) ) { \
436 U32 ging = TRIE_LIST_LEN( state ) * 2; \
437 Renew( trie->states[ state ].trans.list, ging, reg_trie_trans_le ); \
438 TRIE_LIST_LEN( state ) = ging; \
439 } \
440 TRIE_LIST_ITEM( state, TRIE_LIST_CUR( state ) ).forid = fid; \
441 TRIE_LIST_ITEM( state, TRIE_LIST_CUR( state ) ).newstate = ns; \
442 TRIE_LIST_CUR( state )++; \
443 } STMT_END
444
445 #define TRIE_LIST_NEW(state) STMT_START { \
446 Newx( trie->states[ state ].trans.list, \
447 4, reg_trie_trans_le ); \
448 TRIE_LIST_CUR( state ) = 1; \
449 TRIE_LIST_LEN( state ) = 4; \
450 } STMT_END
451
452 #define TRIE_HANDLE_WORD(state) STMT_START { \
453 U16 dupe= trie->states[ state ].wordnum; \
454 regnode * const noper_next = regnext( noper ); \
455 \
456 DEBUG_r({ \
457 /* store the word for dumping */ \
458 SV* tmp; \
459 if (OP(noper) != NOTHING) \
460 tmp = newSVpvn_utf8(STRING(noper), STR_LEN(noper), UTF); \
461 else \
462 tmp = newSVpvn_utf8( "", 0, UTF ); \
463 av_push_simple( trie_words, tmp ); \
464 }); \
465 \
466 curword++; \
467 trie->wordinfo[curword].prev = 0; \
468 trie->wordinfo[curword].len = wordlen; \
469 trie->wordinfo[curword].accept = state; \
470 \
471 if ( noper_next < tail ) { \
472 if (!trie->jump) { \
473 trie->jump = (U16 *) PerlMemShared_calloc( word_count + 1, \
474 sizeof(U16) ); \
475 trie->j_before_paren = (U16 *) PerlMemShared_calloc( word_count + 1, \
476 sizeof(U16) ); \
477 trie->j_after_paren = (U16 *) PerlMemShared_calloc( word_count + 1, \
478 sizeof(U16) ); \
479 } \
480 trie->jump[curword] = (U16)(noper_next - convert); \
481 U16 set_before_paren; \
482 U16 set_after_paren; \
483 if (OP(cur) == BRANCH) { \
484 set_before_paren = ARG1a(cur); \
485 set_after_paren = ARG1b(cur); \
486 } else { \
487 set_before_paren = ARG2a(cur); \
488 set_after_paren = ARG2b(cur); \
489 } \
490 trie->j_before_paren[curword] = set_before_paren; \
491 trie->j_after_paren[curword] = set_after_paren; \
492 if (!jumper) \
493 jumper = noper_next; \
494 if (!nextbranch) \
495 nextbranch= regnext(cur); \
496 } \
497 \
498 if ( dupe ) { \
499 /* It's a dupe. Pre-insert into the wordinfo[].prev */\
500 /* chain, so that when the bits of chain are later */\
501 /* linked together, the dups appear in the chain */\
502 trie->wordinfo[curword].prev = trie->wordinfo[dupe].prev; \
503 trie->wordinfo[dupe].prev = curword; \
504 } else { \
505 /* we haven't inserted this word yet. */ \
506 trie->states[ state ].wordnum = curword; \
507 } \
508 } STMT_END
509
510
511 #define TRIE_TRANS_STATE(state,base,ucharcount,charid,special) \
512 ( ( base + charid >= ucharcount \
513 && base + charid < ubound \
514 && state == trie->trans[ base - ucharcount + charid ].check \
515 && trie->trans[ base - ucharcount + charid ].next ) \
516 ? trie->trans[ base - ucharcount + charid ].next \
517 : ( state==1 ? special : 0 ) \
518 )
519
520 /* Helper function for make_trie(): set a trie bit for both the character
521 * and its folded variant, and for the first byte of a variant codepoint,
522 * if any */
523
524 STATIC void
S_trie_bitmap_set_folded(pTHX_ RExC_state_t * pRExC_state,reg_trie_data * trie,U8 ch,const U8 * folder)525 S_trie_bitmap_set_folded(pTHX_ RExC_state_t *pRExC_state,
526 reg_trie_data *trie, U8 ch, const U8 * folder)
527 {
528 TRIE_BITMAP_SET(trie, ch);
529 /* store the folded codepoint */
530 if ( folder )
531 TRIE_BITMAP_SET(trie, folder[ch]);
532
533 if ( !UTF ) {
534 /* store first byte of utf8 representation of */
535 /* variant codepoints */
536 if (! UVCHR_IS_INVARIANT(ch)) {
537 U8 hi = UTF8_TWO_BYTE_HI(ch);
538 /* Note that hi will be either 0xc2 or 0xc3 (apart from EBCDIC
539 * systems), and TRIE_BITMAP_SET() will do >>3 to get the byte
540 * offset within the bit table, which is constant, and
541 * Coverity complained about this (CID 488118). */
542 TRIE_BITMAP_SET(trie, hi);
543 }
544 }
545 }
546
547
548 I32
Perl_make_trie(pTHX_ RExC_state_t * pRExC_state,regnode * startbranch,regnode * first,regnode * last,regnode * tail,U32 word_count,U32 flags,U32 depth)549 Perl_make_trie(pTHX_ RExC_state_t *pRExC_state, regnode *startbranch,
550 regnode *first, regnode *last, regnode *tail,
551 U32 word_count, U32 flags, U32 depth)
552 {
553 /* first pass, loop through and scan words */
554 reg_trie_data *trie;
555 HV *widecharmap = NULL;
556 AV *revcharmap = newAV();
557 regnode *cur;
558 STRLEN len = 0;
559 UV uvc = 0;
560 U16 curword = 0;
561 U32 next_alloc = 0;
562 regnode *jumper = NULL;
563 regnode *nextbranch = NULL;
564 regnode *lastbranch = NULL;
565 regnode *convert = NULL;
566 U32 *prev_states; /* temp array mapping each state to previous one */
567 /* we just use folder as a flag in utf8 */
568 const U8 * folder = NULL;
569
570 /* in the below reg_add_data call we are storing either 'tu' or 'tuaa'
571 * which stands for one trie structure, one hash, optionally followed
572 * by two arrays */
573 #ifdef DEBUGGING
574 const U32 data_slot = reg_add_data( pRExC_state, STR_WITH_LEN("tuaa"));
575 AV *trie_words = NULL;
576 /* along with revcharmap, this only used during construction but both are
577 * useful during debugging so we store them in the struct when debugging.
578 */
579 #else
580 const U32 data_slot = reg_add_data( pRExC_state, STR_WITH_LEN("tu"));
581 STRLEN trie_charcount=0;
582 #endif
583 SV *re_trie_maxbuff;
584 DECLARE_AND_GET_RE_DEBUG_FLAGS;
585
586 PERL_ARGS_ASSERT_MAKE_TRIE;
587 #ifndef DEBUGGING
588 PERL_UNUSED_ARG(depth);
589 #endif
590
591 switch (flags) {
592 case EXACT: case EXACT_REQ8: case EXACTL: break;
593 case EXACTFAA:
594 case EXACTFUP:
595 case EXACTFU:
596 case EXACTFLU8: folder = PL_fold_latin1; break;
597 case EXACTF: folder = PL_fold; break;
598 default: Perl_croak( aTHX_ "panic! In trie construction, unknown node type %u %s", (unsigned) flags, REGNODE_NAME(flags) );
599 }
600
601 /* create the trie struct, all zeroed */
602 trie = (reg_trie_data *) PerlMemShared_calloc( 1, sizeof(reg_trie_data) );
603 trie->refcount = 1;
604 trie->startstate = 1;
605 trie->wordcount = word_count;
606 RExC_rxi->data->data[ data_slot ] = (void*)trie;
607 trie->charmap = (U16 *) PerlMemShared_calloc( 256, sizeof(U16) );
608 if (flags == EXACT || flags == EXACT_REQ8 || flags == EXACTL)
609 trie->bitmap = (char *) PerlMemShared_calloc( ANYOF_BITMAP_SIZE, 1 );
610 trie->wordinfo = (reg_trie_wordinfo *) PerlMemShared_calloc(
611 trie->wordcount+1, sizeof(reg_trie_wordinfo));
612
613 DEBUG_r({
614 trie_words = newAV();
615 });
616
617 re_trie_maxbuff = get_sv(RE_TRIE_MAXBUF_NAME, GV_ADD);
618 assert(re_trie_maxbuff);
619 if (!SvIOK(re_trie_maxbuff)) {
620 sv_setiv(re_trie_maxbuff, RE_TRIE_MAXBUF_INIT);
621 }
622 DEBUG_TRIE_COMPILE_r({
623 Perl_re_indentf( aTHX_
624 "make_trie start==%d, first==%d, last==%d, tail==%d depth=%d\n",
625 depth+1,
626 REG_NODE_NUM(startbranch), REG_NODE_NUM(first),
627 REG_NODE_NUM(last), REG_NODE_NUM(tail), (int)depth);
628 });
629
630 /* Find the node we are going to overwrite */
631 if ( first == startbranch && OP( last ) != BRANCH ) {
632 /* whole branch chain */
633 convert = first;
634 } else {
635 /* branch sub-chain */
636 convert = REGNODE_AFTER( first );
637 }
638
639 /* -- First loop and Setup --
640
641 We first traverse the branches and scan each word to determine if it
642 contains widechars, and how many unique chars there are, this is
643 important as we have to build a table with at least as many columns as we
644 have unique chars.
645
646 We use an array of integers to represent the character codes 0..255
647 (trie->charmap) and we use a an HV* to store Unicode characters. We use
648 the native representation of the character value as the key and IV's for
649 the coded index.
650
651 *TODO* If we keep track of how many times each character is used we can
652 remap the columns so that the table compression later on is more
653 efficient in terms of memory by ensuring the most common value is in the
654 middle and the least common are on the outside. IMO this would be better
655 than a most to least common mapping as theres a decent chance the most
656 common letter will share a node with the least common, meaning the node
657 will not be compressible. With a middle is most common approach the worst
658 case is when we have the least common nodes twice.
659
660 */
661
662 for ( cur = first ; cur < last ; cur = regnext( cur ) ) {
663 regnode *noper = REGNODE_AFTER( cur );
664 const U8 *uc;
665 const U8 *e;
666 int foldlen = 0;
667 U32 wordlen = 0; /* required init */
668 STRLEN minchars = 0;
669 STRLEN maxchars = 0;
670 bool set_bit = trie->bitmap ? 1 : 0; /*store the first char in the
671 bitmap?*/
672
673 /* wordlen is needed for the TRIE_READ_CHAR() macro, but we don't use its
674 value in this scope, we only modify it. clang 17 warns about this.
675 The later definitions of wordlen in this function do have their values
676 used.
677 */
678 PERL_UNUSED_VAR(wordlen);
679
680 lastbranch = cur;
681
682 if (OP(noper) == NOTHING) {
683 /* skip past a NOTHING at the start of an alternation
684 * eg, /(?:)a|(?:b)/ should be the same as /a|b/
685 *
686 * If the next node is not something we are supposed to process
687 * we will just ignore it due to the condition guarding the
688 * next block.
689 */
690
691 regnode *noper_next= regnext(noper);
692 if (noper_next < tail)
693 noper= noper_next;
694 }
695
696 if ( noper < tail
697 && ( OP(noper) == flags
698 || (flags == EXACT && OP(noper) == EXACT_REQ8)
699 || (flags == EXACTFU && ( OP(noper) == EXACTFU_REQ8
700 || OP(noper) == EXACTFUP))))
701 {
702 uc= (U8*)STRING(noper);
703 e= uc + STR_LEN(noper);
704 } else {
705 trie->minlen= 0;
706 continue;
707 }
708
709
710 if ( set_bit ) { /* bitmap only alloced when !(UTF&&Folding) */
711 TRIE_BITMAP_SET(trie,*uc); /* store the raw first byte
712 regardless of encoding */
713 if (OP( noper ) == EXACTFUP) {
714 /* false positives are ok, so just set this */
715 TRIE_BITMAP_SET(trie, LATIN_SMALL_LETTER_SHARP_S);
716 }
717 }
718
719 for ( ; uc < e ; uc += len ) { /* Look at each char in the current
720 branch */
721 TRIE_CHARCOUNT(trie)++;
722 TRIE_READ_CHAR;
723
724 /* TRIE_READ_CHAR returns the current character, or its fold if /i
725 * is in effect. Under /i, this character can match itself, or
726 * anything that folds to it. If not under /i, it can match just
727 * itself. Most folds are 1-1, for example k, K, and KELVIN SIGN
728 * all fold to k, and all are single characters. But some folds
729 * expand to more than one character, so for example LATIN SMALL
730 * LIGATURE FFI folds to the three character sequence 'ffi'. If
731 * the string beginning at 'uc' is 'ffi', it could be matched by
732 * three characters, or just by the one ligature character. (It
733 * could also be matched by two characters: LATIN SMALL LIGATURE FF
734 * followed by 'i', or by 'f' followed by LATIN SMALL LIGATURE FI).
735 * (Of course 'I' and/or 'F' instead of 'i' and 'f' can also
736 * match.) The trie needs to know the minimum and maximum number
737 * of characters that could match so that it can use size alone to
738 * quickly reject many match attempts. The max is simple: it is
739 * the number of folded characters in this branch (since a fold is
740 * never shorter than what folds to it. */
741
742 maxchars++;
743
744 /* And the min is equal to the max if not under /i (indicated by
745 * 'folder' being NULL), or there are no multi-character folds. If
746 * there is a multi-character fold, the min is incremented just
747 * once, for the character that folds to the sequence. Each
748 * character in the sequence needs to be added to the list below of
749 * characters in the trie, but we count only the first towards the
750 * min number of characters needed. This is done through the
751 * variable 'foldlen', which is returned by the macros that look
752 * for these sequences as the number of bytes the sequence
753 * occupies. Each time through the loop, we decrement 'foldlen' by
754 * how many bytes the current char occupies. Only when it reaches
755 * 0 do we increment 'minchars' or look for another multi-character
756 * sequence. */
757 if (folder == NULL) {
758 minchars++;
759 }
760 else if (foldlen > 0) {
761 foldlen -= (UTF) ? UTF8SKIP(uc) : 1;
762 }
763 else {
764 minchars++;
765
766 /* See if *uc is the beginning of a multi-character fold. If
767 * so, we decrement the length remaining to look at, to account
768 * for the current character this iteration. (We can use 'uc'
769 * instead of the fold returned by TRIE_READ_CHAR because the
770 * macro is smart enough to account for any unfolded
771 * characters. */
772 if (UTF) {
773 if ((foldlen = is_MULTI_CHAR_FOLD_utf8_safe(uc, e))) {
774 foldlen -= UTF8SKIP(uc);
775 }
776 }
777 else if ((foldlen = is_MULTI_CHAR_FOLD_latin1_safe(uc, e))) {
778 foldlen--;
779 }
780 }
781
782 /* The current character (and any potential folds) should be added
783 * to the possible matching characters for this position in this
784 * branch */
785 if ( uvc < 256 ) {
786 if ( folder ) {
787 U8 folded= folder[ (U8) uvc ];
788 if ( !trie->charmap[ folded ] ) {
789 trie->charmap[ folded ]=( ++trie->uniquecharcount );
790 TRIE_STORE_REVCHAR( folded );
791 }
792 }
793 if ( !trie->charmap[ uvc ] ) {
794 trie->charmap[ uvc ]=( ++trie->uniquecharcount );
795 TRIE_STORE_REVCHAR( uvc );
796 }
797 if ( set_bit ) {
798 /* store the codepoint in the bitmap, and its folded
799 * equivalent. */
800 S_trie_bitmap_set_folded(aTHX_ pRExC_state, trie, uvc, folder);
801 set_bit = 0; /* We've done our bit :-) */
802 }
803 } else {
804
805 /* XXX We could come up with the list of code points that fold
806 * to this using PL_utf8_foldclosures, except not for
807 * multi-char folds, as there may be multiple combinations
808 * there that could work, which needs to wait until runtime to
809 * resolve (The comment about LIGATURE FFI above is such an
810 * example */
811
812 SV** svpp;
813 if ( !widecharmap )
814 widecharmap = newHV();
815
816 svpp = hv_fetch( widecharmap, (char*)&uvc, sizeof( UV ), 1 );
817
818 if ( !svpp )
819 Perl_croak( aTHX_ "error creating/fetching widecharmap entry for 0x%" UVXf, uvc );
820
821 if ( !SvTRUE( *svpp ) ) {
822 sv_setiv( *svpp, ++trie->uniquecharcount );
823 TRIE_STORE_REVCHAR(uvc);
824 }
825 }
826 } /* end loop through characters in this branch of the trie */
827
828 /* We take the min and max for this branch and combine to find the min
829 * and max for all branches processed so far */
830 if( cur == first ) {
831 trie->minlen = minchars;
832 trie->maxlen = maxchars;
833 } else if (minchars < trie->minlen) {
834 trie->minlen = minchars;
835 } else if (maxchars > trie->maxlen) {
836 trie->maxlen = maxchars;
837 }
838 } /* end first pass */
839 trie->before_paren = OP(first) == BRANCH
840 ? ARG1a(first)
841 : ARG2a(first); /* BRANCHJ */
842
843 trie->after_paren = OP(lastbranch) == BRANCH
844 ? ARG1b(lastbranch)
845 : ARG2b(lastbranch); /* BRANCHJ */
846 DEBUG_TRIE_COMPILE_r(
847 Perl_re_indentf( aTHX_
848 "TRIE(%s): W:%d C:%d Uq:%d Min:%d Max:%d\n",
849 depth+1,
850 ( widecharmap ? "UTF8" : "NATIVE" ), (int)word_count,
851 (int)TRIE_CHARCOUNT(trie), trie->uniquecharcount,
852 (int)trie->minlen, (int)trie->maxlen )
853 );
854
855 /*
856 We now know what we are dealing with in terms of unique chars and
857 string sizes so we can calculate how much memory a naive
858 representation using a flat table will take. If it's over a reasonable
859 limit (as specified by ${^RE_TRIE_MAXBUF}) we use a more memory
860 conservative but potentially much slower representation using an array
861 of lists.
862
863 At the end we convert both representations into the same compressed
864 form that will be used in regexec.c for matching with. The latter
865 is a form that cannot be used to construct with but has memory
866 properties similar to the list form and access properties similar
867 to the table form making it both suitable for fast searches and
868 small enough that its feasable to store for the duration of a program.
869
870 See the comment in the code where the compressed table is produced
871 inplace from the flat tabe representation for an explanation of how
872 the compression works.
873
874 */
875
876
877 Newx(prev_states, TRIE_CHARCOUNT(trie) + 2, U32);
878 prev_states[1] = 0;
879
880 if ( (IV)( ( TRIE_CHARCOUNT(trie) + 1 ) * trie->uniquecharcount + 1)
881 > SvIV(re_trie_maxbuff) )
882 {
883 /*
884 Second Pass -- Array Of Lists Representation
885
886 Each state will be represented by a list of charid:state records
887 (reg_trie_trans_le) the first such element holds the CUR and LEN
888 points of the allocated array. (See defines above).
889
890 We build the initial structure using the lists, and then convert
891 it into the compressed table form which allows faster lookups
892 (but cant be modified once converted).
893 */
894
895 STRLEN transcount = 1;
896
897 DEBUG_TRIE_COMPILE_MORE_r( Perl_re_indentf( aTHX_ "Compiling trie using list compiler\n",
898 depth+1));
899
900 trie->states = (reg_trie_state *)
901 PerlMemShared_calloc( TRIE_CHARCOUNT(trie) + 2,
902 sizeof(reg_trie_state) );
903 TRIE_LIST_NEW(1);
904 next_alloc = 2;
905
906 for ( cur = first ; cur < last ; cur = regnext( cur ) ) {
907
908 regnode *noper = REGNODE_AFTER( cur );
909 U32 state = 1; /* required init */
910 U16 charid = 0; /* sanity init */
911 U32 wordlen = 0; /* required init */
912
913 if (OP(noper) == NOTHING) {
914 regnode *noper_next= regnext(noper);
915 if (noper_next < tail)
916 noper= noper_next;
917 /* we will undo this assignment if noper does not
918 * point at a trieable type in the else clause of
919 * the following statement. */
920 }
921
922 if ( noper < tail
923 && ( OP(noper) == flags
924 || (flags == EXACT && OP(noper) == EXACT_REQ8)
925 || (flags == EXACTFU && ( OP(noper) == EXACTFU_REQ8
926 || OP(noper) == EXACTFUP))))
927 {
928 const U8 *uc= (U8*)STRING(noper);
929 const U8 *e= uc + STR_LEN(noper);
930
931 for ( ; uc < e ; uc += len ) {
932
933 TRIE_READ_CHAR;
934
935 if ( uvc < 256 ) {
936 charid = trie->charmap[ uvc ];
937 } else {
938 SV** const svpp = hv_fetch( widecharmap,
939 (char*)&uvc,
940 sizeof( UV ),
941 0);
942 if ( !svpp ) {
943 charid = 0;
944 } else {
945 charid=(U16)SvIV( *svpp );
946 }
947 }
948 /* charid is now 0 if we dont know the char read, or
949 * nonzero if we do */
950 if ( charid ) {
951
952 U16 check;
953 U32 newstate = 0;
954
955 charid--;
956 if ( !trie->states[ state ].trans.list ) {
957 TRIE_LIST_NEW( state );
958 }
959 for ( check = 1;
960 check <= TRIE_LIST_USED( state );
961 check++ )
962 {
963 if ( TRIE_LIST_ITEM( state, check ).forid
964 == charid )
965 {
966 newstate = TRIE_LIST_ITEM( state, check ).newstate;
967 break;
968 }
969 }
970 if ( ! newstate ) {
971 newstate = next_alloc++;
972 prev_states[newstate] = state;
973 TRIE_LIST_PUSH( state, charid, newstate );
974 transcount++;
975 }
976 state = newstate;
977 } else {
978 Perl_croak( aTHX_ "panic! In trie construction, no char mapping for %" IVdf, uvc );
979 }
980 }
981 } else {
982 /* If we end up here it is because we skipped past a NOTHING, but did not end up
983 * on a trieable type. So we need to reset noper back to point at the first regop
984 * in the branch before we call TRIE_HANDLE_WORD()
985 */
986 noper= REGNODE_AFTER(cur);
987 }
988 TRIE_HANDLE_WORD(state);
989
990 } /* end second pass */
991
992 /* next alloc is the NEXT state to be allocated */
993 trie->statecount = next_alloc;
994 trie->states = (reg_trie_state *)
995 PerlMemShared_realloc( trie->states,
996 next_alloc
997 * sizeof(reg_trie_state) );
998
999 /* and now dump it out before we compress it */
1000 DEBUG_TRIE_COMPILE_MORE_r(dump_trie_interim_list(trie, widecharmap,
1001 revcharmap, next_alloc,
1002 depth+1)
1003 );
1004
1005 trie->trans = (reg_trie_trans *)
1006 PerlMemShared_calloc( transcount, sizeof(reg_trie_trans) );
1007 {
1008 U32 state;
1009 U32 tp = 0;
1010 U32 zp = 0;
1011
1012
1013 for( state=1 ; state < next_alloc ; state ++ ) {
1014 U32 base=0;
1015
1016 /*
1017 DEBUG_TRIE_COMPILE_MORE_r(
1018 Perl_re_printf( aTHX_ "tp: %d zp: %d ",tp,zp)
1019 );
1020 */
1021
1022 if (trie->states[state].trans.list) {
1023 U16 minid=TRIE_LIST_ITEM( state, 1).forid;
1024 U16 maxid=minid;
1025 U16 idx;
1026
1027 for( idx = 2 ; idx <= TRIE_LIST_USED( state ) ; idx++ ) {
1028 const U16 forid = TRIE_LIST_ITEM( state, idx).forid;
1029 if ( forid < minid ) {
1030 minid=forid;
1031 } else if ( forid > maxid ) {
1032 maxid=forid;
1033 }
1034 }
1035 if ( transcount < tp + maxid - minid + 1) {
1036 transcount *= 2;
1037 trie->trans = (reg_trie_trans *)
1038 PerlMemShared_realloc( trie->trans,
1039 transcount
1040 * sizeof(reg_trie_trans) );
1041 Zero( trie->trans + (transcount / 2),
1042 transcount / 2,
1043 reg_trie_trans );
1044 }
1045 base = trie->uniquecharcount + tp - minid;
1046 if ( maxid == minid ) {
1047 U32 set = 0;
1048 for ( ; zp < tp ; zp++ ) {
1049 if ( ! trie->trans[ zp ].next ) {
1050 base = trie->uniquecharcount + zp - minid;
1051 trie->trans[ zp ].next = TRIE_LIST_ITEM( state,
1052 1).newstate;
1053 trie->trans[ zp ].check = state;
1054 set = 1;
1055 break;
1056 }
1057 }
1058 if ( !set ) {
1059 trie->trans[ tp ].next = TRIE_LIST_ITEM( state,
1060 1).newstate;
1061 trie->trans[ tp ].check = state;
1062 tp++;
1063 zp = tp;
1064 }
1065 } else {
1066 for ( idx=1; idx <= TRIE_LIST_USED( state ) ; idx++ ) {
1067 const U32 tid = base
1068 - trie->uniquecharcount
1069 + TRIE_LIST_ITEM( state, idx ).forid;
1070 trie->trans[ tid ].next = TRIE_LIST_ITEM( state,
1071 idx ).newstate;
1072 trie->trans[ tid ].check = state;
1073 }
1074 tp += ( maxid - minid + 1 );
1075 }
1076 Safefree(trie->states[ state ].trans.list);
1077 }
1078 /*
1079 DEBUG_TRIE_COMPILE_MORE_r(
1080 Perl_re_printf( aTHX_ " base: %d\n",base);
1081 );
1082 */
1083 trie->states[ state ].trans.base=base;
1084 }
1085 trie->lasttrans = tp + 1;
1086 }
1087 } else {
1088 /*
1089 Second Pass -- Flat Table Representation.
1090
1091 we dont use the 0 slot of either trans[] or states[] so we add 1 to
1092 each. We know that we will need Charcount+1 trans at most to store
1093 the data (one row per char at worst case) So we preallocate both
1094 structures assuming worst case.
1095
1096 We then construct the trie using only the .next slots of the entry
1097 structs.
1098
1099 We use the .check field of the first entry of the node temporarily
1100 to make compression both faster and easier by keeping track of how
1101 many non zero fields are in the node.
1102
1103 Since trans are numbered from 1 any 0 pointer in the table is a FAIL
1104 transition.
1105
1106 There are two terms at use here: state as a TRIE_NODEIDX() which is
1107 a number representing the first entry of the node, and state as a
1108 TRIE_NODENUM() which is the trans number. state 1 is TRIE_NODEIDX(1)
1109 and TRIE_NODENUM(1), state 2 is TRIE_NODEIDX(2) and TRIE_NODENUM(3)
1110 if there are 2 entrys per node. eg:
1111
1112 A B A B
1113 1. 2 4 1. 3 7
1114 2. 0 3 3. 0 5
1115 3. 0 0 5. 0 0
1116 4. 0 0 7. 0 0
1117
1118 The table is internally in the right hand, idx form. However as we
1119 also have to deal with the states array which is indexed by nodenum
1120 we have to use TRIE_NODENUM() to convert.
1121
1122 */
1123 DEBUG_TRIE_COMPILE_MORE_r( Perl_re_indentf( aTHX_ "Compiling trie using table compiler\n",
1124 depth+1));
1125
1126 trie->trans = (reg_trie_trans *)
1127 PerlMemShared_calloc( ( TRIE_CHARCOUNT(trie) + 1 )
1128 * trie->uniquecharcount + 1,
1129 sizeof(reg_trie_trans) );
1130 trie->states = (reg_trie_state *)
1131 PerlMemShared_calloc( TRIE_CHARCOUNT(trie) + 2,
1132 sizeof(reg_trie_state) );
1133 next_alloc = trie->uniquecharcount + 1;
1134
1135
1136 for ( cur = first ; cur < last ; cur = regnext( cur ) ) {
1137
1138 regnode *noper = REGNODE_AFTER( cur );
1139
1140 U32 state = 1; /* required init */
1141
1142 U16 charid = 0; /* sanity init */
1143 U32 accept_state = 0; /* sanity init */
1144
1145 U32 wordlen = 0; /* required init */
1146
1147 if (OP(noper) == NOTHING) {
1148 regnode *noper_next= regnext(noper);
1149 if (noper_next < tail)
1150 noper= noper_next;
1151 /* we will undo this assignment if noper does not
1152 * point at a trieable type in the else clause of
1153 * the following statement. */
1154 }
1155
1156 if ( noper < tail
1157 && ( OP(noper) == flags
1158 || (flags == EXACT && OP(noper) == EXACT_REQ8)
1159 || (flags == EXACTFU && ( OP(noper) == EXACTFU_REQ8
1160 || OP(noper) == EXACTFUP))))
1161 {
1162 const U8 *uc= (U8*)STRING(noper);
1163 const U8 *e= uc + STR_LEN(noper);
1164
1165 for ( ; uc < e ; uc += len ) {
1166
1167 TRIE_READ_CHAR;
1168
1169 if ( uvc < 256 ) {
1170 charid = trie->charmap[ uvc ];
1171 } else {
1172 SV* const * const svpp = hv_fetch( widecharmap,
1173 (char*)&uvc,
1174 sizeof( UV ),
1175 0);
1176 charid = svpp ? (U16)SvIV(*svpp) : 0;
1177 }
1178 if ( charid ) {
1179 charid--;
1180 if ( !trie->trans[ state + charid ].next ) {
1181 trie->trans[ state + charid ].next = next_alloc;
1182 trie->trans[ state ].check++;
1183 prev_states[TRIE_NODENUM(next_alloc)]
1184 = TRIE_NODENUM(state);
1185 next_alloc += trie->uniquecharcount;
1186 }
1187 state = trie->trans[ state + charid ].next;
1188 } else {
1189 Perl_croak( aTHX_ "panic! In trie construction, no char mapping for %" IVdf, uvc );
1190 }
1191 /* charid is now 0 if we dont know the char read, or
1192 * nonzero if we do */
1193 }
1194 } else {
1195 /* If we end up here it is because we skipped past a NOTHING, but did not end up
1196 * on a trieable type. So we need to reset noper back to point at the first regop
1197 * in the branch before we call TRIE_HANDLE_WORD().
1198 */
1199 noper= REGNODE_AFTER(cur);
1200 }
1201 accept_state = TRIE_NODENUM( state );
1202 TRIE_HANDLE_WORD(accept_state);
1203
1204 } /* end second pass */
1205
1206 /* and now dump it out before we compress it */
1207 DEBUG_TRIE_COMPILE_MORE_r(dump_trie_interim_table(trie, widecharmap,
1208 revcharmap,
1209 next_alloc, depth+1));
1210
1211 {
1212 /*
1213 * Inplace compress the table.*
1214
1215 For sparse data sets the table constructed by the trie algorithm will
1216 be mostly 0/FAIL transitions or to put it another way mostly empty.
1217 (Note that leaf nodes will not contain any transitions.)
1218
1219 This algorithm compresses the tables by eliminating most such
1220 transitions, at the cost of a modest bit of extra work during lookup:
1221
1222 - Each states[] entry contains a .base field which indicates the
1223 index in the state[] array wheres its transition data is stored.
1224
1225 - If .base is 0 there are no valid transitions from that node.
1226
1227 - If .base is nonzero then charid is added to it to find an entry in
1228 the trans array.
1229
1230 -If trans[states[state].base+charid].check!=state then the
1231 transition is taken to be a 0/Fail transition. Thus if there are fail
1232 transitions at the front of the node then the .base offset will point
1233 somewhere inside the previous nodes data (or maybe even into a node
1234 even earlier), but the .check field determines if the transition is
1235 valid.
1236
1237 XXX - wrong maybe?
1238 The following process inplace converts the table to the compressed
1239 table: We first do not compress the root node 1,and mark all its
1240 .check pointers as 1 and set its .base pointer as 1 as well. This
1241 allows us to do a DFA construction from the compressed table later,
1242 and ensures that any .base pointers we calculate later are greater
1243 than 0.
1244
1245 - We set 'pos' to indicate the first entry of the second node.
1246
1247 - We then iterate over the columns of the node, finding the first and
1248 last used entry at l and m. We then copy l..m into pos..(pos+m-l),
1249 and set the .check pointers accordingly, and advance pos
1250 appropriately and repreat for the next node. Note that when we copy
1251 the next pointers we have to convert them from the original
1252 NODEIDX form to NODENUM form as the former is not valid post
1253 compression.
1254
1255 - If a node has no transitions used we mark its base as 0 and do not
1256 advance the pos pointer.
1257
1258 - If a node only has one transition we use a second pointer into the
1259 structure to fill in allocated fail transitions from other states.
1260 This pointer is independent of the main pointer and scans forward
1261 looking for null transitions that are allocated to a state. When it
1262 finds one it writes the single transition into the "hole". If the
1263 pointer doesnt find one the single transition is appended as normal.
1264
1265 - Once compressed we can Renew/realloc the structures to release the
1266 excess space.
1267
1268 See "Table-Compression Methods" in sec 3.9 of the Red Dragon,
1269 specifically Fig 3.47 and the associated pseudocode.
1270
1271 demq
1272 */
1273 const U32 laststate = TRIE_NODENUM( next_alloc );
1274 U32 state, charid;
1275 U32 pos = 0, zp=0;
1276 trie->statecount = laststate;
1277
1278 for ( state = 1 ; state < laststate ; state++ ) {
1279 U8 flag = 0;
1280 const U32 stateidx = TRIE_NODEIDX( state );
1281 const U32 o_used = trie->trans[ stateidx ].check;
1282 U32 used = trie->trans[ stateidx ].check;
1283 trie->trans[ stateidx ].check = 0;
1284
1285 for ( charid = 0;
1286 used && charid < trie->uniquecharcount;
1287 charid++ )
1288 {
1289 if ( flag || trie->trans[ stateidx + charid ].next ) {
1290 if ( trie->trans[ stateidx + charid ].next ) {
1291 if (o_used == 1) {
1292 for ( ; zp < pos ; zp++ ) {
1293 if ( ! trie->trans[ zp ].next ) {
1294 break;
1295 }
1296 }
1297 trie->states[ state ].trans.base
1298 = zp
1299 + trie->uniquecharcount
1300 - charid ;
1301 trie->trans[ zp ].next
1302 = SAFE_TRIE_NODENUM( trie->trans[ stateidx
1303 + charid ].next );
1304 trie->trans[ zp ].check = state;
1305 if ( ++zp > pos ) pos = zp;
1306 break;
1307 }
1308 used--;
1309 }
1310 if ( !flag ) {
1311 flag = 1;
1312 trie->states[ state ].trans.base
1313 = pos + trie->uniquecharcount - charid ;
1314 }
1315 trie->trans[ pos ].next
1316 = SAFE_TRIE_NODENUM(
1317 trie->trans[ stateidx + charid ].next );
1318 trie->trans[ pos ].check = state;
1319 pos++;
1320 }
1321 }
1322 }
1323 trie->lasttrans = pos + 1;
1324 trie->states = (reg_trie_state *)
1325 PerlMemShared_realloc( trie->states, laststate
1326 * sizeof(reg_trie_state) );
1327 DEBUG_TRIE_COMPILE_MORE_r(
1328 Perl_re_indentf( aTHX_ "Alloc: %d Orig: %" IVdf " elements, Final:%" IVdf ". Savings of %%%5.2f\n",
1329 depth+1,
1330 (int)( ( TRIE_CHARCOUNT(trie) + 1 ) * trie->uniquecharcount
1331 + 1 ),
1332 (IV)next_alloc,
1333 (IV)pos,
1334 ( ( next_alloc - pos ) * 100 ) / (double)next_alloc );
1335 );
1336
1337 } /* end table compress */
1338 }
1339 DEBUG_TRIE_COMPILE_MORE_r(
1340 Perl_re_indentf( aTHX_ "Statecount:%" UVxf " Lasttrans:%" UVxf "\n",
1341 depth+1,
1342 (UV)trie->statecount,
1343 (UV)trie->lasttrans)
1344 );
1345 /* resize the trans array to remove unused space */
1346 trie->trans = (reg_trie_trans *)
1347 PerlMemShared_realloc( trie->trans, trie->lasttrans
1348 * sizeof(reg_trie_trans) );
1349
1350 { /* Modify the program and insert the new TRIE node */
1351 U8 nodetype =(U8) flags;
1352 char *str=NULL;
1353
1354 #ifdef DEBUGGING
1355 regnode *optimize = NULL;
1356 #endif /* DEBUGGING */
1357 /* make sure we have enough room to inject the TRIE op */
1358 assert((!trie->jump) || !trie->jump[1] ||
1359 (trie->jump[1] >= (sizeof(tregnode_TRIE)/sizeof(struct regnode))));
1360 /*
1361 This means we convert either the first branch or the first Exact,
1362 depending on whether the thing following (in 'last') is a branch
1363 or not and whther first is the startbranch (ie is it a sub part of
1364 the alternation or is it the whole thing.)
1365 Assuming its a sub part we convert the EXACT otherwise we convert
1366 the whole branch sequence, including the first.
1367 */
1368 /* Find the node we are going to overwrite */
1369 if ( first != startbranch || OP( last ) == BRANCH ) {
1370 /* branch sub-chain */
1371 NEXT_OFF( first ) = (U16)(last - first);
1372 /* whole branch chain */
1373 }
1374 /* But first we check to see if there is a common prefix we can
1375 split out as an EXACT and put in front of the TRIE node. */
1376 trie->startstate= 1;
1377 if ( trie->bitmap && !widecharmap && !trie->jump ) {
1378 /* we want to find the first state that has more than
1379 * one transition, if that state is not the first state
1380 * then we have a common prefix which we can remove.
1381 */
1382 U32 state;
1383 for ( state = 1 ; state < trie->statecount-1 ; state++ ) {
1384 U32 ofs = 0;
1385 I32 first_ofs = -1; /* keeps track of the ofs of the first
1386 transition, -1 means none */
1387 U32 count = 0;
1388 const U32 base = trie->states[ state ].trans.base;
1389
1390 /* does this state terminate an alternation? */
1391 if ( trie->states[state].wordnum )
1392 count = 1;
1393
1394 for ( ofs = 0 ; ofs < trie->uniquecharcount ; ofs++ ) {
1395 if ( ( base + ofs >= trie->uniquecharcount ) &&
1396 ( base + ofs - trie->uniquecharcount < trie->lasttrans ) &&
1397 trie->trans[ base + ofs - trie->uniquecharcount ].check == state )
1398 {
1399 if ( ++count > 1 ) {
1400 /* we have more than one transition */
1401 SV **tmp;
1402 U8 *ch;
1403 /* if this is the first state there is no common prefix
1404 * to extract, so we can exit */
1405 if ( state == 1 ) break;
1406 tmp = av_fetch_simple( revcharmap, ofs, 0);
1407 ch = (U8*)SvPV_nolen_const( *tmp );
1408
1409 /* if we are on count 2 then we need to initialize the
1410 * bitmap, and store the previous char if there was one
1411 * in it*/
1412 if ( count == 2 ) {
1413 /* clear the bitmap */
1414 Zero(trie->bitmap, ANYOF_BITMAP_SIZE, char);
1415 DEBUG_OPTIMISE_r(
1416 Perl_re_indentf( aTHX_ "New Start State=%" UVuf " Class: [",
1417 depth+1,
1418 (UV)state));
1419 if (first_ofs >= 0) {
1420 SV ** const tmp = av_fetch_simple( revcharmap, first_ofs, 0);
1421 const U8 * const ch = (U8*)SvPV_nolen_const( *tmp );
1422
1423 S_trie_bitmap_set_folded(aTHX_ pRExC_state, trie, *ch, folder);
1424 DEBUG_OPTIMISE_r(
1425 Perl_re_printf( aTHX_ "%s", (char*)ch)
1426 );
1427 }
1428 }
1429 /* store the current firstchar in the bitmap */
1430 S_trie_bitmap_set_folded(aTHX_ pRExC_state, trie, *ch, folder);
1431 DEBUG_OPTIMISE_r(Perl_re_printf( aTHX_ "%s", ch));
1432 }
1433 first_ofs = ofs;
1434 }
1435 }
1436 if ( count == 1 ) {
1437 /* This state has only one transition, its transition is part
1438 * of a common prefix - we need to concatenate the char it
1439 * represents to what we have so far. */
1440 SV **tmp = av_fetch_simple( revcharmap, first_ofs, 0);
1441 STRLEN len;
1442 char *ch = SvPV( *tmp, len );
1443 DEBUG_OPTIMISE_r({
1444 SV *sv=sv_newmortal();
1445 Perl_re_indentf( aTHX_ "Prefix State: %" UVuf " Ofs:%" UVuf " Char='%s'\n",
1446 depth+1,
1447 (UV)state, (UV)first_ofs,
1448 pv_pretty(sv, SvPV_nolen_const(*tmp), SvCUR(*tmp), 6,
1449 PL_colors[0], PL_colors[1],
1450 (SvUTF8(*tmp) ? PERL_PV_ESCAPE_UNI : 0) |
1451 PERL_PV_ESCAPE_FIRSTCHAR
1452 )
1453 );
1454 });
1455 if ( state==1 ) {
1456 OP( convert ) = nodetype;
1457 str=STRING(convert);
1458 setSTR_LEN(convert, 0);
1459 }
1460 assert( ( STR_LEN(convert) + len ) < 256 );
1461 setSTR_LEN(convert, (U8)(STR_LEN(convert) + len));
1462 while (len--)
1463 *str++ = *ch++;
1464 } else {
1465 #ifdef DEBUGGING
1466 if (state>1)
1467 DEBUG_OPTIMISE_r(Perl_re_printf( aTHX_ "]\n"));
1468 #endif
1469 break;
1470 }
1471 }
1472 trie->prefixlen = (state-1);
1473 if (str) {
1474 regnode *n = REGNODE_AFTER(convert);
1475 assert( n - convert <= U16_MAX );
1476 NEXT_OFF(convert) = n - convert;
1477 trie->startstate = state;
1478 trie->minlen -= (state - 1);
1479 trie->maxlen -= (state - 1);
1480 #ifdef DEBUGGING
1481 /* At least the UNICOS C compiler choked on this
1482 * being argument to DEBUG_r(), so let's just have
1483 * it right here. */
1484 if (
1485 #ifdef PERL_EXT_RE_BUILD
1486 1
1487 #else
1488 DEBUG_r_TEST
1489 #endif
1490 ) {
1491 U32 word = trie->wordcount;
1492 while (word--) {
1493 SV ** const tmp = av_fetch_simple( trie_words, word, 0 );
1494 if (tmp) {
1495 if ( STR_LEN(convert) <= SvCUR(*tmp) )
1496 sv_chop(*tmp, SvPV_nolen(*tmp) + STR_LEN(convert));
1497 else
1498 sv_chop(*tmp, SvPV_nolen(*tmp) + SvCUR(*tmp));
1499 }
1500 }
1501 }
1502 #endif
1503 if (trie->maxlen) {
1504 convert = n;
1505 } else {
1506 NEXT_OFF(convert) = (U16)(tail - convert);
1507 DEBUG_r(optimize= n);
1508 }
1509 }
1510 }
1511 if (!jumper)
1512 jumper = last;
1513 if ( trie->maxlen ) {
1514 NEXT_OFF( convert ) = (U16)(tail - convert);
1515 ARG1u_SET( convert, data_slot );
1516 /* Store the offset to the first unabsorbed branch in
1517 jump[0], which is otherwise unused by the jump logic.
1518 We use this when dumping a trie and during optimisation. */
1519 if (trie->jump)
1520 trie->jump[0] = (U16)(nextbranch - convert);
1521
1522 /* If the start state is not accepting (meaning there is no empty string/NOTHING)
1523 * and there is a bitmap
1524 * and the first "jump target" node we found leaves enough room
1525 * then convert the TRIE node into a TRIEC node, with the bitmap
1526 * embedded inline in the opcode - this is hypothetically faster.
1527 */
1528 if ( !trie->states[trie->startstate].wordnum
1529 && trie->bitmap
1530 && ( (char *)jumper - (char *)convert) >= (int)sizeof(tregnode_TRIEC) )
1531 {
1532 OP( convert ) = TRIEC;
1533 Copy(trie->bitmap, ((tregnode_TRIEC *)convert)->bitmap, ANYOF_BITMAP_SIZE, char);
1534 PerlMemShared_free(trie->bitmap);
1535 trie->bitmap= NULL;
1536 } else
1537 OP( convert ) = TRIE;
1538
1539 /* store the type in the flags */
1540 FLAGS(convert) = nodetype;
1541 DEBUG_r({
1542 optimize = convert
1543 + NODE_STEP_REGNODE
1544 + REGNODE_ARG_LEN( OP( convert ) );
1545 });
1546 /* XXX We really should free up the resource in trie now,
1547 as we won't use them - (which resources?) dmq */
1548 }
1549 /* needed for dumping*/
1550 DEBUG_r(if (optimize) {
1551 /*
1552 Try to clean up some of the debris left after the
1553 optimisation.
1554 */
1555 while( optimize < jumper ) {
1556 OP( optimize ) = OPTIMIZED;
1557 optimize++;
1558 }
1559 });
1560 } /* end node insert */
1561
1562 /* Finish populating the prev field of the wordinfo array. Walk back
1563 * from each accept state until we find another accept state, and if
1564 * so, point the first word's .prev field at the second word. If the
1565 * second already has a .prev field set, stop now. This will be the
1566 * case either if we've already processed that word's accept state,
1567 * or that state had multiple words, and the overspill words were
1568 * already linked up earlier.
1569 */
1570 {
1571 U16 word;
1572 U32 state;
1573 U16 prev;
1574
1575 for (word=1; word <= trie->wordcount; word++) {
1576 prev = 0;
1577 if (trie->wordinfo[word].prev)
1578 continue;
1579 state = trie->wordinfo[word].accept;
1580 while (state) {
1581 state = prev_states[state];
1582 if (!state)
1583 break;
1584 prev = trie->states[state].wordnum;
1585 if (prev)
1586 break;
1587 }
1588 trie->wordinfo[word].prev = prev;
1589 }
1590 Safefree(prev_states);
1591 }
1592
1593
1594 /* and now dump out the compressed format */
1595 DEBUG_TRIE_COMPILE_r(dump_trie(trie, widecharmap, revcharmap, depth+1));
1596
1597 RExC_rxi->data->data[ data_slot + 1 ] = (void*)widecharmap;
1598 #ifdef DEBUGGING
1599 RExC_rxi->data->data[ data_slot + TRIE_WORDS_OFFSET ] = (void*)trie_words;
1600 RExC_rxi->data->data[ data_slot + 3 ] = (void*)revcharmap;
1601 #else
1602 SvREFCNT_dec_NN(revcharmap);
1603 #endif
1604 return trie->jump
1605 ? MADE_JUMP_TRIE
1606 : trie->startstate>1
1607 ? MADE_EXACT_TRIE
1608 : MADE_TRIE;
1609 }
1610
1611 regnode *
Perl_construct_ahocorasick_from_trie(pTHX_ RExC_state_t * pRExC_state,regnode * source,U32 depth)1612 Perl_construct_ahocorasick_from_trie(pTHX_ RExC_state_t *pRExC_state, regnode *source, U32 depth)
1613 {
1614 /* The Trie is constructed and compressed now so we can build a fail array if
1615 * it's needed
1616
1617 This is basically the Aho-Corasick algorithm. Its from exercise 3.31 and
1618 3.32 in the
1619 "Red Dragon" -- Compilers, principles, techniques, and tools. Aho, Sethi,
1620 Ullman 1985/88
1621 ISBN 0-201-10088-6
1622
1623 We find the fail state for each state in the trie, this state is the longest
1624 proper suffix of the current state's 'word' that is also a proper prefix of
1625 another word in our trie. State 1 represents the word '' and is thus the
1626 default fail state. This allows the DFA not to have to restart after its
1627 tried and failed a word at a given point, it simply continues as though it
1628 had been matching the other word in the first place.
1629 Consider
1630 'abcdgu'=~/abcdefg|cdgu/
1631 When we get to 'd' we are still matching the first word, we would encounter
1632 'g' which would fail, which would bring us to the state representing 'd' in
1633 the second word where we would try 'g' and succeed, proceeding to match
1634 'cdgu'.
1635 */
1636 /* add a fail transition */
1637 const U32 trie_offset = ARG1u(source);
1638 reg_trie_data *trie=(reg_trie_data *)RExC_rxi->data->data[trie_offset];
1639 U32 *q;
1640 const U32 ucharcount = trie->uniquecharcount;
1641 const U32 numstates = trie->statecount;
1642 const U32 ubound = trie->lasttrans + ucharcount;
1643 U32 q_read = 0;
1644 U32 q_write = 0;
1645 U32 charid;
1646 U32 base = trie->states[ 1 ].trans.base;
1647 U32 *fail;
1648 reg_ac_data *aho;
1649 const U32 data_slot = reg_add_data( pRExC_state, STR_WITH_LEN("T"));
1650 regnode *stclass;
1651 DECLARE_AND_GET_RE_DEBUG_FLAGS;
1652
1653 PERL_ARGS_ASSERT_CONSTRUCT_AHOCORASICK_FROM_TRIE;
1654 PERL_UNUSED_CONTEXT;
1655 #ifndef DEBUGGING
1656 PERL_UNUSED_ARG(depth);
1657 #endif
1658
1659 if ( OP(source) == TRIE ) {
1660 tregnode_TRIE *op = (tregnode_TRIE *)
1661 PerlMemShared_calloc(1, sizeof(tregnode_TRIE));
1662 StructCopy(source, op, tregnode_TRIE);
1663 stclass = (regnode *)op;
1664 } else {
1665 tregnode_TRIEC *op = (tregnode_TRIEC *)
1666 PerlMemShared_calloc(1, sizeof(tregnode_TRIEC));
1667 StructCopy(source, op, tregnode_TRIEC);
1668 stclass = (regnode *)op;
1669 }
1670 OP(stclass)+=2; /* convert the TRIE type to its AHO-CORASICK equivalent */
1671
1672 ARG1u_SET( stclass, data_slot );
1673 aho = (reg_ac_data *) PerlMemShared_calloc( 1, sizeof(reg_ac_data) );
1674 RExC_rxi->data->data[ data_slot ] = (void*)aho;
1675 aho->trie=trie_offset;
1676 aho->states=(reg_trie_state *)PerlMemShared_malloc( numstates * sizeof(reg_trie_state) );
1677 Copy( trie->states, aho->states, numstates, reg_trie_state );
1678 Newx( q, numstates, U32);
1679 aho->fail = (U32 *) PerlMemShared_calloc( numstates, sizeof(U32) );
1680 aho->refcount = 1;
1681 fail = aho->fail;
1682 /* initialize fail[0..1] to be 1 so that we always have
1683 a valid final fail state */
1684 fail[ 0 ] = fail[ 1 ] = 1;
1685
1686 for ( charid = 0; charid < ucharcount ; charid++ ) {
1687 const U32 newstate = TRIE_TRANS_STATE( 1, base, ucharcount, charid, 0 );
1688 if ( newstate ) {
1689 q[ q_write ] = newstate;
1690 /* set to point at the root */
1691 fail[ q[ q_write++ ] ]=1;
1692 }
1693 }
1694 while ( q_read < q_write) {
1695 const U32 cur = q[ q_read++ % numstates ];
1696 base = trie->states[ cur ].trans.base;
1697
1698 for ( charid = 0 ; charid < ucharcount ; charid++ ) {
1699 const U32 ch_state = TRIE_TRANS_STATE( cur, base, ucharcount, charid, 1 );
1700 if (ch_state) {
1701 U32 fail_state = cur;
1702 U32 fail_base;
1703 do {
1704 fail_state = fail[ fail_state ];
1705 fail_base = aho->states[ fail_state ].trans.base;
1706 } while ( !TRIE_TRANS_STATE( fail_state, fail_base, ucharcount, charid, 1 ) );
1707
1708 fail_state = TRIE_TRANS_STATE( fail_state, fail_base, ucharcount, charid, 1 );
1709 fail[ ch_state ] = fail_state;
1710 if ( !aho->states[ ch_state ].wordnum && aho->states[ fail_state ].wordnum )
1711 {
1712 aho->states[ ch_state ].wordnum = aho->states[ fail_state ].wordnum;
1713 }
1714 q[ q_write++ % numstates] = ch_state;
1715 }
1716 }
1717 }
1718 /* restore fail[0..1] to 0 so that we "fall out" of the AC loop
1719 when we fail in state 1, this allows us to use the
1720 charclass scan to find a valid start char. This is based on the principle
1721 that theres a good chance the string being searched contains lots of stuff
1722 that cant be a start char.
1723 */
1724 fail[ 0 ] = fail[ 1 ] = 0;
1725 DEBUG_TRIE_COMPILE_r({
1726 Perl_re_indentf( aTHX_ "Stclass Failtable (%" UVuf " states): 0",
1727 depth, (UV)numstates
1728 );
1729 for( q_read=1; q_read<numstates; q_read++ ) {
1730 Perl_re_printf( aTHX_ ", %" UVuf, (UV)fail[q_read]);
1731 }
1732 Perl_re_printf( aTHX_ "\n");
1733 });
1734 Safefree(q);
1735 /*RExC_seen |= REG_TRIEDFA_SEEN;*/
1736 return stclass;
1737 }
1738