1#!/usr/bin/perl -w
2#
3#
4# Regenerate (overwriting only if changed):
5#
6#    pod/perldebguts.pod
7#    regnodes.h
8#
9# from information stored in
10#
11#    regcomp.sym
12#    regexp.h
13#
14# pod/perldebguts.pod is not completely regenerated.  Only the table of
15# regexp nodes is replaced; other parts remain unchanged.
16#
17# Accepts the standard regen_lib -q and -v args.
18#
19# This script is normally invoked from regen.pl.
20
21BEGIN {
22    # Get function prototypes
23    require './regen/regen_lib.pl';
24}
25use strict;
26
27# NOTE I don't think anyone actually knows what all of these properties mean,
28# and I suspect some of them are outright unused. This is a first attempt to
29# clean up the generation so maybe one day we can move to something more self
30# documenting. (One might argue that an array of hashes of properties would
31# be easier to use.)
32#
33# Why we use the term regnode and nodes, and not say, opcodes, I am not sure.
34
35# General thoughts:
36# 1. We use a single continuum to represent both opcodes and states,
37#    and in regexec.c we switch on the combined set.
38# 2. Opcodes have more information associated to them, states are simpler,
39#    basically just an identifier/number that can be used to switch within
40#    the state machine.
41# 3. Some opcode are order dependent.
42# 4. Output files often use "tricks" to reduce diff effects. Some of what
43#    we do below is more clumsy looking than it could be because of this.
44
45# Op/state properties:
46#
47# Property      In      Descr
48# ----------------------------------------------------------------------------
49# name          Both    Name of op/state
50# id            Both    integer value for this opcode/state
51# optype        Both    Either 'op' or 'state'
52# line_num          Both    line_num number of the input file for this item.
53# type          Op      Type of node (aka regkind)
54# code          Op      what code is associated with this node (???)
55# args          Op      what type of args the node has (which regnode struct)
56# flags         Op      (???)
57# longj         Op      Boolean as to if this node is a longjump
58# comment       Both    Comment about node, if any
59# pod_comment   Both    Special comments for pod output (preceding lines in def)
60
61# Global State
62my @all;    # all opcodes/state
63my %all;    # hash of all opcode/state names
64
65my @ops;    # array of just opcodes
66my @states; # array of just states
67
68my $longest_name_length= 0; # track lengths of names for nicer reports
69my (%type_alias);           # map the type (??)
70
71# register a newly constructed node into our state tables.
72# ensures that we have no name collisions (on name anyway),
73# and issues the "id" for the node.
74sub register_node {
75    my ($node)= @_;
76
77    if ( $all{ $node->{name} } ) {
78        die "Duplicate item '$node->{name}' in regcomp.sym line $node->{line_num} "
79            . "previously defined on line $all{ $node->{name} }{line_num}\n";
80    } elsif (!$node->{optype}) {
81        die "must have an optype in node ", Dumper($node);
82    } elsif ($node->{optype} eq "op") {
83        push @ops, $node;
84    } elsif ($node->{optype} eq "state") {
85        push @states, $node;
86    } else {
87        die "Uknown optype '$node->{optype}' in ", Dumper($node);
88    }
89    $node->{id}= 0 + @all;
90    push @all, $node;
91    $all{ $node->{name} }= $node;
92
93    if ($node->{longj} && $node->{longj} != 1) {
94        die "longj field must be in [01] if present in ", Dumper($node);
95    }
96
97}
98
99# Parse and add an opcode definition to the global state.
100# An opcode definition looks like this:
101#
102#                             +- args
103#                             | +- flags
104#                             | | +- longjmp
105# Name        Type       code | | | ; comment
106# --------------------------------------------------------------------------
107# IFMATCH     BRANCHJ,    off 1 . 2 ; Succeeds if the following matches.
108# UNLESSM     BRANCHJ,    off 1 . 2 ; Fails if the following matches.
109# SUSPEND     BRANCHJ,    off 1 V 1 ; "Independent" sub-RE.
110# IFTHEN      BRANCHJ,    off 1 V 1 ; Switch, should be preceded by switcher.
111# GROUPP      GROUPP,     num 1     ; Whether the group matched.
112#
113# Not every opcode definition has all of these. We should maybe make this
114# nicer/easier to read in the future. Also note that the above is tab
115# sensitive.
116
117sub parse_opcode_def {
118    my ( $text, $line_num, $pod_comment )= @_;
119    my $node= {
120        line_num    => $line_num,
121        pod_comment => $pod_comment,
122        optype      => "op",
123    };
124
125    # first split the line into three, the initial NAME, a middle part
126    # that we call "desc" which contains various (not well documented) things,
127    # and a comment section.
128    @{$node}{qw(name desc comment)}= /^(\S+)\s+([^\t]+?)\s*;\s*(.*)/
129        or die "Failed to match $_";
130
131    # the content of the "desc" field from the first step is extracted here:
132    @{$node}{qw(type code args flags longj)}= split /[,\s]\s*/, $node->{desc};
133
134    defined $node->{$_} or $node->{$_} = ""
135        for qw(type code args flags longj);
136
137    register_node($node); # has to be before the type_alias code below
138
139    if ( !$all{ $node->{type} } and !$type_alias{ $node->{type} } ) {
140
141        #warn "Regop type '$node->{type}' from regcomp.sym line $line_num"
142        #     ." is not an existing regop, and will be aliased to $node->{name}\n"
143        #    if -t STDERR;
144        $type_alias{ $node->{type} }= $node->{name};
145    }
146
147    $longest_name_length= length $node->{name}
148        if length $node->{name} > $longest_name_length;
149}
150
151# parse out a state definition and add the resulting data
152# into the global state. may create multiple new states from
153# a single definition (this is part of the point).
154# Format for states:
155# REGOP \t typelist [ \t typelist]
156# typelist= namelist
157#         = namelist:FAIL
158#         = name:count
159# Eg:
160# WHILEM          A_pre,A_min,A_max,B_min,B_max:FAIL
161# BRANCH          next:FAIL
162# CURLYM          A,B:FAIL
163#
164# The CURLYM definition would create the states:
165# CURLYM_A, CURLYM_A_fail, CURLYM_B, CURLYM_B_fail
166sub parse_state_def {
167    my ( $text, $line_num, $pod_comment )= @_;
168    my ( $type, @lists )= split /\s+/, $text;
169    die "No list? $type" if !@lists;
170    foreach my $list (@lists) {
171        my ( $names, $special )= split /:/, $list, 2;
172        $special ||= "";
173        foreach my $name ( split /,/, $names ) {
174            my $real=
175                $name eq 'resume'
176                ? "resume_$type"
177                : "${type}_$name";
178            my @suffix;
179            if ( !$special ) {
180                @suffix= ("");
181            }
182            elsif ( $special =~ /\d/ ) {
183                @suffix= ( 1 .. $special );
184            }
185            elsif ( $special eq 'FAIL' ) {
186                @suffix= ( "", "_fail" );
187            }
188            else {
189                die "unknown :type ':$special'";
190            }
191            foreach my $suffix (@suffix) {
192                my $node= {
193                    name        => "$real$suffix",
194                    optype      => "state",
195                    type        => $type || "",
196                    comment     => "state for $type",
197                    line_num    => $line_num,
198                };
199                register_node($node);
200            }
201        }
202    }
203}
204
205sub process_flags {
206    my ( $flag, $varname, $comment )= @_;
207    $comment= '' unless defined $comment;
208
209    my @selected;
210    my $bitmap= '';
211    for my $node (@ops) {
212        my $set= $node->{flags} && $node->{flags} eq $flag ? 1 : 0;
213
214        # Whilst I could do this with vec, I'd prefer to do longhand the arithmetic
215        # ops in the C code.
216        my $current= do {
217            no warnings;
218            ord substr $bitmap, ( $node->{id} >> 3 );
219        };
220        substr( $bitmap, ( $node->{id} >> 3 ), 1 )=
221            chr( $current | ( $set << ( $node->{id} & 7 ) ) );
222
223        push @selected, $node->{name} if $set;
224    }
225    my $out_string= join ', ', @selected, 0;
226    $out_string =~ s/(.{1,70},) /$1\n    /g;
227
228    my $out_mask= join ', ', map { sprintf "0x%02X", ord $_ } split '', $bitmap;
229
230    return $comment . <<"EOP";
231#define REGNODE_\U$varname\E(node) (PL_${varname}_bitmask[(node) >> 3] & (1 << ((node) & 7)))
232
233#ifndef DOINIT
234EXTCONST U8 PL_${varname}\[] __attribute__deprecated__;
235#else
236EXTCONST U8 PL_${varname}\[] __attribute__deprecated__ = {
237    $out_string
238};
239#endif /* DOINIT */
240
241#ifndef DOINIT
242EXTCONST U8 PL_${varname}_bitmask[];
243#else
244EXTCONST U8 PL_${varname}_bitmask[] = {
245    $out_mask
246};
247#endif /* DOINIT */
248EOP
249}
250
251sub read_definition {
252    my ( $file )= @_;
253    my ( $seen_sep, $pod_comment )= "";
254    open my $in_fh, "<", $file
255        or die "Failed to open '$file' for reading: $!";
256    while (<$in_fh>) {
257
258        # Special pod comments
259        if (/^#\* ?/) { $pod_comment .= "# $'"; }
260
261        # Truly blank lines possibly surrounding pod comments
262        elsif (/^\s*$/) { $pod_comment .= "\n" }
263
264        next if /\A\s*#/ || /\A\s*\z/;
265
266        s/\s*\z//;
267        if (/^-+\s*$/) {
268            $seen_sep= 1;
269            next;
270        }
271
272        if ($seen_sep) {
273            parse_state_def( $_, $., $pod_comment );
274        }
275        else {
276            parse_opcode_def( $_, $., $pod_comment );
277        }
278        $pod_comment= "";
279    }
280    close $in_fh;
281    die "Too many regexp/state opcodes! Maximum is 256, but there are ", 0 + @all,
282        " in file!"
283        if @all > 256;
284}
285
286# use fixed width to keep the diffs between regcomp.pl recompiles
287# as small as possible.
288my ( $width, $rwidth, $twidth )= ( 22, 12, 9 );
289
290sub print_state_defs {
291    my ($out)= @_;
292    printf $out <<EOP,
293/* Regops and State definitions */
294
295#define %*s\t%d
296#define %*s\t%d
297
298EOP
299        -$width,
300        REGNODE_MAX => $#ops,
301        -$width, REGMATCH_STATE_MAX => $#all;
302
303    my %rev_type_alias= reverse %type_alias;
304    for my $node (@ops) {
305        printf $out "#define\t%*s\t%d\t/* %#04x %s */\n",
306            -$width, $node->{name}, $node->{id}, $node->{id}, $node->{comment};
307        if ( defined( my $alias= $rev_type_alias{ $node->{name} } ) ) {
308            printf $out "#define\t%*s\t%d\t/* %#04x %s */\n",
309                -$width, $alias, $node->{id}, $node->{id}, "type alias";
310        }
311    }
312
313    print $out "\t/* ------------ States ------------- */\n";
314    for my $node (@states) {
315        printf $out "#define\t%*s\t(REGNODE_MAX + %d)\t/* %s */\n",
316            -$width, $node->{name}, $node->{id} - $#ops, $node->{comment};
317    }
318}
319
320sub print_regkind {
321    my ($out)= @_;
322    print $out <<EOP;
323
324/* PL_regkind[] What type of regop or state is this. */
325
326#ifndef DOINIT
327EXTCONST U8 PL_regkind[];
328#else
329EXTCONST U8 PL_regkind[] = {
330EOP
331    use Data::Dumper;
332    foreach my $node (@all) {
333        print Dumper($node) if !defined $node->{type} or !defined( $node->{name} );
334        printf $out "\t%*s\t/* %*s */\n",
335            -1 - $twidth, "$node->{type},", -$width, $node->{name};
336        print $out "\t/* ------------ States ------------- */\n"
337            if $node->{id} == $#ops and $node->{id} != $#all;
338    }
339
340    print $out <<EOP;
341};
342#endif
343EOP
344}
345
346sub wrap_ifdef_print {
347    my $out= shift;
348    my $token= shift;
349    print $out <<EOP;
350
351#ifdef $token
352EOP
353    $_->($out) for @_;
354    print $out <<EOP;
355#endif /* $token */
356
357EOP
358}
359
360sub print_regarglen {
361    my ($out)= @_;
362    print $out <<EOP;
363
364/* regarglen[] - How large is the argument part of the node (in regnodes) */
365
366static const U8 regarglen[] = {
367EOP
368
369    foreach my $node (@ops) {
370        my $size= 0;
371        $size= "EXTRA_SIZE(struct regnode_$node->{args})" if $node->{args};
372
373        printf $out "\t%*s\t/* %*s */\n", -37, "$size,", -$rwidth, $node->{name};
374    }
375
376    print $out <<EOP;
377};
378EOP
379}
380
381sub print_reg_off_by_arg {
382    my ($out)= @_;
383    print $out <<EOP;
384
385/* reg_off_by_arg[] - Which argument holds the offset to the next node */
386
387static const char reg_off_by_arg[] = {
388EOP
389
390    foreach my $node (@ops) {
391        my $size= $node->{longj} || 0;
392
393        printf $out "\t%d,\t/* %*s */\n", $size, -$rwidth, $node->{name};
394    }
395
396    print $out <<EOP;
397};
398
399EOP
400}
401
402sub print_reg_name {
403    my ($out)= @_;
404    print $out <<EOP;
405
406/* reg_name[] - Opcode/state names in string form, for debugging */
407
408#ifndef DOINIT
409EXTCONST char * PL_reg_name[];
410#else
411EXTCONST char * const PL_reg_name[] = {
412EOP
413
414    my $ofs= 0;
415    my $sym= "";
416    foreach my $node (@all) {
417        my $size= $node->{longj} || 0;
418
419        printf $out "\t%*s\t/* $sym%#04x */\n",
420            -3 - $width, qq("$node->{name}",), $node->{id} - $ofs;
421        if ( $node->{id} == $#ops and @ops != @all ) {
422            print $out "\t/* ------------ States ------------- */\n";
423            $ofs= $#ops;
424            $sym= 'REGNODE_MAX +';
425        }
426    }
427
428    print $out <<EOP;
429};
430#endif /* DOINIT */
431
432EOP
433}
434
435sub print_reg_extflags_name {
436    my ($out)= @_;
437    print $out <<EOP;
438/* PL_reg_extflags_name[] - Opcode/state names in string form, for debugging */
439
440#ifndef DOINIT
441EXTCONST char * PL_reg_extflags_name[];
442#else
443EXTCONST char * const PL_reg_extflags_name[] = {
444EOP
445
446    my %rxfv;
447    my %definitions;    # Remember what the symbol definitions are
448    my $val= 0;
449    my %reverse;
450    my $REG_EXTFLAGS_NAME_SIZE= 0;
451    foreach my $file ( "op_reg_common.h", "regexp.h" ) {
452        open my $in_fh, "<", $file or die "Can't read '$file': $!";
453        while (<$in_fh>) {
454
455            # optional leading '_'.  Return symbol in $1, and strip it from
456            # comment of line
457            if (s/^ \# \s* define \s+ ( _? RXf_ \w+ ) \s+ //xi) {
458                chomp;
459                my $define= $1;
460                my $orig= $_;
461                s{ /\* .*? \*/ }{ }x;    # Replace comments by a blank
462
463                # Replace any prior defined symbols by their values
464                foreach my $key ( keys %definitions ) {
465                    s/\b$key\b/$definitions{$key}/g;
466                }
467
468                # Remove the U suffix from unsigned int literals
469                s/\b([0-9]+)U\b/$1/g;
470
471                my $newval= eval $_;     # Get numeric definition
472
473                $definitions{$define}= $newval;
474
475                next unless $_ =~ /<</;    # Bit defines use left shift
476                if ( $val & $newval ) {
477                    my @names= ( $define, $reverse{$newval} );
478                    s/PMf_// for @names;
479                    if ( $names[0] ne $names[1] ) {
480                        die sprintf
481                            "ERROR: both $define and $reverse{$newval} use 0x%08X (%s:%s)",
482                            $newval, $orig, $_;
483                    }
484                    next;
485                }
486                $val |= $newval;
487                $rxfv{$define}= $newval;
488                $reverse{$newval}= $define;
489            }
490        }
491    }
492    my %vrxf= reverse %rxfv;
493    printf $out "\t/* Bits in extflags defined: %s */\n", unpack 'B*', pack 'N',
494        $val;
495    my %multibits;
496    for ( 0 .. 31 ) {
497        my $power_of_2= 2**$_;
498        my $n= $vrxf{$power_of_2};
499        my $extra= "";
500        if ( !$n ) {
501
502            # Here, there was no name that matched exactly the bit.  It could be
503            # either that it is unused, or the name matches multiple bits.
504            if ( !( $val & $power_of_2 ) ) {
505                $n= "UNUSED_BIT_$_";
506            }
507            else {
508
509                # Here, must be because it matches multiple bits.  Look through
510                # all possibilities until find one that matches this one.  Use
511                # that name, and all the bits it matches
512                foreach my $name ( keys %rxfv ) {
513                    if ( $rxfv{$name} & $power_of_2 ) {
514                        $n= $name . ( $multibits{$name}++ );
515                        $extra= sprintf qq{ : "%s" - 0x%08x}, $name,
516                            $rxfv{$name}
517                            if $power_of_2 != $rxfv{$name};
518                        last;
519                    }
520                }
521            }
522        }
523        s/\bRXf_(PMf_)?// for $n, $extra;
524        printf $out qq(\t%-20s/* 0x%08x%s */\n), qq("$n",), $power_of_2, $extra;
525        $REG_EXTFLAGS_NAME_SIZE++;
526    }
527
528    print $out <<EOP;
529};
530#endif /* DOINIT */
531
532#ifdef DEBUGGING
533#  define REG_EXTFLAGS_NAME_SIZE $REG_EXTFLAGS_NAME_SIZE
534#endif
535EOP
536
537}
538
539sub print_reg_intflags_name {
540    my ($out)= @_;
541    print $out <<EOP;
542
543/* PL_reg_intflags_name[] - Opcode/state names in string form, for debugging */
544
545#ifndef DOINIT
546EXTCONST char * PL_reg_intflags_name[];
547#else
548EXTCONST char * const PL_reg_intflags_name[] = {
549EOP
550
551    my %rxfv;
552    my %definitions;    # Remember what the symbol definitions are
553    my $val= 0;
554    my %reverse;
555    my $REG_INTFLAGS_NAME_SIZE= 0;
556    foreach my $file ("regcomp.h") {
557        open my $fh, "<", $file or die "Can't read $file: $!";
558        while (<$fh>) {
559
560            # optional leading '_'.  Return symbol in $1, and strip it from
561            # comment of line
562            if (
563                m/^ \# \s* define \s+ ( PREGf_ ( \w+ ) ) \s+ 0x([0-9a-f]+)(?:\s*\/\*(.*)\*\/)?/xi
564                )
565            {
566                chomp;
567                my $define= $1;
568                my $abbr= $2;
569                my $hex= $3;
570                my $comment= $4;
571                my $val= hex($hex);
572                $comment= $comment ? " - $comment" : "";
573
574                printf $out qq(\t%-30s/* 0x%08x - %s%s */\n), qq("$abbr",),
575                    $val, $define, $comment;
576                $REG_INTFLAGS_NAME_SIZE++;
577            }
578        }
579    }
580
581    print $out <<EOP;
582};
583#endif /* DOINIT */
584
585EOP
586    print $out <<EOQ;
587#ifdef DEBUGGING
588#  define REG_INTFLAGS_NAME_SIZE $REG_INTFLAGS_NAME_SIZE
589#endif
590
591EOQ
592}
593
594sub print_process_flags {
595    my ($out)= @_;
596
597    print $out process_flags( 'V', 'varies', <<'EOC');
598/* The following have no fixed length. U8 so we can do strchr() on it. */
599EOC
600
601    print $out process_flags( 'S', 'simple', <<'EOC');
602
603/* The following always have a length of 1. U8 we can do strchr() on it. */
604/* (Note that length 1 means "one character" under UTF8, not "one octet".) */
605EOC
606
607}
608
609sub do_perldebguts {
610    my $guts= open_new( 'pod/perldebguts.pod', '>' );
611
612    my $node;
613    my $code;
614    my $name_fmt= '<' x  ( $longest_name_length - 1 );
615    my $descr_fmt= '<' x ( 58 - $longest_name_length );
616    eval <<EOD or die $@;
617format GuTS =
618 ^*~~
619 \$node->{pod_comment}
620 ^$name_fmt ^<<<<<<<<< ^$descr_fmt~~
621 \$node->{name}, \$code, defined \$node->{comment} ? \$node->{comment} : ''
622.
6231;
624EOD
625
626    my $old_fh= select($guts);
627    $~= "GuTS";
628
629    open my $oldguts, '<', 'pod/perldebguts.pod'
630        or die "$0 cannot open pod/perldebguts.pod for reading: $!";
631    while (<$oldguts>) {
632        print;
633        last if /=for regcomp.pl begin/;
634    }
635
636    print <<'END_OF_DESCR';
637
638 # TYPE arg-description [num-args] [longjump-len] DESCRIPTION
639END_OF_DESCR
640    for my $n (@ops) {
641        $node= $n;
642        $code= "$node->{code} " . ( $node->{args} || "" );
643        $code .= " $node->{longj}" if $node->{longj};
644        if ( $node->{pod_comment} ||= "" ) {
645
646            # Trim multiple blanks
647            $node->{pod_comment} =~ s/^\n\n+/\n/;
648            $node->{pod_comment} =~ s/\n\n+$/\n\n/;
649        }
650        write;
651    }
652    print "\n";
653
654    while (<$oldguts>) {
655        last if /=for regcomp.pl end/;
656    }
657    do { print } while <$oldguts>; #win32 can't unlink an open FH
658    close $oldguts or die "Error closing pod/perldebguts.pod: $!";
659    select $old_fh;
660    close_and_rename($guts);
661}
662
663read_definition("regcomp.sym");
664my $out= open_new( 'regnodes.h', '>',
665    { by => 'regen/regcomp.pl', from => 'regcomp.sym' } );
666print_state_defs($out);
667print_regkind($out);
668wrap_ifdef_print(
669    $out,
670    "REG_COMP_C",
671    \&print_regarglen,
672    \&print_reg_off_by_arg
673);
674print_reg_name($out);
675print_reg_extflags_name($out);
676print_reg_intflags_name($out);
677print_process_flags($out);
678read_only_bottom_close_and_rename($out);
679
680do_perldebguts();
681