1#!/usr/bin/perl -w 2# 3# 4# Regenerate (overwriting only if changed): 5# 6# pod/perldebguts.pod 7# regnodes.h 8# 9# from information stored in 10# 11# regcomp.sym 12# regexp.h 13# 14# pod/perldebguts.pod is not completely regenerated. Only the table of 15# regexp nodes is replaced; other parts remain unchanged. 16# 17# Accepts the standard regen_lib -q and -v args. 18# 19# This script is normally invoked from regen.pl. 20 21BEGIN { 22 # Get function prototypes 23 require './regen/regen_lib.pl'; 24} 25use strict; 26 27# NOTE I don't think anyone actually knows what all of these properties mean, 28# and I suspect some of them are outright unused. This is a first attempt to 29# clean up the generation so maybe one day we can move to something more self 30# documenting. (One might argue that an array of hashes of properties would 31# be easier to use.) 32# 33# Why we use the term regnode and nodes, and not say, opcodes, I am not sure. 34 35# General thoughts: 36# 1. We use a single continuum to represent both opcodes and states, 37# and in regexec.c we switch on the combined set. 38# 2. Opcodes have more information associated to them, states are simpler, 39# basically just an identifier/number that can be used to switch within 40# the state machine. 41# 3. Some opcode are order dependent. 42# 4. Output files often use "tricks" to reduce diff effects. Some of what 43# we do below is more clumsy looking than it could be because of this. 44 45# Op/state properties: 46# 47# Property In Descr 48# ---------------------------------------------------------------------------- 49# name Both Name of op/state 50# id Both integer value for this opcode/state 51# optype Both Either 'op' or 'state' 52# line_num Both line_num number of the input file for this item. 53# type Op Type of node (aka regkind) 54# code Op what code is associated with this node (???) 55# args Op what type of args the node has (which regnode struct) 56# flags Op (???) 57# longj Op Boolean as to if this node is a longjump 58# comment Both Comment about node, if any 59# pod_comment Both Special comments for pod output (preceding lines in def) 60 61# Global State 62my @all; # all opcodes/state 63my %all; # hash of all opcode/state names 64 65my @ops; # array of just opcodes 66my @states; # array of just states 67 68my $longest_name_length= 0; # track lengths of names for nicer reports 69my (%type_alias); # map the type (??) 70 71# register a newly constructed node into our state tables. 72# ensures that we have no name collisions (on name anyway), 73# and issues the "id" for the node. 74sub register_node { 75 my ($node)= @_; 76 77 if ( $all{ $node->{name} } ) { 78 die "Duplicate item '$node->{name}' in regcomp.sym line $node->{line_num} " 79 . "previously defined on line $all{ $node->{name} }{line_num}\n"; 80 } elsif (!$node->{optype}) { 81 die "must have an optype in node ", Dumper($node); 82 } elsif ($node->{optype} eq "op") { 83 push @ops, $node; 84 } elsif ($node->{optype} eq "state") { 85 push @states, $node; 86 } else { 87 die "Uknown optype '$node->{optype}' in ", Dumper($node); 88 } 89 $node->{id}= 0 + @all; 90 push @all, $node; 91 $all{ $node->{name} }= $node; 92 93 if ($node->{longj} && $node->{longj} != 1) { 94 die "longj field must be in [01] if present in ", Dumper($node); 95 } 96 97} 98 99# Parse and add an opcode definition to the global state. 100# An opcode definition looks like this: 101# 102# +- args 103# | +- flags 104# | | +- longjmp 105# Name Type code | | | ; comment 106# -------------------------------------------------------------------------- 107# IFMATCH BRANCHJ, off 1 . 2 ; Succeeds if the following matches. 108# UNLESSM BRANCHJ, off 1 . 2 ; Fails if the following matches. 109# SUSPEND BRANCHJ, off 1 V 1 ; "Independent" sub-RE. 110# IFTHEN BRANCHJ, off 1 V 1 ; Switch, should be preceded by switcher. 111# GROUPP GROUPP, num 1 ; Whether the group matched. 112# 113# Not every opcode definition has all of these. We should maybe make this 114# nicer/easier to read in the future. Also note that the above is tab 115# sensitive. 116 117sub parse_opcode_def { 118 my ( $text, $line_num, $pod_comment )= @_; 119 my $node= { 120 line_num => $line_num, 121 pod_comment => $pod_comment, 122 optype => "op", 123 }; 124 125 # first split the line into three, the initial NAME, a middle part 126 # that we call "desc" which contains various (not well documented) things, 127 # and a comment section. 128 @{$node}{qw(name desc comment)}= /^(\S+)\s+([^\t]+?)\s*;\s*(.*)/ 129 or die "Failed to match $_"; 130 131 # the content of the "desc" field from the first step is extracted here: 132 @{$node}{qw(type code args flags longj)}= split /[,\s]\s*/, $node->{desc}; 133 134 defined $node->{$_} or $node->{$_} = "" 135 for qw(type code args flags longj); 136 137 register_node($node); # has to be before the type_alias code below 138 139 if ( !$all{ $node->{type} } and !$type_alias{ $node->{type} } ) { 140 141 #warn "Regop type '$node->{type}' from regcomp.sym line $line_num" 142 # ." is not an existing regop, and will be aliased to $node->{name}\n" 143 # if -t STDERR; 144 $type_alias{ $node->{type} }= $node->{name}; 145 } 146 147 $longest_name_length= length $node->{name} 148 if length $node->{name} > $longest_name_length; 149} 150 151# parse out a state definition and add the resulting data 152# into the global state. may create multiple new states from 153# a single definition (this is part of the point). 154# Format for states: 155# REGOP \t typelist [ \t typelist] 156# typelist= namelist 157# = namelist:FAIL 158# = name:count 159# Eg: 160# WHILEM A_pre,A_min,A_max,B_min,B_max:FAIL 161# BRANCH next:FAIL 162# CURLYM A,B:FAIL 163# 164# The CURLYM definition would create the states: 165# CURLYM_A, CURLYM_A_fail, CURLYM_B, CURLYM_B_fail 166sub parse_state_def { 167 my ( $text, $line_num, $pod_comment )= @_; 168 my ( $type, @lists )= split /\s+/, $text; 169 die "No list? $type" if !@lists; 170 foreach my $list (@lists) { 171 my ( $names, $special )= split /:/, $list, 2; 172 $special ||= ""; 173 foreach my $name ( split /,/, $names ) { 174 my $real= 175 $name eq 'resume' 176 ? "resume_$type" 177 : "${type}_$name"; 178 my @suffix; 179 if ( !$special ) { 180 @suffix= (""); 181 } 182 elsif ( $special =~ /\d/ ) { 183 @suffix= ( 1 .. $special ); 184 } 185 elsif ( $special eq 'FAIL' ) { 186 @suffix= ( "", "_fail" ); 187 } 188 else { 189 die "unknown :type ':$special'"; 190 } 191 foreach my $suffix (@suffix) { 192 my $node= { 193 name => "$real$suffix", 194 optype => "state", 195 type => $type || "", 196 comment => "state for $type", 197 line_num => $line_num, 198 }; 199 register_node($node); 200 } 201 } 202 } 203} 204 205sub process_flags { 206 my ( $flag, $varname, $comment )= @_; 207 $comment= '' unless defined $comment; 208 209 my @selected; 210 my $bitmap= ''; 211 for my $node (@ops) { 212 my $set= $node->{flags} && $node->{flags} eq $flag ? 1 : 0; 213 214 # Whilst I could do this with vec, I'd prefer to do longhand the arithmetic 215 # ops in the C code. 216 my $current= do { 217 no warnings; 218 ord substr $bitmap, ( $node->{id} >> 3 ); 219 }; 220 substr( $bitmap, ( $node->{id} >> 3 ), 1 )= 221 chr( $current | ( $set << ( $node->{id} & 7 ) ) ); 222 223 push @selected, $node->{name} if $set; 224 } 225 my $out_string= join ', ', @selected, 0; 226 $out_string =~ s/(.{1,70},) /$1\n /g; 227 228 my $out_mask= join ', ', map { sprintf "0x%02X", ord $_ } split '', $bitmap; 229 230 return $comment . <<"EOP"; 231#define REGNODE_\U$varname\E(node) (PL_${varname}_bitmask[(node) >> 3] & (1 << ((node) & 7))) 232 233#ifndef DOINIT 234EXTCONST U8 PL_${varname}\[] __attribute__deprecated__; 235#else 236EXTCONST U8 PL_${varname}\[] __attribute__deprecated__ = { 237 $out_string 238}; 239#endif /* DOINIT */ 240 241#ifndef DOINIT 242EXTCONST U8 PL_${varname}_bitmask[]; 243#else 244EXTCONST U8 PL_${varname}_bitmask[] = { 245 $out_mask 246}; 247#endif /* DOINIT */ 248EOP 249} 250 251sub read_definition { 252 my ( $file )= @_; 253 my ( $seen_sep, $pod_comment )= ""; 254 open my $in_fh, "<", $file 255 or die "Failed to open '$file' for reading: $!"; 256 while (<$in_fh>) { 257 258 # Special pod comments 259 if (/^#\* ?/) { $pod_comment .= "# $'"; } 260 261 # Truly blank lines possibly surrounding pod comments 262 elsif (/^\s*$/) { $pod_comment .= "\n" } 263 264 next if /\A\s*#/ || /\A\s*\z/; 265 266 s/\s*\z//; 267 if (/^-+\s*$/) { 268 $seen_sep= 1; 269 next; 270 } 271 272 if ($seen_sep) { 273 parse_state_def( $_, $., $pod_comment ); 274 } 275 else { 276 parse_opcode_def( $_, $., $pod_comment ); 277 } 278 $pod_comment= ""; 279 } 280 close $in_fh; 281 die "Too many regexp/state opcodes! Maximum is 256, but there are ", 0 + @all, 282 " in file!" 283 if @all > 256; 284} 285 286# use fixed width to keep the diffs between regcomp.pl recompiles 287# as small as possible. 288my ( $width, $rwidth, $twidth )= ( 22, 12, 9 ); 289 290sub print_state_defs { 291 my ($out)= @_; 292 printf $out <<EOP, 293/* Regops and State definitions */ 294 295#define %*s\t%d 296#define %*s\t%d 297 298EOP 299 -$width, 300 REGNODE_MAX => $#ops, 301 -$width, REGMATCH_STATE_MAX => $#all; 302 303 my %rev_type_alias= reverse %type_alias; 304 for my $node (@ops) { 305 printf $out "#define\t%*s\t%d\t/* %#04x %s */\n", 306 -$width, $node->{name}, $node->{id}, $node->{id}, $node->{comment}; 307 if ( defined( my $alias= $rev_type_alias{ $node->{name} } ) ) { 308 printf $out "#define\t%*s\t%d\t/* %#04x %s */\n", 309 -$width, $alias, $node->{id}, $node->{id}, "type alias"; 310 } 311 } 312 313 print $out "\t/* ------------ States ------------- */\n"; 314 for my $node (@states) { 315 printf $out "#define\t%*s\t(REGNODE_MAX + %d)\t/* %s */\n", 316 -$width, $node->{name}, $node->{id} - $#ops, $node->{comment}; 317 } 318} 319 320sub print_regkind { 321 my ($out)= @_; 322 print $out <<EOP; 323 324/* PL_regkind[] What type of regop or state is this. */ 325 326#ifndef DOINIT 327EXTCONST U8 PL_regkind[]; 328#else 329EXTCONST U8 PL_regkind[] = { 330EOP 331 use Data::Dumper; 332 foreach my $node (@all) { 333 print Dumper($node) if !defined $node->{type} or !defined( $node->{name} ); 334 printf $out "\t%*s\t/* %*s */\n", 335 -1 - $twidth, "$node->{type},", -$width, $node->{name}; 336 print $out "\t/* ------------ States ------------- */\n" 337 if $node->{id} == $#ops and $node->{id} != $#all; 338 } 339 340 print $out <<EOP; 341}; 342#endif 343EOP 344} 345 346sub wrap_ifdef_print { 347 my $out= shift; 348 my $token= shift; 349 print $out <<EOP; 350 351#ifdef $token 352EOP 353 $_->($out) for @_; 354 print $out <<EOP; 355#endif /* $token */ 356 357EOP 358} 359 360sub print_regarglen { 361 my ($out)= @_; 362 print $out <<EOP; 363 364/* regarglen[] - How large is the argument part of the node (in regnodes) */ 365 366static const U8 regarglen[] = { 367EOP 368 369 foreach my $node (@ops) { 370 my $size= 0; 371 $size= "EXTRA_SIZE(struct regnode_$node->{args})" if $node->{args}; 372 373 printf $out "\t%*s\t/* %*s */\n", -37, "$size,", -$rwidth, $node->{name}; 374 } 375 376 print $out <<EOP; 377}; 378EOP 379} 380 381sub print_reg_off_by_arg { 382 my ($out)= @_; 383 print $out <<EOP; 384 385/* reg_off_by_arg[] - Which argument holds the offset to the next node */ 386 387static const char reg_off_by_arg[] = { 388EOP 389 390 foreach my $node (@ops) { 391 my $size= $node->{longj} || 0; 392 393 printf $out "\t%d,\t/* %*s */\n", $size, -$rwidth, $node->{name}; 394 } 395 396 print $out <<EOP; 397}; 398 399EOP 400} 401 402sub print_reg_name { 403 my ($out)= @_; 404 print $out <<EOP; 405 406/* reg_name[] - Opcode/state names in string form, for debugging */ 407 408#ifndef DOINIT 409EXTCONST char * PL_reg_name[]; 410#else 411EXTCONST char * const PL_reg_name[] = { 412EOP 413 414 my $ofs= 0; 415 my $sym= ""; 416 foreach my $node (@all) { 417 my $size= $node->{longj} || 0; 418 419 printf $out "\t%*s\t/* $sym%#04x */\n", 420 -3 - $width, qq("$node->{name}",), $node->{id} - $ofs; 421 if ( $node->{id} == $#ops and @ops != @all ) { 422 print $out "\t/* ------------ States ------------- */\n"; 423 $ofs= $#ops; 424 $sym= 'REGNODE_MAX +'; 425 } 426 } 427 428 print $out <<EOP; 429}; 430#endif /* DOINIT */ 431 432EOP 433} 434 435sub print_reg_extflags_name { 436 my ($out)= @_; 437 print $out <<EOP; 438/* PL_reg_extflags_name[] - Opcode/state names in string form, for debugging */ 439 440#ifndef DOINIT 441EXTCONST char * PL_reg_extflags_name[]; 442#else 443EXTCONST char * const PL_reg_extflags_name[] = { 444EOP 445 446 my %rxfv; 447 my %definitions; # Remember what the symbol definitions are 448 my $val= 0; 449 my %reverse; 450 my $REG_EXTFLAGS_NAME_SIZE= 0; 451 foreach my $file ( "op_reg_common.h", "regexp.h" ) { 452 open my $in_fh, "<", $file or die "Can't read '$file': $!"; 453 while (<$in_fh>) { 454 455 # optional leading '_'. Return symbol in $1, and strip it from 456 # comment of line 457 if (s/^ \# \s* define \s+ ( _? RXf_ \w+ ) \s+ //xi) { 458 chomp; 459 my $define= $1; 460 my $orig= $_; 461 s{ /\* .*? \*/ }{ }x; # Replace comments by a blank 462 463 # Replace any prior defined symbols by their values 464 foreach my $key ( keys %definitions ) { 465 s/\b$key\b/$definitions{$key}/g; 466 } 467 468 # Remove the U suffix from unsigned int literals 469 s/\b([0-9]+)U\b/$1/g; 470 471 my $newval= eval $_; # Get numeric definition 472 473 $definitions{$define}= $newval; 474 475 next unless $_ =~ /<</; # Bit defines use left shift 476 if ( $val & $newval ) { 477 my @names= ( $define, $reverse{$newval} ); 478 s/PMf_// for @names; 479 if ( $names[0] ne $names[1] ) { 480 die sprintf 481 "ERROR: both $define and $reverse{$newval} use 0x%08X (%s:%s)", 482 $newval, $orig, $_; 483 } 484 next; 485 } 486 $val |= $newval; 487 $rxfv{$define}= $newval; 488 $reverse{$newval}= $define; 489 } 490 } 491 } 492 my %vrxf= reverse %rxfv; 493 printf $out "\t/* Bits in extflags defined: %s */\n", unpack 'B*', pack 'N', 494 $val; 495 my %multibits; 496 for ( 0 .. 31 ) { 497 my $power_of_2= 2**$_; 498 my $n= $vrxf{$power_of_2}; 499 my $extra= ""; 500 if ( !$n ) { 501 502 # Here, there was no name that matched exactly the bit. It could be 503 # either that it is unused, or the name matches multiple bits. 504 if ( !( $val & $power_of_2 ) ) { 505 $n= "UNUSED_BIT_$_"; 506 } 507 else { 508 509 # Here, must be because it matches multiple bits. Look through 510 # all possibilities until find one that matches this one. Use 511 # that name, and all the bits it matches 512 foreach my $name ( keys %rxfv ) { 513 if ( $rxfv{$name} & $power_of_2 ) { 514 $n= $name . ( $multibits{$name}++ ); 515 $extra= sprintf qq{ : "%s" - 0x%08x}, $name, 516 $rxfv{$name} 517 if $power_of_2 != $rxfv{$name}; 518 last; 519 } 520 } 521 } 522 } 523 s/\bRXf_(PMf_)?// for $n, $extra; 524 printf $out qq(\t%-20s/* 0x%08x%s */\n), qq("$n",), $power_of_2, $extra; 525 $REG_EXTFLAGS_NAME_SIZE++; 526 } 527 528 print $out <<EOP; 529}; 530#endif /* DOINIT */ 531 532#ifdef DEBUGGING 533# define REG_EXTFLAGS_NAME_SIZE $REG_EXTFLAGS_NAME_SIZE 534#endif 535EOP 536 537} 538 539sub print_reg_intflags_name { 540 my ($out)= @_; 541 print $out <<EOP; 542 543/* PL_reg_intflags_name[] - Opcode/state names in string form, for debugging */ 544 545#ifndef DOINIT 546EXTCONST char * PL_reg_intflags_name[]; 547#else 548EXTCONST char * const PL_reg_intflags_name[] = { 549EOP 550 551 my %rxfv; 552 my %definitions; # Remember what the symbol definitions are 553 my $val= 0; 554 my %reverse; 555 my $REG_INTFLAGS_NAME_SIZE= 0; 556 foreach my $file ("regcomp.h") { 557 open my $fh, "<", $file or die "Can't read $file: $!"; 558 while (<$fh>) { 559 560 # optional leading '_'. Return symbol in $1, and strip it from 561 # comment of line 562 if ( 563 m/^ \# \s* define \s+ ( PREGf_ ( \w+ ) ) \s+ 0x([0-9a-f]+)(?:\s*\/\*(.*)\*\/)?/xi 564 ) 565 { 566 chomp; 567 my $define= $1; 568 my $abbr= $2; 569 my $hex= $3; 570 my $comment= $4; 571 my $val= hex($hex); 572 $comment= $comment ? " - $comment" : ""; 573 574 printf $out qq(\t%-30s/* 0x%08x - %s%s */\n), qq("$abbr",), 575 $val, $define, $comment; 576 $REG_INTFLAGS_NAME_SIZE++; 577 } 578 } 579 } 580 581 print $out <<EOP; 582}; 583#endif /* DOINIT */ 584 585EOP 586 print $out <<EOQ; 587#ifdef DEBUGGING 588# define REG_INTFLAGS_NAME_SIZE $REG_INTFLAGS_NAME_SIZE 589#endif 590 591EOQ 592} 593 594sub print_process_flags { 595 my ($out)= @_; 596 597 print $out process_flags( 'V', 'varies', <<'EOC'); 598/* The following have no fixed length. U8 so we can do strchr() on it. */ 599EOC 600 601 print $out process_flags( 'S', 'simple', <<'EOC'); 602 603/* The following always have a length of 1. U8 we can do strchr() on it. */ 604/* (Note that length 1 means "one character" under UTF8, not "one octet".) */ 605EOC 606 607} 608 609sub do_perldebguts { 610 my $guts= open_new( 'pod/perldebguts.pod', '>' ); 611 612 my $node; 613 my $code; 614 my $name_fmt= '<' x ( $longest_name_length - 1 ); 615 my $descr_fmt= '<' x ( 58 - $longest_name_length ); 616 eval <<EOD or die $@; 617format GuTS = 618 ^*~~ 619 \$node->{pod_comment} 620 ^$name_fmt ^<<<<<<<<< ^$descr_fmt~~ 621 \$node->{name}, \$code, defined \$node->{comment} ? \$node->{comment} : '' 622. 6231; 624EOD 625 626 my $old_fh= select($guts); 627 $~= "GuTS"; 628 629 open my $oldguts, '<', 'pod/perldebguts.pod' 630 or die "$0 cannot open pod/perldebguts.pod for reading: $!"; 631 while (<$oldguts>) { 632 print; 633 last if /=for regcomp.pl begin/; 634 } 635 636 print <<'END_OF_DESCR'; 637 638 # TYPE arg-description [num-args] [longjump-len] DESCRIPTION 639END_OF_DESCR 640 for my $n (@ops) { 641 $node= $n; 642 $code= "$node->{code} " . ( $node->{args} || "" ); 643 $code .= " $node->{longj}" if $node->{longj}; 644 if ( $node->{pod_comment} ||= "" ) { 645 646 # Trim multiple blanks 647 $node->{pod_comment} =~ s/^\n\n+/\n/; 648 $node->{pod_comment} =~ s/\n\n+$/\n\n/; 649 } 650 write; 651 } 652 print "\n"; 653 654 while (<$oldguts>) { 655 last if /=for regcomp.pl end/; 656 } 657 do { print } while <$oldguts>; #win32 can't unlink an open FH 658 close $oldguts or die "Error closing pod/perldebguts.pod: $!"; 659 select $old_fh; 660 close_and_rename($guts); 661} 662 663read_definition("regcomp.sym"); 664my $out= open_new( 'regnodes.h', '>', 665 { by => 'regen/regcomp.pl', from => 'regcomp.sym' } ); 666print_state_defs($out); 667print_regkind($out); 668wrap_ifdef_print( 669 $out, 670 "REG_COMP_C", 671 \&print_regarglen, 672 \&print_reg_off_by_arg 673); 674print_reg_name($out); 675print_reg_extflags_name($out); 676print_reg_intflags_name($out); 677print_process_flags($out); 678read_only_bottom_close_and_rename($out); 679 680do_perldebguts(); 681