1#!/usr/bin/perl -w 2# 3# 4# Regenerate (overwriting only if changed): 5# 6# pod/perldebguts.pod 7# regnodes.h 8# 9# from information stored in 10# 11# regcomp.sym 12# regexp.h 13# 14# pod/perldebguts.pod is not completely regenerated. Only the table of 15# regexp nodes is replaced; other parts remain unchanged. 16# 17# Accepts the standard regen_lib -q and -v args. 18# 19# This script is normally invoked from regen.pl. 20 21BEGIN { 22 # Get function prototypes 23 require 'regen/regen_lib.pl'; 24} 25use strict; 26 27# NOTE I don't think anyone actually knows what all of these properties mean, 28# and I suspect some of them are outright unused. This is a first attempt to 29# clean up the generation so maybe one day we can move to something more self 30# documenting. (One might argue that an array of hashes of properties would 31# be easier to use.) 32# 33# Why we use the term regnode and nodes, and not say, opcodes, I am not sure. 34 35# General thoughts: 36# 1. We use a single continuum to represent both opcodes and states, 37# and in regexec.c we switch on the combined set. 38# 2. Opcodes have more information associated to them, states are simpler, 39# basically just an identifier/number that can be used to switch within 40# the state machine. 41# 3. Some opcode are order dependent. 42# 4. Output files often use "tricks" to reduce diff effects. Some of what 43# we do below is more clumsy looking than it could be because of this. 44 45# Op/state properties: 46# 47# Property In Descr 48# ---------------------------------------------------------------------------- 49# name Both Name of op/state 50# id Both integer value for this opcode/state 51# optype Both Either 'op' or 'state' 52# line_num Both line_num number of the input file for this item. 53# type Op Type of node (aka regkind) 54# code Op what code is associated with this node (???) 55# args Op what type of args the node has (which regnode struct) 56# flags Op (???) 57# longj Op Whether this node is a longjump 58# comment Both Comment about node, if any 59# pod_comment Both Special comments for pod output (preceding lines in def) 60 61# Global State 62my @all; # all opcodes/state 63my %all; # hash of all opcode/state names 64 65my @ops; # array of just opcodes 66my @states; # array of just states 67 68my $longest_name_length= 0; # track lengths of names for nicer reports 69my (%type_alias); # map the type (??) 70 71# register a newly constructed node into our state tables. 72# ensures that we have no name collisions (on name anyway), 73# and issues the "id" for the node. 74sub register_node { 75 my ($node)= @_; 76 77 if ( $all{ $node->{name} } ) { 78 die "Duplicate item '$node->{name}' in regcomp.sym line $node->{line_num} " 79 . "previously defined on line $all{ $node->{name} }{line_num}\n"; 80 } elsif (!$node->{optype}) { 81 die "must have an optype in node ", Dumper($node); 82 } elsif ($node->{optype} eq "op") { 83 push @ops, $node; 84 } elsif ($node->{optype} eq "state") { 85 push @states, $node; 86 } else { 87 die "Uknown optype '$node->{optype}' in ", Dumper($node); 88 } 89 $node->{id}= 0 + @all; 90 push @all, $node; 91 $all{ $node->{name} }= $node; 92} 93 94# Parse and add an opcode definition to the global state. 95# An opcode definition looks like this: 96# 97# +- args 98# | +- flags 99# | | +- longjmp 100# Name Type code | | | ; comment 101# -------------------------------------------------------------------------- 102# IFMATCH BRANCHJ, off 1 . 2 ; Succeeds if the following matches. 103# UNLESSM BRANCHJ, off 1 . 2 ; Fails if the following matches. 104# SUSPEND BRANCHJ, off 1 V 1 ; "Independent" sub-RE. 105# IFTHEN BRANCHJ, off 1 V 1 ; Switch, should be preceded by switcher. 106# GROUPP GROUPP, num 1 ; Whether the group matched. 107# 108# Not every opcode definition has all of these. We should maybe make this 109# nicer/easier to read in the future. Also note that the above is tab 110# sensitive. 111 112sub parse_opcode_def { 113 my ( $text, $line_num, $pod_comment )= @_; 114 my $node= { 115 line_num => $line_num, 116 pod_comment => $pod_comment, 117 optype => "op", 118 }; 119 120 # first split the line into three, the initial NAME, a middle part 121 # that we call "desc" which contains various (not well documented) things, 122 # and a comment section. 123 @{$node}{qw(name desc comment)}= /^(\S+)\s+([^\t]+?)\s*;\s*(.*)/ 124 or die "Failed to match $_"; 125 126 # the content of the "desc" field from the first step is extracted here: 127 @{$node}{qw(type code args flags longj)}= split /[,\s]\s*/, $node->{desc}; 128 129 $node->{$_} //= "" for qw(type code args flags longj); 130 131 register_node($node); # has to be before the type_alias code below 132 133 if ( !$all{ $node->{type} } and !$type_alias{ $node->{type} } ) { 134 135 #warn "Regop type '$node->{type}' from regcomp.sym line $line_num" 136 # ." is not an existing regop, and will be aliased to $node->{name}\n" 137 # if -t STDERR; 138 $type_alias{ $node->{type} }= $node->{name}; 139 } 140 141 $longest_name_length= length $node->{name} 142 if length $node->{name} > $longest_name_length; 143} 144 145# parse out a state definition and add the resulting data 146# into the global state. may create multiple new states from 147# a single definition (this is part of the point). 148# Format for states: 149# REGOP \t typelist [ \t typelist] 150# typelist= namelist 151# = namelist:FAIL 152# = name:count 153# Eg: 154# WHILEM A_pre,A_min,A_max,B_min,B_max:FAIL 155# BRANCH next:FAIL 156# CURLYM A,B:FAIL 157# 158# The CURLYM definition would create the states: 159# CURLYM_A, CURLYM_A_fail, CURLYM_B, CURLYM_B_fail 160sub parse_state_def { 161 my ( $text, $line_num, $pod_comment )= @_; 162 my ( $type, @lists )= split /\s+/, $text; 163 die "No list? $type" if !@lists; 164 foreach my $list (@lists) { 165 my ( $names, $special )= split /:/, $list, 2; 166 $special ||= ""; 167 foreach my $name ( split /,/, $names ) { 168 my $real= 169 $name eq 'resume' 170 ? "resume_$type" 171 : "${type}_$name"; 172 my @suffix; 173 if ( !$special ) { 174 @suffix= (""); 175 } 176 elsif ( $special =~ /\d/ ) { 177 @suffix= ( 1 .. $special ); 178 } 179 elsif ( $special eq 'FAIL' ) { 180 @suffix= ( "", "_fail" ); 181 } 182 else { 183 die "unknown :type ':$special'"; 184 } 185 foreach my $suffix (@suffix) { 186 my $node= { 187 name => "$real$suffix", 188 optype => "state", 189 type => $type || "", 190 comment => "state for $type", 191 line_num => $line_num, 192 }; 193 register_node($node); 194 } 195 } 196 } 197} 198 199sub process_flags { 200 my ( $flag, $varname, $comment )= @_; 201 $comment= '' unless defined $comment; 202 203 my @selected; 204 my $bitmap= ''; 205 for my $node (@ops) { 206 my $set= $node->{flags} && $node->{flags} eq $flag ? 1 : 0; 207 208 # Whilst I could do this with vec, I'd prefer to do longhand the arithmetic 209 # ops in the C code. 210 my $current= do { 211 no warnings; 212 ord substr $bitmap, ( $node->{id} >> 3 ); 213 }; 214 substr( $bitmap, ( $node->{id} >> 3 ), 1 )= 215 chr( $current | ( $set << ( $node->{id} & 7 ) ) ); 216 217 push @selected, $node->{name} if $set; 218 } 219 my $out_string= join ', ', @selected, 0; 220 $out_string =~ s/(.{1,70},) /$1\n /g; 221 222 my $out_mask= join ', ', map { sprintf "0x%02X", ord $_ } split '', $bitmap; 223 224 return $comment . <<"EOP"; 225#define REGNODE_\U$varname\E(node) (PL_${varname}_bitmask[(node) >> 3] & (1 << ((node) & 7))) 226 227#ifndef DOINIT 228EXTCONST U8 PL_${varname}\[] __attribute__deprecated__; 229#else 230EXTCONST U8 PL_${varname}\[] __attribute__deprecated__ = { 231 $out_string 232}; 233#endif /* DOINIT */ 234 235#ifndef DOINIT 236EXTCONST U8 PL_${varname}_bitmask[]; 237#else 238EXTCONST U8 PL_${varname}_bitmask[] = { 239 $out_mask 240}; 241#endif /* DOINIT */ 242EOP 243} 244 245sub read_definition { 246 my ( $file )= @_; 247 my ( $seen_sep, $pod_comment )= ""; 248 open my $in_fh, "<", $file 249 or die "Failed to open '$file' for reading: $!"; 250 while (<$in_fh>) { 251 252 # Special pod comments 253 if (/^#\* ?/) { $pod_comment .= "# $'"; } 254 255 # Truly blank lines possibly surrounding pod comments 256 elsif (/^\s*$/) { $pod_comment .= "\n" } 257 258 next if /\A\s*#/ || /\A\s*\z/; 259 260 s/\s*\z//; 261 if (/^-+\s*$/) { 262 $seen_sep= 1; 263 next; 264 } 265 266 if ($seen_sep) { 267 parse_state_def( $_, $., $pod_comment ); 268 } 269 else { 270 parse_opcode_def( $_, $., $pod_comment ); 271 } 272 $pod_comment= ""; 273 } 274 close $in_fh; 275 die "Too many regexp/state opcodes! Maximum is 256, but there are ", 0 + @all, 276 " in file!" 277 if @all > 256; 278} 279 280# use fixed width to keep the diffs between regcomp.pl recompiles 281# as small as possible. 282my ( $width, $rwidth, $twidth )= ( 22, 12, 9 ); 283 284sub print_state_defs { 285 my ($out)= @_; 286 printf $out <<EOP, 287/* Regops and State definitions */ 288 289#define %*s\t%d 290#define %*s\t%d 291 292EOP 293 -$width, 294 REGNODE_MAX => $#ops, 295 -$width, REGMATCH_STATE_MAX => $#all; 296 297 my %rev_type_alias= reverse %type_alias; 298 for my $node (@ops) { 299 printf $out "#define\t%*s\t%d\t/* %#04x %s */\n", 300 -$width, $node->{name}, $node->{id}, $node->{id}, $node->{comment}; 301 if ( defined( my $alias= $rev_type_alias{ $node->{name} } ) ) { 302 printf $out "#define\t%*s\t%d\t/* %#04x %s */\n", 303 -$width, $alias, $node->{id}, $node->{id}, "type alias"; 304 } 305 } 306 307 print $out "\t/* ------------ States ------------- */\n"; 308 for my $node (@states) { 309 printf $out "#define\t%*s\t(REGNODE_MAX + %d)\t/* %s */\n", 310 -$width, $node->{name}, $node->{id} - $#ops, $node->{comment}; 311 } 312} 313 314sub print_regkind { 315 my ($out)= @_; 316 print $out <<EOP; 317 318/* PL_regkind[] What type of regop or state is this. */ 319 320#ifndef DOINIT 321EXTCONST U8 PL_regkind[]; 322#else 323EXTCONST U8 PL_regkind[] = { 324EOP 325 use Data::Dumper; 326 foreach my $node (@all) { 327 print Dumper($node) if !defined $node->{type} or !defined( $node->{name} ); 328 printf $out "\t%*s\t/* %*s */\n", 329 -1 - $twidth, "$node->{type},", -$width, $node->{name}; 330 print $out "\t/* ------------ States ------------- */\n" 331 if $node->{id} == $#ops and $node->{id} != $#all; 332 } 333 334 print $out <<EOP; 335}; 336#endif 337EOP 338} 339 340sub wrap_ifdef_print { 341 my $out= shift; 342 my $token= shift; 343 print $out <<EOP; 344 345#ifdef $token 346EOP 347 $_->($out) for @_; 348 print $out <<EOP; 349#endif /* $token */ 350 351EOP 352} 353 354sub print_regarglen { 355 my ($out)= @_; 356 print $out <<EOP; 357 358/* regarglen[] - How large is the argument part of the node (in regnodes) */ 359 360static const U8 regarglen[] = { 361EOP 362 363 foreach my $node (@ops) { 364 my $size= 0; 365 $size= "EXTRA_SIZE(struct regnode_$node->{args})" if $node->{args}; 366 367 printf $out "\t%*s\t/* %*s */\n", -37, "$size,", -$rwidth, $node->{name}; 368 } 369 370 print $out <<EOP; 371}; 372EOP 373} 374 375sub print_reg_off_by_arg { 376 my ($out)= @_; 377 print $out <<EOP; 378 379/* reg_off_by_arg[] - Which argument holds the offset to the next node */ 380 381static const char reg_off_by_arg[] = { 382EOP 383 384 foreach my $node (@ops) { 385 my $size= $node->{longj} || 0; 386 387 printf $out "\t%d,\t/* %*s */\n", $size, -$rwidth, $node->{name}; 388 } 389 390 print $out <<EOP; 391}; 392 393EOP 394} 395 396sub print_reg_name { 397 my ($out)= @_; 398 print $out <<EOP; 399 400/* reg_name[] - Opcode/state names in string form, for debugging */ 401 402#ifndef DOINIT 403EXTCONST char * PL_reg_name[]; 404#else 405EXTCONST char * const PL_reg_name[] = { 406EOP 407 408 my $ofs= 0; 409 my $sym= ""; 410 foreach my $node (@all) { 411 my $size= $node->{longj} || 0; 412 413 printf $out "\t%*s\t/* $sym%#04x */\n", 414 -3 - $width, qq("$node->{name}",), $node->{id} - $ofs; 415 if ( $node->{id} == $#ops and @ops != @all ) { 416 print $out "\t/* ------------ States ------------- */\n"; 417 $ofs= $#ops; 418 $sym= 'REGNODE_MAX +'; 419 } 420 } 421 422 print $out <<EOP; 423}; 424#endif /* DOINIT */ 425 426EOP 427} 428 429sub print_reg_extflags_name { 430 my ($out)= @_; 431 print $out <<EOP; 432/* PL_reg_extflags_name[] - Opcode/state names in string form, for debugging */ 433 434#ifndef DOINIT 435EXTCONST char * PL_reg_extflags_name[]; 436#else 437EXTCONST char * const PL_reg_extflags_name[] = { 438EOP 439 440 my %rxfv; 441 my %definitions; # Remember what the symbol definitions are 442 my $val= 0; 443 my %reverse; 444 my $REG_EXTFLAGS_NAME_SIZE= 0; 445 foreach my $file ( "op_reg_common.h", "regexp.h" ) { 446 open my $in_fh, "<", $file or die "Can't read '$file': $!"; 447 while (<$in_fh>) { 448 449 # optional leading '_'. Return symbol in $1, and strip it from 450 # comment of line 451 if (s/^ \# \s* define \s+ ( _? RXf_ \w+ ) \s+ //xi) { 452 chomp; 453 my $define= $1; 454 my $orig= $_; 455 s{ /\* .*? \*/ }{ }x; # Replace comments by a blank 456 457 # Replace any prior defined symbols by their values 458 foreach my $key ( keys %definitions ) { 459 s/\b$key\b/$definitions{$key}/g; 460 } 461 462 # Remove the U suffix from unsigned int literals 463 s/\b([0-9]+)U\b/$1/g; 464 465 my $newval= eval $_; # Get numeric definition 466 467 $definitions{$define}= $newval; 468 469 next unless $_ =~ /<</; # Bit defines use left shift 470 if ( $val & $newval ) { 471 my @names= ( $define, $reverse{$newval} ); 472 s/PMf_// for @names; 473 if ( $names[0] ne $names[1] ) { 474 die sprintf 475 "ERROR: both $define and $reverse{$newval} use 0x%08X (%s:%s)", 476 $newval, $orig, $_; 477 } 478 next; 479 } 480 $val |= $newval; 481 $rxfv{$define}= $newval; 482 $reverse{$newval}= $define; 483 } 484 } 485 } 486 my %vrxf= reverse %rxfv; 487 printf $out "\t/* Bits in extflags defined: %s */\n", unpack 'B*', pack 'N', 488 $val; 489 my %multibits; 490 for ( 0 .. 31 ) { 491 my $power_of_2= 2**$_; 492 my $n= $vrxf{$power_of_2}; 493 my $extra= ""; 494 if ( !$n ) { 495 496 # Here, there was no name that matched exactly the bit. It could be 497 # either that it is unused, or the name matches multiple bits. 498 if ( !( $val & $power_of_2 ) ) { 499 $n= "UNUSED_BIT_$_"; 500 } 501 else { 502 503 # Here, must be because it matches multiple bits. Look through 504 # all possibilities until find one that matches this one. Use 505 # that name, and all the bits it matches 506 foreach my $name ( keys %rxfv ) { 507 if ( $rxfv{$name} & $power_of_2 ) { 508 $n= $name . ( $multibits{$name}++ ); 509 $extra= sprintf qq{ : "%s" - 0x%08x}, $name, 510 $rxfv{$name} 511 if $power_of_2 != $rxfv{$name}; 512 last; 513 } 514 } 515 } 516 } 517 s/\bRXf_(PMf_)?// for $n, $extra; 518 printf $out qq(\t%-20s/* 0x%08x%s */\n), qq("$n",), $power_of_2, $extra; 519 $REG_EXTFLAGS_NAME_SIZE++; 520 } 521 522 print $out <<EOP; 523}; 524#endif /* DOINIT */ 525 526#ifdef DEBUGGING 527# define REG_EXTFLAGS_NAME_SIZE $REG_EXTFLAGS_NAME_SIZE 528#endif 529EOP 530 531} 532 533sub print_reg_intflags_name { 534 my ($out)= @_; 535 print $out <<EOP; 536 537/* PL_reg_intflags_name[] - Opcode/state names in string form, for debugging */ 538 539#ifndef DOINIT 540EXTCONST char * PL_reg_intflags_name[]; 541#else 542EXTCONST char * const PL_reg_intflags_name[] = { 543EOP 544 545 my %rxfv; 546 my %definitions; # Remember what the symbol definitions are 547 my $val= 0; 548 my %reverse; 549 my $REG_INTFLAGS_NAME_SIZE= 0; 550 foreach my $file ("regcomp.h") { 551 open my $fh, "<", $file or die "Can't read $file: $!"; 552 while (<$fh>) { 553 554 # optional leading '_'. Return symbol in $1, and strip it from 555 # comment of line 556 if ( 557 m/^ \# \s* define \s+ ( PREGf_ ( \w+ ) ) \s+ 0x([0-9a-f]+)(?:\s*\/\*(.*)\*\/)?/xi 558 ) 559 { 560 chomp; 561 my $define= $1; 562 my $abbr= $2; 563 my $hex= $3; 564 my $comment= $4; 565 my $val= hex($hex); 566 $comment= $comment ? " - $comment" : ""; 567 568 printf $out qq(\t%-30s/* 0x%08x - %s%s */\n), qq("$abbr",), 569 $val, $define, $comment; 570 $REG_INTFLAGS_NAME_SIZE++; 571 } 572 } 573 } 574 575 print $out <<EOP; 576}; 577#endif /* DOINIT */ 578 579EOP 580 print $out <<EOQ; 581#ifdef DEBUGGING 582# define REG_INTFLAGS_NAME_SIZE $REG_INTFLAGS_NAME_SIZE 583#endif 584 585EOQ 586} 587 588sub print_process_flags { 589 my ($out)= @_; 590 591 print $out process_flags( 'V', 'varies', <<'EOC'); 592/* The following have no fixed length. U8 so we can do strchr() on it. */ 593EOC 594 595 print $out process_flags( 'S', 'simple', <<'EOC'); 596 597/* The following always have a length of 1. U8 we can do strchr() on it. */ 598/* (Note that length 1 means "one character" under UTF8, not "one octet".) */ 599EOC 600 601} 602 603sub do_perldebguts { 604 my $guts= open_new( 'pod/perldebguts.pod', '>' ); 605 606 my $node; 607 my $code; 608 my $name_fmt= '<' x ( $longest_name_length - 1 ); 609 my $descr_fmt= '<' x ( 58 - $longest_name_length ); 610 eval <<EOD or die $@; 611format GuTS = 612 ^*~~ 613 \$node->{pod_comment} 614 ^$name_fmt ^<<<<<<<<< ^$descr_fmt~~ 615 \$node->{name}, \$code, \$node->{comment}//'' 616. 6171; 618EOD 619 620 my $old_fh= select($guts); 621 $~= "GuTS"; 622 623 open my $oldguts, "pod/perldebguts.pod" 624 or die "$0 cannot open pod/perldebguts.pod for reading: $!"; 625 while (<$oldguts>) { 626 print; 627 last if /=for regcomp.pl begin/; 628 } 629 630 print <<'END_OF_DESCR'; 631 632 # TYPE arg-description [num-args] [longjump-len] DESCRIPTION 633END_OF_DESCR 634 for my $n (@ops) { 635 $node= $n; 636 $code= "$node->{code} " . ( $node->{args} || "" ); 637 $code .= " $node->{longj}" if $node->{longj}; 638 if ( $node->{pod_comment} ||= "" ) { 639 640 # Trim multiple blanks 641 $node->{pod_comment} =~ s/^\n\n+/\n/; 642 $node->{pod_comment} =~ s/\n\n+$/\n\n/; 643 } 644 write; 645 } 646 print "\n"; 647 648 while (<$oldguts>) { 649 last if /=for regcomp.pl end/; 650 } 651 do { print } while <$oldguts>; 652 select $old_fh; 653 close_and_rename($guts); 654} 655 656read_definition("regcomp.sym"); 657my $out= open_new( 'regnodes.h', '>', 658 { by => 'regen/regcomp.pl', from => 'regcomp.sym' } ); 659print_state_defs($out); 660print_regkind($out); 661wrap_ifdef_print( 662 $out, 663 "REG_COMP_C", 664 \&print_regarglen, 665 \&print_reg_off_by_arg 666); 667print_reg_name($out); 668print_reg_extflags_name($out); 669print_reg_intflags_name($out); 670print_process_flags($out); 671read_only_bottom_close_and_rename($out); 672 673do_perldebguts(); 674