1package HeaderParser; 2use strict; 3use warnings; 4 5BEGIN { 6 if ($] < 5.010) { 7 die <<EOFMSG; 8The regen code in the perl repo needs at least version 5.10 to run. 9You are currently using $], we recommend you install a more modern 10version of perl before you try to build this version of perl. 11EOFMSG 12 } 13} 14 15# these are required below in BEGIN statements, we cant have a 16# hard dependency on them as they might not be available when 17# we run as part of autodoc.pl 18# 19# use Data::Dumper; 20# use Storable qw(dclone); 21# 22use Carp qw(confess); 23use Text::Tabs qw(expand unexpand); 24use Text::Wrap qw(wrap); 25 26# The style of this file is determined by: 27# 28# perltidy -w -ple -bbb -bbc -bbs -nolq -l=80 -noll -nola -nwls='=' \ 29# -isbc -nolc -otr -kis -ci=4 -se -sot -sct -nsbl -pt=2 -fs \ 30# -fsb='#start-no-tidy' -fse='#end-no-tidy' -cpb -bfvt=2 31 32my ( 33 %unop, # unary operators and their precedence 34 %binop, # binary operators and their precedence 35 %is_right_assoc, # operators which are right associative 36 %precedence, # precedence of all operators. 37 %associative, # associative operators 38 %commutative, # commutative operators 39 %cmpop, # comparison operators 40 $unop_pat, # pattern to match unary operators 41 $binop_pat, # pattern to match binary operators 42 %op_names, # map of op to description, used in error messages 43 $tokenize_pat # a pattern which can tokenize an expression 44); 45 46BEGIN { 47 # this is initialization for the operator precedence expression parser 48 # we use for handling preprocessor conditions. 49 %op_names= ( 50 '==' => 'equality', 51 '!=' => 'inequality', 52 '<<' => 'bit-shift-left', 53 '>>' => 'bit-shift-right', 54 '+' => 'addition', 55 '-' => 'subtraction', 56 '*' => 'multiplication', 57 '/' => 'division', 58 '%' => 'modulo', 59 '||' => 'logical-or', # Lowest precedence 60 '&&' => 'logical-and', 61 '|' => 'binary-or', 62 '^' => 'binary-xor', 63 '&' => 'binary-and', 64 '<' => 'less-than', # split on spaces, all with equal precedence 65 '>' => 'greater-than', 66 '<=' => 'less-than-or-equal', 67 '>=' => 'greater-than-or-equal', 68 ); 69 my @cmpop= ( 70 '== !=', # listed in lowest to highest precedence 71 '< > <= >=', # split on spaces, all with equal precedence 72 ); 73 my @binop= ( 74 '||', # Lowest precedence 75 '&&', 76 '|', 77 '^', 78 '&', 79 @cmpop, # include the numerical comparison operators. 80 '<< >>', 81 '+ -', 82 '* / %', # highest prcedence operators. 83 ); 84 85 my @unop= qw( ! ~ + - ); 86 %unop= map { $_ => 1 } @unop; 87 %cmpop= map { $_ => 1 } map { split /\s+/, $_ } @cmpop; 88 %binop= map { $_ => 1 } map { split /\s+/, $_ } @binop; 89 90 my $make_pat= sub { 91 my $pat= join "|", sort { length($b) <=> length($a) || $a cmp $b } 92 map quotemeta($_), @_; 93 return qr/$pat/; 94 }; 95 $unop_pat= $make_pat->(@unop); 96 foreach my $ix (0 .. $#binop) { 97 my $sym= $binop[$ix]; 98 $precedence{$_}= (1 + $ix) * 10 for split /\s+/, $sym; 99 } 100 $is_right_assoc{"?"}= 1; 101 $is_right_assoc{":"}= 1; 102 $precedence{"?"}= 1; 103 $precedence{":"}= 0; 104 105 $associative{$_}++ 106 for qw( || && + *); # we leave '==' out so we don't reorder terms 107 $commutative{$_}++ for qw( || && + *); 108 109 $binop_pat= $make_pat->(keys %precedence); 110 $tokenize_pat= qr/ 111 ^(?: 112 (?<comment> \/\*.*?\*\/ ) 113 | (?<ws> \s+ ) 114 | (?<term> 115 (?<literal> 116 (?<define> defined\(\w+\) ) 117 | (?<func> \w+\s*\(\s*\w+(?:\s*,\s*\w+)*\s*\) ) 118 | (?<const> (?:0x[a-fA-F0-9]+|\d+[LU]*|'.') ) 119 | (?<sym> \w+ ) 120 ) 121 | (?<op> $binop_pat | $unop_pat ) 122 | (?<paren> [\(\)] ) 123 ) 124 ) 125 /xs; 126} 127 128# dump the arguments with dump. wraps loading Dumper 129# as we are executed by miniperl where Dumper isnt available 130sub dd { 131 my $self= shift; 132 local $self->{orig_content}; 133 my $ret= "(dump not available)"; 134 eval { 135 require Data::Dumper; 136 $ret= Data::Dumper->new(\@_)->Indent(1)->Sortkeys(1)->Useqq(1)->Dump(); 137 }; 138 return $ret; 139} 140 141my $has_storable; 142 143# same story here, in miniperl we use slow perl code, 144# in real perl we can use Storable and speed things up. 145BEGIN { eval "use Storable; \$has_storable=1;" } 146 147# recursively copy an AoAoA... 148sub copy_aoa { 149 my ($aoa)= @_; 150 if ($has_storable) { 151 return Storable::dclone($aoa); 152 } 153 else { 154 return _copy_aoa($aoa); 155 } 156} 157 158sub _copy_aoa { 159 my ($thing)= @_; 160 if (ref $thing) { 161 return [ map { ref($_) ? _copy_aoa($_) : $_ } @$thing ]; 162 } 163 else { 164 return $thing; 165 } 166} 167 168# return the number characters that should go in between a '#' and 169# the name of a c preprocessor directive. Returns 0 spaces for level 170# 0, and 2 * ($level - 1) + 1 spaces for the rest. (1,3,5, etc) 171# This might sound weird, but consider these are tab *stops* and the 172# '#' is included in the total. which means indents of 2, 4, 6 etc. 173sub indent_chars { 174 my ($self, $level)= @_; 175 my $ind= ""; 176 $ind .= " " if $level; 177 $ind .= " " x ($level - 1) if $level > 1; 178 return $ind; 179} 180 181# we use OO to store state, etc. 182sub new { 183 my ($class, %args)= @_; 184 $args{add_commented_expr_after} //= 10; 185 $args{max_width} //= 78; 186 $args{min_break_width} //= 70; 187 return bless \%args,; 188} 189 190# this parses the expression into an array of tokens 191# this is somewhat crude, we could do this incrementally 192# if we wanted and avoid the overhead. but it makes it 193# easier to debug the tokenizer. 194sub _tokenize_expr { 195 my ($self, $expr)= @_; 196 delete $self->{tokens}; 197 delete $self->{parse_tree}; 198 $self->{original_expr}= $expr; 199 200 my @tokens; 201 while ($expr =~ s/$tokenize_pat//xs) { 202 push @tokens, {%+} if defined $+{'term'}; 203 } 204 $self->{tokens}= \@tokens; 205 warn $self->dd($self) if $self->{debug}; 206 if (length $expr) { 207 confess "Failed to tokenize_expr: $expr\n"; 208 } 209 return \@tokens; 210} 211 212sub _count_ops { 213 my ($self, $term)= @_; 214 my $count= 0; 215 $count++ while $term =~ m/(?: \|\| | \&\& | \? )/gx; 216 return $count; 217} 218 219# sort terms in an expression in a way that puts things 220# in a sensible order. Anything starting with PERL_IN_ 221# should be on the left in alphabetical order. Digits 222# should be on the right (eg 0), and ties are resolved 223# by stripping non-alpha-numerc, thus removing underbar 224# parens, spaces, logical operators, etc, and then by 225# lc comparison of the result. 226sub _sort_terms { 227 my $self= shift; 228 my (@terms)= map { 229 [ 230 $_, # 0: raw 231 lc($_) =~ s/[^a-zA-Z0-9]//gr, # 1: "_" stripped and caseless 232 $_ =~ m/PERL_IN_/ ? 1 : 0, # 2: PERL_IN_ labeled define 233 $_ =~ m/^\d/ ? 1 : 0, # 3: digit 234 $_ =~ m/DEBUGGING/ ? 1 : 0, # 4: DEBUGGING? 235 $self->_count_ops($_), # 5: Number of ops (||, &&) 236 ] 237 } @_; 238 my %seen; 239 #start-no-tidy 240 @terms= map { $seen{ $_->[0] }++ ? () : $_->[0] } 241 sort { 242 $a->[5] <=> $b->[5] || # least number of ops 243 $b->[2] <=> $a->[2] || # PERL_IN before others 244 $a->[3] <=> $b->[3] || # digits after others 245 $a->[4] <=> $b->[4] || # DEBUGGING after all else 246 $a->[1] cmp $b->[1] || # stripped caseless cmp 247 lc($a->[0]) cmp lc($b->[0]) || # caseless cmp 248 $a->[0] cmp $b->[0] || # exact cmp 249 0 250 } @terms; 251 #end-no-tidy 252 return @terms; 253} 254 255# normalize a condition expression by parsing it and then stringifying 256# the parse tree. 257sub tidy_cond { 258 my ($self, $expr)= @_; 259 my $ret= $self->{_tidy_cond_cache}{$expr} //= do { 260 $self->parse_expr($expr) if defined $expr; 261 my $text= $self->_pt_as_str(); 262 $text; 263 }; 264 $self->{last_expr}= $ret; 265 return $ret; 266} 267 268# convert a parse tree structure to a string recursively. 269# 270# Parse trees are currently made up of arrays, with the count 271# of items in the object determining the type of op it represents. 272# 1 argument: literal value of some sort. 273# 2 arguments: unary operator: 0 slot is the operator, 1 is a parse tree 274# : ternary: 0 slot holds '?', 1 is an array holding three 275# parse trees: cond, true, false 276# 3 arguments or more: binary operator. 0 slot is the op. 1..n are parse trees 277# : note, this is multigate for commutative operators like 278# : "+", "*", "&&" and "||", so an expr 279# : like "A && B && !C" would be represented as: 280# : [ "&&", ["A"], ["B"], [ "!",["C"] ] ] 281# 282sub _pt_as_str { 283 my ($self, $node, $parent_op, $depth)= @_; 284 285 $node ||= $self->{parse_tree} 286 or confess "No parse tree?"; 287 $depth ||= 0; 288 if (@$node == 1) { 289 290 # its a literal 291 return $node->[0]; 292 } 293 elsif (@$node == 2) { 294 295 # is this a ternary or an unop? 296 if ($node->[0] eq '?') { 297 298 # ternary, the three "parts" are tucked away in 299 # an array in the payload slot 300 my $expr= 301 $self->_pt_as_str($node->[1][0], "?", $depth + 1) . " ? " 302 . $self->_pt_as_str($node->[1][1], "?", $depth + 1) . " : " 303 . $self->_pt_as_str($node->[1][2], "?", $depth + 1); 304 305 # stick parens on if this is a subexpression 306 $expr= "( " . $expr . " )" if $depth; 307 return $expr; 308 } 309 else { 310 if ( $node->[0] eq "!" 311 and @{ $node->[1] } == 2 312 and $node->[1][0] eq "!") 313 { 314 # normalize away !! in expressions. 315 return $self->_pt_as_str($node->[1][1], $parent_op, $depth); 316 } 317 318 # unop - the payload is a optree 319 return $node->[0] 320 . $self->_pt_as_str($node->[1], $node->[0], $depth + 1); 321 } 322 } 323 324 # if we get here we are dealing with a binary operator 325 # the nodes are not necessarily binary, as we "collect" 326 # the terms into a list, thus: A && B && C && D -> ['&&',A,B,C,D] 327 my ($op, @terms)= @$node; 328 329 # convert the terms to strings 330 @terms= map { $self->_pt_as_str($_, $op, $depth + 1) } @terms; 331 332 # sort them to normalize the subexpression 333 my $expr= 334 join " $op ", $associative{$op} 335 ? $self->_sort_terms(@terms) 336 : @terms; 337 338 # stick parens on if this is a subexpression 339 $expr= "( " . $expr . " )" if $depth and !$cmpop{$op}; 340 341 # and we are done. 342 return $expr; 343} 344 345# Returns the precedence of an operator, returns 0 if there is no token 346# or the next token is not an op, or confesss if it encounters an op it does not 347# know. 348sub _precedence { 349 my $self= shift; 350 my $token= shift // return 0; 351 352 my $op= (ref $token ? $token->{op} : $token) // return 0; 353 354 return $precedence{$op} // confess "Unknown op '$op'"; 355} 356 357# entry point into parsing the tokens, checks that we actually parsed everything 358# and didnt leave anything in the token stream (possible from a malformed expression) 359# Performs some minor textual cleanups using regexes, but then does a proper parse 360# of the expression. 361sub parse_expr { 362 my ($self, $expr)= @_; 363 if (defined $expr) { 364 $expr =~ s/\s*\\\n\s*/ /g; 365 $expr =~ s/defined\s+(\w+)/defined($1)/g; 366 $self->_tokenize_expr($expr); 367 } 368 my $ret= $self->_parse_expr(); 369 if (@{ $self->{tokens} }) { 370 371 # if all was well with parsing we should not get here. 372 confess "Unparsed tokens: ", $self->dd($self->{tokens}); 373 } 374 $self->{parse_tree}= $ret; 375 return $ret; 376} 377 378# this is just a wrapper around _parse_expr_assoc() which handles 379# parsing an arbitrary expression. 380sub _parse_expr { 381 my ($self)= @_; 382 return $self->_parse_expr_assoc($self->_parse_expr_primary(), 1); 383} 384 385# This handles extracting from the token stream 386# - simple literals 387# - unops (assumed to be right associative) 388# - parens (which reset the precedence acceptable to the parser) 389# 390sub _parse_expr_primary { 391 my ($self)= @_; 392 my $tokens= $self->{tokens} 393 or confess "No tokens in _parse_expr_primary?"; 394 my $first= $tokens->[0] 395 or confess "No primary?"; 396 if ($first->{paren} and $first->{paren} eq "(") { 397 shift @$tokens; 398 my $expr= $self->_parse_expr(); 399 $first= $tokens->[0]; 400 if (!$first->{paren} or $first->{paren} ne ")") { 401 confess "Expecting close paren", $self->dd($tokens); 402 } 403 shift @$tokens; 404 return $expr; 405 } 406 elsif ($first->{op} and $unop{ $first->{op} }) { 407 my $op_token= shift @$tokens; 408 return [ $op_token->{op}, $self->_parse_expr_primary() ]; 409 } 410 elsif (defined $first->{literal}) { 411 shift @$tokens; 412 return [ $first->{literal} ]; 413 } 414 else { 415 die sprintf 416 "Unexpected token '%s', expecting literal, unary, or expression.\n", 417 $first->{term}; 418 } 419} 420 421# This is the heart of the expression parser. It uses 422# a pair of nested loops to avoid excessive recursion during parsing, 423# which should be a bit faster than other strategies. It only should 424# recurse when the precedence level changes. 425sub _parse_expr_assoc { 426 my ($self, $lhs, $min_precedence)= @_; 427 my $tokens= $self->{tokens} 428 or confess "No tokens in _parse_expr_assoc"; 429 my $la= $tokens->[0]; # lookahead 430 my $la_pr= $self->_precedence($la); # lookahead precedence 431 while ($la && $la_pr >= $min_precedence) { 432 my $op_token= shift @$tokens; 433 my $op_pr= $la_pr; # op precedence 434 if ($op_token->{op} eq "?") { 435 my $mid= $self->_parse_expr(); 436 if (@$tokens and $tokens->[0]{op} and $tokens->[0]{op} eq ":") { 437 shift @$tokens; 438 my $tail= $self->_parse_expr(); 439 return [ '?', [ $lhs, $mid, $tail ] ]; 440 } 441 confess "Panic: expecting ':'", $self->dd($tokens); 442 } 443 my $rhs; 444 eval { $rhs= $self->_parse_expr_primary(); } 445 or die "Error in $op_names{$op_token->{op}} expression: $@"; 446 $la= $tokens->[0]; 447 $la_pr= $self->_precedence($la); 448 while ( 449 $la_pr > $op_pr || # any and larger 450 ( $is_right_assoc{ $op_token->{op} } 451 and $la_pr == $op_pr) # right and equal 452 ) { 453 my $new_precedence= $op_pr + ($la_pr > $op_pr ? 1 : 0); 454 $rhs= $self->_parse_expr_assoc($rhs, $new_precedence); 455 $la= $tokens->[0]; 456 $la_pr= $self->_precedence($la); 457 } 458 if ( @$lhs >= 3 459 && $lhs->[0] eq $op_token->{op} 460 && $commutative{ $op_token->{op} }) 461 { 462 push @$lhs, $rhs; 463 } 464 else { 465 my @lt= ($lhs); 466 my @rt= ($rhs); 467 468 # if we have '( a && b ) && ( c && d)' 469 # turn it into 'a && b && c && d' 470 if (@$lhs > 2 && $lhs->[0] eq $op_token->{op}) { 471 (undef,@lt)= @$lhs; # throw away op. 472 } 473 if (@$rhs > 2 && $rhs->[0] eq $op_token->{op}) { 474 (undef,@rt)= @$rhs; # throw away op. 475 } 476 $lhs= [ $op_token->{op}, @lt, @rt ]; 477 } 478 } 479 return $lhs; 480} 481 482#entry point for normalizing and if/elif statements 483#returns the line and condition in normalized form. 484sub normalize_if_elif { 485 my ($self, $line, $line_info)= @_; 486 if (my $dat= $self->{cache_normalize_if_elif}{$line}) { 487 return $dat->{line}, $dat->{cond}; 488 } 489 my ($cond); 490 eval { 491 ($line, $cond)= $self->_normalize_if_elif($line); 492 1; 493 } or die sprintf "Error at line %d\nLine %d: %s\n%s", 494 ($line_info->start_line_num()) x 2, $line, $@; 495 $self->{cache_normalize_if_elif}{$line}= { line => $line, cond => $cond }; 496 return ($line, $cond); 497} 498 499#guts of the normalize_if_elif() - cleans up the line, extracts 500#the condition, and then tidies it with tidy_cond(). 501sub _normalize_if_elif { 502 my ($self, $line)= @_; 503 my $nl= ""; 504 $nl= $1 if $line =~ s/(\n+)\z//; 505 $line =~ s/\s+\z//; 506 my @comment; 507 push @comment, $1 while $line =~ s!\s*(/\*.*?\*/)\z!!; 508 $line =~ s/defined\s*\(\s*(\w+)\s*\)/defined($1)/g; 509 $line =~ s/!\s+defined/!defined/g; 510 511 if ($line =~ /^#((?:el)?if)(n?)def\s+(\w+)/) { 512 my $if= $1; 513 my $not= $2 ? "!" : ""; 514 $line= "#$if ${not}defined($3)"; 515 } 516 $line =~ s/#((?:el)?if)\s+// 517 or confess "Bad cond: $line"; 518 my $if= $1; 519 $line =~ s/!\s+/!/g; 520 521 my $old_cond= $line; 522 my $cond= $self->tidy_cond($old_cond); 523 524 warn "cond - $old_cond\ncond + $cond\n" 525 if $old_cond ne $cond and $self->{debug}; 526 527 $line= "#$if $cond"; 528 $line .= " " . join " ", reverse @comment if @comment; 529 530 $line .= $nl; 531 return ($line, $cond); 532} 533 534# parses a text buffer as though it was a file on disk 535# calls parse_fh() 536sub parse_text { 537 my ($self, $text)= @_; 538 local $self->{parse_source}= "(buffer)"; 539 open my $fh, "<", \$text 540 or die "Failed to open buffer for read: $!"; 541 return $self->parse_fh($fh); 542} 543 544# takes a readable filehandle and parses whatever contents is 545# returned by reading it. Returns an array of HeaderLine objects. 546# this is the main routing for parsing a header file. 547sub parse_fh { 548 my ($self, $fh)= @_; 549 my @lines; 550 my @cond; 551 my @cond_line; 552 my $last_cond; 553 local $self->{parse_source}= $self->{parse_source} || "(unknown)"; 554 my $cb= $self->{pre_process_content}; 555 $self->{orig_content}= ""; 556 my $line_num= 1; 557 558 while (defined(my $line= readline($fh))) { 559 my $start_line_num= $line_num++; 560 $self->{orig_content} .= $line; 561 while ($line =~ /\\\n\z/ or $line =~ m</\*(?:(?!\*/).)*\s*\z>s) { 562 defined(my $read_line= readline($fh)) 563 or last; 564 $self->{orig_content} .= $read_line; 565 $line_num++; 566 $line .= $read_line; 567 } 568 while ($line =~ m!/\*(.*?)(\*/|\z)!gs) { 569 my ($inner, $tail)= ($1, $2); 570 if ($tail ne "*/") { 571 confess 572 "Unterminated comment starting at line $start_line_num\n"; 573 } 574 elsif ($inner =~ m!/\*!) { 575 confess 576 "Nested/broken comment starting at line $start_line_num\n"; 577 } 578 } 579 580 my $raw= $line; 581 my $type= "content"; 582 my $sub_type= "text"; 583 my $level= @cond; 584 my $do_pop= 0; 585 my $flat= $line; 586 $flat =~ s/\s*\\\n\s*/ /g; 587 $flat =~ s!/\*.*?\*/! !gs; 588 $flat =~ s/\s+/ /g; 589 $flat =~ s/\s+\z//; 590 $flat =~ s/^\s*#\s*/#/g; 591 592 my $line_info= 593 HeaderLine->new(start_line_num => $start_line_num, raw => $raw); 594 my $do_cond_line; 595 if ($flat =~ /^#/) { 596 if ($flat =~ m/^(#(?:el)?if)(n?)def\s+(\w+)/) { 597 my $if= $1; 598 my $not= $2 ? "!" : ""; 599 my $sym= $3; 600 $flat =~ 601 s/^(#(?:el)?if)(n?)def\s+(\w+)/$if ${not}defined($sym)/; 602 } 603 my $cond; # used in various expressions below 604 if ($flat =~ /^#endif/) { 605 if (!@cond) { 606 confess "Not expecting $flat"; 607 } 608 $do_pop= 1; 609 $level--; 610 $type= "cond"; 611 $sub_type= "#endif"; 612 } 613 elsif ($flat =~ /^#if\b/) { 614 ($flat, $cond)= $self->normalize_if_elif($flat, $line_info); 615 push @cond, [$cond]; 616 push @cond_line, $line_info; 617 $type= "cond"; 618 $sub_type= "#if"; 619 } 620 elsif ($flat =~ /^#elif\b/) { 621 if (!@cond) { 622 confess "No if for $flat"; 623 } 624 $level--; 625 ($flat, $cond)= $self->normalize_if_elif($flat, $line_info); 626 $cond[-1][-1]= $self->tidy_cond("!($cond[-1][-1])"); 627 $cond_line[-1]= $line_info; 628 push @{ $cond[-1] }, $cond; 629 $type= "cond"; 630 $sub_type= "#elif"; 631 } 632 elsif ($flat =~ /^#else\b/) { 633 if (!@cond) { 634 confess "No if for $flat"; 635 } 636 $level--; 637 $cond[-1][-1]= $self->tidy_cond("!($cond[-1][-1])"); 638 $cond_line[-1]= $line_info; 639 $type= "cond"; 640 $sub_type= "#else"; 641 } 642 elsif ($flat =~ /#undef/) { 643 $type= "content"; 644 $sub_type= "#undef"; 645 } 646 elsif ($flat =~ /#pragma\b/) { 647 $type= "content"; 648 $sub_type= "#pragma"; 649 } 650 elsif ($flat =~ /#include\b/) { 651 $type= "content"; 652 $sub_type= "#include"; 653 } 654 elsif ($flat =~ /#define\b/) { 655 $type= "content"; 656 $sub_type= "#define"; 657 } 658 elsif ($flat =~ /#error\b/) { 659 $type= "content"; 660 $sub_type= "#error"; 661 } 662 else { 663 confess "Do not know what to do with $line"; 664 } 665 if ($type eq "cond") { 666 667 # normalize conditional lines 668 $line= $flat; 669 $last_cond= $line_info; 670 } 671 } 672 $line =~ s/\n?\z/\n/; 673 674 %$line_info= ( 675 cond => copy_aoa(\@cond), 676 type => $type, 677 sub_type => $sub_type, 678 raw => $raw, 679 flat => $flat, 680 line => $line, 681 level => $level, 682 source => $self->{parse_source}, 683 start_line_num => $start_line_num, 684 n_lines => $line_num - $start_line_num, 685 ); 686 687 push @lines, $line_info; 688 if ($do_pop) { 689 $line_info->{inner_lines}= 690 $line_info->start_line_num - $cond_line[-1]->start_line_num; 691 pop @cond; 692 pop @cond_line; 693 } 694 if ($type eq "content" and $cb) { 695 $cb->($self, $lines[-1]); 696 } 697 } 698 if (@cond_line) { 699 my $msg= "Unterminated conditional block starting line " 700 . $cond_line[-1]->start_line_num(); 701 $msg .= 702 " with last conditional operation at line " 703 . $last_cond->start_line_num() 704 if $cond_line[-1] != $last_cond; 705 confess $msg; 706 } 707 $self->{lines}= \@lines; 708 return \@lines; 709} 710 711# returns the last lines we parsed. 712sub lines { $_[0]->{lines} } 713 714# assuming a line looks like an embed.fnc entry parse it 715# and normalize it, and create and EmbedLine object from it. 716sub tidy_embed_fnc_entry { 717 my ($self, $line_data)= @_; 718 my $line= $line_data->{line}; 719 return $line if $line =~ /^\s*:/; 720 return $line unless $line_data->{type} eq "content"; 721 return $line unless $line =~ /\|/; 722 723 $line =~ s/\s*\\\n/ /g; 724 $line =~ s/\s+\z//; 725 ($line)= expand($line); 726 my ($flags, $ret, $name, @args)= split /\s*\|\s*/, $line; 727 my %flag_seen; 728 $flags= join "", grep !$flag_seen{$_}++, sort split //, $flags; 729 if ($flags =~ s/^#//) { 730 $flags .= "#"; 731 } 732 if ($flags eq "#") { 733 die "Not allowed to use only '#' for flags" 734 . "in 'embed.fnc' at line $line_data->{start_line_num}"; 735 } 736 if (!$flags) { 737 die "Missing flags in function definition" 738 . " in 'embed.fnc' at line $line_data->{start_line_num}\n" 739 . "Did you a forget a line continuation on the previous line?\n"; 740 } 741 for ($ret, @args) { 742 s/(\w)\*/$1 */g; 743 s/\*\s+(\w)/*$1/g; 744 s/\*const/* const/g; 745 } 746 my $head= sprintf "%-8s|%-7s", $flags, $ret; 747 $head .= sprintf "|%*s", -(31 - length($head)), $name; 748 if (@args and length($head) > 32) { 749 $head .= "\\\n"; 750 $head .= " " x 32; 751 } 752 foreach my $ix (0 .. $#args) { 753 my $arg= $args[$ix]; 754 $head .= "|$arg"; 755 $head .= "\\\n" . (" " x 32) if $ix < $#args; 756 } 757 $line= $head . "\n"; 758 759 if ($line =~ /\\\n/) { 760 my @lines= split /\s*\\\n/, $line; 761 my $len= length($lines[0]); 762 $len < length($_) and $len= length($_) for @lines; 763 $len= int(($len + 7) / 8) * 8; 764 $len= 72 if $len < 72; 765 $line= join("\\\n", 766 (map { sprintf "%*s", -$len, $_ } @lines[ 0 .. $#lines - 1 ]), 767 $lines[-1]); 768 } 769 ($line)= unexpand($line); 770 771 $line_data->{embed}= EmbedLine->new( 772 flags => $flags, 773 return_type => $ret, 774 name => $name, 775 args => \@args, 776 ); 777 $line =~ s/\s+\z/\n/; 778 $line_data->{line}= $line; 779 return $line; 780} 781 782# line up the text in a multiline string by a given $fragment 783# of text, inserting whitespace in front or behind the $fragment 784# to get the text to line up. Returns the text. This is wrapped 785# by line_up() and is used to wrap long conditions and comments 786# in the generated code. 787sub _line_up_frag { 788 my ($self, $str, $fragment)= @_; 789 die "has tabs?!" if $str =~ /\t/; 790 my @lines= split /\n/, $str; 791 my $changed= 1; 792 while ($changed) { 793 $changed= 0; 794 foreach my $ix (0 .. $#lines - 1) { 795 my $f_index= 0; 796 my $n_index= 0; 797 while (1) { 798 $f_index= index($lines[$ix], $fragment, $f_index); 799 $n_index= index($lines[ $ix + 1 ], $fragment, $n_index); 800 if ($f_index == -1 or $n_index == -1) { 801 last; 802 } 803 if ($f_index < $n_index) { 804 my $f_idx= $f_index; 805 $f_idx-- while substr($lines[$ix], $f_idx, 1) ne " "; 806 substr($lines[$ix], $f_idx, 0, " " x ($n_index - $f_index)); 807 $changed++; 808 last; 809 } 810 elsif ($n_index < $f_index) { 811 my $n_idx= $n_index; 812 $n_idx-- while substr($lines[ $ix + 1 ], $n_idx, 1) ne " "; 813 substr($lines[ $ix + 1 ], 814 $n_idx, 0, " " x ($f_index - $n_index)); 815 $changed++; 816 last; 817 } 818 $f_index++; 819 $n_index++; 820 } 821 } 822 } 823 my $ret= join "", map { "$_\n" } @lines; 824 return $ret; 825} 826 827sub _fixup_indent { 828 my ($self, $line)= @_; 829 my @lines= split /\n/, $line; 830 if ($lines[0]=~/^(#\s*\w+(?:\s*\/\*)?\s)(\s+)/) { 831 my $first_left_len = length $1; 832 833 while (1) { 834 my $ok = 1; 835 for (@lines) { 836 /^.{$first_left_len} / 837 or do { $ok = 0; last; }; 838 } 839 if ($ok) { 840 s/^(.{$first_left_len}) /$1/ for @lines; 841 } else { 842 last; 843 } 844 } 845 } 846 847 if ($lines[0]=~/^(#\s*\w+\s+)\(/) { 848 my $len = length($1); 849 for my $idx (1..$#lines) { 850 $lines[$idx]=~s/^([ ]{$len})(\s+)(\()/$1$3$2/; 851 } 852 } 853 my $ret= join "", map { "$_\n" } @lines; 854 return $ret; 855} 856 857# this is the workhorse for _break_line_at_op(). 858sub __break_line_at_op { 859 my ($self, $limit, $line, $blank_prefix)= @_; 860 my @lines= (""); 861 while (length $line) { 862 my $part; 863 if ($line =~ s/^(.*?(?:\|\||&&)\s+)//) { 864 $part= $1; 865 } 866 else { 867 $part= $line; 868 $line= ""; 869 } 870 if (length($lines[-1]) + length($part) < $limit) { 871 $lines[-1] .= $part; 872 } 873 else { 874 push @lines, $blank_prefix . $part; 875 } 876 } 877 return \@lines; 878} 879 880# Break a condition line into parts, while trying to keep the last 881# token on each line being an operator like || or && or ? or : We try 882# to keep each line at $limit characters, however, we also try to 883# ensure that each line has the same number of operators on it such 884# that across all the lines there are only two counts of operators (eg, 885# we either way each line to have two operators on it, or 0, or 1 or 0, 886# or 2 or 1, and so on.) If we cannot meet this requirement we reduce 887# the limit by 1 and try again, until we meet the objective, or the 888# limit ends up at 70 chars or less. 889sub _break_line_at_op { 890 my ($self, $limit, $line, $blank_prefix)= @_; 891 my $lines; 892 while (1) { 893 $lines= $self->__break_line_at_op($limit, $line, $blank_prefix); 894 my %op_counts; 895 foreach my $line_idx (0 .. $#$lines) { 896 my $line= $lines->[$line_idx]; 897 my $count= 0; 898 $count++ while $line =~ /(\|\||&&|\?|:)/g; 899 $op_counts{$count}++; 900 901 } 902 if ($limit <= $self->{min_break_width} || keys(%op_counts) <= 2) { 903 last; 904 } 905 $limit--; 906 } 907 908 s/\s*\z/\n/ for @$lines; 909 return join "", @$lines; 910} 911 912sub _max { # cant use Scalar::Util so we roll our own 913 my $max= shift; 914 $max < $_ and $max= $_ for @_; 915 return $max; 916} 917 918# take a condition, split into $type and $rest 919# wrap it, and try to line up operators and defined() functions 920# that it contains. This is rather horrible code, but it does a 921# reasonable job applying the heuristics we need to lay our the 922# conditions in a reasonable way. 923sub _wrap_and_line_up_cond { 924 my ($self, $type, $rest)= @_; 925 926 my $limit= $self->{max_width}; 927 928 # extract the expression part of the line, and normalize it, we do 929 # this here even though it might be duplicative as it is possible 930 # that the caller code has munged the expression in some way, and we 931 # might want to simplify the expression first. Eg: 932 # 'defined(FOO) && (defined(BAR) && defined(BAZ))' should be turned into 933 # 'defined(FOO) && defined(BAR) && defined(BAZ)' if possible. 934 my $rest_head= ""; 935 my $rest_tail= ""; 936 if ($rest =~ s!(if\s+)!!) { 937 $rest_head= $1; 938 } 939 if ($rest =~ s!(\s*/\*.*?\*/)\s*\z!! || $rest =~ s!(\s*\*/\s*)\z!!) { 940 $rest_tail= $1; 941 } 942 if ($rest) { 943 $rest= $self->tidy_cond($rest); 944 $rest= $rest_head . $rest . $rest_tail; 945 } 946 947 my $l= length($type); 948 my $line= $type; 949 $line .= $rest if length($rest); 950 my $blank_prefix= " " x $l; 951 952 # at this point we have a single line with the entire expression on it 953 # if it fits on one line we are done, we can return it right away. 954 if (length($line) <= $limit) { 955 $line =~ s/\s*\z/\n/; 956 return $line; 957 } 958 my $rest_copy= $rest; 959 my @fragments; 960 my $op_pat= qr/(?:\|\||&&|[?:])/; 961 962 # does the $rest contain a parenthesized group? If it does then 963 # there are a mixture of different ops being used, as if it was all 964 # the same opcode there would not be a parenthesized group. 965 # If it does then we handle it differently, and try to put the 966 # different parts of the expression on their own line. 967 if ($rest_copy =~ /$op_pat\s*\(/) { 968 my @parts; 969 while (length $rest_copy) { 970 if ($rest_copy =~ s/^(.*?$op_pat)(\s*!?\()/$2/) { 971 push @parts, $1; 972 } else { 973 #$rest_copy=~s/^\s+//; 974 push @parts, $rest_copy; 975 last; 976 } 977 } 978 $parts[0]= $type . $parts[0]; 979 $parts[$_]= $blank_prefix . $parts[$_] for 1 .. $#parts; 980 foreach my $line (@parts) { 981 if (length($line) > $limit) { 982 $line= $self->_break_line_at_op($limit, $line, $blank_prefix); 983 } 984 } 985 s/\s*\z/\n/ for @parts; 986 $line= join "", @parts; 987 @fragments= ("defined", "||"); 988 } 989 else { 990 # the expression consists of just one opcode type, so we can use 991 # simpler logic to break it apart with the objective of ensuring 992 # that the lines are similarly formed with trailing operators on 993 # each line but the last. 994 @fragments= ("||", "defined"); 995 $line= $self->_break_line_at_op($limit, $type . $rest, $blank_prefix); 996 } 997 998 # try to line up the text on different lines. We stop after 999 # the first $fragment that modifies the text. The order 1000 # of fragments we try is determined above based on the type 1001 # of condition this is. 1002 my $pre_line= $line; 1003 foreach my $fragment (@fragments) { 1004 $line= $self->_line_up_frag($line, $fragment); 1005 last if $line ne $pre_line; 1006 } 1007 1008 # if we have lined up by "defined" in _line_up_frag() 1009 # then we may have " || defined(...)" type expressions 1010 # convert these to " || defined(...)" as it looks better. 1011 $line =~ s/( )(\|\||&&|[()?:])([ ]{2,})(!?defined)/$3$2$1$4/g; 1012 $line =~ s/(\|\||&&|[()?:])[ ]{10,}/$1 /g; 1013 1014 # add back the line continuations. this is all pretty inefficient, 1015 # but it works nicely. 1016 my @lines= split /\n/, $line; 1017 my $last= pop @lines; 1018 my $max_len= _max(map { length $_ } @lines); 1019 $_= sprintf "%*s \\\n", -$max_len, $_ for @lines; 1020 $last .= "\n"; 1021 1022 $line= join "", @lines, $last; 1023 1024 # remove line continuations that are inside of a comment, 1025 # we may have a variable number of lines of the expression 1026 # or parts of lines of the expression in a comment, so 1027 # we do this last. 1028 $line =~ s!/\* (.*) \*/ 1029 !"/*"._strip_line_cont("$1")."*/"!xsge; 1030 1031 return $self->_fixup_indent($line); 1032} 1033 1034#remove line continuations from the argument. 1035sub _strip_line_cont { 1036 my ($string)= @_; 1037 $string =~ s/\s*\\\n/\n/g; 1038 return $string; 1039} 1040 1041# Takes an array of HeaderLines objects produced by parse_fh() 1042# or by group_content(), and turn it into a string. 1043sub lines_as_str { 1044 my ($self, $lines, $post_process_content)= @_; 1045 $lines ||= $self->{lines}; 1046 my $ret; 1047 $post_process_content ||= $self->{post_process_content}; 1048 my $filter= $self->{filter_content}; 1049 my $last_line= ""; 1050 1051 #warn $self->dd($lines); 1052 foreach my $line_data (@$lines) { 1053 my $line= $line_data->{line}; 1054 if ($line_data->{type} ne "content" or $line_data->{sub_type} ne "text") 1055 { 1056 my $level= $line_data->{level}; 1057 my $ind= $self->indent_chars($level); 1058 $line =~ s/^#(\s*)/#$ind/; 1059 } 1060 if ($line_data->{type} eq "cond") { 1061 my $add_commented_expr_after= $self->{add_commented_expr_after}; 1062 if ($line_data->{sub_type} =~ /#(?:else|endif)/) { 1063 my $joined= join " && ", 1064 map { "($_)" } @{ $line_data->{cond}[-1] }; 1065 my $cond_txt= $self->tidy_cond($joined); 1066 $cond_txt= "if $cond_txt" if $line_data->{sub_type} eq "#else"; 1067 $line =~ s!\s*\z! /* $cond_txt */\n! 1068 if $line_data->{inner_lines} >= $add_commented_expr_after; 1069 } 1070 elsif ($line_data->{sub_type} eq "#elif") { 1071 my $last_frame= $line_data->{cond}[-1]; 1072 my $joined= join " && ", 1073 map { "($_)" } @$last_frame[ 0 .. ($#$last_frame - 1) ]; 1074 my $cond_txt= $self->tidy_cond($joined); 1075 $line =~ s!\s*\z! /* && $cond_txt */\n! 1076 if $line_data->{inner_lines} >= $add_commented_expr_after; 1077 } 1078 } 1079 $line =~ s/\s*\z/\n/; 1080 if ($last_line eq "\n" and $line eq "\n") { 1081 next; 1082 } 1083 $last_line= $line; 1084 if ($line_data->{type} eq "cond") { 1085 $line =~ m!(^\s*#\s*\w+[ ]*)([^/].*?\s*)?(/\*.*)?\n\z! 1086 or die "Failed to split cond line: $line"; 1087 my ($type, $cond, $comment)= ($1, $2, $3); 1088 $comment //= ""; 1089 $cond //= ""; 1090 my $new_line; 1091 if (!length($cond) and $comment) { 1092 $comment =~ s!^(/\*\s+)!! 1093 and $type .= $1; 1094 } 1095 1096 $line= $self->_wrap_and_line_up_cond($type, $cond . $comment); 1097 } 1098 $line_data->{line}= $line; 1099 if ($post_process_content and $line_data->{type} eq "content") { 1100 $post_process_content->($self, $line_data); 1101 } 1102 if ($filter and $line_data->{type} eq "content") { 1103 $filter->($self, $line_data) or next; 1104 } 1105 $ret .= $line_data->{line}; 1106 } 1107 return $ret; 1108} 1109 1110# Text::Wrap::wrap has an odd api, so hide it behind a wrapper 1111# sub which sets things up properly. 1112sub _my_wrap { 1113 my ($head, $rest, $line)= @_; 1114 local $Text::Wrap::unexpand= 0; 1115 local $Text::Wrap::huge= "overflow"; 1116 local $Text::Wrap::columns= 78; 1117 unless (length $line) { return $head } 1118 $line= wrap $head, $rest, $line; 1119 return $line; 1120} 1121 1122# recursively extract the && expressions from a parse tree, 1123# returning the result as strings. 1124# if $node is not a '&&' op then it returns $node as a string, 1125# otherwise it returns the string form of the arguments to the 1126# '&&' op, recursively flattening any '&&' nodes that it might 1127# contain. 1128sub _and_clauses { 1129 my ($self, $node)= @_; 1130 1131 my @ret; 1132 if (@$node < 3 or $node->[0] ne "&&") { 1133 return $self->_pt_as_str($node); 1134 } 1135 foreach my $idx (1 .. $#$node) { 1136 push @ret, $self->_and_clauses($node->[$idx]); 1137 } 1138 return @ret; 1139} 1140 1141# recursively walk the a parse tree, and return the literal 1142# terms it contains, ignoring any operators in the optree. 1143sub _terms { 1144 my ($self, $node)= @_; 1145 if (@$node == 1) { 1146 return $self->_pt_as_str($node); 1147 } 1148 my @ret; 1149 if (@$node == 2) { 1150 if ($node->[0] eq "?") { 1151 push @ret, map { $self->_terms($_) } @{ $node->[1] }; 1152 } 1153 else { 1154 push @ret, $self->_terms($node->[1]); 1155 } 1156 } 1157 else { 1158 foreach my $i (1 .. $#$node) { 1159 push @ret, $self->_terms($node->[$i]); 1160 } 1161 } 1162 return @ret; 1163} 1164 1165# takes a HeaderLine "cond" AoA and flattens it into 1166# a single expression, and then extracts all the and clauses 1167# it contains. Thus [['defined(A)'],['defined(B)']] and 1168# [['defined(A) && defined(B)']], end up as ['defined(A)','defined(B)'] 1169sub _flatten_cond { 1170 my ($self, $cond_ary)= @_; 1171 1172 my $expr= join " && ", map { 1173 map { "($_)" } 1174 @$_ 1175 } @$cond_ary; 1176 return [] unless $expr; 1177 my $tree= $self->parse_expr($expr); 1178 my %seen; 1179 my @and_clause= grep { !$seen{$_}++ } $self->_and_clauses($tree); 1180 return \@and_clause; 1181} 1182 1183# Find the best path into a tree of conditions, such that 1184# we reuse the maximum number of existing branches. Returning 1185# two arrays, the first contain the parts of $cond_array that 1186# make up the best path, in the best path order, and a second array 1187# with the remaining items in the initial order they were provided. 1188# Thus if we have previously stored only the path "A", "B", "C" 1189# into the tree, and want to find the best path for 1190# ["E","D","C","B","A"] we should return: ["A","B","C"],["E","D"], 1191# 1192# This used to reduce the number of conditions in the grouped content, 1193# and is especially helpful with dealing with DEBUGGING related 1194# functionality. It is coupled with careful control over the order 1195# that we add paths and conditions to the tree. 1196sub _best_path { 1197 my ($self, $tree_node, $cond_array, @path)= @_; 1198 my $best= \@path; 1199 my $rest= $cond_array; 1200 foreach my $cond (@$cond_array) { 1201 if ($tree_node->{$cond}) { 1202 my ($new_best, $new_rest)= 1203 $self->_best_path($tree_node->{$cond}, 1204 [ grep $_ ne $cond, @$cond_array ], 1205 @path, $cond); 1206 if (@$new_best > @$best) { 1207 ($best, $rest)= ($new_best, $new_rest); 1208 } 1209 } 1210 } 1211 if (@$best == @path) { 1212 foreach my $cond (@$cond_array) { 1213 my $not_cond= $self->tidy_cond("!($cond)"); 1214 if ($tree_node->{$not_cond}) { 1215 $best= [ @path, $cond ]; 1216 $rest= [ grep $_ ne $cond, @$cond_array ]; 1217 last; 1218 } 1219 } 1220 } 1221 return ($best, $rest); 1222} 1223 1224# This builds a group content tree from a set of lines. each content line in 1225# the original file is added to the file based on the conditions that apply to 1226# the content. 1227# 1228# The tree is made up of nested HoH's with keys in the HoH being normalized 1229# clauses from the {cond} data in the HeaderLine objects. 1230# 1231# Care is taken to minimize the number of pathways and to reorder clauses to 1232# reuse existing pathways and minimize the total number of conditions in the 1233# file. 1234# 1235# The '' key of a hash contains an array of the lines that are part of the 1236# condition that lead to that key. Thus lines with no conditions are in 1237# @{$tree{''}}, lines with the condition "defined(A) && defined(B)" would be 1238# in $tree{"defined(A)"}{"defined(B)"}{""}. 1239# 1240# The result of this sub is normally passed into __recurse_group_content_tree() 1241# which converts it back into a set of HeaderLine objects. 1242# 1243sub _build_group_content_tree { 1244 my ($self, $lines)= @_; 1245 $lines ||= $self->{lines}; 1246 my $filter= $self->{filter_content}; 1247 my %seen_normal; 1248 foreach my $line_data (@$lines) { 1249 next if $line_data->{type} ne "content"; 1250 next if $filter and !$filter->($self, $line_data); 1251 my $cond_frames= $line_data->{cond}; 1252 my $cond_frame= $self->_flatten_cond($cond_frames); 1253 my $flat_merged= join " && ", map "($_)", @$cond_frame; 1254 my $normalized; 1255 if (@$cond_frame) { 1256 $normalized= $self->tidy_cond($flat_merged); 1257 } 1258 else { 1259 $normalized= $flat_merged; # empty string 1260 } 1261 push @{ $seen_normal{$normalized} }, $line_data; 1262 } 1263 my @debugging; 1264 my @non_debugging; 1265 foreach my $key (keys %seen_normal) { 1266 if ($key =~ /DEBUGGING/) { 1267 push @debugging, $key; 1268 } 1269 else { 1270 push @non_debugging, $key; 1271 } 1272 } 1273 @non_debugging= 1274 sort { length($a) <=> length($b) || $a cmp $b } @non_debugging; 1275 @debugging= sort { length($b) <=> length($a) || $a cmp $b } @debugging; 1276 my %tree; 1277 foreach my $normal_expr (@non_debugging, @debugging) { 1278 my $all_line_data= $seen_normal{$normal_expr}; 1279 1280 my $cond_frame= 1281 (length $normal_expr) 1282 ? $self->_flatten_cond([ [$normal_expr] ]) 1283 : []; 1284 @$cond_frame= $self->_sort_terms(@$cond_frame); 1285 my $node= \%tree; 1286 my ($best, $rest)= $self->_best_path($node, $cond_frame); 1287 die sprintf "Woah: %d %d %d", 0 + @$best, 0 + @$rest, 0 + @$cond_frame 1288 unless @$best + @$rest == @$cond_frame; 1289 1290 foreach my $cond (@$best, @$rest) { 1291 $node= $node->{$cond} ||= {}; 1292 } 1293 push @{ $node->{''} }, @$all_line_data; 1294 } 1295 1296 warn $self->dd(\%tree) if $self->{debug}; 1297 $self->{tree}= \%tree; 1298 return \%tree; 1299} 1300 1301sub _recurse_group_content_tree { 1302 my ($self, $node, @path)= @_; 1303 1304 my @ret; 1305 local $self->{rgct_ret}= \@ret; 1306 local $self->{line_by_depth}= []; 1307 1308 $self->__recurse_group_content_tree($node, @path); 1309 return \@ret; 1310} 1311 1312# convert a tree of conditions constructed by _build_group_content_tree() 1313# and turn it into a set of HeaderLines that represents it. Performs the 1314# appropriate sets required to reconstitute an if/elif/elif/else sequence 1315# by calling _handle_else(). 1316sub __recurse_group_content_tree { 1317 my ($self, $node, @path)= @_; 1318 my $depth= 0 + @path; 1319 my $ind= $self->indent_chars($depth); 1320 my $ret= $self->{rgct_ret}; 1321 if ($node->{''}) { 1322 if (my $cb= $self->{post_process_grouped_content}) { 1323 $cb->($self, $node->{''}, \@path); 1324 } 1325 if (my $cb= $self->{post_process_content}) { 1326 $cb->($self, $_, \@path) for @{ $node->{''} }; 1327 } 1328 push @$ret, map { 1329 HeaderLine->new( 1330 %$_, 1331 cond => [@path], 1332 level => $depth, 1333 start_line_num => 0 + @$ret 1334 ) 1335 } @{ $node->{''} }; 1336 } 1337 1338 my %skip; 1339 foreach my $expr ( 1340 map { $_->[0] } 1341 sort { $a->[1] cmp $b->[1] || $a->[0] cmp $b->[0] } 1342 map { [ $_, lc($_) =~ s/[^A-Za-z0-9]+//gr ] } keys %$node 1343 ) { 1344 next unless length $expr; # ignore payload 1345 my $not= $self->tidy_cond("!($expr)"); 1346 if ($skip{$expr} or ($not !~ /^!/ and $node->{$not})) { 1347 next; 1348 } 1349 my $kid= $node->{$expr}; 1350 while (!$node->{$not} and keys(%$kid) == 1 and !$kid->{''}) { 1351 my ($kid_key)= keys(%$kid); 1352 $expr= $self->tidy_cond("($expr) && ($kid_key)"); 1353 $kid= $kid->{$kid_key}; 1354 my $new_not= $self->tidy_cond("!($expr)"); 1355 if ($node->{$new_not}) { 1356 $not= $new_not; 1357 $skip{$not}++; 1358 } 1359 } 1360 my $raw= "#${ind}if $expr\n"; 1361 my $hl= HeaderLine->new( 1362 type => "cond", 1363 sub_type => "#if", 1364 raw => $raw, 1365 line => $raw, 1366 level => $depth, 1367 cond => [ @path, [$expr] ], 1368 start_line_num => 0 + @$ret, 1369 ); 1370 $self->{line_by_depth}[$depth]= 0 + @$ret; 1371 push @$ret, $hl; 1372 $self->__recurse_group_content_tree($kid, @path, [$expr]); 1373 if ($node->{$not}) { 1374 $skip{$not}++; 1375 $self->_handle_else($not, $node->{$not}, $ind, $depth, @path, 1376 [$not]); 1377 } 1378 1379 # and finally the #endif 1380 1381 $raw= "#${ind}endif\n"; 1382 1383 # we need to extract the condition information from the last line in @ret, 1384 # as we don't know which condition we are ending here. It could be an elsif 1385 # from deep in the parse tree for instance. 1386 # So we need to extract the last frame from the cond structure in the last 1387 # line-info in @ret. 1388 # BUT if this last line is itself an #endif, then we need to take the second 1389 # to last line instead, as the endif would have "popped" that frame off the 1390 # condition stack. 1391 my $last_ret= $ret->[-1]; 1392 my $idx= 1393 ($last_ret->{type} eq "cond" && $last_ret->{sub_type} eq "#endif") 1394 ? -2 1395 : -1; 1396 my $end_line= HeaderLine->new( 1397 type => "cond", 1398 sub_type => "#endif", 1399 raw => $raw, 1400 line => $raw, 1401 level => $depth, 1402 cond => [ @path, $last_ret->{cond}[$idx] ], 1403 start_line_num => 0 + @$ret, 1404 inner_lines => @$ret - $self->{line_by_depth}[$depth], 1405 ); 1406 undef $self->{line_by_depth}[$depth]; 1407 push @$ret, $end_line; 1408 } 1409 return $ret; 1410} 1411 1412# this handles the specific case of an else clause, detecting 1413# when an elif can be constructed, may recursively call itself 1414# to deal with if/elif/elif/else chains. Calls back into 1415# __recurse_group_content_tree(). 1416sub _handle_else { 1417 my ($self, $not, $kid, $ind, $depth, @path)= @_; 1418 1419 # extract the first 3 keys - from this we can detect 1420 # which of the three scenarios we have to handle. 1421 my ($k1, $k2, $k3)= 1422 sort { length($a) <=> length($b) || $a cmp $b } keys %$kid; 1423 my $not_k1; 1424 if (length($k1) and defined($k2) and !defined($k3)) { 1425 1426 # if we do not have a payload (length($k1)) and we have exactly 1427 # two keys (defined($k2) and !defined($k3)) we need to compute 1428 # the inverse of $k1, which we will use later. 1429 $not_k1= $self->tidy_cond("!($k1)"); 1430 } 1431 my $ret= $self->{rgct_ret}; 1432 if (length($k1) and !defined($k2)) { 1433 1434 # only one child, no payload -> elsif $k1 1435 my $sub_expr; 1436 do { 1437 $sub_expr= 1438 !$sub_expr 1439 ? $k1 1440 : $self->tidy_cond("($sub_expr) && ($k1)"); 1441 $kid= $kid->{$k1}; 1442 ($k1, $k2)= 1443 sort { length($a) <=> length($b) || $a cmp $b } keys %$kid; 1444 } while length($k1) and !defined $k2; 1445 1446 my $raw= "#${ind}elif $sub_expr\n"; 1447 push @{ $path[-1] }, $sub_expr; 1448 my $hl= HeaderLine->new( 1449 type => "cond", 1450 sub_type => "#elif", 1451 raw => $raw, 1452 line => $raw, 1453 level => $depth, 1454 cond => [ map { [@$_] } @path ], 1455 start_line_num => 0 + @$ret, 1456 inner_lines => @$ret - $self->{line_by_depth}[$depth], 1457 ); 1458 $self->{line_by_depth}[$depth]= 0 + @$ret; 1459 push @$ret, $hl; 1460 $self->__recurse_group_content_tree($kid, @path); 1461 } 1462 elsif (defined($not_k1) and $not_k1 eq $k2) { 1463 1464 # two children which are complementary, no payload -> elif $k1 else.. 1465 my $raw= "#${ind}elif $k1\n"; 1466 1467 push @{ $path[-1] }, $k1; 1468 my $hl= HeaderLine->new( 1469 type => "cond", 1470 sub_type => "#elif", 1471 raw => $raw, 1472 line => $raw, 1473 level => $depth, 1474 cond => [ map { [@$_] } @path ], 1475 start_line_num => 0 + @$ret, 1476 inner_lines => @$ret - $self->{line_by_depth}[$depth], 1477 ); 1478 $self->{line_by_depth}[$depth]= 0 + @$ret; 1479 push @$ret, $hl; 1480 $self->__recurse_group_content_tree($kid->{$k1}, @path); 1481 $path[-1][-1]= $k2; 1482 $self->_handle_else($k2, $kid->{$k2}, $ind, $depth, @path); 1483 } 1484 else { 1485 # payload, 3+ children, or 2 which are not complementary -> else 1486 my $raw= "#${ind}else\n"; 1487 my $hl= HeaderLine->new( 1488 type => "cond", 1489 sub_type => "#else", 1490 raw => $raw, 1491 line => $raw, 1492 level => $depth, 1493 cond => [ map { [@$_] } @path ], 1494 start_line_num => 0 + @$ret, 1495 inner_lines => @$ret - $self->{line_by_depth}[$depth], 1496 ); 1497 $self->{line_by_depth}[$depth]= 0 + @$ret; 1498 push @$ret, $hl; 1499 $self->__recurse_group_content_tree($kid, @path); 1500 } 1501 return $ret; 1502} 1503 1504# group the content in lines by the condition that apply to them 1505# returns a set of lines representing the new structure 1506sub group_content { 1507 my ($self, $lines, $filter)= @_; 1508 $lines ||= $self->{lines}; 1509 local $self->{filter_content}= $filter || $self->{filter_content}; 1510 my $tree= $self->_build_group_content_tree($lines); 1511 return $self->_recurse_group_content_tree($tree); 1512} 1513 1514#read a file by name - opens the file and passes the fh into parse_fh(). 1515sub read_file { 1516 my ($self, $file_name, $callback)= @_; 1517 $self= $self->new() unless ref $self; 1518 local $self->{parse_source}= $file_name; 1519 open my $fh, "<", $file_name 1520 or confess "Failed to open '$file_name' for read: $!"; 1521 my $lines= $self->parse_fh($fh); 1522 if ($callback) { 1523 foreach my $line (@$lines) { 1524 $callback->($self, $line); 1525 } 1526 } 1527 return $self; 1528} 1529 1530# These are utility methods for the HeaderLine objects. 1531sub HeaderLine::new { 1532 my ($class, %self)= @_; 1533 return bless \%self, $class; 1534} 1535sub HeaderLine::cond { $_[0]->{cond} } # AoA 1536sub HeaderLine::type { $_[0]->{type} } 1537sub HeaderLine::type_is { return $_[0]->type eq $_[1] ? 1 : 0 } 1538sub HeaderLine::sub_type { $_[0]->{sub_type} } 1539sub HeaderLine::sub_type_is { return $_[0]->sub_type eq $_[1] ? 1 : 0 } 1540sub HeaderLine::raw { $_[0]->{raw} } 1541sub HeaderLine::flat { $_[0]->{flat} } 1542sub HeaderLine::line { $_[0]->{line} } 1543sub HeaderLine::level { $_[0]->{level} } 1544sub HeaderLine::is_content { return $_[0]->type_is("content") } 1545sub HeaderLine::is_cond { return $_[0]->type_is("cond") } 1546sub HeaderLine::is_define { return $_[0]->sub_type_is("#define") } 1547sub HeaderLine::line_num { $_[0]->{start_line_num} } 1548sub HeaderLine::inner_lines { $_[0]->{inner_lines} } 1549sub HeaderLine::n_lines { $_[0]->{n_lines} } 1550sub HeaderLine::embed { $_[0]->{embed} } 1551*HeaderLine::start_line_num= *HeaderLine::line_num; 1552 1553# these are methods for EmbedLine objects 1554*EmbedLine::new= *HeaderLine::new; 1555sub EmbedLine::flags { $_[0]->{flags} } 1556sub EmbedLine::return_type { $_[0]->{return_type} } 1557sub EmbedLine::name { $_[0]->{name} } 1558sub EmbedLine::args { $_[0]->{args} } # array ref 1559 15601; 1561 1562__END__ 1563 1564=head1 NAME 1565 1566HeaderParser - A minimal header file parser that can be hooked by other porting 1567scripts. 1568 1569=head1 SYNOPSIS 1570 1571 my $o= HeaderParser->new(); 1572 my $lines= $o->parse_fh($fh); 1573 1574=head1 DESCRIPTION 1575 1576HeaderParser is a tool to parse C preprocessor header files. The tool 1577understands the syntax of preprocessor conditions, and is capable of creating 1578a parse tree of the expressions involved, and normalizing them as well. 1579 1580C preprocessor files are a bit tricky to parse properly, especially with a 1581"line by line" model. There are two issues that must be dealt with: 1582 1583=over 4 1584 1585=item Line Continuations 1586 1587Any line ending in "\\\n" (that is backslash newline) is considered to be part 1588of a longer string which continues on the next line. Processors should replace 1589the "\\\n" typically with a space when converting to a "real" line. 1590 1591=item Comments Acting As A Line Continuation 1592 1593The rules for header files stipulates that C style comments are stripped 1594before processing other content, this means that comments can serve as a form 1595of line continuation: 1596 1597 #if defined(foo) /* 1598 */ && defined(bar) 1599 1600is the same as 1601 1602 #if defined(foo) && defined(bar) 1603 1604This type of comment usage is often overlooked by people writing header file 1605parsers for the first time. 1606 1607=item Indented pre processor directives. 1608 1609It is easy to forget that there may be multiple spaces between the "#" 1610character and the directive. It also easy to forget that there may be spaces 1611in *front* of the "#" character. Both of these cases are often overlooked. 1612 1613=back 1614 1615The main idea of this module is to provide a single framework for correctly 1616parsing the content of our header files in a consistent manner. A secondary 1617purpose it to make various tasks we want to do easier, such as normalizing 1618content or preprocessor expressions, or just extracting the real "content" of 1619the file properly. 1620 1621=head2 parse_fh 1622 1623This function parses a filehandle into a set of lines. Each line is represented by a hash 1624based object which contains the following fields: 1625 1626 bless { 1627 cond => [['defined(a)'],['defined(b)']], 1628 type => "content", 1629 sub_type => undef, 1630 raw => $raw_content_of_line, 1631 line => $normalized_content_of_line, 1632 level => $level, 1633 source => $filename_or_string, 1634 start_line_num => $line_num_for_first_line, 1635 n_lines => $line_num - $line_num_for_first_line, 1636 }, "HeaderLine" 1637 1638A "line" in this context is a logical line, and because of line continuations 1639and comments may contain more than one physical line, and thus more than 1640one newline, but will always include at least one and will always end with one 1641(unless there is no newline at the end of the file). Thus 1642 1643 before /* 1644 this is a comment 1645 */ after \ 1646 and continues 1647 1648will be treated as a single logical line even though the content is 1649spread over four lines. 1650 1651=over 4 1652 1653=item cond 1654 1655An array of arrays containing the normalized expressions of any C preprocessor 1656conditional blocks which include the line. Each line has its own copy of the 1657conditions it was operated on currently, but that may change so dont alter 1658this data. The inner arrays may contain more than one element. If so then the 1659line is part of an "#else" or "#elsif" and the clauses should be considered to 1660be a conjuction when considering "when is this line included", however when 1661considered as part of an if/elsif/else, each added clause represents the most 1662recent condition. In the following you can see how: 1663 1664 before /* cond => [ ] */ 1665 #if A /* cond => [ ['A'] ] */ 1666 do-a /* cond => [ ['A'] ] */ 1667 #elif B /* cond => [ ['!A', 'B'] ] */ 1668 do-b /* cond => [ ['!A', 'B'] ] */ 1669 #else /* cond => [ ['!A', '!B'] ] */ 1670 do-c /* cond => [ ['!A', '!B'] ] */ 1671 # if D /* cond => [ ['!A', '!B'], ['D'] ] */ 1672 do-d /* cond => [ ['!A', '!B'], ['D'] ] */ 1673 # endif /* cond => [ ['!A', '!B'], ['D'] ] */ 1674 #endif /* cond => [ ['!A', '!B'] ] */ 1675 after /* cond => [ ] */ 1676 1677So in the above we can see how the three clauses of the if produce 1678a single "frame" in the cond array, but that frame "grows" and changes 1679as additional else clauses are added. When an entirely new if block 1680is started (D) it gets its own block. Each endif includes the clause 1681it terminates. 1682 1683=item type 1684 1685This value indicates the type of the line. This may be one of the following: 1686'content', 'cond', 'define', 'include' and 'error'. Several of the types 1687have a sub_type. 1688 1689=item sub_type 1690 1691This value gives more detail on the type of the line where necessary. 1692Not all types have a subtype. 1693 1694 Type | Sub Type 1695 --------+---------- 1696 content | text 1697 | include 1698 | define 1699 | error 1700 cond | #if 1701 | #elif 1702 | #else 1703 | #endif 1704 1705Note that there are no '#ifdef' or '#elifndef' or similar expressions. All 1706expressions of that form are normalized into the '#if defined' form to 1707simplify processing. 1708 1709=item raw 1710 1711This was the raw original text before HeaderParser performed any modifications 1712to it. 1713 1714=item line 1715 1716This is the normalized and modified text after HeaderParser or any callbacks 1717have processed it. 1718 1719=item level 1720 1721This is the "indent level" of a line and corresponds to the number of blocks 1722that the line is within, not including any blocks that might be created by 1723the line itself. 1724 1725 before /* level => 0 */ 1726 #if A /* level => 0 */ 1727 do-a /* level => 1 */ 1728 #elif B /* level => 0 */ 1729 do-b /* level => 1 */ 1730 #else /* level => 0 */ 1731 do-c /* level => 1 */ 1732 # if D /* level => 1 */ 1733 do-d /* level => 2 */ 1734 # endif /* level => 1 */ 1735 #endif /* level => 0 */ 1736 after /* level => 0 */ 1737 1738=back 1739 1740parse_fh() will throw an exception if it encounters a malformed expression 1741or input it cannot handle. 1742 1743=head2 lines_as_str 1744 1745This function will return a string representation of the lines it is provided. 1746 1747=head2 group_content 1748 1749This function will group the text in the file by the conditions which contain 1750it. This is only useful for files where the content is essentially a list and 1751where changing the order that lines are output in will not break the resulting 1752file. 1753 1754Each content line will be grouped into a structure of nested if/else blocks 1755(elif will produce a new nested block) such that the content under the control 1756of a given set of normalized condition clauses are grouped together in the order 1757the occurred in the file, such that each combined conditional clause is output 1758only once. 1759 1760This means a file like this: 1761 1762 #if A 1763 A 1764 #elif K 1765 AK 1766 #else 1767 ZA 1768 #endif 1769 #if B && Q 1770 B 1771 #endif 1772 #if Q && B 1773 BC 1774 #endif 1775 #if A 1776 AD 1777 #endif 1778 #if !A 1779 ZZ 1780 #endif 1781 1782Will end up looking roughly like this: 1783 1784 #if A 1785 A 1786 AD 1787 #else 1788 ZZ 1789 # if K 1790 AK 1791 # else 1792 ZA 1793 # endif 1794 #endif 1795 #if B && Q 1796 B 1797 BC 1798 #endif 1799 1800Content at a given block level always goes before conditional clauses 1801at the same nesting level. 1802 1803=head2 HOOKS 1804 1805There are severals hooks that are available, C<pre_process_content> and 1806C<post_process_content>, and C<post_process_grouped_content>. All of these 1807hooks will be called with the HeaderParser object as the first argument. 1808The "process_content" callbacks will be called with a line hash as the second 1809argument, and C<post_process_grouped_content> will be called with an 1810array of line hashes for the content in that group, so that the array may be 1811modified or sorted. Callbacks called from inside of C<group_content()> 1812(that is C<post_process_content> and C<post_process_grouped_content> will be 1813called with an additional argument containing and array specifying the actual 1814conditional "path" to the content (which may differ somewhat from the data in 1815a lines "cond" property). 1816 1817These hooks may do what they like, but generally they will modify the 1818"line" property of the line hash to change the final output returned 1819by C<lines_as_str()> or C<group_content()>. 1820 1821=head2 FORMATTING AND INDENTING 1822 1823Header parser tries hard to produce neat and readable output with a consistent 1824style and form. For example: 1825 1826 #if defined(FOO) 1827 # define HAS_FOO 1828 # if defined(BAR) 1829 # define HAS_FOO_AND_BAR 1830 # else /* !defined(BAR) */ 1831 # define HAS_FOO_NO_BAR 1832 # endif /* !defined(BAR) */ 1833 #endif /* defined(FOO) */ 1834 1835HeaderParser uses two space tab stops for indenting C pre-processor 1836directives. It puts the spaces between the "#" and the directive. The "#" is 1837considered "part" of the indent, even though the space comes after it. This 1838means the first indent level "looks" like one space, and following indents 1839look like 2. This should match what a sensible editor would do with two space 1840tab stops. The C<indent_chars()> method can be used to convert an indent level 1841into a string that contains the appropriate number of spaces to go in between 1842the "#" and the directive. 1843 1844When emitting "#endif", "#elif" and "#else" directives comments will be 1845emitted also to show the conditions that apply. These comments may be wrapped 1846to cover multiple lines. Some effort is made to get these comments to line up 1847visually, but it uses heuristics which may not always produce the best result. 1848 1849=cut 1850