1package HeaderParser; 2use strict; 3use warnings; 4 5# these are required below in BEGIN statements, we cant have a 6# hard dependency on them as they might not be available when 7# we run as part of autodoc.pl 8# 9# use Data::Dumper; 10# use Storable qw(dclone); 11# 12use Carp qw(confess); 13use Text::Tabs qw(expand unexpand); 14use Text::Wrap qw(wrap); 15 16# The style of this file is determined by: 17# 18# perltidy -w -ple -bbb -bbc -bbs -nolq -l=80 -noll -nola -nwls='=' \ 19# -isbc -nolc -otr -kis -ci=4 -se -sot -sct -nsbl -pt=2 -fs \ 20# -fsb='#start-no-tidy' -fse='#end-no-tidy' -cpb -bfvt=2 21 22my ( 23 %unop, # unary operators and their precedence 24 %binop, # binary operators and their precedence 25 %is_right_assoc, # operators which are right associative 26 %precedence, # precedence of all operators. 27 %associative, # associative operators 28 %commutative, # commutative operators 29 %cmpop, # comparison operators 30 $unop_pat, # pattern to match unary operators 31 $binop_pat, # pattern to match binary operators 32 %op_names, # map of op to description, used in error messages 33 $tokenize_pat # a pattern which can tokenize an expression 34); 35 36BEGIN { 37 # this is initialization for the operator precedence expression parser 38 # we use for handling preprocessor conditions. 39 %op_names= ( 40 '==' => 'equality', 41 '!=' => 'inequality', 42 '<<' => 'bit-shift-left', 43 '>>' => 'bit-shift-right', 44 '+' => 'addition', 45 '-' => 'subtraction', 46 '*' => 'multiplication', 47 '/' => 'division', 48 '%' => 'modulo', 49 '||' => 'logical-or', # Lowest precedence 50 '&&' => 'logical-and', 51 '|' => 'binary-or', 52 '^' => 'binary-xor', 53 '&' => 'binary-and', 54 '<' => 'less-than', # split on spaces, all with equal precedence 55 '>' => 'greater-than', 56 '<=' => 'less-than-or-equal', 57 '>=' => 'greater-than-or-equal', 58 ); 59 my @cmpop= ( 60 '== !=', # listed in lowest to highest precedence 61 '< > <= >=', # split on spaces, all with equal precedence 62 ); 63 my @binop= ( 64 '||', # Lowest precedence 65 '&&', 66 '|', 67 '^', 68 '&', 69 @cmpop, # include the numerical comparison operators. 70 '<< >>', 71 '+ -', 72 '* / %', # highest prcedence operators. 73 ); 74 75 my @unop= qw( ! ~ + - ); 76 %unop= map { $_ => 1 } @unop; 77 %cmpop= map { $_ => 1 } map { split /\s+/, $_ } @cmpop; 78 %binop= map { $_ => 1 } map { split /\s+/, $_ } @binop; 79 80 my $make_pat= sub { 81 my $pat= join "|", sort { length($b) <=> length($a) || $a cmp $b } 82 map quotemeta($_), @_; 83 return qr/$pat/; 84 }; 85 $unop_pat= $make_pat->(@unop); 86 foreach my $ix (0 .. $#binop) { 87 my $sym= $binop[$ix]; 88 $precedence{$_}= (1 + $ix) * 10 for split /\s+/, $sym; 89 } 90 $is_right_assoc{"?"}= 1; 91 $is_right_assoc{":"}= 1; 92 $precedence{"?"}= 1; 93 $precedence{":"}= 0; 94 95 $associative{$_}++ 96 for qw( || && + *); # we leave '==' out so we don't reorder terms 97 $commutative{$_}++ for qw( || && + *); 98 99 $binop_pat= $make_pat->(keys %precedence); 100 $tokenize_pat= qr/ 101 ^(?: 102 (?<comment> \/\*.*?\*\/ ) 103 | (?<ws> \s+ ) 104 | (?<term> 105 (?<literal> 106 (?<define> defined\(\w+\) ) 107 | (?<func> \w+\s*\(\s*\w+(?:\s*,\s*\w+)*\s*\) ) 108 | (?<const> (?:0x[a-fA-F0-9]+|\d+[LU]*|'.') ) 109 | (?<sym> \w+ ) 110 ) 111 | (?<op> $binop_pat | $unop_pat ) 112 | (?<paren> [\(\)] ) 113 ) 114 ) 115 /xs; 116} 117 118# dump the arguments with dump. wraps loading Dumper 119# as we are executed by miniperl where Dumper isnt available 120sub dd { 121 my $self= shift; 122 local $self->{orig_content}; 123 my $ret= "(dump not available)"; 124 eval { 125 require Data::Dumper; 126 $ret= Data::Dumper->new(\@_)->Indent(1)->Sortkeys(1)->Useqq(1)->Dump(); 127 }; 128 return $ret; 129} 130 131my $has_storable; 132 133# same story here, in miniperl we use slow perl code, 134# in real perl we can use Storable and speed things up. 135BEGIN { eval "use Storable; \$has_storable=1;" } 136 137# recursively copy an AoAoA... 138sub copy_aoa { 139 my ($aoa)= @_; 140 if ($has_storable) { 141 return Storable::dclone($aoa); 142 } 143 else { 144 return _copy_aoa($aoa); 145 } 146} 147 148sub _copy_aoa { 149 my ($thing)= @_; 150 if (ref $thing) { 151 return [ map { ref($_) ? _copy_aoa($_) : $_ } @$thing ]; 152 } 153 else { 154 return $thing; 155 } 156} 157 158# return the number characters that should go in between a '#' and 159# the name of a c preprocessor directive. Returns 0 spaces for level 160# 0, and 2 * ($level - 1) + 1 spaces for the rest. (1,3,5, etc) 161# This might sound weird, but consider these are tab *stops* and the 162# '#' is included in the total. which means indents of 2, 4, 6 etc. 163sub indent_chars { 164 my ($self, $level)= @_; 165 my $ind= ""; 166 $ind .= " " if $level; 167 $ind .= " " x ($level - 1) if $level > 1; 168 return $ind; 169} 170 171# we use OO to store state, etc. 172sub new { 173 my ($class, %args)= @_; 174 $args{add_commented_expr_after} //= 10; 175 $args{max_width} //= 78; 176 $args{min_break_width} //= 70; 177 return bless \%args,; 178} 179 180# this parses the expression into an array of tokens 181# this is somewhat crude, we could do this incrementally 182# if we wanted and avoid the overhead. but it makes it 183# easier to debug the tokenizer. 184sub _tokenize_expr { 185 my ($self, $expr)= @_; 186 delete $self->{tokens}; 187 delete $self->{parse_tree}; 188 $self->{original_expr}= $expr; 189 190 my @tokens; 191 while ($expr =~ s/$tokenize_pat//xs) { 192 push @tokens, {%+} if defined $+{'term'}; 193 } 194 $self->{tokens}= \@tokens; 195 warn $self->dd($self) if $self->{debug}; 196 if (length $expr) { 197 confess "Failed to tokenize_expr: $expr\n"; 198 } 199 return \@tokens; 200} 201 202sub _count_ops { 203 my ($self, $term)= @_; 204 my $count= 0; 205 $count++ while $term =~ m/(?: \|\| | \&\& | \? )/gx; 206 return $count; 207} 208 209# sort terms in an expression in a way that puts things 210# in a sensible order. Anything starting with PERL_IN_ 211# should be on the left in alphabetical order. Digits 212# should be on the right (eg 0), and ties are resolved 213# by stripping non-alpha-numerc, thus removing underbar 214# parens, spaces, logical operators, etc, and then by 215# lc comparison of the result. 216sub _sort_terms { 217 my $self= shift; 218 my (@terms)= map { 219 [ 220 $_, # 0: raw 221 lc($_) =~ s/[^a-zA-Z0-9]//gr, # 1: "_" stripped and caseless 222 $_ =~ m/PERL_IN_/ ? 1 : 0, # 2: PERL_IN_ labeled define 223 $_ =~ m/^\d/ ? 1 : 0, # 3: digit 224 $_ =~ m/DEBUGGING/ ? 1 : 0, # 4: DEBUGGING? 225 $self->_count_ops($_), # 5: Number of ops (||, &&) 226 ] 227 } @_; 228 my %seen; 229 #start-no-tidy 230 @terms= map { $seen{ $_->[0] }++ ? () : $_->[0] } 231 sort { 232 $a->[5] <=> $b->[5] || # least number of ops 233 $b->[2] <=> $a->[2] || # PERL_IN before others 234 $a->[3] <=> $b->[3] || # digits after others 235 $a->[4] <=> $b->[4] || # DEBUGGING after all else 236 $a->[1] cmp $b->[1] || # stripped caseless cmp 237 lc($a->[0]) cmp lc($b->[0]) || # caseless cmp 238 $a->[0] cmp $b->[0] || # exact cmp 239 0 240 } @terms; 241 #end-no-tidy 242 return @terms; 243} 244 245# normalize a condition expression by parsing it and then stringifying 246# the parse tree. 247sub tidy_cond { 248 my ($self, $expr)= @_; 249 my $ret= $self->{_tidy_cond_cache}{$expr} //= do { 250 $self->parse_expr($expr) if defined $expr; 251 my $text= $self->_pt_as_str(); 252 $text; 253 }; 254 $self->{last_expr}= $ret; 255 return $ret; 256} 257 258# convert a parse tree structure to a string recursively. 259# 260# Parse trees are currently made up of arrays, with the count 261# of items in the object determining the type of op it represents. 262# 1 argument: literal value of some sort. 263# 2 arguments: unary operator: 0 slot is the operator, 1 is a parse tree 264# : ternary: 0 slot holds '?', 1 is an array holding three 265# parse trees: cond, true, false 266# 3 arguments or more: binary operator. 0 slot is the op. 1..n are parse trees 267# : note, this is multigate for commutative operators like 268# : "+", "*", "&&" and "||", so an expr 269# : like "A && B && !C" would be represented as: 270# : [ "&&", ["A"], ["B"], [ "!",["C"] ] ] 271# 272sub _pt_as_str { 273 my ($self, $node, $parent_op, $depth)= @_; 274 275 $node ||= $self->{parse_tree} 276 or confess "No parse tree?"; 277 $depth ||= 0; 278 if (@$node == 1) { 279 280 # its a literal 281 return $node->[0]; 282 } 283 elsif (@$node == 2) { 284 285 # is this a ternary or an unop? 286 if ($node->[0] eq '?') { 287 288 # ternary, the three "parts" are tucked away in 289 # an array in the payload slot 290 my $expr= 291 $self->_pt_as_str($node->[1][0], "?", $depth + 1) . " ? " 292 . $self->_pt_as_str($node->[1][1], "?", $depth + 1) . " : " 293 . $self->_pt_as_str($node->[1][2], "?", $depth + 1); 294 295 # stick parens on if this is a subexpression 296 $expr= "( " . $expr . " )" if $depth; 297 return $expr; 298 } 299 else { 300 if ( $node->[0] eq "!" 301 and @{ $node->[1] } == 2 302 and $node->[1][0] eq "!") 303 { 304 # normalize away !! in expressions. 305 return $self->_pt_as_str($node->[1][1], $parent_op, $depth); 306 } 307 308 # unop - the payload is a optree 309 return $node->[0] 310 . $self->_pt_as_str($node->[1], $node->[0], $depth + 1); 311 } 312 } 313 314 # if we get here we are dealing with a binary operator 315 # the nodes are not necessarily binary, as we "collect" 316 # the terms into a list, thus: A && B && C && D -> ['&&',A,B,C,D] 317 my ($op, @terms)= @$node; 318 319 # convert the terms to strings 320 @terms= map { $self->_pt_as_str($_, $op, $depth + 1) } @terms; 321 322 # sort them to normalize the subexpression 323 my $expr= 324 join " $op ", $associative{$op} 325 ? $self->_sort_terms(@terms) 326 : @terms; 327 328 # stick parens on if this is a subexpression 329 $expr= "( " . $expr . " )" if $depth and !$cmpop{$op}; 330 331 # and we are done. 332 return $expr; 333} 334 335# Returns the precedence of an operator, returns 0 if there is no token 336# or the next token is not an op, or confesss if it encounters an op it does not 337# know. 338sub _precedence { 339 my $self= shift; 340 my $token= shift // return 0; 341 342 my $op= (ref $token ? $token->{op} : $token) // return 0; 343 344 return $precedence{$op} // confess "Unknown op '$op'"; 345} 346 347# entry point into parsing the tokens, checks that we actually parsed everything 348# and didnt leave anything in the token stream (possible from a malformed expression) 349# Performs some minor textual cleanups using regexes, but then does a proper parse 350# of the expression. 351sub parse_expr { 352 my ($self, $expr)= @_; 353 if (defined $expr) { 354 $expr =~ s/\s*\\\n\s*/ /g; 355 $expr =~ s/defined\s+(\w+)/defined($1)/g; 356 $self->_tokenize_expr($expr); 357 } 358 my $ret= $self->_parse_expr(); 359 if (@{ $self->{tokens} }) { 360 361 # if all was well with parsing we should not get here. 362 confess "Unparsed tokens: ", $self->dd($self->{tokens}); 363 } 364 $self->{parse_tree}= $ret; 365 return $ret; 366} 367 368# this is just a wrapper around _parse_expr_assoc() which handles 369# parsing an arbitrary expression. 370sub _parse_expr { 371 my ($self)= @_; 372 return $self->_parse_expr_assoc($self->_parse_expr_primary(), 1); 373} 374 375# This handles extracting from the token stream 376# - simple literals 377# - unops (assumed to be right associative) 378# - parens (which reset the precedence acceptable to the parser) 379# 380sub _parse_expr_primary { 381 my ($self)= @_; 382 my $tokens= $self->{tokens} 383 or confess "No tokens in _parse_expr_primary?"; 384 my $first= $tokens->[0] 385 or confess "No primary?"; 386 if ($first->{paren} and $first->{paren} eq "(") { 387 shift @$tokens; 388 my $expr= $self->_parse_expr(); 389 $first= $tokens->[0]; 390 if (!$first->{paren} or $first->{paren} ne ")") { 391 confess "Expecting close paren", $self->dd($tokens); 392 } 393 shift @$tokens; 394 return $expr; 395 } 396 elsif ($first->{op} and $unop{ $first->{op} }) { 397 my $op_token= shift @$tokens; 398 return [ $op_token->{op}, $self->_parse_expr_primary() ]; 399 } 400 elsif (defined $first->{literal}) { 401 shift @$tokens; 402 return [ $first->{literal} ]; 403 } 404 else { 405 die sprintf 406 "Unexpected token '%s', expecting literal, unary, or expression.\n", 407 $first->{term}; 408 } 409} 410 411# This is the heart of the expression parser. It uses 412# a pair of nested loops to avoid excessive recursion during parsing, 413# which should be a bit faster than other strategies. It only should 414# recurse when the precedence level changes. 415sub _parse_expr_assoc { 416 my ($self, $lhs, $min_precedence)= @_; 417 my $tokens= $self->{tokens} 418 or confess "No tokens in _parse_expr_assoc"; 419 my $la= $tokens->[0]; # lookahead 420 my $la_pr= $self->_precedence($la); # lookahead precedence 421 while ($la && $la_pr >= $min_precedence) { 422 my $op_token= shift @$tokens; 423 my $op_pr= $la_pr; # op precedence 424 if ($op_token->{op} eq "?") { 425 my $mid= $self->_parse_expr(); 426 if (@$tokens and $tokens->[0]{op} and $tokens->[0]{op} eq ":") { 427 shift @$tokens; 428 my $tail= $self->_parse_expr(); 429 return [ '?', [ $lhs, $mid, $tail ] ]; 430 } 431 confess "Panic: expecting ':'", $self->dd($tokens); 432 } 433 my $rhs; 434 eval { $rhs= $self->_parse_expr_primary(); } 435 or die "Error in $op_names{$op_token->{op}} expression: $@"; 436 $la= $tokens->[0]; 437 $la_pr= $self->_precedence($la); 438 while ( 439 $la_pr > $op_pr || # any and larger 440 ( $is_right_assoc{ $op_token->{op} } 441 and $la_pr == $op_pr) # right and equal 442 ) { 443 my $new_precedence= $op_pr + ($la_pr > $op_pr ? 1 : 0); 444 $rhs= $self->_parse_expr_assoc($rhs, $new_precedence); 445 $la= $tokens->[0]; 446 $la_pr= $self->_precedence($la); 447 } 448 if ( @$lhs >= 3 449 && $lhs->[0] eq $op_token->{op} 450 && $commutative{ $op_token->{op} }) 451 { 452 push @$lhs, $rhs; 453 } 454 else { 455 my @lt= ($lhs); 456 my @rt= ($rhs); 457 458 # if we have '( a && b ) && ( c && d)' 459 # turn it into 'a && b && c && d' 460 if (@$lhs > 2 && $lhs->[0] eq $op_token->{op}) { 461 (undef,@lt)= @$lhs; # throw away op. 462 } 463 if (@$rhs > 2 && $rhs->[0] eq $op_token->{op}) { 464 (undef,@rt)= @$rhs; # throw away op. 465 } 466 $lhs= [ $op_token->{op}, @lt, @rt ]; 467 } 468 } 469 return $lhs; 470} 471 472#entry point for normalizing and if/elif statements 473#returns the line and condition in normalized form. 474sub normalize_if_elif { 475 my ($self, $line, $line_info)= @_; 476 if (my $dat= $self->{cache_normalize_if_elif}{$line}) { 477 return $dat->{line}, $dat->{cond}; 478 } 479 my ($cond); 480 eval { 481 ($line, $cond)= $self->_normalize_if_elif($line); 482 1; 483 } or die sprintf "Error at line %d\nLine %d: %s\n%s", 484 ($line_info->start_line_num()) x 2, $line, $@; 485 $self->{cache_normalize_if_elif}{$line}= { line => $line, cond => $cond }; 486 return ($line, $cond); 487} 488 489#guts of the normalize_if_elif() - cleans up the line, extracts 490#the condition, and then tidies it with tidy_cond(). 491sub _normalize_if_elif { 492 my ($self, $line)= @_; 493 my $nl= ""; 494 $nl= $1 if $line =~ s/(\n+)\z//; 495 $line =~ s/\s+\z//; 496 my @comment; 497 push @comment, $1 while $line =~ s!\s*(/\*.*?\*/)\z!!; 498 $line =~ s/defined\s*\(\s*(\w+)\s*\)/defined($1)/g; 499 $line =~ s/!\s+defined/!defined/g; 500 501 if ($line =~ /^#((?:el)?if)(n?)def\s+(\w+)/) { 502 my $if= $1; 503 my $not= $2 ? "!" : ""; 504 $line= "#$if ${not}defined($3)"; 505 } 506 $line =~ s/#((?:el)?if)\s+// 507 or confess "Bad cond: $line"; 508 my $if= $1; 509 $line =~ s/!\s+/!/g; 510 511 my $old_cond= $line; 512 my $cond= $self->tidy_cond($old_cond); 513 514 warn "cond - $old_cond\ncond + $cond\n" 515 if $old_cond ne $cond and $self->{debug}; 516 517 $line= "#$if $cond"; 518 $line .= " " . join " ", reverse @comment if @comment; 519 520 $line .= $nl; 521 return ($line, $cond); 522} 523 524# parses a text buffer as though it was a file on disk 525# calls parse_fh() 526sub parse_text { 527 my ($self, $text)= @_; 528 local $self->{parse_source}= "(buffer)"; 529 open my $fh, "<", \$text 530 or die "Failed to open buffer for read: $!"; 531 return $self->parse_fh($fh); 532} 533 534# takes a readable filehandle and parses whatever contents is 535# returned by reading it. Returns an array of HeaderLine objects. 536# this is the main routing for parsing a header file. 537sub parse_fh { 538 my ($self, $fh)= @_; 539 my @lines; 540 my @cond; 541 my @cond_line; 542 my $last_cond; 543 local $self->{parse_source}= $self->{parse_source} || "(unknown)"; 544 my $cb= $self->{pre_process_content}; 545 $self->{orig_content}= ""; 546 my $line_num= 1; 547 548 while (defined(my $line= readline($fh))) { 549 my $start_line_num= $line_num++; 550 $self->{orig_content} .= $line; 551 while ($line =~ /\\\n\z/ or $line =~ m</\*(?:(?!\*/).)*\s*\z>s) { 552 defined(my $read_line= readline($fh)) 553 or last; 554 $self->{orig_content} .= $read_line; 555 $line_num++; 556 $line .= $read_line; 557 } 558 while ($line =~ m!/\*(.*?)(\*/|\z)!gs) { 559 my ($inner, $tail)= ($1, $2); 560 if ($tail ne "*/") { 561 confess 562 "Unterminated comment starting at line $start_line_num\n"; 563 } 564 elsif ($inner =~ m!/\*!) { 565 confess 566 "Nested/broken comment starting at line $start_line_num\n"; 567 } 568 } 569 570 my $raw= $line; 571 my $type= "content"; 572 my $sub_type= "text"; 573 my $level= @cond; 574 my $do_pop= 0; 575 my $flat= $line; 576 $flat =~ s/\s*\\\n\s*/ /g; 577 $flat =~ s!/\*.*?\*/! !gs; 578 $flat =~ s/\s+/ /g; 579 $flat =~ s/\s+\z//; 580 $flat =~ s/^\s*#\s*/#/g; 581 582 my $line_info= 583 HeaderLine->new(start_line_num => $start_line_num, raw => $raw); 584 my $do_cond_line; 585 if ($flat =~ /^#/) { 586 if ($flat =~ m/^(#(?:el)?if)(n?)def\s+(\w+)/) { 587 my $if= $1; 588 my $not= $2 ? "!" : ""; 589 my $sym= $3; 590 $flat =~ 591 s/^(#(?:el)?if)(n?)def\s+(\w+)/$if ${not}defined($sym)/; 592 } 593 my $cond; # used in various expressions below 594 if ($flat =~ /^#endif/) { 595 if (!@cond) { 596 confess "Not expecting $flat"; 597 } 598 $do_pop= 1; 599 $level--; 600 $type= "cond"; 601 $sub_type= "#endif"; 602 } 603 elsif ($flat =~ /^#if\b/) { 604 ($flat, $cond)= $self->normalize_if_elif($flat, $line_info); 605 push @cond, [$cond]; 606 push @cond_line, $line_info; 607 $type= "cond"; 608 $sub_type= "#if"; 609 } 610 elsif ($flat =~ /^#elif\b/) { 611 if (!@cond) { 612 confess "No if for $flat"; 613 } 614 $level--; 615 ($flat, $cond)= $self->normalize_if_elif($flat, $line_info); 616 $cond[-1][-1]= $self->tidy_cond("!($cond[-1][-1])"); 617 $cond_line[-1]= $line_info; 618 push @{ $cond[-1] }, $cond; 619 $type= "cond"; 620 $sub_type= "#elif"; 621 } 622 elsif ($flat =~ /^#else\b/) { 623 if (!@cond) { 624 confess "No if for $flat"; 625 } 626 $level--; 627 $cond[-1][-1]= $self->tidy_cond("!($cond[-1][-1])"); 628 $cond_line[-1]= $line_info; 629 $type= "cond"; 630 $sub_type= "#else"; 631 } 632 elsif ($flat =~ /#undef/) { 633 $type= "content"; 634 $sub_type= "#undef"; 635 } 636 elsif ($flat =~ /#pragma\b/) { 637 $type= "content"; 638 $sub_type= "#pragma"; 639 } 640 elsif ($flat =~ /#include\b/) { 641 $type= "content"; 642 $sub_type= "#include"; 643 } 644 elsif ($flat =~ /#define\b/) { 645 $type= "content"; 646 $sub_type= "#define"; 647 } 648 elsif ($flat =~ /#error\b/) { 649 $type= "content"; 650 $sub_type= "#error"; 651 } 652 else { 653 confess "Do not know what to do with $line"; 654 } 655 if ($type eq "cond") { 656 657 # normalize conditional lines 658 $line= $flat; 659 $last_cond= $line_info; 660 } 661 } 662 $line =~ s/\n?\z/\n/; 663 664 %$line_info= ( 665 cond => copy_aoa(\@cond), 666 type => $type, 667 sub_type => $sub_type, 668 raw => $raw, 669 flat => $flat, 670 line => $line, 671 level => $level, 672 source => $self->{parse_source}, 673 start_line_num => $start_line_num, 674 n_lines => $line_num - $start_line_num, 675 ); 676 677 push @lines, $line_info; 678 if ($do_pop) { 679 $line_info->{inner_lines}= 680 $line_info->start_line_num - $cond_line[-1]->start_line_num; 681 pop @cond; 682 pop @cond_line; 683 } 684 if ($type eq "content" and $cb) { 685 $cb->($self, $lines[-1]); 686 } 687 } 688 if (@cond_line) { 689 my $msg= "Unterminated conditional block starting line " 690 . $cond_line[-1]->start_line_num(); 691 $msg .= 692 " with last conditional operation at line " 693 . $last_cond->start_line_num() 694 if $cond_line[-1] != $last_cond; 695 confess $msg; 696 } 697 $self->{lines}= \@lines; 698 return \@lines; 699} 700 701# returns the last lines we parsed. 702sub lines { $_[0]->{lines} } 703 704# assuming a line looks like an embed.fnc entry parse it 705# and normalize it, and create and EmbedLine object from it. 706sub tidy_embed_fnc_entry { 707 my ($self, $line_data)= @_; 708 my $line= $line_data->{line}; 709 return $line if $line =~ /^\s*:/; 710 return $line unless $line_data->{type} eq "content"; 711 return $line unless $line =~ /\|/; 712 713 $line =~ s/\s*\\\n/ /g; 714 $line =~ s/\s+\z//; 715 ($line)= expand($line); 716 my ($flags, $ret, $name, @args)= split /\s*\|\s*/, $line; 717 my %flag_seen; 718 $flags= join "", grep !$flag_seen{$_}++, sort split //, $flags; 719 if ($flags =~ s/^#//) { 720 $flags .= "#"; 721 } 722 if ($flags eq "#") { 723 die "Not allowed to use only '#' for flags" 724 . "in 'embed.fnc' at line $line_data->{start_line_num}"; 725 } 726 if (!$flags) { 727 die "Missing flags in function definition" 728 . " in 'embed.fnc' at line $line_data->{start_line_num}\n" 729 . "Did you a forget a line continuation on the previous line?\n"; 730 } 731 for ($ret, @args) { 732 s/(\w)\*/$1 */g; 733 s/\*\s+(\w)/*$1/g; 734 s/\*const/* const/g; 735 } 736 my $head= sprintf "%-8s|%-7s", $flags, $ret; 737 $head .= sprintf "|%*s", -(31 - length($head)), $name; 738 if (@args and length($head) > 32) { 739 $head .= "\\\n"; 740 $head .= " " x 32; 741 } 742 foreach my $ix (0 .. $#args) { 743 my $arg= $args[$ix]; 744 $head .= "|$arg"; 745 $head .= "\\\n" . (" " x 32) if $ix < $#args; 746 } 747 $line= $head . "\n"; 748 749 if ($line =~ /\\\n/) { 750 my @lines= split /\s*\\\n/, $line; 751 my $len= length($lines[0]); 752 $len < length($_) and $len= length($_) for @lines; 753 $len= int(($len + 7) / 8) * 8; 754 $len= 72 if $len < 72; 755 $line= join("\\\n", 756 (map { sprintf "%*s", -$len, $_ } @lines[ 0 .. $#lines - 1 ]), 757 $lines[-1]); 758 } 759 ($line)= unexpand($line); 760 761 $line_data->{embed}= EmbedLine->new( 762 flags => $flags, 763 return_type => $ret, 764 name => $name, 765 args => \@args, 766 ); 767 $line =~ s/\s+\z/\n/; 768 $line_data->{line}= $line; 769 return $line; 770} 771 772# line up the text in a multiline string by a given $fragment 773# of text, inserting whitespace in front or behind the $fragment 774# to get the text to line up. Returns the text. This is wrapped 775# by line_up() and is used to wrap long conditions and comments 776# in the generated code. 777sub _line_up_frag { 778 my ($self, $str, $fragment)= @_; 779 die "has tabs?!" if $str =~ /\t/; 780 my @lines= split /\n/, $str; 781 my $changed= 1; 782 while ($changed) { 783 $changed= 0; 784 foreach my $ix (0 .. $#lines - 1) { 785 my $f_index= 0; 786 my $n_index= 0; 787 while (1) { 788 $f_index= index($lines[$ix], $fragment, $f_index); 789 $n_index= index($lines[ $ix + 1 ], $fragment, $n_index); 790 if ($f_index == -1 or $n_index == -1) { 791 last; 792 } 793 if ($f_index < $n_index) { 794 my $f_idx= $f_index; 795 $f_idx-- while substr($lines[$ix], $f_idx, 1) ne " "; 796 substr($lines[$ix], $f_idx, 0, " " x ($n_index - $f_index)); 797 $changed++; 798 last; 799 } 800 elsif ($n_index < $f_index) { 801 my $n_idx= $n_index; 802 $n_idx-- while substr($lines[ $ix + 1 ], $n_idx, 1) ne " "; 803 substr($lines[ $ix + 1 ], 804 $n_idx, 0, " " x ($f_index - $n_index)); 805 $changed++; 806 last; 807 } 808 $f_index++; 809 $n_index++; 810 } 811 } 812 } 813 my $ret= join "", map { "$_\n" } @lines; 814 return $ret; 815} 816 817sub _fixup_indent { 818 my ($self, $line)= @_; 819 my @lines= split /\n/, $line; 820 if ($lines[0]=~/^(#\s*\w+(?:\s*\/\*)?\s)(\s+)/) { 821 my $first_left_len = length $1; 822 823 while (1) { 824 my $ok = 1; 825 for (@lines) { 826 /^.{$first_left_len} / 827 or do { $ok = 0; last; }; 828 } 829 if ($ok) { 830 s/^(.{$first_left_len}) /$1/ for @lines; 831 } else { 832 last; 833 } 834 } 835 } 836 837 if ($lines[0]=~/^(#\s*\w+\s+)\(/) { 838 my $len = length($1); 839 for my $idx (1..$#lines) { 840 $lines[$idx]=~s/^([ ]{$len})(\s+)(\()/$1$3$2/; 841 } 842 } 843 my $ret= join "", map { "$_\n" } @lines; 844 return $ret; 845} 846 847# this is the workhorse for _break_line_at_op(). 848sub __break_line_at_op { 849 my ($self, $limit, $line, $blank_prefix)= @_; 850 my @lines= (""); 851 while (length $line) { 852 my $part; 853 if ($line =~ s/^(.*?(?:\|\||&&)\s+)//) { 854 $part= $1; 855 } 856 else { 857 $part= $line; 858 $line= ""; 859 } 860 if (length($lines[-1]) + length($part) < $limit) { 861 $lines[-1] .= $part; 862 } 863 else { 864 push @lines, $blank_prefix . $part; 865 } 866 } 867 return \@lines; 868} 869 870# Break a condition line into parts, while trying to keep the last 871# token on each line being an operator like || or && or ? or : We try 872# to keep each line at $limit characters, however, we also try to 873# ensure that each line has the same number of operators on it such 874# that across all the lines there are only two counts of operators (eg, 875# we either way each line to have two operators on it, or 0, or 1 or 0, 876# or 2 or 1, and so on.) If we cannot meet this requirement we reduce 877# the limit by 1 and try again, until we meet the objective, or the 878# limit ends up at 70 chars or less. 879sub _break_line_at_op { 880 my ($self, $limit, $line, $blank_prefix)= @_; 881 my $lines; 882 while (1) { 883 $lines= $self->__break_line_at_op($limit, $line, $blank_prefix); 884 my %op_counts; 885 foreach my $line_idx (0 .. $#$lines) { 886 my $line= $lines->[$line_idx]; 887 my $count= 0; 888 $count++ while $line =~ /(\|\||&&|\?|:)/g; 889 $op_counts{$count}++; 890 891 } 892 if ($limit <= $self->{min_break_width} || keys(%op_counts) <= 2) { 893 last; 894 } 895 $limit--; 896 } 897 898 s/\s*\z/\n/ for @$lines; 899 return join "", @$lines; 900} 901 902sub _max { # cant use Scalar::Util so we roll our own 903 my $max= shift; 904 $max < $_ and $max= $_ for @_; 905 return $max; 906} 907 908# take a condition, split into $type and $rest 909# wrap it, and try to line up operators and defined() functions 910# that it contains. This is rather horrible code, but it does a 911# reasonable job applying the heuristics we need to lay our the 912# conditions in a reasonable way. 913sub _wrap_and_line_up_cond { 914 my ($self, $type, $rest)= @_; 915 916 my $limit= $self->{max_width}; 917 918 # extract the expression part of the line, and normalize it, we do 919 # this here even though it might be duplicative as it is possible 920 # that the caller code has munged the expression in some way, and we 921 # might want to simplify the expression first. Eg: 922 # 'defined(FOO) && (defined(BAR) && defined(BAZ))' should be turned into 923 # 'defined(FOO) && defined(BAR) && defined(BAZ)' if possible. 924 my $rest_head= ""; 925 my $rest_tail= ""; 926 if ($rest =~ s!(if\s+)!!) { 927 $rest_head= $1; 928 } 929 if ($rest =~ s!(\s*/\*.*?\*/)\s*\z!! || $rest =~ s!(\s*\*/\s*)\z!!) { 930 $rest_tail= $1; 931 } 932 if ($rest) { 933 $rest= $self->tidy_cond($rest); 934 $rest= $rest_head . $rest . $rest_tail; 935 } 936 937 my $l= length($type); 938 my $line= $type; 939 $line .= $rest if length($rest); 940 my $blank_prefix= " " x $l; 941 942 # at this point we have a single line with the entire expression on it 943 # if it fits on one line we are done, we can return it right away. 944 if (length($line) <= $limit) { 945 $line =~ s/\s*\z/\n/; 946 return $line; 947 } 948 my $rest_copy= $rest; 949 my @fragments; 950 my $op_pat= qr/(?:\|\||&&|[?:])/; 951 952 # does the $rest contain a parenthesized group? If it does then 953 # there are a mixture of different ops being used, as if it was all 954 # the same opcode there would not be a parenthesized group. 955 # If it does then we handle it differently, and try to put the 956 # different parts of the expression on their own line. 957 if ($rest_copy =~ /$op_pat\s*\(/) { 958 my @parts; 959 while (length $rest_copy) { 960 if ($rest_copy =~ s/^(.*?$op_pat)(\s*!?\()/$2/) { 961 push @parts, $1; 962 } else { 963 #$rest_copy=~s/^\s+//; 964 push @parts, $rest_copy; 965 last; 966 } 967 } 968 $parts[0]= $type . $parts[0]; 969 $parts[$_]= $blank_prefix . $parts[$_] for 1 .. $#parts; 970 foreach my $line (@parts) { 971 if (length($line) > $limit) { 972 $line= $self->_break_line_at_op($limit, $line, $blank_prefix); 973 } 974 } 975 s/\s*\z/\n/ for @parts; 976 $line= join "", @parts; 977 @fragments= ("defined", "||"); 978 } 979 else { 980 # the expression consists of just one opcode type, so we can use 981 # simpler logic to break it apart with the objective of ensuring 982 # that the lines are similarly formed with trailing operators on 983 # each line but the last. 984 @fragments= ("||", "defined"); 985 $line= $self->_break_line_at_op($limit, $type . $rest, $blank_prefix); 986 } 987 988 # try to line up the text on different lines. We stop after 989 # the first $fragment that modifies the text. The order 990 # of fragments we try is determined above based on the type 991 # of condition this is. 992 my $pre_line= $line; 993 foreach my $fragment (@fragments) { 994 $line= $self->_line_up_frag($line, $fragment); 995 last if $line ne $pre_line; 996 } 997 998 # if we have lined up by "defined" in _line_up_frag() 999 # then we may have " || defined(...)" type expressions 1000 # convert these to " || defined(...)" as it looks better. 1001 $line =~ s/( )(\|\||&&|[()?:])([ ]{2,})(!?defined)/$3$2$1$4/g; 1002 $line =~ s/(\|\||&&|[()?:])[ ]{10,}/$1 /g; 1003 1004 # add back the line continuations. this is all pretty inefficient, 1005 # but it works nicely. 1006 my @lines= split /\n/, $line; 1007 my $last= pop @lines; 1008 my $max_len= _max(map { length $_ } @lines); 1009 $_= sprintf "%*s \\\n", -$max_len, $_ for @lines; 1010 $last .= "\n"; 1011 1012 $line= join "", @lines, $last; 1013 1014 # remove line continuations that are inside of a comment, 1015 # we may have a variable number of lines of the expression 1016 # or parts of lines of the expression in a comment, so 1017 # we do this last. 1018 $line =~ s!/\* (.*) \*/ 1019 !"/*"._strip_line_cont("$1")."*/"!xsge; 1020 1021 return $self->_fixup_indent($line); 1022} 1023 1024#remove line continuations from the argument. 1025sub _strip_line_cont { 1026 my ($string)= @_; 1027 $string =~ s/\s*\\\n/\n/g; 1028 return $string; 1029} 1030 1031# Takes an array of HeaderLines objects produced by parse_fh() 1032# or by group_content(), and turn it into a string. 1033sub lines_as_str { 1034 my ($self, $lines, $post_process_content)= @_; 1035 $lines ||= $self->{lines}; 1036 my $ret; 1037 $post_process_content ||= $self->{post_process_content}; 1038 my $filter= $self->{filter_content}; 1039 my $last_line= ""; 1040 1041 #warn $self->dd($lines); 1042 foreach my $line_data (@$lines) { 1043 my $line= $line_data->{line}; 1044 if ($line_data->{type} ne "content" or $line_data->{sub_type} ne "text") 1045 { 1046 my $level= $line_data->{level}; 1047 my $ind= $self->indent_chars($level); 1048 $line =~ s/^#(\s*)/#$ind/; 1049 } 1050 if ($line_data->{type} eq "cond") { 1051 my $add_commented_expr_after= $self->{add_commented_expr_after}; 1052 if ($line_data->{sub_type} =~ /#(?:else|endif)/) { 1053 my $joined= join " && ", 1054 map { "($_)" } @{ $line_data->{cond}[-1] }; 1055 my $cond_txt= $self->tidy_cond($joined); 1056 $cond_txt= "if $cond_txt" if $line_data->{sub_type} eq "#else"; 1057 $line =~ s!\s*\z! /* $cond_txt */\n! 1058 if $line_data->{inner_lines} >= $add_commented_expr_after; 1059 } 1060 elsif ($line_data->{sub_type} eq "#elif") { 1061 my $last_frame= $line_data->{cond}[-1]; 1062 my $joined= join " && ", 1063 map { "($_)" } @$last_frame[ 0 .. ($#$last_frame - 1) ]; 1064 my $cond_txt= $self->tidy_cond($joined); 1065 $line =~ s!\s*\z! /* && $cond_txt */\n! 1066 if $line_data->{inner_lines} >= $add_commented_expr_after; 1067 } 1068 } 1069 $line =~ s/\s*\z/\n/; 1070 if ($last_line eq "\n" and $line eq "\n") { 1071 next; 1072 } 1073 $last_line= $line; 1074 if ($line_data->{type} eq "cond") { 1075 $line =~ m!(^\s*#\s*\w+[ ]*)([^/].*?\s*)?(/\*.*)?\n\z! 1076 or die "Failed to split cond line: $line"; 1077 my ($type, $cond, $comment)= ($1, $2, $3); 1078 $comment //= ""; 1079 $cond //= ""; 1080 my $new_line; 1081 if (!length($cond) and $comment) { 1082 $comment =~ s!^(/\*\s+)!! 1083 and $type .= $1; 1084 } 1085 1086 $line= $self->_wrap_and_line_up_cond($type, $cond . $comment); 1087 } 1088 $line_data->{line}= $line; 1089 if ($post_process_content and $line_data->{type} eq "content") { 1090 $post_process_content->($self, $line_data); 1091 } 1092 if ($filter and $line_data->{type} eq "content") { 1093 $filter->($self, $line_data) or next; 1094 } 1095 $ret .= $line_data->{line}; 1096 } 1097 return $ret; 1098} 1099 1100# Text::Wrap::wrap has an odd api, so hide it behind a wrapper 1101# sub which sets things up properly. 1102sub _my_wrap { 1103 my ($head, $rest, $line)= @_; 1104 local $Text::Wrap::unexpand= 0; 1105 local $Text::Wrap::huge= "overflow"; 1106 local $Text::Wrap::columns= 78; 1107 unless (length $line) { return $head } 1108 $line= wrap $head, $rest, $line; 1109 return $line; 1110} 1111 1112# recursively extract the && expressions from a parse tree, 1113# returning the result as strings. 1114# if $node is not a '&&' op then it returns $node as a string, 1115# otherwise it returns the string form of the arguments to the 1116# '&&' op, recursively flattening any '&&' nodes that it might 1117# contain. 1118sub _and_clauses { 1119 my ($self, $node)= @_; 1120 1121 my @ret; 1122 if (@$node < 3 or $node->[0] ne "&&") { 1123 return $self->_pt_as_str($node); 1124 } 1125 foreach my $idx (1 .. $#$node) { 1126 push @ret, $self->_and_clauses($node->[$idx]); 1127 } 1128 return @ret; 1129} 1130 1131# recursively walk the a parse tree, and return the literal 1132# terms it contains, ignoring any operators in the optree. 1133sub _terms { 1134 my ($self, $node)= @_; 1135 if (@$node == 1) { 1136 return $self->_pt_as_str($node); 1137 } 1138 my @ret; 1139 if (@$node == 2) { 1140 if ($node->[0] eq "?") { 1141 push @ret, map { $self->_terms($_) } @{ $node->[1] }; 1142 } 1143 else { 1144 push @ret, $self->_terms($node->[1]); 1145 } 1146 } 1147 else { 1148 foreach my $i (1 .. $#$node) { 1149 push @ret, $self->_terms($node->[$i]); 1150 } 1151 } 1152 return @ret; 1153} 1154 1155# takes a HeaderLine "cond" AoA and flattens it into 1156# a single expression, and then extracts all the and clauses 1157# it contains. Thus [['defined(A)'],['defined(B)']] and 1158# [['defined(A) && defined(B)']], end up as ['defined(A)','defined(B)'] 1159sub _flatten_cond { 1160 my ($self, $cond_ary)= @_; 1161 1162 my $expr= join " && ", map { 1163 map { "($_)" } 1164 @$_ 1165 } @$cond_ary; 1166 return [] unless $expr; 1167 my $tree= $self->parse_expr($expr); 1168 my %seen; 1169 my @and_clause= grep { !$seen{$_}++ } $self->_and_clauses($tree); 1170 return \@and_clause; 1171} 1172 1173# Find the best path into a tree of conditions, such that 1174# we reuse the maximum number of existing branches. Returning 1175# two arrays, the first contain the parts of $cond_array that 1176# make up the best path, in the best path order, and a second array 1177# with the remaining items in the initial order they were provided. 1178# Thus if we have previously stored only the path "A", "B", "C" 1179# into the tree, and want to find the best path for 1180# ["E","D","C","B","A"] we should return: ["A","B","C"],["E","D"], 1181# 1182# This used to reduce the number of conditions in the grouped content, 1183# and is especially helpful with dealing with DEBUGGING related 1184# functionality. It is coupled with careful control over the order 1185# that we add paths and conditions to the tree. 1186sub _best_path { 1187 my ($self, $tree_node, $cond_array, @path)= @_; 1188 my $best= \@path; 1189 my $rest= $cond_array; 1190 foreach my $cond (@$cond_array) { 1191 if ($tree_node->{$cond}) { 1192 my ($new_best, $new_rest)= 1193 $self->_best_path($tree_node->{$cond}, 1194 [ grep $_ ne $cond, @$cond_array ], 1195 @path, $cond); 1196 if (@$new_best > @$best) { 1197 ($best, $rest)= ($new_best, $new_rest); 1198 } 1199 } 1200 } 1201 if (@$best == @path) { 1202 foreach my $cond (@$cond_array) { 1203 my $not_cond= $self->tidy_cond("!($cond)"); 1204 if ($tree_node->{$not_cond}) { 1205 $best= [ @path, $cond ]; 1206 $rest= [ grep $_ ne $cond, @$cond_array ]; 1207 last; 1208 } 1209 } 1210 } 1211 return ($best, $rest); 1212} 1213 1214# This builds a group content tree from a set of lines. each content line in 1215# the original file is added to the file based on the conditions that apply to 1216# the content. 1217# 1218# The tree is made up of nested HoH's with keys in the HoH being normalized 1219# clauses from the {cond} data in the HeaderLine objects. 1220# 1221# Care is taken to minimize the number of pathways and to reorder clauses to 1222# reuse existing pathways and minimize the total number of conditions in the 1223# file. 1224# 1225# The '' key of a hash contains an array of the lines that are part of the 1226# condition that lead to that key. Thus lines with no conditions are in 1227# @{$tree{''}}, lines with the condition "defined(A) && defined(B)" would be 1228# in $tree{"defined(A)"}{"defined(B)"}{""}. 1229# 1230# The result of this sub is normally passed into __recurse_group_content_tree() 1231# which converts it back into a set of HeaderLine objects. 1232# 1233sub _build_group_content_tree { 1234 my ($self, $lines)= @_; 1235 $lines ||= $self->{lines}; 1236 my $filter= $self->{filter_content}; 1237 my %seen_normal; 1238 foreach my $line_data (@$lines) { 1239 next if $line_data->{type} ne "content"; 1240 next if $filter and !$filter->($self, $line_data); 1241 my $cond_frames= $line_data->{cond}; 1242 my $cond_frame= $self->_flatten_cond($cond_frames); 1243 my $flat_merged= join " && ", map "($_)", @$cond_frame; 1244 my $normalized; 1245 if (@$cond_frame) { 1246 $normalized= $self->tidy_cond($flat_merged); 1247 } 1248 else { 1249 $normalized= $flat_merged; # empty string 1250 } 1251 push @{ $seen_normal{$normalized} }, $line_data; 1252 } 1253 my @debugging; 1254 my @non_debugging; 1255 foreach my $key (keys %seen_normal) { 1256 if ($key =~ /DEBUGGING/) { 1257 push @debugging, $key; 1258 } 1259 else { 1260 push @non_debugging, $key; 1261 } 1262 } 1263 @non_debugging= 1264 sort { length($a) <=> length($b) || $a cmp $b } @non_debugging; 1265 @debugging= sort { length($b) <=> length($a) || $a cmp $b } @debugging; 1266 my %tree; 1267 foreach my $normal_expr (@non_debugging, @debugging) { 1268 my $all_line_data= $seen_normal{$normal_expr}; 1269 1270 my $cond_frame= 1271 (length $normal_expr) 1272 ? $self->_flatten_cond([ [$normal_expr] ]) 1273 : []; 1274 @$cond_frame= $self->_sort_terms(@$cond_frame); 1275 my $node= \%tree; 1276 my ($best, $rest)= $self->_best_path($node, $cond_frame); 1277 die sprintf "Woah: %d %d %d", 0 + @$best, 0 + @$rest, 0 + @$cond_frame 1278 unless @$best + @$rest == @$cond_frame; 1279 1280 foreach my $cond (@$best, @$rest) { 1281 $node= $node->{$cond} ||= {}; 1282 } 1283 push @{ $node->{''} }, @$all_line_data; 1284 } 1285 1286 warn $self->dd(\%tree) if $self->{debug}; 1287 $self->{tree}= \%tree; 1288 return \%tree; 1289} 1290 1291sub _recurse_group_content_tree { 1292 my ($self, $node, @path)= @_; 1293 1294 my @ret; 1295 local $self->{rgct_ret}= \@ret; 1296 local $self->{line_by_depth}= []; 1297 1298 $self->__recurse_group_content_tree($node, @path); 1299 return \@ret; 1300} 1301 1302# convert a tree of conditions constructed by _build_group_content_tree() 1303# and turn it into a set of HeaderLines that represents it. Performs the 1304# appropriate sets required to reconstitute an if/elif/elif/else sequence 1305# by calling _handle_else(). 1306sub __recurse_group_content_tree { 1307 my ($self, $node, @path)= @_; 1308 my $depth= 0 + @path; 1309 my $ind= $self->indent_chars($depth); 1310 my $ret= $self->{rgct_ret}; 1311 if ($node->{''}) { 1312 if (my $cb= $self->{post_process_grouped_content}) { 1313 $cb->($self, $node->{''}, \@path); 1314 } 1315 if (my $cb= $self->{post_process_content}) { 1316 $cb->($self, $_, \@path) for @{ $node->{''} }; 1317 } 1318 push @$ret, map { 1319 HeaderLine->new( 1320 %$_, 1321 cond => [@path], 1322 level => $depth, 1323 start_line_num => 0 + @$ret 1324 ) 1325 } @{ $node->{''} }; 1326 } 1327 1328 my %skip; 1329 foreach my $expr ( 1330 map { $_->[0] } 1331 sort { $a->[1] cmp $b->[1] || $a->[0] cmp $b->[0] } 1332 map { [ $_, lc($_) =~ s/[^A-Za-z0-9]+//gr ] } keys %$node 1333 ) { 1334 next unless length $expr; # ignore payload 1335 my $not= $self->tidy_cond("!($expr)"); 1336 if ($skip{$expr} or ($not !~ /^!/ and $node->{$not})) { 1337 next; 1338 } 1339 my $kid= $node->{$expr}; 1340 while (!$node->{$not} and keys(%$kid) == 1 and !$kid->{''}) { 1341 my ($kid_key)= keys(%$kid); 1342 $expr= $self->tidy_cond("($expr) && ($kid_key)"); 1343 $kid= $kid->{$kid_key}; 1344 my $new_not= $self->tidy_cond("!($expr)"); 1345 if ($node->{$new_not}) { 1346 $not= $new_not; 1347 $skip{$not}++; 1348 } 1349 } 1350 my $raw= "#${ind}if $expr\n"; 1351 my $hl= HeaderLine->new( 1352 type => "cond", 1353 sub_type => "#if", 1354 raw => $raw, 1355 line => $raw, 1356 level => $depth, 1357 cond => [ @path, [$expr] ], 1358 start_line_num => 0 + @$ret, 1359 ); 1360 $self->{line_by_depth}[$depth]= 0 + @$ret; 1361 push @$ret, $hl; 1362 $self->__recurse_group_content_tree($kid, @path, [$expr]); 1363 if ($node->{$not}) { 1364 $skip{$not}++; 1365 $self->_handle_else($not, $node->{$not}, $ind, $depth, @path, 1366 [$not]); 1367 } 1368 1369 # and finally the #endif 1370 1371 $raw= "#${ind}endif\n"; 1372 1373 # we need to extract the condition information from the last line in @ret, 1374 # as we don't know which condition we are ending here. It could be an elsif 1375 # from deep in the parse tree for instance. 1376 # So we need to extract the last frame from the cond structure in the last 1377 # line-info in @ret. 1378 # BUT if this last line is itself an #endif, then we need to take the second 1379 # to last line instead, as the endif would have "popped" that frame off the 1380 # condition stack. 1381 my $last_ret= $ret->[-1]; 1382 my $idx= 1383 ($last_ret->{type} eq "cond" && $last_ret->{sub_type} eq "#endif") 1384 ? -2 1385 : -1; 1386 my $end_line= HeaderLine->new( 1387 type => "cond", 1388 sub_type => "#endif", 1389 raw => $raw, 1390 line => $raw, 1391 level => $depth, 1392 cond => [ @path, $last_ret->{cond}[$idx] ], 1393 start_line_num => 0 + @$ret, 1394 inner_lines => @$ret - $self->{line_by_depth}[$depth], 1395 ); 1396 undef $self->{line_by_depth}[$depth]; 1397 push @$ret, $end_line; 1398 } 1399 return $ret; 1400} 1401 1402# this handles the specific case of an else clause, detecting 1403# when an elif can be constructed, may recursively call itself 1404# to deal with if/elif/elif/else chains. Calls back into 1405# __recurse_group_content_tree(). 1406sub _handle_else { 1407 my ($self, $not, $kid, $ind, $depth, @path)= @_; 1408 1409 # extract the first 3 keys - from this we can detect 1410 # which of the three scenarios we have to handle. 1411 my ($k1, $k2, $k3)= 1412 sort { length($a) <=> length($b) || $a cmp $b } keys %$kid; 1413 my $not_k1; 1414 if (length($k1) and defined($k2) and !defined($k3)) { 1415 1416 # if we do not have a payload (length($k1)) and we have exactly 1417 # two keys (defined($k2) and !defined($k3)) we need to compute 1418 # the inverse of $k1, which we will use later. 1419 $not_k1= $self->tidy_cond("!($k1)"); 1420 } 1421 my $ret= $self->{rgct_ret}; 1422 if (length($k1) and !defined($k2)) { 1423 1424 # only one child, no payload -> elsif $k1 1425 my $sub_expr; 1426 do { 1427 $sub_expr= 1428 !$sub_expr 1429 ? $k1 1430 : $self->tidy_cond("($sub_expr) && ($k1)"); 1431 $kid= $kid->{$k1}; 1432 ($k1, $k2)= 1433 sort { length($a) <=> length($b) || $a cmp $b } keys %$kid; 1434 } while length($k1) and !defined $k2; 1435 1436 my $raw= "#${ind}elif $sub_expr\n"; 1437 push @{ $path[-1] }, $sub_expr; 1438 my $hl= HeaderLine->new( 1439 type => "cond", 1440 sub_type => "#elif", 1441 raw => $raw, 1442 line => $raw, 1443 level => $depth, 1444 cond => [ map { [@$_] } @path ], 1445 start_line_num => 0 + @$ret, 1446 inner_lines => @$ret - $self->{line_by_depth}[$depth], 1447 ); 1448 $self->{line_by_depth}[$depth]= 0 + @$ret; 1449 push @$ret, $hl; 1450 $self->__recurse_group_content_tree($kid, @path); 1451 } 1452 elsif (defined($not_k1) and $not_k1 eq $k2) { 1453 1454 # two children which are complementary, no payload -> elif $k1 else.. 1455 my $raw= "#${ind}elif $k1\n"; 1456 1457 push @{ $path[-1] }, $k1; 1458 my $hl= HeaderLine->new( 1459 type => "cond", 1460 sub_type => "#elif", 1461 raw => $raw, 1462 line => $raw, 1463 level => $depth, 1464 cond => [ map { [@$_] } @path ], 1465 start_line_num => 0 + @$ret, 1466 inner_lines => @$ret - $self->{line_by_depth}[$depth], 1467 ); 1468 $self->{line_by_depth}[$depth]= 0 + @$ret; 1469 push @$ret, $hl; 1470 $self->__recurse_group_content_tree($kid->{$k1}, @path); 1471 $path[-1][-1]= $k2; 1472 $self->_handle_else($k2, $kid->{$k2}, $ind, $depth, @path); 1473 } 1474 else { 1475 # payload, 3+ children, or 2 which are not complementary -> else 1476 my $raw= "#${ind}else\n"; 1477 my $hl= HeaderLine->new( 1478 type => "cond", 1479 sub_type => "#else", 1480 raw => $raw, 1481 line => $raw, 1482 level => $depth, 1483 cond => [ map { [@$_] } @path ], 1484 start_line_num => 0 + @$ret, 1485 inner_lines => @$ret - $self->{line_by_depth}[$depth], 1486 ); 1487 $self->{line_by_depth}[$depth]= 0 + @$ret; 1488 push @$ret, $hl; 1489 $self->__recurse_group_content_tree($kid, @path); 1490 } 1491 return $ret; 1492} 1493 1494# group the content in lines by the condition that apply to them 1495# returns a set of lines representing the new structure 1496sub group_content { 1497 my ($self, $lines, $filter)= @_; 1498 $lines ||= $self->{lines}; 1499 local $self->{filter_content}= $filter || $self->{filter_content}; 1500 my $tree= $self->_build_group_content_tree($lines); 1501 return $self->_recurse_group_content_tree($tree); 1502} 1503 1504#read a file by name - opens the file and passes the fh into parse_fh(). 1505sub read_file { 1506 my ($self, $file_name, $callback)= @_; 1507 $self= $self->new() unless ref $self; 1508 local $self->{parse_source}= $file_name; 1509 open my $fh, "<", $file_name 1510 or confess "Failed to open '$file_name' for read: $!"; 1511 my $lines= $self->parse_fh($fh); 1512 if ($callback) { 1513 foreach my $line (@$lines) { 1514 $callback->($self, $line); 1515 } 1516 } 1517 return $self; 1518} 1519 1520# These are utility methods for the HeaderLine objects. 1521sub HeaderLine::new { 1522 my ($class, %self)= @_; 1523 return bless \%self, $class; 1524} 1525sub HeaderLine::cond { $_[0]->{cond} } # AoA 1526sub HeaderLine::type { $_[0]->{type} } 1527sub HeaderLine::type_is { return $_[0]->type eq $_[1] ? 1 : 0 } 1528sub HeaderLine::sub_type { $_[0]->{sub_type} } 1529sub HeaderLine::sub_type_is { return $_[0]->sub_type eq $_[1] ? 1 : 0 } 1530sub HeaderLine::raw { $_[0]->{raw} } 1531sub HeaderLine::flat { $_[0]->{flat} } 1532sub HeaderLine::line { $_[0]->{line} } 1533sub HeaderLine::level { $_[0]->{level} } 1534sub HeaderLine::is_content { return $_[0]->type_is("content") } 1535sub HeaderLine::is_cond { return $_[0]->type_is("cond") } 1536sub HeaderLine::is_define { return $_[0]->sub_type_is("#define") } 1537sub HeaderLine::line_num { $_[0]->{start_line_num} } 1538sub HeaderLine::inner_lines { $_[0]->{inner_lines} } 1539sub HeaderLine::n_lines { $_[0]->{n_lines} } 1540sub HeaderLine::embed { $_[0]->{embed} } 1541*HeaderLine::start_line_num= *HeaderLine::line_num; 1542 1543# these are methods for EmbedLine objects 1544*EmbedLine::new= *HeaderLine::new; 1545sub EmbedLine::flags { $_[0]->{flags} } 1546sub EmbedLine::return_type { $_[0]->{return_type} } 1547sub EmbedLine::name { $_[0]->{name} } 1548sub EmbedLine::args { $_[0]->{args} } # array ref 1549 15501; 1551 1552__END__ 1553 1554=head1 NAME 1555 1556HeaderParser - A minimal header file parser that can be hooked by other porting 1557scripts. 1558 1559=head1 SYNOPSIS 1560 1561 my $o= HeaderParser->new(); 1562 my $lines= $o->parse_fh($fh); 1563 1564=head1 DESCRIPTION 1565 1566HeaderParser is a tool to parse C preprocessor header files. The tool 1567understands the syntax of preprocessor conditions, and is capable of creating 1568a parse tree of the expressions involved, and normalizing them as well. 1569 1570C preprocessor files are a bit tricky to parse properly, especially with a 1571"line by line" model. There are two issues that must be dealt with: 1572 1573=over 4 1574 1575=item Line Continuations 1576 1577Any line ending in "\\\n" (that is backslash newline) is considered to be part 1578of a longer string which continues on the next line. Processors should replace 1579the "\\\n" typically with a space when converting to a "real" line. 1580 1581=item Comments Acting As A Line Continuation 1582 1583The rules for header files stipulates that C style comments are stripped 1584before processing other content, this means that comments can serve as a form 1585of line continuation: 1586 1587 #if defined(foo) /* 1588 */ && defined(bar) 1589 1590is the same as 1591 1592 #if defined(foo) && defined(bar) 1593 1594This type of comment usage is often overlooked by people writing header file 1595parsers for the first time. 1596 1597=item Indented pre processor directives. 1598 1599It is easy to forget that there may be multiple spaces between the "#" 1600character and the directive. It also easy to forget that there may be spaces 1601in *front* of the "#" character. Both of these cases are often overlooked. 1602 1603=back 1604 1605The main idea of this module is to provide a single framework for correctly 1606parsing the content of our header files in a consistent manner. A secondary 1607purpose it to make various tasks we want to do easier, such as normalizing 1608content or preprocessor expressions, or just extracting the real "content" of 1609the file properly. 1610 1611=head2 parse_fh 1612 1613This function parses a filehandle into a set of lines. Each line is represented by a hash 1614based object which contains the following fields: 1615 1616 bless { 1617 cond => [['defined(a)'],['defined(b)']], 1618 type => "content", 1619 sub_type => undef, 1620 raw => $raw_content_of_line, 1621 line => $normalized_content_of_line, 1622 level => $level, 1623 source => $filename_or_string, 1624 start_line_num => $line_num_for_first_line, 1625 n_lines => $line_num - $line_num_for_first_line, 1626 }, "HeaderLine" 1627 1628A "line" in this context is a logical line, and because of line continuations 1629and comments may contain more than one physical line, and thus more than 1630one newline, but will always include at least one and will always end with one 1631(unless there is no newline at the end of the file). Thus 1632 1633 before /* 1634 this is a comment 1635 */ after \ 1636 and continues 1637 1638will be treated as a single logical line even though the content is 1639spread over four lines. 1640 1641=over 4 1642 1643=item cond 1644 1645An array of arrays containing the normalized expressions of any C preprocessor 1646conditional blocks which include the line. Each line has its own copy of the 1647conditions it was operated on currently, but that may change so dont alter 1648this data. The inner arrays may contain more than one element. If so then the 1649line is part of an "#else" or "#elsif" and the clauses should be considered to 1650be a conjuction when considering "when is this line included", however when 1651considered as part of an if/elsif/else, each added clause represents the most 1652recent condition. In the following you can see how: 1653 1654 before /* cond => [ ] */ 1655 #if A /* cond => [ ['A'] ] */ 1656 do-a /* cond => [ ['A'] ] */ 1657 #elif B /* cond => [ ['!A', 'B'] ] */ 1658 do-b /* cond => [ ['!A', 'B'] ] */ 1659 #else /* cond => [ ['!A', '!B'] ] */ 1660 do-c /* cond => [ ['!A', '!B'] ] */ 1661 # if D /* cond => [ ['!A', '!B'], ['D'] ] */ 1662 do-d /* cond => [ ['!A', '!B'], ['D'] ] */ 1663 # endif /* cond => [ ['!A', '!B'], ['D'] ] */ 1664 #endif /* cond => [ ['!A', '!B'] ] */ 1665 after /* cond => [ ] */ 1666 1667So in the above we can see how the three clauses of the if produce 1668a single "frame" in the cond array, but that frame "grows" and changes 1669as additional else clauses are added. When an entirely new if block 1670is started (D) it gets its own block. Each endif includes the clause 1671it terminates. 1672 1673=item type 1674 1675This value indicates the type of the line. This may be one of the following: 1676'content', 'cond', 'define', 'include' and 'error'. Several of the types 1677have a sub_type. 1678 1679=item sub_type 1680 1681This value gives more detail on the type of the line where necessary. 1682Not all types have a subtype. 1683 1684 Type | Sub Type 1685 --------+---------- 1686 content | text 1687 | include 1688 | define 1689 | error 1690 cond | #if 1691 | #elif 1692 | #else 1693 | #endif 1694 1695Note that there are no '#ifdef' or '#elifndef' or similar expressions. All 1696expressions of that form are normalized into the '#if defined' form to 1697simplify processing. 1698 1699=item raw 1700 1701This was the raw original text before HeaderParser performed any modifications 1702to it. 1703 1704=item line 1705 1706This is the normalized and modified text after HeaderParser or any callbacks 1707have processed it. 1708 1709=item level 1710 1711This is the "indent level" of a line and corresponds to the number of blocks 1712that the line is within, not including any blocks that might be created by 1713the line itself. 1714 1715 before /* level => 0 */ 1716 #if A /* level => 0 */ 1717 do-a /* level => 1 */ 1718 #elif B /* level => 0 */ 1719 do-b /* level => 1 */ 1720 #else /* level => 0 */ 1721 do-c /* level => 1 */ 1722 # if D /* level => 1 */ 1723 do-d /* level => 2 */ 1724 # endif /* level => 1 */ 1725 #endif /* level => 0 */ 1726 after /* level => 0 */ 1727 1728=back 1729 1730parse_fh() will throw an exception if it encounters a malformed expression 1731or input it cannot handle. 1732 1733=head2 lines_as_str 1734 1735This function will return a string representation of the lines it is provided. 1736 1737=head2 group_content 1738 1739This function will group the text in the file by the conditions which contain 1740it. This is only useful for files where the content is essentially a list and 1741where changing the order that lines are output in will not break the resulting 1742file. 1743 1744Each content line will be grouped into a structure of nested if/else blocks 1745(elif will produce a new nested block) such that the content under the control 1746of a given set of normalized condition clauses are grouped together in the order 1747the occurred in the file, such that each combined conditional clause is output 1748only once. 1749 1750This means a file like this: 1751 1752 #if A 1753 A 1754 #elif K 1755 AK 1756 #else 1757 ZA 1758 #endif 1759 #if B && Q 1760 B 1761 #endif 1762 #if Q && B 1763 BC 1764 #endif 1765 #if A 1766 AD 1767 #endif 1768 #if !A 1769 ZZ 1770 #endif 1771 1772Will end up looking roughly like this: 1773 1774 #if A 1775 A 1776 AD 1777 #else 1778 ZZ 1779 # if K 1780 AK 1781 # else 1782 ZA 1783 # endif 1784 #endif 1785 #if B && Q 1786 B 1787 BC 1788 #endif 1789 1790Content at a given block level always goes before conditional clauses 1791at the same nesting level. 1792 1793=head2 HOOKS 1794 1795There are severals hooks that are available, C<pre_process_content> and 1796C<post_process_content>, and C<post_process_grouped_content>. All of these 1797hooks will be called with the HeaderParser object as the first argument. 1798The "process_content" callbacks will be called with a line hash as the second 1799argument, and C<post_process_grouped_content> will be called with an 1800array of line hashes for the content in that group, so that the array may be 1801modified or sorted. Callbacks called from inside of C<group_content()> 1802(that is C<post_process_content> and C<post_process_grouped_content> will be 1803called with an additional argument containing and array specifying the actual 1804conditional "path" to the content (which may differ somewhat from the data in 1805a lines "cond" property). 1806 1807These hooks may do what they like, but generally they will modify the 1808"line" property of the line hash to change the final output returned 1809by C<lines_as_str()> or C<group_content()>. 1810 1811=head2 FORMATTING AND INDENTING 1812 1813Header parser tries hard to produce neat and readable output with a consistent 1814style and form. For example: 1815 1816 #if defined(FOO) 1817 # define HAS_FOO 1818 # if defined(BAR) 1819 # define HAS_FOO_AND_BAR 1820 # else /* !defined(BAR) */ 1821 # define HAS_FOO_NO_BAR 1822 # endif /* !defined(BAR) */ 1823 #endif /* defined(FOO) */ 1824 1825HeaderParser uses two space tab stops for indenting C pre-processor 1826directives. It puts the spaces between the "#" and the directive. The "#" is 1827considered "part" of the indent, even though the space comes after it. This 1828means the first indent level "looks" like one space, and following indents 1829look like 2. This should match what a sensible editor would do with two space 1830tab stops. The C<indent_chars()> method can be used to convert an indent level 1831into a string that contains the appropriate number of spaces to go in between 1832the "#" and the directive. 1833 1834When emitting "#endif", "#elif" and "#else" directives comments will be 1835emitted also to show the conditions that apply. These comments may be wrapped 1836to cover multiple lines. Some effort is made to get these comments to line up 1837visually, but it uses heuristics which may not always produce the best result. 1838 1839=cut 1840