xref: /openbsd/gnu/usr.bin/perl/regen/HeaderParser.pm (revision 5486feef)
1package HeaderParser;
2use strict;
3use warnings;
4
5BEGIN {
6    if ($] < 5.010) {
7        die <<EOFMSG;
8The regen code in the perl repo needs at least version 5.10 to run.
9You are currently using $], we recommend you install a more modern
10version of perl before you try to build this version of perl.
11EOFMSG
12    }
13}
14
15# these are required below in BEGIN statements, we cant have a
16# hard dependency on them as they might not be available when
17# we run as part of autodoc.pl
18#
19# use Data::Dumper;
20# use Storable qw(dclone);
21#
22use Carp       qw(confess);
23use Text::Tabs qw(expand unexpand);
24use Text::Wrap qw(wrap);
25
26# The style of this file is determined by:
27#
28# perltidy -w -ple -bbb -bbc -bbs -nolq -l=80 -noll -nola -nwls='=' \
29#   -isbc -nolc -otr -kis -ci=4 -se -sot -sct -nsbl -pt=2 -fs  \
30#   -fsb='#start-no-tidy' -fse='#end-no-tidy' -cpb -bfvt=2
31
32my (
33    %unop,              # unary operators and their precedence
34    %binop,             # binary operators and their precedence
35    %is_right_assoc,    # operators which are right associative
36    %precedence,        # precedence of all operators.
37    %associative,       # associative operators
38    %commutative,       # commutative operators
39    %cmpop,             # comparison operators
40    $unop_pat,          # pattern to match unary operators
41    $binop_pat,         # pattern to match binary operators
42    %op_names,          # map of op to description, used in error messages
43    $tokenize_pat       # a pattern which can tokenize an expression
44);
45
46BEGIN {
47    # this is initialization for the operator precedence expression parser
48    # we use for handling preprocessor conditions.
49    %op_names= (
50        '==' => 'equality',
51        '!=' => 'inequality',
52        '<<' => 'bit-shift-left',
53        '>>' => 'bit-shift-right',
54        '+'  => 'addition',
55        '-'  => 'subtraction',
56        '*'  => 'multiplication',
57        '/'  => 'division',
58        '%'  => 'modulo',
59        '||' => 'logical-or',       # Lowest precedence
60        '&&' => 'logical-and',
61        '|'  => 'binary-or',
62        '^'  => 'binary-xor',
63        '&'  => 'binary-and',
64        '<'  => 'less-than',        # split on spaces, all with equal precedence
65        '>'  => 'greater-than',
66        '<=' => 'less-than-or-equal',
67        '>=' => 'greater-than-or-equal',
68    );
69    my @cmpop= (
70        '== !=',                    # listed in lowest to highest precedence
71        '< > <= >=',                # split on spaces, all with equal precedence
72    );
73    my @binop= (
74        '||',                       # Lowest precedence
75        '&&',
76        '|',
77        '^',
78        '&',
79        @cmpop,    # include the numerical comparison operators.
80        '<< >>',
81        '+ -',
82        '* / %',    # highest prcedence operators.
83    );
84
85    my @unop= qw( ! ~ + - );
86    %unop= map  { $_ => 1 } @unop;
87    %cmpop= map { $_ => 1 } map { split /\s+/, $_ } @cmpop;
88    %binop= map { $_ => 1 } map { split /\s+/, $_ } @binop;
89
90    my $make_pat= sub {
91        my $pat= join "|", sort { length($b) <=> length($a) || $a cmp $b }
92            map quotemeta($_), @_;
93        return qr/$pat/;
94    };
95    $unop_pat= $make_pat->(@unop);
96    foreach my $ix (0 .. $#binop) {
97        my $sym= $binop[$ix];
98        $precedence{$_}= (1 + $ix) * 10 for split /\s+/, $sym;
99    }
100    $is_right_assoc{"?"}= 1;
101    $is_right_assoc{":"}= 1;
102    $precedence{"?"}= 1;
103    $precedence{":"}= 0;
104
105    $associative{$_}++
106        for qw( || && + *);    # we leave '==' out so we don't reorder terms
107    $commutative{$_}++ for qw( || && + *);
108
109    $binop_pat= $make_pat->(keys %precedence);
110    $tokenize_pat= qr/
111     ^(?:
112        (?<comment> \/\*.*?\*\/ )
113      | (?<ws>      \s+ )
114      | (?<term>
115            (?<literal>
116                (?<define> defined\(\w+\) )
117            |   (?<func>   \w+\s*\(\s*\w+(?:\s*,\s*\w+)*\s*\) )
118            |   (?<const>  (?:0x[a-fA-F0-9]+|\d+[LU]*|'.') )
119            |   (?<sym>    \w+ )
120            )
121        |   (?<op> $binop_pat | $unop_pat )
122        |   (?<paren> [\(\)] )
123        )
124      )
125    /xs;
126}
127
128# dump the arguments with dump. wraps loading Dumper
129# as we are executed by miniperl where Dumper isnt available
130sub dd {
131    my $self= shift;
132    local $self->{orig_content};
133    my $ret= "(dump not available)";
134    eval {
135        require Data::Dumper;
136        $ret= Data::Dumper->new(\@_)->Indent(1)->Sortkeys(1)->Useqq(1)->Dump();
137    };
138    return $ret;
139}
140
141my $has_storable;
142
143# same story here, in miniperl we use slow perl code,
144# in real perl we can use Storable and speed things up.
145BEGIN { eval "use Storable; \$has_storable=1;" }
146
147# recursively copy an AoAoA...
148sub copy_aoa {
149    my ($aoa)= @_;
150    if ($has_storable) {
151        return Storable::dclone($aoa);
152    }
153    else {
154        return _copy_aoa($aoa);
155    }
156}
157
158sub _copy_aoa {
159    my ($thing)= @_;
160    if (ref $thing) {
161        return [ map { ref($_) ? _copy_aoa($_) : $_ } @$thing ];
162    }
163    else {
164        return $thing;
165    }
166}
167
168# return the number characters that should go in between a '#' and
169# the name of a c preprocessor directive. Returns 0 spaces for level
170# 0, and 2 * ($level - 1) + 1 spaces for the rest. (1,3,5, etc)
171# This might sound weird, but consider these are tab *stops* and the
172# '#' is included in the total. which means indents of 2, 4, 6 etc.
173sub indent_chars {
174    my ($self, $level)= @_;
175    my $ind= "";
176    $ind .= " "                 if $level;
177    $ind .= "  " x ($level - 1) if $level > 1;
178    return $ind;
179}
180
181# we use OO to store state, etc.
182sub new {
183    my ($class, %args)= @_;
184    $args{add_commented_expr_after} //= 10;
185    $args{max_width} //= 78;
186    $args{min_break_width} //= 70;
187    return bless \%args,;
188}
189
190# this parses the expression into an array of tokens
191# this is somewhat crude, we could do this incrementally
192# if we wanted and avoid the overhead. but it makes it
193# easier to debug the tokenizer.
194sub _tokenize_expr {
195    my ($self, $expr)= @_;
196    delete $self->{tokens};
197    delete $self->{parse_tree};
198    $self->{original_expr}= $expr;
199
200    my @tokens;
201    while ($expr =~ s/$tokenize_pat//xs) {
202        push @tokens, {%+} if defined $+{'term'};
203    }
204    $self->{tokens}= \@tokens;
205    warn $self->dd($self) if $self->{debug};
206    if (length $expr) {
207        confess "Failed to tokenize_expr: $expr\n";
208    }
209    return \@tokens;
210}
211
212sub _count_ops {
213    my ($self, $term)= @_;
214    my $count= 0;
215    $count++ while $term =~ m/(?: \|\| | \&\& | \? )/gx;
216    return $count;
217}
218
219# sort terms in an expression in a way that puts things
220# in a sensible order. Anything starting with PERL_IN_
221# should be on the left in alphabetical order. Digits
222# should be on the right (eg 0), and ties are resolved
223# by stripping non-alpha-numerc, thus removing underbar
224# parens, spaces, logical operators, etc, and then by
225# lc comparison of the result.
226sub _sort_terms {
227    my $self= shift;
228    my (@terms)= map {
229        [
230            $_,                                # 0: raw
231            lc($_) =~ s/[^a-zA-Z0-9]//gr,      # 1: "_" stripped and caseless
232            $_     =~ m/PERL_IN_/  ? 1 : 0,    # 2: PERL_IN_ labeled define
233            $_     =~ m/^\d/       ? 1 : 0,    # 3: digit
234            $_     =~ m/DEBUGGING/ ? 1 : 0,    # 4: DEBUGGING?
235            $self->_count_ops($_),             # 5: Number of ops (||, &&)
236        ]
237    } @_;
238    my %seen;
239    #start-no-tidy
240    @terms= map { $seen{ $_->[0] }++ ? () : $_->[0] }
241        sort {
242            $a->[5] <=> $b->[5]         ||    # least number of ops
243            $b->[2] <=> $a->[2]         ||    # PERL_IN before others
244            $a->[3] <=> $b->[3]         ||    # digits after others
245            $a->[4] <=> $b->[4]         ||    # DEBUGGING after all else
246            $a->[1] cmp $b->[1]         ||    # stripped caseless cmp
247            lc($a->[0]) cmp lc($b->[0]) ||    # caseless cmp
248            $a->[0] cmp $b->[0]         ||    # exact cmp
249            0
250        } @terms;
251    #end-no-tidy
252    return @terms;
253}
254
255# normalize a condition expression by parsing it and then stringifying
256# the parse tree.
257sub tidy_cond {
258    my ($self, $expr)= @_;
259    my $ret= $self->{_tidy_cond_cache}{$expr} //= do {
260        $self->parse_expr($expr) if defined $expr;
261        my $text= $self->_pt_as_str();
262        $text;
263    };
264    $self->{last_expr}= $ret;
265    return $ret;
266}
267
268# convert a parse tree structure to a string recursively.
269#
270# Parse trees are currently made up of arrays, with the count
271# of items in the object determining the type of op it represents.
272# 1 argument:  literal value of some sort.
273# 2 arguments: unary operator: 0 slot is the operator, 1 is a parse tree
274#            : ternary: 0 slot holds '?', 1 is an array holding three
275#                       parse trees: cond, true, false
276# 3 arguments or more: binary operator. 0 slot is the op. 1..n are parse trees
277#                    : note, this is multigate for commutative operators like
278#                    : "+", "*", "&&" and "||", so an expr
279#                    : like "A && B && !C" would be represented as:
280#                    : [ "&&", ["A"], ["B"], [ "!",["C"] ] ]
281#
282sub _pt_as_str {
283    my ($self, $node, $parent_op, $depth)= @_;
284
285    $node ||= $self->{parse_tree}
286        or confess "No parse tree?";
287    $depth ||= 0;
288    if (@$node == 1) {
289
290        # its a literal
291        return $node->[0];
292    }
293    elsif (@$node == 2) {
294
295        # is this a ternary or an unop?
296        if ($node->[0] eq '?') {
297
298            # ternary, the three "parts" are tucked away in
299            # an array in the payload slot
300            my $expr=
301                  $self->_pt_as_str($node->[1][0], "?", $depth + 1) . " ? "
302                . $self->_pt_as_str($node->[1][1], "?", $depth + 1) . " : "
303                . $self->_pt_as_str($node->[1][2], "?", $depth + 1);
304
305            # stick parens on if this is a subexpression
306            $expr= "( " . $expr . " )" if $depth;
307            return $expr;
308        }
309        else {
310            if (    $node->[0] eq "!"
311                and @{ $node->[1] } == 2
312                and $node->[1][0] eq "!")
313            {
314                # normalize away !! in expressions.
315                return $self->_pt_as_str($node->[1][1], $parent_op, $depth);
316            }
317
318            # unop - the payload is a optree
319            return $node->[0]
320                . $self->_pt_as_str($node->[1], $node->[0], $depth + 1);
321        }
322    }
323
324    # if we get here we are dealing with a binary operator
325    # the nodes are not necessarily binary, as we "collect"
326    # the terms into a list, thus: A && B && C && D -> ['&&',A,B,C,D]
327    my ($op, @terms)= @$node;
328
329    # convert the terms to strings
330    @terms= map { $self->_pt_as_str($_, $op, $depth + 1) } @terms;
331
332    # sort them to normalize the subexpression
333    my $expr=
334        join " $op ", $associative{$op}
335        ? $self->_sort_terms(@terms)
336        : @terms;
337
338    # stick parens on if this is a subexpression
339    $expr= "( " . $expr . " )" if $depth and !$cmpop{$op};
340
341    # and we are done.
342    return $expr;
343}
344
345# Returns the precedence of an operator, returns 0 if there is no token
346# or the next token is not an op, or confesss if it encounters an op it does not
347# know.
348sub _precedence {
349    my $self= shift;
350    my $token= shift // return 0;
351
352    my $op= (ref $token ? $token->{op} : $token) // return 0;
353
354    return $precedence{$op} // confess "Unknown op '$op'";
355}
356
357# entry point into parsing the tokens, checks that we actually parsed everything
358# and didnt leave anything in the token stream (possible from a malformed expression)
359# Performs some minor textual cleanups using regexes, but then does a proper parse
360# of the expression.
361sub parse_expr {
362    my ($self, $expr)= @_;
363    if (defined $expr) {
364        $expr =~ s/\s*\\\n\s*/ /g;
365        $expr =~ s/defined\s+(\w+)/defined($1)/g;
366        $self->_tokenize_expr($expr);
367    }
368    my $ret= $self->_parse_expr();
369    if (@{ $self->{tokens} }) {
370
371        # if all was well with parsing we should not get here.
372        confess "Unparsed tokens: ", $self->dd($self->{tokens});
373    }
374    $self->{parse_tree}= $ret;
375    return $ret;
376}
377
378# this is just a wrapper around _parse_expr_assoc() which handles
379# parsing an arbitrary expression.
380sub _parse_expr {
381    my ($self)= @_;
382    return $self->_parse_expr_assoc($self->_parse_expr_primary(), 1);
383}
384
385# This handles extracting from the token stream
386#  - simple literals
387#  - unops (assumed to be right associative)
388#  - parens (which reset the precedence acceptable to the parser)
389#
390sub _parse_expr_primary {
391    my ($self)= @_;
392    my $tokens= $self->{tokens}
393        or confess "No tokens in _parse_expr_primary?";
394    my $first= $tokens->[0]
395        or confess "No primary?";
396    if ($first->{paren} and $first->{paren} eq "(") {
397        shift @$tokens;
398        my $expr= $self->_parse_expr();
399        $first= $tokens->[0];
400        if (!$first->{paren} or $first->{paren} ne ")") {
401            confess "Expecting close paren", $self->dd($tokens);
402        }
403        shift @$tokens;
404        return $expr;
405    }
406    elsif ($first->{op} and $unop{ $first->{op} }) {
407        my $op_token= shift @$tokens;
408        return [ $op_token->{op}, $self->_parse_expr_primary() ];
409    }
410    elsif (defined $first->{literal}) {
411        shift @$tokens;
412        return [ $first->{literal} ];
413    }
414    else {
415        die sprintf
416            "Unexpected token '%s', expecting literal, unary, or expression.\n",
417            $first->{term};
418    }
419}
420
421# This is the heart of the expression parser. It uses
422# a pair of nested loops to avoid excessive recursion during parsing,
423# which should be a bit faster than other strategies. It only should
424# recurse when the precedence level changes.
425sub _parse_expr_assoc {
426    my ($self, $lhs, $min_precedence)= @_;
427    my $tokens= $self->{tokens}
428        or confess "No tokens in _parse_expr_assoc";
429    my $la= $tokens->[0];                  # lookahead
430    my $la_pr= $self->_precedence($la);    # lookahead precedence
431    while ($la && $la_pr >= $min_precedence) {
432        my $op_token= shift @$tokens;
433        my $op_pr= $la_pr;                 # op precedence
434        if ($op_token->{op} eq "?") {
435            my $mid= $self->_parse_expr();
436            if (@$tokens and $tokens->[0]{op} and $tokens->[0]{op} eq ":") {
437                shift @$tokens;
438                my $tail= $self->_parse_expr();
439                return [ '?', [ $lhs, $mid, $tail ] ];
440            }
441            confess "Panic: expecting ':'", $self->dd($tokens);
442        }
443        my $rhs;
444        eval { $rhs= $self->_parse_expr_primary(); }
445            or die "Error in $op_names{$op_token->{op}} expression: $@";
446        $la= $tokens->[0];
447        $la_pr= $self->_precedence($la);
448        while (
449            $la_pr > $op_pr ||    # any and larger
450            (       $is_right_assoc{ $op_token->{op} }
451                and $la_pr == $op_pr)    # right and equal
452        ) {
453            my $new_precedence= $op_pr + ($la_pr > $op_pr ? 1 : 0);
454            $rhs= $self->_parse_expr_assoc($rhs, $new_precedence);
455            $la= $tokens->[0];
456            $la_pr= $self->_precedence($la);
457        }
458        if (   @$lhs >= 3
459            && $lhs->[0] eq $op_token->{op}
460            && $commutative{ $op_token->{op} })
461        {
462            push @$lhs, $rhs;
463        }
464        else {
465            my @lt= ($lhs);
466            my @rt= ($rhs);
467
468            # if we have '( a && b ) && ( c && d)'
469            # turn it into 'a && b && c && d'
470            if (@$lhs > 2 && $lhs->[0] eq $op_token->{op}) {
471                (undef,@lt)= @$lhs; # throw away op.
472            }
473            if (@$rhs > 2 && $rhs->[0] eq $op_token->{op}) {
474                (undef,@rt)= @$rhs; # throw away op.
475            }
476            $lhs= [ $op_token->{op}, @lt, @rt ];
477        }
478    }
479    return $lhs;
480}
481
482#entry point for normalizing and if/elif statements
483#returns the line and condition in normalized form.
484sub normalize_if_elif {
485    my ($self, $line, $line_info)= @_;
486    if (my $dat= $self->{cache_normalize_if_elif}{$line}) {
487        return $dat->{line}, $dat->{cond};
488    }
489    my ($cond);
490    eval {
491        ($line, $cond)= $self->_normalize_if_elif($line);
492        1;
493    } or die sprintf "Error at line %d\nLine %d: %s\n%s",
494        ($line_info->start_line_num()) x 2, $line, $@;
495    $self->{cache_normalize_if_elif}{$line}= { line => $line, cond => $cond };
496    return ($line, $cond);
497}
498
499#guts of the normalize_if_elif() - cleans up the line, extracts
500#the condition, and then tidies it with tidy_cond().
501sub _normalize_if_elif {
502    my ($self, $line)= @_;
503    my $nl= "";
504    $nl= $1 if $line =~ s/(\n+)\z//;
505    $line =~ s/\s+\z//;
506    my @comment;
507    push @comment, $1 while $line =~ s!\s*(/\*.*?\*/)\z!!;
508    $line =~ s/defined\s*\(\s*(\w+)\s*\)/defined($1)/g;
509    $line =~ s/!\s+defined/!defined/g;
510
511    if ($line =~ /^#((?:el)?if)(n?)def\s+(\w+)/) {
512        my $if= $1;
513        my $not= $2 ? "!" : "";
514        $line= "#$if ${not}defined($3)";
515    }
516    $line =~ s/#((?:el)?if)\s+//
517        or confess "Bad cond: $line";
518    my $if= $1;
519    $line =~ s/!\s+/!/g;
520
521    my $old_cond= $line;
522    my $cond= $self->tidy_cond($old_cond);
523
524    warn "cond - $old_cond\ncond + $cond\n"
525        if $old_cond ne $cond and $self->{debug};
526
527    $line= "#$if $cond";
528    $line .= "  " . join " ", reverse @comment if @comment;
529
530    $line .= $nl;
531    return ($line, $cond);
532}
533
534# parses a text buffer as though it was a file on disk
535# calls parse_fh()
536sub parse_text {
537    my ($self, $text)= @_;
538    local $self->{parse_source}= "(buffer)";
539    open my $fh, "<", \$text
540        or die "Failed to open buffer for read: $!";
541    return $self->parse_fh($fh);
542}
543
544# takes a readable filehandle and parses whatever contents is
545# returned by reading it. Returns an array of HeaderLine objects.
546# this is the main routing for parsing a header file.
547sub parse_fh {
548    my ($self, $fh)= @_;
549    my @lines;
550    my @cond;
551    my @cond_line;
552    my $last_cond;
553    local $self->{parse_source}= $self->{parse_source} || "(unknown)";
554    my $cb= $self->{pre_process_content};
555    $self->{orig_content}= "";
556    my $line_num= 1;
557
558    while (defined(my $line= readline($fh))) {
559        my $start_line_num= $line_num++;
560        $self->{orig_content} .= $line;
561        while ($line =~ /\\\n\z/ or $line =~ m</\*(?:(?!\*/).)*\s*\z>s) {
562            defined(my $read_line= readline($fh))
563                or last;
564            $self->{orig_content} .= $read_line;
565            $line_num++;
566            $line .= $read_line;
567        }
568        while ($line =~ m!/\*(.*?)(\*/|\z)!gs) {
569            my ($inner, $tail)= ($1, $2);
570            if ($tail ne "*/") {
571                confess
572                    "Unterminated comment starting at line $start_line_num\n";
573            }
574            elsif ($inner =~ m!/\*!) {
575                confess
576                    "Nested/broken comment starting at line $start_line_num\n";
577            }
578        }
579
580        my $raw= $line;
581        my $type= "content";
582        my $sub_type= "text";
583        my $level= @cond;
584        my $do_pop= 0;
585        my $flat= $line;
586        $flat =~ s/\s*\\\n\s*/ /g;
587        $flat =~ s!/\*.*?\*/! !gs;
588        $flat =~ s/\s+/ /g;
589        $flat =~ s/\s+\z//;
590        $flat =~ s/^\s*#\s*/#/g;
591
592        my $line_info=
593            HeaderLine->new(start_line_num => $start_line_num, raw => $raw);
594        my $do_cond_line;
595        if ($flat =~ /^#/) {
596            if ($flat =~ m/^(#(?:el)?if)(n?)def\s+(\w+)/) {
597                my $if= $1;
598                my $not= $2 ? "!" : "";
599                my $sym= $3;
600                $flat =~
601                    s/^(#(?:el)?if)(n?)def\s+(\w+)/$if ${not}defined($sym)/;
602            }
603            my $cond;    # used in various expressions below
604            if ($flat =~ /^#endif/) {
605                if (!@cond) {
606                    confess "Not expecting $flat";
607                }
608                $do_pop= 1;
609                $level--;
610                $type= "cond";
611                $sub_type= "#endif";
612            }
613            elsif ($flat =~ /^#if\b/) {
614                ($flat, $cond)= $self->normalize_if_elif($flat, $line_info);
615                push @cond,      [$cond];
616                push @cond_line, $line_info;
617                $type= "cond";
618                $sub_type= "#if";
619            }
620            elsif ($flat =~ /^#elif\b/) {
621                if (!@cond) {
622                    confess "No if for $flat";
623                }
624                $level--;
625                ($flat, $cond)= $self->normalize_if_elif($flat, $line_info);
626                $cond[-1][-1]= $self->tidy_cond("!($cond[-1][-1])");
627                $cond_line[-1]= $line_info;
628                push @{ $cond[-1] }, $cond;
629                $type= "cond";
630                $sub_type= "#elif";
631            }
632            elsif ($flat =~ /^#else\b/) {
633                if (!@cond) {
634                    confess "No if for $flat";
635                }
636                $level--;
637                $cond[-1][-1]= $self->tidy_cond("!($cond[-1][-1])");
638                $cond_line[-1]= $line_info;
639                $type= "cond";
640                $sub_type= "#else";
641            }
642            elsif ($flat =~ /#undef/) {
643                $type= "content";
644                $sub_type= "#undef";
645            }
646            elsif ($flat =~ /#pragma\b/) {
647                $type= "content";
648                $sub_type= "#pragma";
649            }
650            elsif ($flat =~ /#include\b/) {
651                $type= "content";
652                $sub_type= "#include";
653            }
654            elsif ($flat =~ /#define\b/) {
655                $type= "content";
656                $sub_type= "#define";
657            }
658            elsif ($flat =~ /#error\b/) {
659                $type= "content";
660                $sub_type= "#error";
661            }
662            else {
663                confess "Do not know what to do with $line";
664            }
665            if ($type eq "cond") {
666
667                # normalize conditional lines
668                $line= $flat;
669                $last_cond= $line_info;
670            }
671        }
672        $line =~ s/\n?\z/\n/;
673
674        %$line_info= (
675            cond           => copy_aoa(\@cond),
676            type           => $type,
677            sub_type       => $sub_type,
678            raw            => $raw,
679            flat           => $flat,
680            line           => $line,
681            level          => $level,
682            source         => $self->{parse_source},
683            start_line_num => $start_line_num,
684            n_lines        => $line_num - $start_line_num,
685        );
686
687        push @lines, $line_info;
688        if ($do_pop) {
689            $line_info->{inner_lines}=
690                $line_info->start_line_num - $cond_line[-1]->start_line_num;
691            pop @cond;
692            pop @cond_line;
693        }
694        if ($type eq "content" and $cb) {
695            $cb->($self, $lines[-1]);
696        }
697    }
698    if (@cond_line) {
699        my $msg= "Unterminated conditional block starting line "
700            . $cond_line[-1]->start_line_num();
701        $msg .=
702            " with last conditional operation at line "
703            . $last_cond->start_line_num()
704            if $cond_line[-1] != $last_cond;
705        confess $msg;
706    }
707    $self->{lines}= \@lines;
708    return \@lines;
709}
710
711# returns the last lines we parsed.
712sub lines { $_[0]->{lines} }
713
714# assuming a line looks like an embed.fnc entry parse it
715# and normalize it, and create and EmbedLine object from it.
716sub tidy_embed_fnc_entry {
717    my ($self, $line_data)= @_;
718    my $line= $line_data->{line};
719    return $line if $line =~ /^\s*:/;
720    return $line unless $line_data->{type} eq "content";
721    return $line unless $line =~ /\|/;
722
723    $line =~ s/\s*\\\n/ /g;
724    $line =~ s/\s+\z//;
725    ($line)= expand($line);
726    my ($flags, $ret, $name, @args)= split /\s*\|\s*/, $line;
727    my %flag_seen;
728    $flags= join "", grep !$flag_seen{$_}++, sort split //, $flags;
729    if ($flags =~ s/^#//) {
730        $flags .= "#";
731    }
732    if ($flags eq "#") {
733        die "Not allowed to use only '#' for flags"
734            . "in 'embed.fnc' at line $line_data->{start_line_num}";
735    }
736    if (!$flags) {
737        die "Missing flags in function definition"
738            . " in 'embed.fnc' at line $line_data->{start_line_num}\n"
739            . "Did you a forget a line continuation on the previous line?\n";
740    }
741    for ($ret, @args) {
742        s/(\w)\*/$1 */g;
743        s/\*\s+(\w)/*$1/g;
744        s/\*const/* const/g;
745    }
746    my $head= sprintf "%-8s|%-7s", $flags, $ret;
747    $head .= sprintf "|%*s", -(31 - length($head)), $name;
748    if (@args and length($head) > 32) {
749        $head .= "\\\n";
750        $head .= " " x 32;
751    }
752    foreach my $ix (0 .. $#args) {
753        my $arg= $args[$ix];
754        $head .= "|$arg";
755        $head .= "\\\n" . (" " x 32) if $ix < $#args;
756    }
757    $line= $head . "\n";
758
759    if ($line =~ /\\\n/) {
760        my @lines= split /\s*\\\n/, $line;
761        my $len= length($lines[0]);
762        $len < length($_) and $len= length($_) for @lines;
763        $len= int(($len + 7) / 8) * 8;
764        $len= 72 if $len < 72;
765        $line= join("\\\n",
766            (map { sprintf "%*s", -$len, $_ } @lines[ 0 .. $#lines - 1 ]),
767            $lines[-1]);
768    }
769    ($line)= unexpand($line);
770
771    $line_data->{embed}= EmbedLine->new(
772        flags       => $flags,
773        return_type => $ret,
774        name        => $name,
775        args        => \@args,
776    );
777    $line =~ s/\s+\z/\n/;
778    $line_data->{line}= $line;
779    return $line;
780}
781
782# line up the text in a multiline string by a given $fragment
783# of text, inserting whitespace in front or behind the $fragment
784# to get the text to line up. Returns the text. This is wrapped
785# by line_up() and is used to wrap long conditions and comments
786# in the generated code.
787sub _line_up_frag {
788    my ($self, $str, $fragment)= @_;
789    die "has tabs?!" if $str =~ /\t/;
790    my @lines= split /\n/, $str;
791    my $changed= 1;
792    while ($changed) {
793        $changed= 0;
794        foreach my $ix (0 .. $#lines - 1) {
795            my $f_index= 0;
796            my $n_index= 0;
797            while (1) {
798                $f_index= index($lines[$ix],       $fragment, $f_index);
799                $n_index= index($lines[ $ix + 1 ], $fragment, $n_index);
800                if ($f_index == -1 or $n_index == -1) {
801                    last;
802                }
803                if ($f_index < $n_index) {
804                    my $f_idx= $f_index;
805                    $f_idx-- while substr($lines[$ix], $f_idx, 1) ne " ";
806                    substr($lines[$ix], $f_idx, 0, " " x ($n_index - $f_index));
807                    $changed++;
808                    last;
809                }
810                elsif ($n_index < $f_index) {
811                    my $n_idx= $n_index;
812                    $n_idx-- while substr($lines[ $ix + 1 ], $n_idx, 1) ne " ";
813                    substr($lines[ $ix + 1 ],
814                        $n_idx, 0, " " x ($f_index - $n_index));
815                    $changed++;
816                    last;
817                }
818                $f_index++;
819                $n_index++;
820            }
821        }
822    }
823    my $ret= join "", map { "$_\n" } @lines;
824    return $ret;
825}
826
827sub _fixup_indent {
828    my ($self, $line)= @_;
829    my @lines= split /\n/, $line;
830    if ($lines[0]=~/^(#\s*\w+(?:\s*\/\*)?\s)(\s+)/) {
831        my $first_left_len = length $1;
832
833        while (1) {
834            my $ok = 1;
835            for (@lines) {
836                /^.{$first_left_len} /
837                    or do { $ok = 0; last; };
838            }
839            if ($ok) {
840                s/^(.{$first_left_len}) /$1/ for @lines;
841            } else {
842                last;
843            }
844        }
845    }
846
847    if ($lines[0]=~/^(#\s*\w+\s+)\(/) {
848        my $len = length($1);
849        for my $idx (1..$#lines) {
850            $lines[$idx]=~s/^([ ]{$len})(\s+)(\()/$1$3$2/;
851        }
852    }
853    my $ret= join "", map { "$_\n" } @lines;
854    return $ret;
855}
856
857# this is the workhorse for _break_line_at_op().
858sub __break_line_at_op {
859    my ($self, $limit, $line, $blank_prefix)= @_;
860    my @lines= ("");
861    while (length $line) {
862        my $part;
863        if ($line =~ s/^(.*?(?:\|\||&&)\s+)//) {
864            $part= $1;
865        }
866        else {
867            $part= $line;
868            $line= "";
869        }
870        if (length($lines[-1]) + length($part) < $limit) {
871            $lines[-1] .= $part;
872        }
873        else {
874            push @lines, $blank_prefix . $part;
875        }
876    }
877    return \@lines;
878}
879
880# Break a condition line into parts, while trying to keep the last
881# token on each line being an operator like || or && or ? or : We try
882# to keep each line at $limit characters, however, we also try to
883# ensure that each line has the same number of operators on it such
884# that across all the lines there are only two counts of operators (eg,
885# we either way each line to have two operators on it, or 0, or 1 or 0,
886# or 2 or 1, and so on.) If we cannot meet this requirement we reduce
887# the limit by 1 and try again, until we meet the objective, or the
888# limit ends up at 70 chars or less.
889sub _break_line_at_op {
890    my ($self, $limit, $line, $blank_prefix)= @_;
891    my $lines;
892    while (1) {
893        $lines= $self->__break_line_at_op($limit, $line, $blank_prefix);
894        my %op_counts;
895        foreach my $line_idx (0 .. $#$lines) {
896            my $line= $lines->[$line_idx];
897            my $count= 0;
898            $count++ while $line =~ /(\|\||&&|\?|:)/g;
899            $op_counts{$count}++;
900
901        }
902        if ($limit <= $self->{min_break_width} || keys(%op_counts) <= 2) {
903            last;
904        }
905        $limit--;
906    }
907
908    s/\s*\z/\n/ for @$lines;
909    return join "", @$lines;
910}
911
912sub _max { # cant use Scalar::Util so we roll our own
913    my $max= shift;
914    $max < $_ and $max= $_ for @_;
915    return $max;
916}
917
918# take a condition, split into $type and $rest
919# wrap it, and try to line up operators and defined() functions
920# that it contains. This is rather horrible code, but it does a
921# reasonable job applying the heuristics we need to lay our the
922# conditions in a reasonable way.
923sub _wrap_and_line_up_cond {
924    my ($self, $type, $rest)= @_;
925
926    my $limit= $self->{max_width};
927
928    # extract the expression part of the line, and normalize it, we do
929    # this here even though it might be duplicative as it is possible
930    # that the caller code has munged the expression in some way, and we
931    # might want to simplify the expression first. Eg:
932    # 'defined(FOO) && (defined(BAR) && defined(BAZ))' should be turned into
933    # 'defined(FOO) && defined(BAR) && defined(BAZ)' if possible.
934    my $rest_head= "";
935    my $rest_tail= "";
936    if ($rest =~ s!(if\s+)!!) {
937        $rest_head= $1;
938    }
939    if ($rest =~ s!(\s*/\*.*?\*/)\s*\z!! || $rest =~ s!(\s*\*/\s*)\z!!) {
940        $rest_tail= $1;
941    }
942    if ($rest) {
943        $rest= $self->tidy_cond($rest);
944        $rest= $rest_head . $rest . $rest_tail;
945    }
946
947    my $l= length($type);
948    my $line= $type;
949    $line .= $rest if length($rest);
950    my $blank_prefix= " " x $l;
951
952    # at this point we have a single line with the entire expression on it
953    # if it fits on one line we are done, we can return it right away.
954    if (length($line) <= $limit) {
955        $line =~ s/\s*\z/\n/;
956        return $line;
957    }
958    my $rest_copy= $rest;
959    my @fragments;
960    my $op_pat= qr/(?:\|\||&&|[?:])/;
961
962    # does the $rest contain a parenthesized group? If it does then
963    # there are a mixture of different ops being used, as if it was all
964    # the same opcode there would not be a parenthesized group.
965    # If it does then we handle it differently, and try to put the
966    # different parts of the expression on their own line.
967    if ($rest_copy =~ /$op_pat\s*\(/) {
968        my @parts;
969        while (length $rest_copy) {
970            if ($rest_copy =~ s/^(.*?$op_pat)(\s*!?\()/$2/) {
971                push @parts, $1;
972            } else {
973                #$rest_copy=~s/^\s+//;
974                push @parts, $rest_copy;
975                last;
976            }
977        }
978        $parts[0]= $type . $parts[0];
979        $parts[$_]= $blank_prefix . $parts[$_] for 1 .. $#parts;
980        foreach my $line (@parts) {
981            if (length($line) > $limit) {
982                $line= $self->_break_line_at_op($limit, $line, $blank_prefix);
983            }
984        }
985        s/\s*\z/\n/ for @parts;
986        $line= join "", @parts;
987        @fragments= ("defined", "||");
988    }
989    else {
990        # the expression consists of just one opcode type, so we can use
991        # simpler logic to break it apart with the objective of ensuring
992        # that the lines are similarly formed with trailing operators on
993        # each line but the last.
994        @fragments= ("||", "defined");
995        $line= $self->_break_line_at_op($limit, $type . $rest, $blank_prefix);
996    }
997
998    # try to line up the text on different lines. We stop after
999    # the first $fragment that modifies the text. The order
1000    # of fragments we try is determined above based on the type
1001    # of condition this is.
1002    my $pre_line= $line;
1003    foreach my $fragment (@fragments) {
1004        $line= $self->_line_up_frag($line, $fragment);
1005        last if $line ne $pre_line;
1006    }
1007
1008    # if we have lined up by "defined" in _line_up_frag()
1009    # then we may have " ||        defined(...)" type expressions
1010    # convert these to "        || defined(...)" as it looks better.
1011    $line =~ s/( )(\|\||&&|[()?:])([ ]{2,})(!?defined)/$3$2$1$4/g;
1012    $line =~ s/(\|\||&&|[()?:])[ ]{10,}/$1 /g;
1013
1014    # add back the line continuations. this is all pretty inefficient,
1015    # but it works nicely.
1016    my @lines= split /\n/, $line;
1017    my $last= pop @lines;
1018    my $max_len= _max(map { length $_ } @lines);
1019    $_= sprintf "%*s \\\n", -$max_len, $_ for @lines;
1020    $last .= "\n";
1021
1022    $line= join "", @lines, $last;
1023
1024    # remove line continuations that are inside of a comment,
1025    # we may have a variable number of lines of the expression
1026    # or parts of lines of the expression in a comment, so
1027    # we do this last.
1028    $line =~ s!/\* (.*) \*/
1029              !"/*"._strip_line_cont("$1")."*/"!xsge;
1030
1031    return $self->_fixup_indent($line);
1032}
1033
1034#remove line continuations from the argument.
1035sub _strip_line_cont {
1036    my ($string)= @_;
1037    $string =~ s/\s*\\\n/\n/g;
1038    return $string;
1039}
1040
1041# Takes an array of HeaderLines objects produced by parse_fh()
1042# or by group_content(), and turn it into a string.
1043sub lines_as_str {
1044    my ($self, $lines, $post_process_content)= @_;
1045    $lines ||= $self->{lines};
1046    my $ret;
1047    $post_process_content ||= $self->{post_process_content};
1048    my $filter= $self->{filter_content};
1049    my $last_line= "";
1050
1051    #warn $self->dd($lines);
1052    foreach my $line_data (@$lines) {
1053        my $line= $line_data->{line};
1054        if ($line_data->{type} ne "content" or $line_data->{sub_type} ne "text")
1055        {
1056            my $level= $line_data->{level};
1057            my $ind= $self->indent_chars($level);
1058            $line =~ s/^#(\s*)/#$ind/;
1059        }
1060        if ($line_data->{type} eq "cond") {
1061            my $add_commented_expr_after= $self->{add_commented_expr_after};
1062            if ($line_data->{sub_type} =~ /#(?:else|endif)/) {
1063                my $joined= join " && ",
1064                    map { "($_)" } @{ $line_data->{cond}[-1] };
1065                my $cond_txt= $self->tidy_cond($joined);
1066                $cond_txt= "if $cond_txt" if $line_data->{sub_type} eq "#else";
1067                $line =~ s!\s*\z! /* $cond_txt */\n!
1068                    if $line_data->{inner_lines} >= $add_commented_expr_after;
1069            }
1070            elsif ($line_data->{sub_type} eq "#elif") {
1071                my $last_frame= $line_data->{cond}[-1];
1072                my $joined= join " && ",
1073                    map { "($_)" } @$last_frame[ 0 .. ($#$last_frame - 1) ];
1074                my $cond_txt= $self->tidy_cond($joined);
1075                $line =~ s!\s*\z! /* && $cond_txt */\n!
1076                    if $line_data->{inner_lines} >= $add_commented_expr_after;
1077            }
1078        }
1079        $line =~ s/\s*\z/\n/;
1080        if ($last_line eq "\n" and $line eq "\n") {
1081            next;
1082        }
1083        $last_line= $line;
1084        if ($line_data->{type} eq "cond") {
1085            $line =~ m!(^\s*#\s*\w+[ ]*)([^/].*?\s*)?(/\*.*)?\n\z!
1086                or die "Failed to split cond line: $line";
1087            my ($type, $cond, $comment)= ($1, $2, $3);
1088            $comment //= "";
1089            $cond    //= "";
1090            my $new_line;
1091            if (!length($cond) and $comment) {
1092                $comment =~ s!^(/\*\s+)!!
1093                    and $type .= $1;
1094            }
1095
1096            $line= $self->_wrap_and_line_up_cond($type, $cond . $comment);
1097        }
1098        $line_data->{line}= $line;
1099        if ($post_process_content and $line_data->{type} eq "content") {
1100            $post_process_content->($self, $line_data);
1101        }
1102        if ($filter and $line_data->{type} eq "content") {
1103            $filter->($self, $line_data) or next;
1104        }
1105        $ret .= $line_data->{line};
1106    }
1107    return $ret;
1108}
1109
1110# Text::Wrap::wrap has an odd api, so hide it behind a wrapper
1111# sub which sets things up properly.
1112sub _my_wrap {
1113    my ($head, $rest, $line)= @_;
1114    local $Text::Wrap::unexpand= 0;
1115    local $Text::Wrap::huge= "overflow";
1116    local $Text::Wrap::columns= 78;
1117    unless (length $line) { return $head }
1118    $line= wrap $head, $rest, $line;
1119    return $line;
1120}
1121
1122# recursively extract the && expressions from a parse tree,
1123# returning the result as strings.
1124# if $node is not a '&&' op then it returns $node as a string,
1125# otherwise it returns the string form of the arguments to the
1126# '&&' op, recursively flattening any '&&' nodes that it might
1127# contain.
1128sub _and_clauses {
1129    my ($self, $node)= @_;
1130
1131    my @ret;
1132    if (@$node < 3 or $node->[0] ne "&&") {
1133        return $self->_pt_as_str($node);
1134    }
1135    foreach my $idx (1 .. $#$node) {
1136        push @ret, $self->_and_clauses($node->[$idx]);
1137    }
1138    return @ret;
1139}
1140
1141# recursively walk the a parse tree, and return the literal
1142# terms it contains, ignoring any operators in the optree.
1143sub _terms {
1144    my ($self, $node)= @_;
1145    if (@$node == 1) {
1146        return $self->_pt_as_str($node);
1147    }
1148    my @ret;
1149    if (@$node == 2) {
1150        if ($node->[0] eq "?") {
1151            push @ret, map { $self->_terms($_) } @{ $node->[1] };
1152        }
1153        else {
1154            push @ret, $self->_terms($node->[1]);
1155        }
1156    }
1157    else {
1158        foreach my $i (1 .. $#$node) {
1159            push @ret, $self->_terms($node->[$i]);
1160        }
1161    }
1162    return @ret;
1163}
1164
1165# takes a HeaderLine "cond" AoA and flattens it into
1166# a single expression, and then extracts all the and clauses
1167# it contains. Thus [['defined(A)'],['defined(B)']] and
1168# [['defined(A) && defined(B)']], end up as ['defined(A)','defined(B)']
1169sub _flatten_cond {
1170    my ($self, $cond_ary)= @_;
1171
1172    my $expr= join " && ", map {
1173        map { "($_)" }
1174            @$_
1175    } @$cond_ary;
1176    return [] unless $expr;
1177    my $tree= $self->parse_expr($expr);
1178    my %seen;
1179    my @and_clause= grep { !$seen{$_}++ } $self->_and_clauses($tree);
1180    return \@and_clause;
1181}
1182
1183# Find the best path into a tree of conditions, such that
1184# we reuse the maximum number of existing branches. Returning
1185# two arrays, the first contain the parts of $cond_array that
1186# make up the best path, in the best path order, and a second array
1187# with the remaining items in the initial order they were provided.
1188# Thus if we have previously stored only the path "A", "B", "C"
1189# into the tree, and want to find the best path for
1190# ["E","D","C","B","A"] we should return: ["A","B","C"],["E","D"],
1191#
1192# This used to reduce the number of conditions in the grouped content,
1193# and is especially helpful with dealing with DEBUGGING related
1194# functionality. It is coupled with careful control over the order
1195# that we add paths and conditions to the tree.
1196sub _best_path {
1197    my ($self, $tree_node, $cond_array, @path)= @_;
1198    my $best= \@path;
1199    my $rest= $cond_array;
1200    foreach my $cond (@$cond_array) {
1201        if ($tree_node->{$cond}) {
1202            my ($new_best, $new_rest)=
1203                $self->_best_path($tree_node->{$cond},
1204                [ grep $_ ne $cond, @$cond_array ],
1205                @path, $cond);
1206            if (@$new_best > @$best) {
1207                ($best, $rest)= ($new_best, $new_rest);
1208            }
1209        }
1210    }
1211    if (@$best == @path) {
1212        foreach my $cond (@$cond_array) {
1213            my $not_cond= $self->tidy_cond("!($cond)");
1214            if ($tree_node->{$not_cond}) {
1215                $best= [ @path, $cond ];
1216                $rest= [ grep $_ ne $cond, @$cond_array ];
1217                last;
1218            }
1219        }
1220    }
1221    return ($best, $rest);
1222}
1223
1224# This builds a group content tree from a set of lines. each content line in
1225# the original file is added to the file based on the conditions that apply to
1226# the content.
1227#
1228# The tree is made up of nested HoH's with keys in the HoH being normalized
1229# clauses from the {cond} data in the HeaderLine objects.
1230#
1231# Care is taken to minimize the number of pathways and to reorder clauses to
1232# reuse existing pathways and minimize the total number of conditions in the
1233# file.
1234#
1235# The '' key of a hash contains an array of the lines that are part of the
1236# condition that lead to that key. Thus lines with no conditions are in
1237# @{$tree{''}}, lines with the condition "defined(A) && defined(B)" would be
1238# in $tree{"defined(A)"}{"defined(B)"}{""}.
1239#
1240# The result of this sub is normally passed into __recurse_group_content_tree()
1241# which converts it back into a set of HeaderLine objects.
1242#
1243sub _build_group_content_tree {
1244    my ($self, $lines)= @_;
1245    $lines ||= $self->{lines};
1246    my $filter= $self->{filter_content};
1247    my %seen_normal;
1248    foreach my $line_data (@$lines) {
1249        next if $line_data->{type} ne "content";
1250        next if $filter and !$filter->($self, $line_data);
1251        my $cond_frames= $line_data->{cond};
1252        my $cond_frame= $self->_flatten_cond($cond_frames);
1253        my $flat_merged= join " && ", map "($_)", @$cond_frame;
1254        my $normalized;
1255        if (@$cond_frame) {
1256            $normalized= $self->tidy_cond($flat_merged);
1257        }
1258        else {
1259            $normalized= $flat_merged;    # empty string
1260        }
1261        push @{ $seen_normal{$normalized} }, $line_data;
1262    }
1263    my @debugging;
1264    my @non_debugging;
1265    foreach my $key (keys %seen_normal) {
1266        if ($key =~ /DEBUGGING/) {
1267            push @debugging, $key;
1268        }
1269        else {
1270            push @non_debugging, $key;
1271        }
1272    }
1273    @non_debugging=
1274        sort { length($a) <=> length($b) || $a cmp $b } @non_debugging;
1275    @debugging= sort { length($b) <=> length($a) || $a cmp $b } @debugging;
1276    my %tree;
1277    foreach my $normal_expr (@non_debugging, @debugging) {
1278        my $all_line_data= $seen_normal{$normal_expr};
1279
1280        my $cond_frame=
1281            (length $normal_expr)
1282            ? $self->_flatten_cond([ [$normal_expr] ])
1283            : [];
1284        @$cond_frame= $self->_sort_terms(@$cond_frame);
1285        my $node= \%tree;
1286        my ($best, $rest)= $self->_best_path($node, $cond_frame);
1287        die sprintf "Woah: %d %d %d", 0 + @$best, 0 + @$rest, 0 + @$cond_frame
1288            unless @$best + @$rest == @$cond_frame;
1289
1290        foreach my $cond (@$best, @$rest) {
1291            $node= $node->{$cond} ||= {};
1292        }
1293        push @{ $node->{''} }, @$all_line_data;
1294    }
1295
1296    warn $self->dd(\%tree) if $self->{debug};
1297    $self->{tree}= \%tree;
1298    return \%tree;
1299}
1300
1301sub _recurse_group_content_tree {
1302    my ($self, $node, @path)= @_;
1303
1304    my @ret;
1305    local $self->{rgct_ret}= \@ret;
1306    local $self->{line_by_depth}= [];
1307
1308    $self->__recurse_group_content_tree($node, @path);
1309    return \@ret;
1310}
1311
1312# convert a tree of conditions constructed by _build_group_content_tree()
1313# and turn it into a set of HeaderLines that represents it. Performs the
1314# appropriate sets required to reconstitute an if/elif/elif/else sequence
1315# by calling _handle_else().
1316sub __recurse_group_content_tree {
1317    my ($self, $node, @path)= @_;
1318    my $depth= 0 + @path;
1319    my $ind= $self->indent_chars($depth);
1320    my $ret= $self->{rgct_ret};
1321    if ($node->{''}) {
1322        if (my $cb= $self->{post_process_grouped_content}) {
1323            $cb->($self, $node->{''}, \@path);
1324        }
1325        if (my $cb= $self->{post_process_content}) {
1326            $cb->($self, $_, \@path) for @{ $node->{''} };
1327        }
1328        push @$ret, map {
1329            HeaderLine->new(
1330                %$_,
1331                cond           => [@path],
1332                level          => $depth,
1333                start_line_num => 0 + @$ret
1334            )
1335        } @{ $node->{''} };
1336    }
1337
1338    my %skip;
1339    foreach my $expr (
1340        map  { $_->[0] }
1341        sort { $a->[1] cmp $b->[1] || $a->[0] cmp $b->[0] }
1342        map  { [ $_, lc($_) =~ s/[^A-Za-z0-9]+//gr ] } keys %$node
1343    ) {
1344        next unless length $expr;    # ignore payload
1345        my $not= $self->tidy_cond("!($expr)");
1346        if ($skip{$expr} or ($not !~ /^!/ and $node->{$not})) {
1347            next;
1348        }
1349        my $kid= $node->{$expr};
1350        while (!$node->{$not} and keys(%$kid) == 1 and !$kid->{''}) {
1351            my ($kid_key)= keys(%$kid);
1352            $expr= $self->tidy_cond("($expr) && ($kid_key)");
1353            $kid= $kid->{$kid_key};
1354            my $new_not= $self->tidy_cond("!($expr)");
1355            if ($node->{$new_not}) {
1356                $not= $new_not;
1357                $skip{$not}++;
1358            }
1359        }
1360        my $raw= "#${ind}if $expr\n";
1361        my $hl= HeaderLine->new(
1362            type           => "cond",
1363            sub_type       => "#if",
1364            raw            => $raw,
1365            line           => $raw,
1366            level          => $depth,
1367            cond           => [ @path, [$expr] ],
1368            start_line_num => 0 + @$ret,
1369        );
1370        $self->{line_by_depth}[$depth]= 0 + @$ret;
1371        push @$ret, $hl;
1372        $self->__recurse_group_content_tree($kid, @path, [$expr]);
1373        if ($node->{$not}) {
1374            $skip{$not}++;
1375            $self->_handle_else($not, $node->{$not}, $ind, $depth, @path,
1376                [$not]);
1377        }
1378
1379        # and finally the #endif
1380
1381        $raw= "#${ind}endif\n";
1382
1383        # we need to extract the condition information from the last line in @ret,
1384        # as we don't know which condition we are ending here. It could be an elsif
1385        # from deep in the parse tree for instance.
1386        # So we need to extract the last frame from the cond structure in the last
1387        # line-info in @ret.
1388        # BUT if this last line is itself an #endif, then we need to take the second
1389        # to last line instead, as the endif would have "popped" that frame off the
1390        # condition stack.
1391        my $last_ret= $ret->[-1];
1392        my $idx=
1393            ($last_ret->{type} eq "cond" && $last_ret->{sub_type} eq "#endif")
1394            ? -2
1395            : -1;
1396        my $end_line= HeaderLine->new(
1397            type           => "cond",
1398            sub_type       => "#endif",
1399            raw            => $raw,
1400            line           => $raw,
1401            level          => $depth,
1402            cond           => [ @path, $last_ret->{cond}[$idx] ],
1403            start_line_num => 0 + @$ret,
1404            inner_lines    => @$ret - $self->{line_by_depth}[$depth],
1405        );
1406        undef $self->{line_by_depth}[$depth];
1407        push @$ret, $end_line;
1408    }
1409    return $ret;
1410}
1411
1412# this handles the specific case of an else clause, detecting
1413# when an elif can be constructed, may recursively call itself
1414# to deal with if/elif/elif/else chains. Calls back into
1415# __recurse_group_content_tree().
1416sub _handle_else {
1417    my ($self, $not, $kid, $ind, $depth, @path)= @_;
1418
1419    # extract the first 3 keys - from this we can detect
1420    # which of the three scenarios we have to handle.
1421    my ($k1, $k2, $k3)=
1422        sort { length($a) <=> length($b) || $a cmp $b } keys %$kid;
1423    my $not_k1;
1424    if (length($k1) and defined($k2) and !defined($k3)) {
1425
1426        # if we do not have a payload (length($k1)) and we have exactly
1427        # two keys (defined($k2) and !defined($k3)) we need to compute
1428        # the inverse of $k1, which we will use later.
1429        $not_k1= $self->tidy_cond("!($k1)");
1430    }
1431    my $ret= $self->{rgct_ret};
1432    if (length($k1) and !defined($k2)) {
1433
1434        # only one child, no payload -> elsif $k1
1435        my $sub_expr;
1436        do {
1437            $sub_expr=
1438                 !$sub_expr
1439                ? $k1
1440                : $self->tidy_cond("($sub_expr) && ($k1)");
1441            $kid= $kid->{$k1};
1442            ($k1, $k2)=
1443                sort { length($a) <=> length($b) || $a cmp $b } keys %$kid;
1444        } while length($k1) and !defined $k2;
1445
1446        my $raw= "#${ind}elif $sub_expr\n";
1447        push @{ $path[-1] }, $sub_expr;
1448        my $hl= HeaderLine->new(
1449            type           => "cond",
1450            sub_type       => "#elif",
1451            raw            => $raw,
1452            line           => $raw,
1453            level          => $depth,
1454            cond           => [ map { [@$_] } @path ],
1455            start_line_num => 0 + @$ret,
1456            inner_lines    => @$ret - $self->{line_by_depth}[$depth],
1457        );
1458        $self->{line_by_depth}[$depth]= 0 + @$ret;
1459        push @$ret, $hl;
1460        $self->__recurse_group_content_tree($kid, @path);
1461    }
1462    elsif (defined($not_k1) and $not_k1 eq $k2) {
1463
1464        # two children which are complementary, no payload -> elif $k1 else..
1465        my $raw= "#${ind}elif $k1\n";
1466
1467        push @{ $path[-1] }, $k1;
1468        my $hl= HeaderLine->new(
1469            type           => "cond",
1470            sub_type       => "#elif",
1471            raw            => $raw,
1472            line           => $raw,
1473            level          => $depth,
1474            cond           => [ map { [@$_] } @path ],
1475            start_line_num => 0 + @$ret,
1476            inner_lines    => @$ret - $self->{line_by_depth}[$depth],
1477        );
1478        $self->{line_by_depth}[$depth]= 0 + @$ret;
1479        push @$ret, $hl;
1480        $self->__recurse_group_content_tree($kid->{$k1}, @path);
1481        $path[-1][-1]= $k2;
1482        $self->_handle_else($k2, $kid->{$k2}, $ind, $depth, @path);
1483    }
1484    else {
1485        # payload, 3+ children, or 2 which are not complementary -> else
1486        my $raw= "#${ind}else\n";
1487        my $hl= HeaderLine->new(
1488            type           => "cond",
1489            sub_type       => "#else",
1490            raw            => $raw,
1491            line           => $raw,
1492            level          => $depth,
1493            cond           => [ map { [@$_] } @path ],
1494            start_line_num => 0 + @$ret,
1495            inner_lines    => @$ret - $self->{line_by_depth}[$depth],
1496        );
1497        $self->{line_by_depth}[$depth]= 0 + @$ret;
1498        push @$ret, $hl;
1499        $self->__recurse_group_content_tree($kid, @path);
1500    }
1501    return $ret;
1502}
1503
1504# group the content in lines by the condition that apply to them
1505# returns a set of lines representing the new structure
1506sub group_content {
1507    my ($self, $lines, $filter)= @_;
1508    $lines ||= $self->{lines};
1509    local $self->{filter_content}= $filter || $self->{filter_content};
1510    my $tree= $self->_build_group_content_tree($lines);
1511    return $self->_recurse_group_content_tree($tree);
1512}
1513
1514#read a file by name - opens the file and passes the fh into parse_fh().
1515sub read_file {
1516    my ($self, $file_name, $callback)= @_;
1517    $self= $self->new() unless ref $self;
1518    local $self->{parse_source}= $file_name;
1519    open my $fh, "<", $file_name
1520        or confess "Failed to open '$file_name' for read: $!";
1521    my $lines= $self->parse_fh($fh);
1522    if ($callback) {
1523        foreach my $line (@$lines) {
1524            $callback->($self, $line);
1525        }
1526    }
1527    return $self;
1528}
1529
1530# These are utility methods for the HeaderLine objects.
1531sub HeaderLine::new {
1532    my ($class, %self)= @_;
1533    return bless \%self, $class;
1534}
1535sub HeaderLine::cond        { $_[0]->{cond} }                             # AoA
1536sub HeaderLine::type        { $_[0]->{type} }
1537sub HeaderLine::type_is     { return $_[0]->type eq $_[1] ? 1 : 0 }
1538sub HeaderLine::sub_type    { $_[0]->{sub_type} }
1539sub HeaderLine::sub_type_is { return $_[0]->sub_type eq $_[1] ? 1 : 0 }
1540sub HeaderLine::raw         { $_[0]->{raw} }
1541sub HeaderLine::flat        { $_[0]->{flat} }
1542sub HeaderLine::line        { $_[0]->{line} }
1543sub HeaderLine::level       { $_[0]->{level} }
1544sub HeaderLine::is_content  { return $_[0]->type_is("content") }
1545sub HeaderLine::is_cond     { return $_[0]->type_is("cond") }
1546sub HeaderLine::is_define   { return $_[0]->sub_type_is("#define") }
1547sub HeaderLine::line_num    { $_[0]->{start_line_num} }
1548sub HeaderLine::inner_lines { $_[0]->{inner_lines} }
1549sub HeaderLine::n_lines     { $_[0]->{n_lines} }
1550sub HeaderLine::embed       { $_[0]->{embed} }
1551*HeaderLine::start_line_num= *HeaderLine::line_num;
1552
1553# these are methods for EmbedLine objects
1554*EmbedLine::new= *HeaderLine::new;
1555sub EmbedLine::flags       { $_[0]->{flags} }
1556sub EmbedLine::return_type { $_[0]->{return_type} }
1557sub EmbedLine::name        { $_[0]->{name} }
1558sub EmbedLine::args        { $_[0]->{args} }          # array ref
1559
15601;
1561
1562__END__
1563
1564=head1 NAME
1565
1566HeaderParser - A minimal header file parser that can be hooked by other porting
1567scripts.
1568
1569=head1 SYNOPSIS
1570
1571    my $o= HeaderParser->new();
1572    my $lines= $o->parse_fh($fh);
1573
1574=head1 DESCRIPTION
1575
1576HeaderParser is a tool to parse C preprocessor header files. The tool
1577understands the syntax of preprocessor conditions, and is capable of creating
1578a parse tree of the expressions involved, and normalizing them as well.
1579
1580C preprocessor files are a bit tricky to parse properly, especially with a
1581"line by line" model. There are two issues that must be dealt with:
1582
1583=over 4
1584
1585=item Line Continuations
1586
1587Any line ending in "\\\n" (that is backslash newline) is considered to be part
1588of a longer string which continues on the next line. Processors should replace
1589the "\\\n" typically with a space when converting to a "real" line.
1590
1591=item Comments Acting As A Line Continuation
1592
1593The rules for header files stipulates that C style comments are stripped
1594before processing other content, this means that comments can serve as a form
1595of line continuation:
1596
1597    #if defined(foo) /*
1598    */ && defined(bar)
1599
1600is the same as
1601
1602    #if defined(foo) && defined(bar)
1603
1604This type of comment usage is often overlooked by people writing header file
1605parsers for the first time.
1606
1607=item Indented pre processor directives.
1608
1609It is easy to forget that there may be multiple spaces between the "#"
1610character and the directive. It also easy to forget that there may be spaces
1611in *front* of the "#" character. Both of these cases are often overlooked.
1612
1613=back
1614
1615The main idea of this module is to provide a single framework for correctly
1616parsing the content of our header files in a consistent manner. A secondary
1617purpose it to make various tasks we want to do easier, such as normalizing
1618content or preprocessor expressions, or just extracting the real "content" of
1619the file properly.
1620
1621=head2 parse_fh
1622
1623This function parses a filehandle into a set of lines.  Each line is represented by a hash
1624based object which contains the following fields:
1625
1626    bless {
1627        cond     => [['defined(a)'],['defined(b)']],
1628        type     => "content",
1629        sub_type => undef,
1630        raw      => $raw_content_of_line,
1631        line     => $normalized_content_of_line,
1632        level    => $level,
1633        source         => $filename_or_string,
1634        start_line_num => $line_num_for_first_line,
1635        n_lines        => $line_num - $line_num_for_first_line,
1636    }, "HeaderLine"
1637
1638A "line" in this context is a logical line, and because of line continuations
1639and comments may contain more than one physical line, and thus more than
1640one newline, but will always include at least one and will always end with one
1641(unless there is no newline at the end of the file). Thus
1642
1643    before /*
1644     this is a comment
1645    */ after \
1646    and continues
1647
1648will be treated as a single logical line even though the content is
1649spread over four lines.
1650
1651=over 4
1652
1653=item cond
1654
1655An array of arrays containing the normalized expressions of any C preprocessor
1656conditional blocks which include the line. Each line has its own copy of the
1657conditions it was operated on currently, but that may change so dont alter
1658this data. The inner arrays may contain more than one element. If so then the
1659line is part of an "#else" or "#elsif" and the clauses should be considered to
1660be a conjuction when considering "when is this line included", however when
1661considered as part of an if/elsif/else, each added clause represents the most
1662recent condition. In the following you can see how:
1663
1664    before          /* cond => [ ]                      */
1665    #if A           /* cond => [ ['A'] ]                */
1666    do-a            /* cond => [ ['A'] ]                */
1667    #elif B         /* cond => [ ['!A', 'B'] ]          */
1668    do-b            /* cond => [ ['!A', 'B'] ]          */
1669    #else           /* cond => [ ['!A', '!B'] ]         */
1670    do-c            /* cond => [ ['!A', '!B'] ]         */
1671    # if D          /* cond => [ ['!A', '!B'], ['D'] ]  */
1672    do-d            /* cond => [ ['!A', '!B'], ['D'] ]  */
1673    # endif         /* cond => [ ['!A', '!B'], ['D'] ]  */
1674    #endif          /* cond => [ ['!A', '!B'] ]         */
1675    after           /* cond => [ ]                      */
1676
1677So in the above we can see how the three clauses of the if produce
1678a single "frame" in the cond array, but that frame "grows" and changes
1679as additional else clauses are added. When an entirely new if block
1680is started (D) it gets its own block. Each endif includes the clause
1681it terminates.
1682
1683=item type
1684
1685This value indicates the type of the line. This may be one of the following:
1686'content', 'cond', 'define', 'include' and 'error'. Several of the types
1687have a sub_type.
1688
1689=item sub_type
1690
1691This value gives more detail on the type of the line where necessary.
1692Not all types have a subtype.
1693
1694    Type    | Sub Type
1695    --------+----------
1696    content | text
1697            | include
1698            | define
1699            | error
1700    cond    | #if
1701            | #elif
1702            | #else
1703            | #endif
1704
1705Note that there are no '#ifdef' or '#elifndef' or similar expressions. All
1706expressions of that form are normalized into the '#if defined' form to
1707simplify processing.
1708
1709=item raw
1710
1711This was the raw original text before HeaderParser performed any modifications
1712to it.
1713
1714=item line
1715
1716This is the normalized and modified text after HeaderParser or any callbacks
1717have processed it.
1718
1719=item level
1720
1721This is the "indent level" of a line and corresponds to the number of blocks
1722that the line is within, not including any blocks that might be created by
1723the line itself.
1724
1725    before          /* level => 0 */
1726    #if A           /* level => 0 */
1727    do-a            /* level => 1 */
1728    #elif B         /* level => 0 */
1729    do-b            /* level => 1 */
1730    #else           /* level => 0 */
1731    do-c            /* level => 1 */
1732    # if D          /* level => 1 */
1733    do-d            /* level => 2 */
1734    # endif         /* level => 1 */
1735    #endif          /* level => 0 */
1736    after           /* level => 0 */
1737
1738=back
1739
1740parse_fh() will throw an exception if it encounters a malformed expression
1741or input it cannot handle.
1742
1743=head2 lines_as_str
1744
1745This function will return a string representation of the lines it is provided.
1746
1747=head2 group_content
1748
1749This function will group the text in the file by the conditions which contain
1750it. This is only useful for files where the content is essentially a list and
1751where changing the order that lines are output in will not break the resulting
1752file.
1753
1754Each content line will be grouped into a structure of nested if/else blocks
1755(elif will produce a new nested block) such that the content under the control
1756of a given set of normalized condition clauses are grouped together in the order
1757the occurred in the file, such that each combined conditional clause is output
1758only once.
1759
1760This means a file like this:
1761
1762    #if A
1763    A
1764    #elif K
1765    AK
1766    #else
1767    ZA
1768    #endif
1769    #if B && Q
1770    B
1771    #endif
1772    #if Q && B
1773    BC
1774    #endif
1775    #if A
1776    AD
1777    #endif
1778    #if !A
1779    ZZ
1780    #endif
1781
1782Will end up looking roughly like this:
1783
1784    #if A
1785    A
1786    AD
1787    #else
1788    ZZ
1789    # if K
1790    AK
1791    # else
1792    ZA
1793    # endif
1794    #endif
1795    #if B && Q
1796    B
1797    BC
1798    #endif
1799
1800Content at a given block level always goes before conditional clauses
1801at the same nesting level.
1802
1803=head2 HOOKS
1804
1805There are severals hooks that are available, C<pre_process_content> and
1806C<post_process_content>, and C<post_process_grouped_content>. All of these
1807hooks  will be called with the HeaderParser object as the first argument.
1808The "process_content" callbacks will be called with a line hash as the second
1809argument, and C<post_process_grouped_content> will be called with an
1810array of line hashes for the content in that group, so that the array may be
1811modified or sorted.  Callbacks called from inside of C<group_content()>
1812(that is C<post_process_content> and C<post_process_grouped_content> will be
1813called with an additional argument containing and array specifying the actual
1814conditional "path" to the content  (which may differ somewhat from the data in
1815a lines "cond" property).
1816
1817These hooks may do what they like, but generally they will modify the
1818"line" property of the line hash to change the final output returned
1819by C<lines_as_str()> or C<group_content()>.
1820
1821=head2 FORMATTING AND INDENTING
1822
1823Header parser tries hard to produce neat and readable output with a consistent
1824style and form. For example:
1825
1826    #if defined(FOO)
1827    # define HAS_FOO
1828    # if defined(BAR)
1829    #   define HAS_FOO_AND_BAR
1830    # else /* !defined(BAR) */
1831    #   define HAS_FOO_NO_BAR
1832    # endif /* !defined(BAR) */
1833    #endif /* defined(FOO) */
1834
1835HeaderParser uses two space tab stops for indenting C pre-processor
1836directives. It puts the spaces between the "#" and the directive. The "#" is
1837considered "part" of the indent, even though the space comes after it. This
1838means the first indent level "looks" like one space, and following indents
1839look like 2. This should match what a sensible editor would do with two space
1840tab stops. The C<indent_chars()> method can be used to convert an indent level
1841into a string that contains the appropriate number of spaces to go in between
1842the "#" and the directive.
1843
1844When emitting "#endif", "#elif" and "#else" directives comments will be
1845emitted also to show the conditions that apply. These comments may be wrapped
1846to cover multiple lines. Some effort is made to get these comments to line up
1847visually, but it uses heuristics which may not always produce the best result.
1848
1849=cut
1850