xref: /openbsd/gnu/usr.bin/perl/regen/HeaderParser.pm (revision f2a19305)
1package HeaderParser;
2use strict;
3use warnings;
4
5# these are required below in BEGIN statements, we cant have a
6# hard dependency on them as they might not be available when
7# we run as part of autodoc.pl
8#
9# use Data::Dumper;
10# use Storable qw(dclone);
11#
12use Carp       qw(confess);
13use Text::Tabs qw(expand unexpand);
14use Text::Wrap qw(wrap);
15
16# The style of this file is determined by:
17#
18# perltidy -w -ple -bbb -bbc -bbs -nolq -l=80 -noll -nola -nwls='=' \
19#   -isbc -nolc -otr -kis -ci=4 -se -sot -sct -nsbl -pt=2 -fs  \
20#   -fsb='#start-no-tidy' -fse='#end-no-tidy' -cpb -bfvt=2
21
22my (
23    %unop,              # unary operators and their precedence
24    %binop,             # binary operators and their precedence
25    %is_right_assoc,    # operators which are right associative
26    %precedence,        # precedence of all operators.
27    %associative,       # associative operators
28    %commutative,       # commutative operators
29    %cmpop,             # comparison operators
30    $unop_pat,          # pattern to match unary operators
31    $binop_pat,         # pattern to match binary operators
32    %op_names,          # map of op to description, used in error messages
33    $tokenize_pat       # a pattern which can tokenize an expression
34);
35
36BEGIN {
37    # this is initialization for the operator precedence expression parser
38    # we use for handling preprocessor conditions.
39    %op_names= (
40        '==' => 'equality',
41        '!=' => 'inequality',
42        '<<' => 'bit-shift-left',
43        '>>' => 'bit-shift-right',
44        '+'  => 'addition',
45        '-'  => 'subtraction',
46        '*'  => 'multiplication',
47        '/'  => 'division',
48        '%'  => 'modulo',
49        '||' => 'logical-or',       # Lowest precedence
50        '&&' => 'logical-and',
51        '|'  => 'binary-or',
52        '^'  => 'binary-xor',
53        '&'  => 'binary-and',
54        '<'  => 'less-than',        # split on spaces, all with equal precedence
55        '>'  => 'greater-than',
56        '<=' => 'less-than-or-equal',
57        '>=' => 'greater-than-or-equal',
58    );
59    my @cmpop= (
60        '== !=',                    # listed in lowest to highest precedence
61        '< > <= >=',                # split on spaces, all with equal precedence
62    );
63    my @binop= (
64        '||',                       # Lowest precedence
65        '&&',
66        '|',
67        '^',
68        '&',
69        @cmpop,    # include the numerical comparison operators.
70        '<< >>',
71        '+ -',
72        '* / %',    # highest prcedence operators.
73    );
74
75    my @unop= qw( ! ~ + - );
76    %unop= map  { $_ => 1 } @unop;
77    %cmpop= map { $_ => 1 } map { split /\s+/, $_ } @cmpop;
78    %binop= map { $_ => 1 } map { split /\s+/, $_ } @binop;
79
80    my $make_pat= sub {
81        my $pat= join "|", sort { length($b) <=> length($a) || $a cmp $b }
82            map quotemeta($_), @_;
83        return qr/$pat/;
84    };
85    $unop_pat= $make_pat->(@unop);
86    foreach my $ix (0 .. $#binop) {
87        my $sym= $binop[$ix];
88        $precedence{$_}= (1 + $ix) * 10 for split /\s+/, $sym;
89    }
90    $is_right_assoc{"?"}= 1;
91    $is_right_assoc{":"}= 1;
92    $precedence{"?"}= 1;
93    $precedence{":"}= 0;
94
95    $associative{$_}++
96        for qw( || && + *);    # we leave '==' out so we don't reorder terms
97    $commutative{$_}++ for qw( || && + *);
98
99    $binop_pat= $make_pat->(keys %precedence);
100    $tokenize_pat= qr/
101     ^(?:
102        (?<comment> \/\*.*?\*\/ )
103      | (?<ws>      \s+ )
104      | (?<term>
105            (?<literal>
106                (?<define> defined\(\w+\) )
107            |   (?<func>   \w+\s*\(\s*\w+(?:\s*,\s*\w+)*\s*\) )
108            |   (?<const>  (?:0x[a-fA-F0-9]+|\d+[LU]*|'.') )
109            |   (?<sym>    \w+ )
110            )
111        |   (?<op> $binop_pat | $unop_pat )
112        |   (?<paren> [\(\)] )
113        )
114      )
115    /xs;
116}
117
118# dump the arguments with dump. wraps loading Dumper
119# as we are executed by miniperl where Dumper isnt available
120sub dd {
121    my $self= shift;
122    local $self->{orig_content};
123    my $ret= "(dump not available)";
124    eval {
125        require Data::Dumper;
126        $ret= Data::Dumper->new(\@_)->Indent(1)->Sortkeys(1)->Useqq(1)->Dump();
127    };
128    return $ret;
129}
130
131my $has_storable;
132
133# same story here, in miniperl we use slow perl code,
134# in real perl we can use Storable and speed things up.
135BEGIN { eval "use Storable; \$has_storable=1;" }
136
137# recursively copy an AoAoA...
138sub copy_aoa {
139    my ($aoa)= @_;
140    if ($has_storable) {
141        return Storable::dclone($aoa);
142    }
143    else {
144        return _copy_aoa($aoa);
145    }
146}
147
148sub _copy_aoa {
149    my ($thing)= @_;
150    if (ref $thing) {
151        return [ map { ref($_) ? _copy_aoa($_) : $_ } @$thing ];
152    }
153    else {
154        return $thing;
155    }
156}
157
158# return the number characters that should go in between a '#' and
159# the name of a c preprocessor directive. Returns 0 spaces for level
160# 0, and 2 * ($level - 1) + 1 spaces for the rest. (1,3,5, etc)
161# This might sound weird, but consider these are tab *stops* and the
162# '#' is included in the total. which means indents of 2, 4, 6 etc.
163sub indent_chars {
164    my ($self, $level)= @_;
165    my $ind= "";
166    $ind .= " "                 if $level;
167    $ind .= "  " x ($level - 1) if $level > 1;
168    return $ind;
169}
170
171# we use OO to store state, etc.
172sub new {
173    my ($class, %args)= @_;
174    $args{add_commented_expr_after} //= 10;
175    $args{max_width} //= 78;
176    $args{min_break_width} //= 70;
177    return bless \%args,;
178}
179
180# this parses the expression into an array of tokens
181# this is somewhat crude, we could do this incrementally
182# if we wanted and avoid the overhead. but it makes it
183# easier to debug the tokenizer.
184sub _tokenize_expr {
185    my ($self, $expr)= @_;
186    delete $self->{tokens};
187    delete $self->{parse_tree};
188    $self->{original_expr}= $expr;
189
190    my @tokens;
191    while ($expr =~ s/$tokenize_pat//xs) {
192        push @tokens, {%+} if defined $+{'term'};
193    }
194    $self->{tokens}= \@tokens;
195    warn $self->dd($self) if $self->{debug};
196    if (length $expr) {
197        confess "Failed to tokenize_expr: $expr\n";
198    }
199    return \@tokens;
200}
201
202sub _count_ops {
203    my ($self, $term)= @_;
204    my $count= 0;
205    $count++ while $term =~ m/(?: \|\| | \&\& | \? )/gx;
206    return $count;
207}
208
209# sort terms in an expression in a way that puts things
210# in a sensible order. Anything starting with PERL_IN_
211# should be on the left in alphabetical order. Digits
212# should be on the right (eg 0), and ties are resolved
213# by stripping non-alpha-numerc, thus removing underbar
214# parens, spaces, logical operators, etc, and then by
215# lc comparison of the result.
216sub _sort_terms {
217    my $self= shift;
218    my (@terms)= map {
219        [
220            $_,                                # 0: raw
221            lc($_) =~ s/[^a-zA-Z0-9]//gr,      # 1: "_" stripped and caseless
222            $_     =~ m/PERL_IN_/  ? 1 : 0,    # 2: PERL_IN_ labeled define
223            $_     =~ m/^\d/       ? 1 : 0,    # 3: digit
224            $_     =~ m/DEBUGGING/ ? 1 : 0,    # 4: DEBUGGING?
225            $self->_count_ops($_),             # 5: Number of ops (||, &&)
226        ]
227    } @_;
228    my %seen;
229    #start-no-tidy
230    @terms= map { $seen{ $_->[0] }++ ? () : $_->[0] }
231        sort {
232            $a->[5] <=> $b->[5]         ||    # least number of ops
233            $b->[2] <=> $a->[2]         ||    # PERL_IN before others
234            $a->[3] <=> $b->[3]         ||    # digits after others
235            $a->[4] <=> $b->[4]         ||    # DEBUGGING after all else
236            $a->[1] cmp $b->[1]         ||    # stripped caseless cmp
237            lc($a->[0]) cmp lc($b->[0]) ||    # caseless cmp
238            $a->[0] cmp $b->[0]         ||    # exact cmp
239            0
240        } @terms;
241    #end-no-tidy
242    return @terms;
243}
244
245# normalize a condition expression by parsing it and then stringifying
246# the parse tree.
247sub tidy_cond {
248    my ($self, $expr)= @_;
249    my $ret= $self->{_tidy_cond_cache}{$expr} //= do {
250        $self->parse_expr($expr) if defined $expr;
251        my $text= $self->_pt_as_str();
252        $text;
253    };
254    $self->{last_expr}= $ret;
255    return $ret;
256}
257
258# convert a parse tree structure to a string recursively.
259#
260# Parse trees are currently made up of arrays, with the count
261# of items in the object determining the type of op it represents.
262# 1 argument:  literal value of some sort.
263# 2 arguments: unary operator: 0 slot is the operator, 1 is a parse tree
264#            : ternary: 0 slot holds '?', 1 is an array holding three
265#                       parse trees: cond, true, false
266# 3 arguments or more: binary operator. 0 slot is the op. 1..n are parse trees
267#                    : note, this is multigate for commutative operators like
268#                    : "+", "*", "&&" and "||", so an expr
269#                    : like "A && B && !C" would be represented as:
270#                    : [ "&&", ["A"], ["B"], [ "!",["C"] ] ]
271#
272sub _pt_as_str {
273    my ($self, $node, $parent_op, $depth)= @_;
274
275    $node ||= $self->{parse_tree}
276        or confess "No parse tree?";
277    $depth ||= 0;
278    if (@$node == 1) {
279
280        # its a literal
281        return $node->[0];
282    }
283    elsif (@$node == 2) {
284
285        # is this a ternary or an unop?
286        if ($node->[0] eq '?') {
287
288            # ternary, the three "parts" are tucked away in
289            # an array in the payload slot
290            my $expr=
291                  $self->_pt_as_str($node->[1][0], "?", $depth + 1) . " ? "
292                . $self->_pt_as_str($node->[1][1], "?", $depth + 1) . " : "
293                . $self->_pt_as_str($node->[1][2], "?", $depth + 1);
294
295            # stick parens on if this is a subexpression
296            $expr= "( " . $expr . " )" if $depth;
297            return $expr;
298        }
299        else {
300            if (    $node->[0] eq "!"
301                and @{ $node->[1] } == 2
302                and $node->[1][0] eq "!")
303            {
304                # normalize away !! in expressions.
305                return $self->_pt_as_str($node->[1][1], $parent_op, $depth);
306            }
307
308            # unop - the payload is a optree
309            return $node->[0]
310                . $self->_pt_as_str($node->[1], $node->[0], $depth + 1);
311        }
312    }
313
314    # if we get here we are dealing with a binary operator
315    # the nodes are not necessarily binary, as we "collect"
316    # the terms into a list, thus: A && B && C && D -> ['&&',A,B,C,D]
317    my ($op, @terms)= @$node;
318
319    # convert the terms to strings
320    @terms= map { $self->_pt_as_str($_, $op, $depth + 1) } @terms;
321
322    # sort them to normalize the subexpression
323    my $expr=
324        join " $op ", $associative{$op}
325        ? $self->_sort_terms(@terms)
326        : @terms;
327
328    # stick parens on if this is a subexpression
329    $expr= "( " . $expr . " )" if $depth and !$cmpop{$op};
330
331    # and we are done.
332    return $expr;
333}
334
335# Returns the precedence of an operator, returns 0 if there is no token
336# or the next token is not an op, or confesss if it encounters an op it does not
337# know.
338sub _precedence {
339    my $self= shift;
340    my $token= shift // return 0;
341
342    my $op= (ref $token ? $token->{op} : $token) // return 0;
343
344    return $precedence{$op} // confess "Unknown op '$op'";
345}
346
347# entry point into parsing the tokens, checks that we actually parsed everything
348# and didnt leave anything in the token stream (possible from a malformed expression)
349# Performs some minor textual cleanups using regexes, but then does a proper parse
350# of the expression.
351sub parse_expr {
352    my ($self, $expr)= @_;
353    if (defined $expr) {
354        $expr =~ s/\s*\\\n\s*/ /g;
355        $expr =~ s/defined\s+(\w+)/defined($1)/g;
356        $self->_tokenize_expr($expr);
357    }
358    my $ret= $self->_parse_expr();
359    if (@{ $self->{tokens} }) {
360
361        # if all was well with parsing we should not get here.
362        confess "Unparsed tokens: ", $self->dd($self->{tokens});
363    }
364    $self->{parse_tree}= $ret;
365    return $ret;
366}
367
368# this is just a wrapper around _parse_expr_assoc() which handles
369# parsing an arbitrary expression.
370sub _parse_expr {
371    my ($self)= @_;
372    return $self->_parse_expr_assoc($self->_parse_expr_primary(), 1);
373}
374
375# This handles extracting from the token stream
376#  - simple literals
377#  - unops (assumed to be right associative)
378#  - parens (which reset the precedence acceptable to the parser)
379#
380sub _parse_expr_primary {
381    my ($self)= @_;
382    my $tokens= $self->{tokens}
383        or confess "No tokens in _parse_expr_primary?";
384    my $first= $tokens->[0]
385        or confess "No primary?";
386    if ($first->{paren} and $first->{paren} eq "(") {
387        shift @$tokens;
388        my $expr= $self->_parse_expr();
389        $first= $tokens->[0];
390        if (!$first->{paren} or $first->{paren} ne ")") {
391            confess "Expecting close paren", $self->dd($tokens);
392        }
393        shift @$tokens;
394        return $expr;
395    }
396    elsif ($first->{op} and $unop{ $first->{op} }) {
397        my $op_token= shift @$tokens;
398        return [ $op_token->{op}, $self->_parse_expr_primary() ];
399    }
400    elsif (defined $first->{literal}) {
401        shift @$tokens;
402        return [ $first->{literal} ];
403    }
404    else {
405        die sprintf
406            "Unexpected token '%s', expecting literal, unary, or expression.\n",
407            $first->{term};
408    }
409}
410
411# This is the heart of the expression parser. It uses
412# a pair of nested loops to avoid excessive recursion during parsing,
413# which should be a bit faster than other strategies. It only should
414# recurse when the precedence level changes.
415sub _parse_expr_assoc {
416    my ($self, $lhs, $min_precedence)= @_;
417    my $tokens= $self->{tokens}
418        or confess "No tokens in _parse_expr_assoc";
419    my $la= $tokens->[0];                  # lookahead
420    my $la_pr= $self->_precedence($la);    # lookahead precedence
421    while ($la && $la_pr >= $min_precedence) {
422        my $op_token= shift @$tokens;
423        my $op_pr= $la_pr;                 # op precedence
424        if ($op_token->{op} eq "?") {
425            my $mid= $self->_parse_expr();
426            if (@$tokens and $tokens->[0]{op} and $tokens->[0]{op} eq ":") {
427                shift @$tokens;
428                my $tail= $self->_parse_expr();
429                return [ '?', [ $lhs, $mid, $tail ] ];
430            }
431            confess "Panic: expecting ':'", $self->dd($tokens);
432        }
433        my $rhs;
434        eval { $rhs= $self->_parse_expr_primary(); }
435            or die "Error in $op_names{$op_token->{op}} expression: $@";
436        $la= $tokens->[0];
437        $la_pr= $self->_precedence($la);
438        while (
439            $la_pr > $op_pr ||    # any and larger
440            (       $is_right_assoc{ $op_token->{op} }
441                and $la_pr == $op_pr)    # right and equal
442        ) {
443            my $new_precedence= $op_pr + ($la_pr > $op_pr ? 1 : 0);
444            $rhs= $self->_parse_expr_assoc($rhs, $new_precedence);
445            $la= $tokens->[0];
446            $la_pr= $self->_precedence($la);
447        }
448        if (   @$lhs >= 3
449            && $lhs->[0] eq $op_token->{op}
450            && $commutative{ $op_token->{op} })
451        {
452            push @$lhs, $rhs;
453        }
454        else {
455            my @lt= ($lhs);
456            my @rt= ($rhs);
457
458            # if we have '( a && b ) && ( c && d)'
459            # turn it into 'a && b && c && d'
460            if (@$lhs > 2 && $lhs->[0] eq $op_token->{op}) {
461                (undef,@lt)= @$lhs; # throw away op.
462            }
463            if (@$rhs > 2 && $rhs->[0] eq $op_token->{op}) {
464                (undef,@rt)= @$rhs; # throw away op.
465            }
466            $lhs= [ $op_token->{op}, @lt, @rt ];
467        }
468    }
469    return $lhs;
470}
471
472#entry point for normalizing and if/elif statements
473#returns the line and condition in normalized form.
474sub normalize_if_elif {
475    my ($self, $line, $line_info)= @_;
476    if (my $dat= $self->{cache_normalize_if_elif}{$line}) {
477        return $dat->{line}, $dat->{cond};
478    }
479    my ($cond);
480    eval {
481        ($line, $cond)= $self->_normalize_if_elif($line);
482        1;
483    } or die sprintf "Error at line %d\nLine %d: %s\n%s",
484        ($line_info->start_line_num()) x 2, $line, $@;
485    $self->{cache_normalize_if_elif}{$line}= { line => $line, cond => $cond };
486    return ($line, $cond);
487}
488
489#guts of the normalize_if_elif() - cleans up the line, extracts
490#the condition, and then tidies it with tidy_cond().
491sub _normalize_if_elif {
492    my ($self, $line)= @_;
493    my $nl= "";
494    $nl= $1 if $line =~ s/(\n+)\z//;
495    $line =~ s/\s+\z//;
496    my @comment;
497    push @comment, $1 while $line =~ s!\s*(/\*.*?\*/)\z!!;
498    $line =~ s/defined\s*\(\s*(\w+)\s*\)/defined($1)/g;
499    $line =~ s/!\s+defined/!defined/g;
500
501    if ($line =~ /^#((?:el)?if)(n?)def\s+(\w+)/) {
502        my $if= $1;
503        my $not= $2 ? "!" : "";
504        $line= "#$if ${not}defined($3)";
505    }
506    $line =~ s/#((?:el)?if)\s+//
507        or confess "Bad cond: $line";
508    my $if= $1;
509    $line =~ s/!\s+/!/g;
510
511    my $old_cond= $line;
512    my $cond= $self->tidy_cond($old_cond);
513
514    warn "cond - $old_cond\ncond + $cond\n"
515        if $old_cond ne $cond and $self->{debug};
516
517    $line= "#$if $cond";
518    $line .= "  " . join " ", reverse @comment if @comment;
519
520    $line .= $nl;
521    return ($line, $cond);
522}
523
524# parses a text buffer as though it was a file on disk
525# calls parse_fh()
526sub parse_text {
527    my ($self, $text)= @_;
528    local $self->{parse_source}= "(buffer)";
529    open my $fh, "<", \$text
530        or die "Failed to open buffer for read: $!";
531    return $self->parse_fh($fh);
532}
533
534# takes a readable filehandle and parses whatever contents is
535# returned by reading it. Returns an array of HeaderLine objects.
536# this is the main routing for parsing a header file.
537sub parse_fh {
538    my ($self, $fh)= @_;
539    my @lines;
540    my @cond;
541    my @cond_line;
542    my $last_cond;
543    local $self->{parse_source}= $self->{parse_source} || "(unknown)";
544    my $cb= $self->{pre_process_content};
545    $self->{orig_content}= "";
546    my $line_num= 1;
547
548    while (defined(my $line= readline($fh))) {
549        my $start_line_num= $line_num++;
550        $self->{orig_content} .= $line;
551        while ($line =~ /\\\n\z/ or $line =~ m</\*(?:(?!\*/).)*\s*\z>s) {
552            defined(my $read_line= readline($fh))
553                or last;
554            $self->{orig_content} .= $read_line;
555            $line_num++;
556            $line .= $read_line;
557        }
558        while ($line =~ m!/\*(.*?)(\*/|\z)!gs) {
559            my ($inner, $tail)= ($1, $2);
560            if ($tail ne "*/") {
561                confess
562                    "Unterminated comment starting at line $start_line_num\n";
563            }
564            elsif ($inner =~ m!/\*!) {
565                confess
566                    "Nested/broken comment starting at line $start_line_num\n";
567            }
568        }
569
570        my $raw= $line;
571        my $type= "content";
572        my $sub_type= "text";
573        my $level= @cond;
574        my $do_pop= 0;
575        my $flat= $line;
576        $flat =~ s/\s*\\\n\s*/ /g;
577        $flat =~ s!/\*.*?\*/! !gs;
578        $flat =~ s/\s+/ /g;
579        $flat =~ s/\s+\z//;
580        $flat =~ s/^\s*#\s*/#/g;
581
582        my $line_info=
583            HeaderLine->new(start_line_num => $start_line_num, raw => $raw);
584        my $do_cond_line;
585        if ($flat =~ /^#/) {
586            if ($flat =~ m/^(#(?:el)?if)(n?)def\s+(\w+)/) {
587                my $if= $1;
588                my $not= $2 ? "!" : "";
589                my $sym= $3;
590                $flat =~
591                    s/^(#(?:el)?if)(n?)def\s+(\w+)/$if ${not}defined($sym)/;
592            }
593            my $cond;    # used in various expressions below
594            if ($flat =~ /^#endif/) {
595                if (!@cond) {
596                    confess "Not expecting $flat";
597                }
598                $do_pop= 1;
599                $level--;
600                $type= "cond";
601                $sub_type= "#endif";
602            }
603            elsif ($flat =~ /^#if\b/) {
604                ($flat, $cond)= $self->normalize_if_elif($flat, $line_info);
605                push @cond,      [$cond];
606                push @cond_line, $line_info;
607                $type= "cond";
608                $sub_type= "#if";
609            }
610            elsif ($flat =~ /^#elif\b/) {
611                if (!@cond) {
612                    confess "No if for $flat";
613                }
614                $level--;
615                ($flat, $cond)= $self->normalize_if_elif($flat, $line_info);
616                $cond[-1][-1]= $self->tidy_cond("!($cond[-1][-1])");
617                $cond_line[-1]= $line_info;
618                push @{ $cond[-1] }, $cond;
619                $type= "cond";
620                $sub_type= "#elif";
621            }
622            elsif ($flat =~ /^#else\b/) {
623                if (!@cond) {
624                    confess "No if for $flat";
625                }
626                $level--;
627                $cond[-1][-1]= $self->tidy_cond("!($cond[-1][-1])");
628                $cond_line[-1]= $line_info;
629                $type= "cond";
630                $sub_type= "#else";
631            }
632            elsif ($flat =~ /#undef/) {
633                $type= "content";
634                $sub_type= "#undef";
635            }
636            elsif ($flat =~ /#pragma\b/) {
637                $type= "content";
638                $sub_type= "#pragma";
639            }
640            elsif ($flat =~ /#include\b/) {
641                $type= "content";
642                $sub_type= "#include";
643            }
644            elsif ($flat =~ /#define\b/) {
645                $type= "content";
646                $sub_type= "#define";
647            }
648            elsif ($flat =~ /#error\b/) {
649                $type= "content";
650                $sub_type= "#error";
651            }
652            else {
653                confess "Do not know what to do with $line";
654            }
655            if ($type eq "cond") {
656
657                # normalize conditional lines
658                $line= $flat;
659                $last_cond= $line_info;
660            }
661        }
662        $line =~ s/\n?\z/\n/;
663
664        %$line_info= (
665            cond           => copy_aoa(\@cond),
666            type           => $type,
667            sub_type       => $sub_type,
668            raw            => $raw,
669            flat           => $flat,
670            line           => $line,
671            level          => $level,
672            source         => $self->{parse_source},
673            start_line_num => $start_line_num,
674            n_lines        => $line_num - $start_line_num,
675        );
676
677        push @lines, $line_info;
678        if ($do_pop) {
679            $line_info->{inner_lines}=
680                $line_info->start_line_num - $cond_line[-1]->start_line_num;
681            pop @cond;
682            pop @cond_line;
683        }
684        if ($type eq "content" and $cb) {
685            $cb->($self, $lines[-1]);
686        }
687    }
688    if (@cond_line) {
689        my $msg= "Unterminated conditional block starting line "
690            . $cond_line[-1]->start_line_num();
691        $msg .=
692            " with last conditional operation at line "
693            . $last_cond->start_line_num()
694            if $cond_line[-1] != $last_cond;
695        confess $msg;
696    }
697    $self->{lines}= \@lines;
698    return \@lines;
699}
700
701# returns the last lines we parsed.
702sub lines { $_[0]->{lines} }
703
704# assuming a line looks like an embed.fnc entry parse it
705# and normalize it, and create and EmbedLine object from it.
706sub tidy_embed_fnc_entry {
707    my ($self, $line_data)= @_;
708    my $line= $line_data->{line};
709    return $line if $line =~ /^\s*:/;
710    return $line unless $line_data->{type} eq "content";
711    return $line unless $line =~ /\|/;
712
713    $line =~ s/\s*\\\n/ /g;
714    $line =~ s/\s+\z//;
715    ($line)= expand($line);
716    my ($flags, $ret, $name, @args)= split /\s*\|\s*/, $line;
717    my %flag_seen;
718    $flags= join "", grep !$flag_seen{$_}++, sort split //, $flags;
719    if ($flags =~ s/^#//) {
720        $flags .= "#";
721    }
722    if ($flags eq "#") {
723        die "Not allowed to use only '#' for flags"
724            . "in 'embed.fnc' at line $line_data->{start_line_num}";
725    }
726    if (!$flags) {
727        die "Missing flags in function definition"
728            . " in 'embed.fnc' at line $line_data->{start_line_num}\n"
729            . "Did you a forget a line continuation on the previous line?\n";
730    }
731    for ($ret, @args) {
732        s/(\w)\*/$1 */g;
733        s/\*\s+(\w)/*$1/g;
734        s/\*const/* const/g;
735    }
736    my $head= sprintf "%-8s|%-7s", $flags, $ret;
737    $head .= sprintf "|%*s", -(31 - length($head)), $name;
738    if (@args and length($head) > 32) {
739        $head .= "\\\n";
740        $head .= " " x 32;
741    }
742    foreach my $ix (0 .. $#args) {
743        my $arg= $args[$ix];
744        $head .= "|$arg";
745        $head .= "\\\n" . (" " x 32) if $ix < $#args;
746    }
747    $line= $head . "\n";
748
749    if ($line =~ /\\\n/) {
750        my @lines= split /\s*\\\n/, $line;
751        my $len= length($lines[0]);
752        $len < length($_) and $len= length($_) for @lines;
753        $len= int(($len + 7) / 8) * 8;
754        $len= 72 if $len < 72;
755        $line= join("\\\n",
756            (map { sprintf "%*s", -$len, $_ } @lines[ 0 .. $#lines - 1 ]),
757            $lines[-1]);
758    }
759    ($line)= unexpand($line);
760
761    $line_data->{embed}= EmbedLine->new(
762        flags       => $flags,
763        return_type => $ret,
764        name        => $name,
765        args        => \@args,
766    );
767    $line =~ s/\s+\z/\n/;
768    $line_data->{line}= $line;
769    return $line;
770}
771
772# line up the text in a multiline string by a given $fragment
773# of text, inserting whitespace in front or behind the $fragment
774# to get the text to line up. Returns the text. This is wrapped
775# by line_up() and is used to wrap long conditions and comments
776# in the generated code.
777sub _line_up_frag {
778    my ($self, $str, $fragment)= @_;
779    die "has tabs?!" if $str =~ /\t/;
780    my @lines= split /\n/, $str;
781    my $changed= 1;
782    while ($changed) {
783        $changed= 0;
784        foreach my $ix (0 .. $#lines - 1) {
785            my $f_index= 0;
786            my $n_index= 0;
787            while (1) {
788                $f_index= index($lines[$ix],       $fragment, $f_index);
789                $n_index= index($lines[ $ix + 1 ], $fragment, $n_index);
790                if ($f_index == -1 or $n_index == -1) {
791                    last;
792                }
793                if ($f_index < $n_index) {
794                    my $f_idx= $f_index;
795                    $f_idx-- while substr($lines[$ix], $f_idx, 1) ne " ";
796                    substr($lines[$ix], $f_idx, 0, " " x ($n_index - $f_index));
797                    $changed++;
798                    last;
799                }
800                elsif ($n_index < $f_index) {
801                    my $n_idx= $n_index;
802                    $n_idx-- while substr($lines[ $ix + 1 ], $n_idx, 1) ne " ";
803                    substr($lines[ $ix + 1 ],
804                        $n_idx, 0, " " x ($f_index - $n_index));
805                    $changed++;
806                    last;
807                }
808                $f_index++;
809                $n_index++;
810            }
811        }
812    }
813    my $ret= join "", map { "$_\n" } @lines;
814    return $ret;
815}
816
817sub _fixup_indent {
818    my ($self, $line)= @_;
819    my @lines= split /\n/, $line;
820    if ($lines[0]=~/^(#\s*\w+(?:\s*\/\*)?\s)(\s+)/) {
821        my $first_left_len = length $1;
822
823        while (1) {
824            my $ok = 1;
825            for (@lines) {
826                /^.{$first_left_len} /
827                    or do { $ok = 0; last; };
828            }
829            if ($ok) {
830                s/^(.{$first_left_len}) /$1/ for @lines;
831            } else {
832                last;
833            }
834        }
835    }
836
837    if ($lines[0]=~/^(#\s*\w+\s+)\(/) {
838        my $len = length($1);
839        for my $idx (1..$#lines) {
840            $lines[$idx]=~s/^([ ]{$len})(\s+)(\()/$1$3$2/;
841        }
842    }
843    my $ret= join "", map { "$_\n" } @lines;
844    return $ret;
845}
846
847# this is the workhorse for _break_line_at_op().
848sub __break_line_at_op {
849    my ($self, $limit, $line, $blank_prefix)= @_;
850    my @lines= ("");
851    while (length $line) {
852        my $part;
853        if ($line =~ s/^(.*?(?:\|\||&&)\s+)//) {
854            $part= $1;
855        }
856        else {
857            $part= $line;
858            $line= "";
859        }
860        if (length($lines[-1]) + length($part) < $limit) {
861            $lines[-1] .= $part;
862        }
863        else {
864            push @lines, $blank_prefix . $part;
865        }
866    }
867    return \@lines;
868}
869
870# Break a condition line into parts, while trying to keep the last
871# token on each line being an operator like || or && or ? or : We try
872# to keep each line at $limit characters, however, we also try to
873# ensure that each line has the same number of operators on it such
874# that across all the lines there are only two counts of operators (eg,
875# we either way each line to have two operators on it, or 0, or 1 or 0,
876# or 2 or 1, and so on.) If we cannot meet this requirement we reduce
877# the limit by 1 and try again, until we meet the objective, or the
878# limit ends up at 70 chars or less.
879sub _break_line_at_op {
880    my ($self, $limit, $line, $blank_prefix)= @_;
881    my $lines;
882    while (1) {
883        $lines= $self->__break_line_at_op($limit, $line, $blank_prefix);
884        my %op_counts;
885        foreach my $line_idx (0 .. $#$lines) {
886            my $line= $lines->[$line_idx];
887            my $count= 0;
888            $count++ while $line =~ /(\|\||&&|\?|:)/g;
889            $op_counts{$count}++;
890
891        }
892        if ($limit <= $self->{min_break_width} || keys(%op_counts) <= 2) {
893            last;
894        }
895        $limit--;
896    }
897
898    s/\s*\z/\n/ for @$lines;
899    return join "", @$lines;
900}
901
902sub _max { # cant use Scalar::Util so we roll our own
903    my $max= shift;
904    $max < $_ and $max= $_ for @_;
905    return $max;
906}
907
908# take a condition, split into $type and $rest
909# wrap it, and try to line up operators and defined() functions
910# that it contains. This is rather horrible code, but it does a
911# reasonable job applying the heuristics we need to lay our the
912# conditions in a reasonable way.
913sub _wrap_and_line_up_cond {
914    my ($self, $type, $rest)= @_;
915
916    my $limit= $self->{max_width};
917
918    # extract the expression part of the line, and normalize it, we do
919    # this here even though it might be duplicative as it is possible
920    # that the caller code has munged the expression in some way, and we
921    # might want to simplify the expression first. Eg:
922    # 'defined(FOO) && (defined(BAR) && defined(BAZ))' should be turned into
923    # 'defined(FOO) && defined(BAR) && defined(BAZ)' if possible.
924    my $rest_head= "";
925    my $rest_tail= "";
926    if ($rest =~ s!(if\s+)!!) {
927        $rest_head= $1;
928    }
929    if ($rest =~ s!(\s*/\*.*?\*/)\s*\z!! || $rest =~ s!(\s*\*/\s*)\z!!) {
930        $rest_tail= $1;
931    }
932    if ($rest) {
933        $rest= $self->tidy_cond($rest);
934        $rest= $rest_head . $rest . $rest_tail;
935    }
936
937    my $l= length($type);
938    my $line= $type;
939    $line .= $rest if length($rest);
940    my $blank_prefix= " " x $l;
941
942    # at this point we have a single line with the entire expression on it
943    # if it fits on one line we are done, we can return it right away.
944    if (length($line) <= $limit) {
945        $line =~ s/\s*\z/\n/;
946        return $line;
947    }
948    my $rest_copy= $rest;
949    my @fragments;
950    my $op_pat= qr/(?:\|\||&&|[?:])/;
951
952    # does the $rest contain a parenthesized group? If it does then
953    # there are a mixture of different ops being used, as if it was all
954    # the same opcode there would not be a parenthesized group.
955    # If it does then we handle it differently, and try to put the
956    # different parts of the expression on their own line.
957    if ($rest_copy =~ /$op_pat\s*\(/) {
958        my @parts;
959        while (length $rest_copy) {
960            if ($rest_copy =~ s/^(.*?$op_pat)(\s*!?\()/$2/) {
961                push @parts, $1;
962            } else {
963                #$rest_copy=~s/^\s+//;
964                push @parts, $rest_copy;
965                last;
966            }
967        }
968        $parts[0]= $type . $parts[0];
969        $parts[$_]= $blank_prefix . $parts[$_] for 1 .. $#parts;
970        foreach my $line (@parts) {
971            if (length($line) > $limit) {
972                $line= $self->_break_line_at_op($limit, $line, $blank_prefix);
973            }
974        }
975        s/\s*\z/\n/ for @parts;
976        $line= join "", @parts;
977        @fragments= ("defined", "||");
978    }
979    else {
980        # the expression consists of just one opcode type, so we can use
981        # simpler logic to break it apart with the objective of ensuring
982        # that the lines are similarly formed with trailing operators on
983        # each line but the last.
984        @fragments= ("||", "defined");
985        $line= $self->_break_line_at_op($limit, $type . $rest, $blank_prefix);
986    }
987
988    # try to line up the text on different lines. We stop after
989    # the first $fragment that modifies the text. The order
990    # of fragments we try is determined above based on the type
991    # of condition this is.
992    my $pre_line= $line;
993    foreach my $fragment (@fragments) {
994        $line= $self->_line_up_frag($line, $fragment);
995        last if $line ne $pre_line;
996    }
997
998    # if we have lined up by "defined" in _line_up_frag()
999    # then we may have " ||        defined(...)" type expressions
1000    # convert these to "        || defined(...)" as it looks better.
1001    $line =~ s/( )(\|\||&&|[()?:])([ ]{2,})(!?defined)/$3$2$1$4/g;
1002    $line =~ s/(\|\||&&|[()?:])[ ]{10,}/$1 /g;
1003
1004    # add back the line continuations. this is all pretty inefficient,
1005    # but it works nicely.
1006    my @lines= split /\n/, $line;
1007    my $last= pop @lines;
1008    my $max_len= _max(map { length $_ } @lines);
1009    $_= sprintf "%*s \\\n", -$max_len, $_ for @lines;
1010    $last .= "\n";
1011
1012    $line= join "", @lines, $last;
1013
1014    # remove line continuations that are inside of a comment,
1015    # we may have a variable number of lines of the expression
1016    # or parts of lines of the expression in a comment, so
1017    # we do this last.
1018    $line =~ s!/\* (.*) \*/
1019              !"/*"._strip_line_cont("$1")."*/"!xsge;
1020
1021    return $self->_fixup_indent($line);
1022}
1023
1024#remove line continuations from the argument.
1025sub _strip_line_cont {
1026    my ($string)= @_;
1027    $string =~ s/\s*\\\n/\n/g;
1028    return $string;
1029}
1030
1031# Takes an array of HeaderLines objects produced by parse_fh()
1032# or by group_content(), and turn it into a string.
1033sub lines_as_str {
1034    my ($self, $lines, $post_process_content)= @_;
1035    $lines ||= $self->{lines};
1036    my $ret;
1037    $post_process_content ||= $self->{post_process_content};
1038    my $filter= $self->{filter_content};
1039    my $last_line= "";
1040
1041    #warn $self->dd($lines);
1042    foreach my $line_data (@$lines) {
1043        my $line= $line_data->{line};
1044        if ($line_data->{type} ne "content" or $line_data->{sub_type} ne "text")
1045        {
1046            my $level= $line_data->{level};
1047            my $ind= $self->indent_chars($level);
1048            $line =~ s/^#(\s*)/#$ind/;
1049        }
1050        if ($line_data->{type} eq "cond") {
1051            my $add_commented_expr_after= $self->{add_commented_expr_after};
1052            if ($line_data->{sub_type} =~ /#(?:else|endif)/) {
1053                my $joined= join " && ",
1054                    map { "($_)" } @{ $line_data->{cond}[-1] };
1055                my $cond_txt= $self->tidy_cond($joined);
1056                $cond_txt= "if $cond_txt" if $line_data->{sub_type} eq "#else";
1057                $line =~ s!\s*\z! /* $cond_txt */\n!
1058                    if $line_data->{inner_lines} >= $add_commented_expr_after;
1059            }
1060            elsif ($line_data->{sub_type} eq "#elif") {
1061                my $last_frame= $line_data->{cond}[-1];
1062                my $joined= join " && ",
1063                    map { "($_)" } @$last_frame[ 0 .. ($#$last_frame - 1) ];
1064                my $cond_txt= $self->tidy_cond($joined);
1065                $line =~ s!\s*\z! /* && $cond_txt */\n!
1066                    if $line_data->{inner_lines} >= $add_commented_expr_after;
1067            }
1068        }
1069        $line =~ s/\s*\z/\n/;
1070        if ($last_line eq "\n" and $line eq "\n") {
1071            next;
1072        }
1073        $last_line= $line;
1074        if ($line_data->{type} eq "cond") {
1075            $line =~ m!(^\s*#\s*\w+[ ]*)([^/].*?\s*)?(/\*.*)?\n\z!
1076                or die "Failed to split cond line: $line";
1077            my ($type, $cond, $comment)= ($1, $2, $3);
1078            $comment //= "";
1079            $cond    //= "";
1080            my $new_line;
1081            if (!length($cond) and $comment) {
1082                $comment =~ s!^(/\*\s+)!!
1083                    and $type .= $1;
1084            }
1085
1086            $line= $self->_wrap_and_line_up_cond($type, $cond . $comment);
1087        }
1088        $line_data->{line}= $line;
1089        if ($post_process_content and $line_data->{type} eq "content") {
1090            $post_process_content->($self, $line_data);
1091        }
1092        if ($filter and $line_data->{type} eq "content") {
1093            $filter->($self, $line_data) or next;
1094        }
1095        $ret .= $line_data->{line};
1096    }
1097    return $ret;
1098}
1099
1100# Text::Wrap::wrap has an odd api, so hide it behind a wrapper
1101# sub which sets things up properly.
1102sub _my_wrap {
1103    my ($head, $rest, $line)= @_;
1104    local $Text::Wrap::unexpand= 0;
1105    local $Text::Wrap::huge= "overflow";
1106    local $Text::Wrap::columns= 78;
1107    unless (length $line) { return $head }
1108    $line= wrap $head, $rest, $line;
1109    return $line;
1110}
1111
1112# recursively extract the && expressions from a parse tree,
1113# returning the result as strings.
1114# if $node is not a '&&' op then it returns $node as a string,
1115# otherwise it returns the string form of the arguments to the
1116# '&&' op, recursively flattening any '&&' nodes that it might
1117# contain.
1118sub _and_clauses {
1119    my ($self, $node)= @_;
1120
1121    my @ret;
1122    if (@$node < 3 or $node->[0] ne "&&") {
1123        return $self->_pt_as_str($node);
1124    }
1125    foreach my $idx (1 .. $#$node) {
1126        push @ret, $self->_and_clauses($node->[$idx]);
1127    }
1128    return @ret;
1129}
1130
1131# recursively walk the a parse tree, and return the literal
1132# terms it contains, ignoring any operators in the optree.
1133sub _terms {
1134    my ($self, $node)= @_;
1135    if (@$node == 1) {
1136        return $self->_pt_as_str($node);
1137    }
1138    my @ret;
1139    if (@$node == 2) {
1140        if ($node->[0] eq "?") {
1141            push @ret, map { $self->_terms($_) } @{ $node->[1] };
1142        }
1143        else {
1144            push @ret, $self->_terms($node->[1]);
1145        }
1146    }
1147    else {
1148        foreach my $i (1 .. $#$node) {
1149            push @ret, $self->_terms($node->[$i]);
1150        }
1151    }
1152    return @ret;
1153}
1154
1155# takes a HeaderLine "cond" AoA and flattens it into
1156# a single expression, and then extracts all the and clauses
1157# it contains. Thus [['defined(A)'],['defined(B)']] and
1158# [['defined(A) && defined(B)']], end up as ['defined(A)','defined(B)']
1159sub _flatten_cond {
1160    my ($self, $cond_ary)= @_;
1161
1162    my $expr= join " && ", map {
1163        map { "($_)" }
1164            @$_
1165    } @$cond_ary;
1166    return [] unless $expr;
1167    my $tree= $self->parse_expr($expr);
1168    my %seen;
1169    my @and_clause= grep { !$seen{$_}++ } $self->_and_clauses($tree);
1170    return \@and_clause;
1171}
1172
1173# Find the best path into a tree of conditions, such that
1174# we reuse the maximum number of existing branches. Returning
1175# two arrays, the first contain the parts of $cond_array that
1176# make up the best path, in the best path order, and a second array
1177# with the remaining items in the initial order they were provided.
1178# Thus if we have previously stored only the path "A", "B", "C"
1179# into the tree, and want to find the best path for
1180# ["E","D","C","B","A"] we should return: ["A","B","C"],["E","D"],
1181#
1182# This used to reduce the number of conditions in the grouped content,
1183# and is especially helpful with dealing with DEBUGGING related
1184# functionality. It is coupled with careful control over the order
1185# that we add paths and conditions to the tree.
1186sub _best_path {
1187    my ($self, $tree_node, $cond_array, @path)= @_;
1188    my $best= \@path;
1189    my $rest= $cond_array;
1190    foreach my $cond (@$cond_array) {
1191        if ($tree_node->{$cond}) {
1192            my ($new_best, $new_rest)=
1193                $self->_best_path($tree_node->{$cond},
1194                [ grep $_ ne $cond, @$cond_array ],
1195                @path, $cond);
1196            if (@$new_best > @$best) {
1197                ($best, $rest)= ($new_best, $new_rest);
1198            }
1199        }
1200    }
1201    if (@$best == @path) {
1202        foreach my $cond (@$cond_array) {
1203            my $not_cond= $self->tidy_cond("!($cond)");
1204            if ($tree_node->{$not_cond}) {
1205                $best= [ @path, $cond ];
1206                $rest= [ grep $_ ne $cond, @$cond_array ];
1207                last;
1208            }
1209        }
1210    }
1211    return ($best, $rest);
1212}
1213
1214# This builds a group content tree from a set of lines. each content line in
1215# the original file is added to the file based on the conditions that apply to
1216# the content.
1217#
1218# The tree is made up of nested HoH's with keys in the HoH being normalized
1219# clauses from the {cond} data in the HeaderLine objects.
1220#
1221# Care is taken to minimize the number of pathways and to reorder clauses to
1222# reuse existing pathways and minimize the total number of conditions in the
1223# file.
1224#
1225# The '' key of a hash contains an array of the lines that are part of the
1226# condition that lead to that key. Thus lines with no conditions are in
1227# @{$tree{''}}, lines with the condition "defined(A) && defined(B)" would be
1228# in $tree{"defined(A)"}{"defined(B)"}{""}.
1229#
1230# The result of this sub is normally passed into __recurse_group_content_tree()
1231# which converts it back into a set of HeaderLine objects.
1232#
1233sub _build_group_content_tree {
1234    my ($self, $lines)= @_;
1235    $lines ||= $self->{lines};
1236    my $filter= $self->{filter_content};
1237    my %seen_normal;
1238    foreach my $line_data (@$lines) {
1239        next if $line_data->{type} ne "content";
1240        next if $filter and !$filter->($self, $line_data);
1241        my $cond_frames= $line_data->{cond};
1242        my $cond_frame= $self->_flatten_cond($cond_frames);
1243        my $flat_merged= join " && ", map "($_)", @$cond_frame;
1244        my $normalized;
1245        if (@$cond_frame) {
1246            $normalized= $self->tidy_cond($flat_merged);
1247        }
1248        else {
1249            $normalized= $flat_merged;    # empty string
1250        }
1251        push @{ $seen_normal{$normalized} }, $line_data;
1252    }
1253    my @debugging;
1254    my @non_debugging;
1255    foreach my $key (keys %seen_normal) {
1256        if ($key =~ /DEBUGGING/) {
1257            push @debugging, $key;
1258        }
1259        else {
1260            push @non_debugging, $key;
1261        }
1262    }
1263    @non_debugging=
1264        sort { length($a) <=> length($b) || $a cmp $b } @non_debugging;
1265    @debugging= sort { length($b) <=> length($a) || $a cmp $b } @debugging;
1266    my %tree;
1267    foreach my $normal_expr (@non_debugging, @debugging) {
1268        my $all_line_data= $seen_normal{$normal_expr};
1269
1270        my $cond_frame=
1271            (length $normal_expr)
1272            ? $self->_flatten_cond([ [$normal_expr] ])
1273            : [];
1274        @$cond_frame= $self->_sort_terms(@$cond_frame);
1275        my $node= \%tree;
1276        my ($best, $rest)= $self->_best_path($node, $cond_frame);
1277        die sprintf "Woah: %d %d %d", 0 + @$best, 0 + @$rest, 0 + @$cond_frame
1278            unless @$best + @$rest == @$cond_frame;
1279
1280        foreach my $cond (@$best, @$rest) {
1281            $node= $node->{$cond} ||= {};
1282        }
1283        push @{ $node->{''} }, @$all_line_data;
1284    }
1285
1286    warn $self->dd(\%tree) if $self->{debug};
1287    $self->{tree}= \%tree;
1288    return \%tree;
1289}
1290
1291sub _recurse_group_content_tree {
1292    my ($self, $node, @path)= @_;
1293
1294    my @ret;
1295    local $self->{rgct_ret}= \@ret;
1296    local $self->{line_by_depth}= [];
1297
1298    $self->__recurse_group_content_tree($node, @path);
1299    return \@ret;
1300}
1301
1302# convert a tree of conditions constructed by _build_group_content_tree()
1303# and turn it into a set of HeaderLines that represents it. Performs the
1304# appropriate sets required to reconstitute an if/elif/elif/else sequence
1305# by calling _handle_else().
1306sub __recurse_group_content_tree {
1307    my ($self, $node, @path)= @_;
1308    my $depth= 0 + @path;
1309    my $ind= $self->indent_chars($depth);
1310    my $ret= $self->{rgct_ret};
1311    if ($node->{''}) {
1312        if (my $cb= $self->{post_process_grouped_content}) {
1313            $cb->($self, $node->{''}, \@path);
1314        }
1315        if (my $cb= $self->{post_process_content}) {
1316            $cb->($self, $_, \@path) for @{ $node->{''} };
1317        }
1318        push @$ret, map {
1319            HeaderLine->new(
1320                %$_,
1321                cond           => [@path],
1322                level          => $depth,
1323                start_line_num => 0 + @$ret
1324            )
1325        } @{ $node->{''} };
1326    }
1327
1328    my %skip;
1329    foreach my $expr (
1330        map  { $_->[0] }
1331        sort { $a->[1] cmp $b->[1] || $a->[0] cmp $b->[0] }
1332        map  { [ $_, lc($_) =~ s/[^A-Za-z0-9]+//gr ] } keys %$node
1333    ) {
1334        next unless length $expr;    # ignore payload
1335        my $not= $self->tidy_cond("!($expr)");
1336        if ($skip{$expr} or ($not !~ /^!/ and $node->{$not})) {
1337            next;
1338        }
1339        my $kid= $node->{$expr};
1340        while (!$node->{$not} and keys(%$kid) == 1 and !$kid->{''}) {
1341            my ($kid_key)= keys(%$kid);
1342            $expr= $self->tidy_cond("($expr) && ($kid_key)");
1343            $kid= $kid->{$kid_key};
1344            my $new_not= $self->tidy_cond("!($expr)");
1345            if ($node->{$new_not}) {
1346                $not= $new_not;
1347                $skip{$not}++;
1348            }
1349        }
1350        my $raw= "#${ind}if $expr\n";
1351        my $hl= HeaderLine->new(
1352            type           => "cond",
1353            sub_type       => "#if",
1354            raw            => $raw,
1355            line           => $raw,
1356            level          => $depth,
1357            cond           => [ @path, [$expr] ],
1358            start_line_num => 0 + @$ret,
1359        );
1360        $self->{line_by_depth}[$depth]= 0 + @$ret;
1361        push @$ret, $hl;
1362        $self->__recurse_group_content_tree($kid, @path, [$expr]);
1363        if ($node->{$not}) {
1364            $skip{$not}++;
1365            $self->_handle_else($not, $node->{$not}, $ind, $depth, @path,
1366                [$not]);
1367        }
1368
1369        # and finally the #endif
1370
1371        $raw= "#${ind}endif\n";
1372
1373        # we need to extract the condition information from the last line in @ret,
1374        # as we don't know which condition we are ending here. It could be an elsif
1375        # from deep in the parse tree for instance.
1376        # So we need to extract the last frame from the cond structure in the last
1377        # line-info in @ret.
1378        # BUT if this last line is itself an #endif, then we need to take the second
1379        # to last line instead, as the endif would have "popped" that frame off the
1380        # condition stack.
1381        my $last_ret= $ret->[-1];
1382        my $idx=
1383            ($last_ret->{type} eq "cond" && $last_ret->{sub_type} eq "#endif")
1384            ? -2
1385            : -1;
1386        my $end_line= HeaderLine->new(
1387            type           => "cond",
1388            sub_type       => "#endif",
1389            raw            => $raw,
1390            line           => $raw,
1391            level          => $depth,
1392            cond           => [ @path, $last_ret->{cond}[$idx] ],
1393            start_line_num => 0 + @$ret,
1394            inner_lines    => @$ret - $self->{line_by_depth}[$depth],
1395        );
1396        undef $self->{line_by_depth}[$depth];
1397        push @$ret, $end_line;
1398    }
1399    return $ret;
1400}
1401
1402# this handles the specific case of an else clause, detecting
1403# when an elif can be constructed, may recursively call itself
1404# to deal with if/elif/elif/else chains. Calls back into
1405# __recurse_group_content_tree().
1406sub _handle_else {
1407    my ($self, $not, $kid, $ind, $depth, @path)= @_;
1408
1409    # extract the first 3 keys - from this we can detect
1410    # which of the three scenarios we have to handle.
1411    my ($k1, $k2, $k3)=
1412        sort { length($a) <=> length($b) || $a cmp $b } keys %$kid;
1413    my $not_k1;
1414    if (length($k1) and defined($k2) and !defined($k3)) {
1415
1416        # if we do not have a payload (length($k1)) and we have exactly
1417        # two keys (defined($k2) and !defined($k3)) we need to compute
1418        # the inverse of $k1, which we will use later.
1419        $not_k1= $self->tidy_cond("!($k1)");
1420    }
1421    my $ret= $self->{rgct_ret};
1422    if (length($k1) and !defined($k2)) {
1423
1424        # only one child, no payload -> elsif $k1
1425        my $sub_expr;
1426        do {
1427            $sub_expr=
1428                 !$sub_expr
1429                ? $k1
1430                : $self->tidy_cond("($sub_expr) && ($k1)");
1431            $kid= $kid->{$k1};
1432            ($k1, $k2)=
1433                sort { length($a) <=> length($b) || $a cmp $b } keys %$kid;
1434        } while length($k1) and !defined $k2;
1435
1436        my $raw= "#${ind}elif $sub_expr\n";
1437        push @{ $path[-1] }, $sub_expr;
1438        my $hl= HeaderLine->new(
1439            type           => "cond",
1440            sub_type       => "#elif",
1441            raw            => $raw,
1442            line           => $raw,
1443            level          => $depth,
1444            cond           => [ map { [@$_] } @path ],
1445            start_line_num => 0 + @$ret,
1446            inner_lines    => @$ret - $self->{line_by_depth}[$depth],
1447        );
1448        $self->{line_by_depth}[$depth]= 0 + @$ret;
1449        push @$ret, $hl;
1450        $self->__recurse_group_content_tree($kid, @path);
1451    }
1452    elsif (defined($not_k1) and $not_k1 eq $k2) {
1453
1454        # two children which are complementary, no payload -> elif $k1 else..
1455        my $raw= "#${ind}elif $k1\n";
1456
1457        push @{ $path[-1] }, $k1;
1458        my $hl= HeaderLine->new(
1459            type           => "cond",
1460            sub_type       => "#elif",
1461            raw            => $raw,
1462            line           => $raw,
1463            level          => $depth,
1464            cond           => [ map { [@$_] } @path ],
1465            start_line_num => 0 + @$ret,
1466            inner_lines    => @$ret - $self->{line_by_depth}[$depth],
1467        );
1468        $self->{line_by_depth}[$depth]= 0 + @$ret;
1469        push @$ret, $hl;
1470        $self->__recurse_group_content_tree($kid->{$k1}, @path);
1471        $path[-1][-1]= $k2;
1472        $self->_handle_else($k2, $kid->{$k2}, $ind, $depth, @path);
1473    }
1474    else {
1475        # payload, 3+ children, or 2 which are not complementary -> else
1476        my $raw= "#${ind}else\n";
1477        my $hl= HeaderLine->new(
1478            type           => "cond",
1479            sub_type       => "#else",
1480            raw            => $raw,
1481            line           => $raw,
1482            level          => $depth,
1483            cond           => [ map { [@$_] } @path ],
1484            start_line_num => 0 + @$ret,
1485            inner_lines    => @$ret - $self->{line_by_depth}[$depth],
1486        );
1487        $self->{line_by_depth}[$depth]= 0 + @$ret;
1488        push @$ret, $hl;
1489        $self->__recurse_group_content_tree($kid, @path);
1490    }
1491    return $ret;
1492}
1493
1494# group the content in lines by the condition that apply to them
1495# returns a set of lines representing the new structure
1496sub group_content {
1497    my ($self, $lines, $filter)= @_;
1498    $lines ||= $self->{lines};
1499    local $self->{filter_content}= $filter || $self->{filter_content};
1500    my $tree= $self->_build_group_content_tree($lines);
1501    return $self->_recurse_group_content_tree($tree);
1502}
1503
1504#read a file by name - opens the file and passes the fh into parse_fh().
1505sub read_file {
1506    my ($self, $file_name, $callback)= @_;
1507    $self= $self->new() unless ref $self;
1508    local $self->{parse_source}= $file_name;
1509    open my $fh, "<", $file_name
1510        or confess "Failed to open '$file_name' for read: $!";
1511    my $lines= $self->parse_fh($fh);
1512    if ($callback) {
1513        foreach my $line (@$lines) {
1514            $callback->($self, $line);
1515        }
1516    }
1517    return $self;
1518}
1519
1520# These are utility methods for the HeaderLine objects.
1521sub HeaderLine::new {
1522    my ($class, %self)= @_;
1523    return bless \%self, $class;
1524}
1525sub HeaderLine::cond        { $_[0]->{cond} }                             # AoA
1526sub HeaderLine::type        { $_[0]->{type} }
1527sub HeaderLine::type_is     { return $_[0]->type eq $_[1] ? 1 : 0 }
1528sub HeaderLine::sub_type    { $_[0]->{sub_type} }
1529sub HeaderLine::sub_type_is { return $_[0]->sub_type eq $_[1] ? 1 : 0 }
1530sub HeaderLine::raw         { $_[0]->{raw} }
1531sub HeaderLine::flat        { $_[0]->{flat} }
1532sub HeaderLine::line        { $_[0]->{line} }
1533sub HeaderLine::level       { $_[0]->{level} }
1534sub HeaderLine::is_content  { return $_[0]->type_is("content") }
1535sub HeaderLine::is_cond     { return $_[0]->type_is("cond") }
1536sub HeaderLine::is_define   { return $_[0]->sub_type_is("#define") }
1537sub HeaderLine::line_num    { $_[0]->{start_line_num} }
1538sub HeaderLine::inner_lines { $_[0]->{inner_lines} }
1539sub HeaderLine::n_lines     { $_[0]->{n_lines} }
1540sub HeaderLine::embed       { $_[0]->{embed} }
1541*HeaderLine::start_line_num= *HeaderLine::line_num;
1542
1543# these are methods for EmbedLine objects
1544*EmbedLine::new= *HeaderLine::new;
1545sub EmbedLine::flags       { $_[0]->{flags} }
1546sub EmbedLine::return_type { $_[0]->{return_type} }
1547sub EmbedLine::name        { $_[0]->{name} }
1548sub EmbedLine::args        { $_[0]->{args} }          # array ref
1549
15501;
1551
1552__END__
1553
1554=head1 NAME
1555
1556HeaderParser - A minimal header file parser that can be hooked by other porting
1557scripts.
1558
1559=head1 SYNOPSIS
1560
1561    my $o= HeaderParser->new();
1562    my $lines= $o->parse_fh($fh);
1563
1564=head1 DESCRIPTION
1565
1566HeaderParser is a tool to parse C preprocessor header files. The tool
1567understands the syntax of preprocessor conditions, and is capable of creating
1568a parse tree of the expressions involved, and normalizing them as well.
1569
1570C preprocessor files are a bit tricky to parse properly, especially with a
1571"line by line" model. There are two issues that must be dealt with:
1572
1573=over 4
1574
1575=item Line Continuations
1576
1577Any line ending in "\\\n" (that is backslash newline) is considered to be part
1578of a longer string which continues on the next line. Processors should replace
1579the "\\\n" typically with a space when converting to a "real" line.
1580
1581=item Comments Acting As A Line Continuation
1582
1583The rules for header files stipulates that C style comments are stripped
1584before processing other content, this means that comments can serve as a form
1585of line continuation:
1586
1587    #if defined(foo) /*
1588    */ && defined(bar)
1589
1590is the same as
1591
1592    #if defined(foo) && defined(bar)
1593
1594This type of comment usage is often overlooked by people writing header file
1595parsers for the first time.
1596
1597=item Indented pre processor directives.
1598
1599It is easy to forget that there may be multiple spaces between the "#"
1600character and the directive. It also easy to forget that there may be spaces
1601in *front* of the "#" character. Both of these cases are often overlooked.
1602
1603=back
1604
1605The main idea of this module is to provide a single framework for correctly
1606parsing the content of our header files in a consistent manner. A secondary
1607purpose it to make various tasks we want to do easier, such as normalizing
1608content or preprocessor expressions, or just extracting the real "content" of
1609the file properly.
1610
1611=head2 parse_fh
1612
1613This function parses a filehandle into a set of lines.  Each line is represented by a hash
1614based object which contains the following fields:
1615
1616    bless {
1617        cond     => [['defined(a)'],['defined(b)']],
1618        type     => "content",
1619        sub_type => undef,
1620        raw      => $raw_content_of_line,
1621        line     => $normalized_content_of_line,
1622        level    => $level,
1623        source         => $filename_or_string,
1624        start_line_num => $line_num_for_first_line,
1625        n_lines        => $line_num - $line_num_for_first_line,
1626    }, "HeaderLine"
1627
1628A "line" in this context is a logical line, and because of line continuations
1629and comments may contain more than one physical line, and thus more than
1630one newline, but will always include at least one and will always end with one
1631(unless there is no newline at the end of the file). Thus
1632
1633    before /*
1634     this is a comment
1635    */ after \
1636    and continues
1637
1638will be treated as a single logical line even though the content is
1639spread over four lines.
1640
1641=over 4
1642
1643=item cond
1644
1645An array of arrays containing the normalized expressions of any C preprocessor
1646conditional blocks which include the line. Each line has its own copy of the
1647conditions it was operated on currently, but that may change so dont alter
1648this data. The inner arrays may contain more than one element. If so then the
1649line is part of an "#else" or "#elsif" and the clauses should be considered to
1650be a conjuction when considering "when is this line included", however when
1651considered as part of an if/elsif/else, each added clause represents the most
1652recent condition. In the following you can see how:
1653
1654    before          /* cond => [ ]                      */
1655    #if A           /* cond => [ ['A'] ]                */
1656    do-a            /* cond => [ ['A'] ]                */
1657    #elif B         /* cond => [ ['!A', 'B'] ]          */
1658    do-b            /* cond => [ ['!A', 'B'] ]          */
1659    #else           /* cond => [ ['!A', '!B'] ]         */
1660    do-c            /* cond => [ ['!A', '!B'] ]         */
1661    # if D          /* cond => [ ['!A', '!B'], ['D'] ]  */
1662    do-d            /* cond => [ ['!A', '!B'], ['D'] ]  */
1663    # endif         /* cond => [ ['!A', '!B'], ['D'] ]  */
1664    #endif          /* cond => [ ['!A', '!B'] ]         */
1665    after           /* cond => [ ]                      */
1666
1667So in the above we can see how the three clauses of the if produce
1668a single "frame" in the cond array, but that frame "grows" and changes
1669as additional else clauses are added. When an entirely new if block
1670is started (D) it gets its own block. Each endif includes the clause
1671it terminates.
1672
1673=item type
1674
1675This value indicates the type of the line. This may be one of the following:
1676'content', 'cond', 'define', 'include' and 'error'. Several of the types
1677have a sub_type.
1678
1679=item sub_type
1680
1681This value gives more detail on the type of the line where necessary.
1682Not all types have a subtype.
1683
1684    Type    | Sub Type
1685    --------+----------
1686    content | text
1687            | include
1688            | define
1689            | error
1690    cond    | #if
1691            | #elif
1692            | #else
1693            | #endif
1694
1695Note that there are no '#ifdef' or '#elifndef' or similar expressions. All
1696expressions of that form are normalized into the '#if defined' form to
1697simplify processing.
1698
1699=item raw
1700
1701This was the raw original text before HeaderParser performed any modifications
1702to it.
1703
1704=item line
1705
1706This is the normalized and modified text after HeaderParser or any callbacks
1707have processed it.
1708
1709=item level
1710
1711This is the "indent level" of a line and corresponds to the number of blocks
1712that the line is within, not including any blocks that might be created by
1713the line itself.
1714
1715    before          /* level => 0 */
1716    #if A           /* level => 0 */
1717    do-a            /* level => 1 */
1718    #elif B         /* level => 0 */
1719    do-b            /* level => 1 */
1720    #else           /* level => 0 */
1721    do-c            /* level => 1 */
1722    # if D          /* level => 1 */
1723    do-d            /* level => 2 */
1724    # endif         /* level => 1 */
1725    #endif          /* level => 0 */
1726    after           /* level => 0 */
1727
1728=back
1729
1730parse_fh() will throw an exception if it encounters a malformed expression
1731or input it cannot handle.
1732
1733=head2 lines_as_str
1734
1735This function will return a string representation of the lines it is provided.
1736
1737=head2 group_content
1738
1739This function will group the text in the file by the conditions which contain
1740it. This is only useful for files where the content is essentially a list and
1741where changing the order that lines are output in will not break the resulting
1742file.
1743
1744Each content line will be grouped into a structure of nested if/else blocks
1745(elif will produce a new nested block) such that the content under the control
1746of a given set of normalized condition clauses are grouped together in the order
1747the occurred in the file, such that each combined conditional clause is output
1748only once.
1749
1750This means a file like this:
1751
1752    #if A
1753    A
1754    #elif K
1755    AK
1756    #else
1757    ZA
1758    #endif
1759    #if B && Q
1760    B
1761    #endif
1762    #if Q && B
1763    BC
1764    #endif
1765    #if A
1766    AD
1767    #endif
1768    #if !A
1769    ZZ
1770    #endif
1771
1772Will end up looking roughly like this:
1773
1774    #if A
1775    A
1776    AD
1777    #else
1778    ZZ
1779    # if K
1780    AK
1781    # else
1782    ZA
1783    # endif
1784    #endif
1785    #if B && Q
1786    B
1787    BC
1788    #endif
1789
1790Content at a given block level always goes before conditional clauses
1791at the same nesting level.
1792
1793=head2 HOOKS
1794
1795There are severals hooks that are available, C<pre_process_content> and
1796C<post_process_content>, and C<post_process_grouped_content>. All of these
1797hooks  will be called with the HeaderParser object as the first argument.
1798The "process_content" callbacks will be called with a line hash as the second
1799argument, and C<post_process_grouped_content> will be called with an
1800array of line hashes for the content in that group, so that the array may be
1801modified or sorted.  Callbacks called from inside of C<group_content()>
1802(that is C<post_process_content> and C<post_process_grouped_content> will be
1803called with an additional argument containing and array specifying the actual
1804conditional "path" to the content  (which may differ somewhat from the data in
1805a lines "cond" property).
1806
1807These hooks may do what they like, but generally they will modify the
1808"line" property of the line hash to change the final output returned
1809by C<lines_as_str()> or C<group_content()>.
1810
1811=head2 FORMATTING AND INDENTING
1812
1813Header parser tries hard to produce neat and readable output with a consistent
1814style and form. For example:
1815
1816    #if defined(FOO)
1817    # define HAS_FOO
1818    # if defined(BAR)
1819    #   define HAS_FOO_AND_BAR
1820    # else /* !defined(BAR) */
1821    #   define HAS_FOO_NO_BAR
1822    # endif /* !defined(BAR) */
1823    #endif /* defined(FOO) */
1824
1825HeaderParser uses two space tab stops for indenting C pre-processor
1826directives. It puts the spaces between the "#" and the directive. The "#" is
1827considered "part" of the indent, even though the space comes after it. This
1828means the first indent level "looks" like one space, and following indents
1829look like 2. This should match what a sensible editor would do with two space
1830tab stops. The C<indent_chars()> method can be used to convert an indent level
1831into a string that contains the appropriate number of spaces to go in between
1832the "#" and the directive.
1833
1834When emitting "#endif", "#elif" and "#else" directives comments will be
1835emitted also to show the conditions that apply. These comments may be wrapped
1836to cover multiple lines. Some effort is made to get these comments to line up
1837visually, but it uses heuristics which may not always produce the best result.
1838
1839=cut
1840