xref: /openbsd/gnu/usr.bin/perl/regen/regcharclass.pl (revision a6445c1d)
1#!perl
2package CharClass::Matcher;
3use strict;
4use 5.008;
5use warnings;
6use warnings FATAL => 'all';
7no warnings 'experimental::autoderef';
8use Data::Dumper;
9$Data::Dumper::Useqq= 1;
10our $hex_fmt= "0x%02X";
11
12sub DEBUG () { 0 }
13$|=1 if DEBUG;
14
15sub ASCII_PLATFORM { (ord('A') == 65) }
16
17require 'regen/regen_lib.pl';
18
19=head1 NAME
20
21CharClass::Matcher -- Generate C macros that match character classes efficiently
22
23=head1 SYNOPSIS
24
25    perl Porting/regcharclass.pl
26
27=head1 DESCRIPTION
28
29Dynamically generates macros for detecting special charclasses
30in latin-1, utf8, and codepoint forms. Macros can be set to return
31the length (in bytes) of the matched codepoint, and/or the codepoint itself.
32
33To regenerate F<regcharclass.h>, run this script from perl-root. No arguments
34are necessary.
35
36Using WHATEVER as an example the following macros can be produced, depending
37on the input parameters (how to get each is described by internal comments at
38the C<__DATA__> line):
39
40=over 4
41
42=item C<is_WHATEVER(s,is_utf8)>
43
44=item C<is_WHATEVER_safe(s,e,is_utf8)>
45
46Do a lookup as appropriate based on the C<is_utf8> flag. When possible
47comparisons involving octect<128 are done before checking the C<is_utf8>
48flag, hopefully saving time.
49
50The version without the C<_safe> suffix should be used only when the input is
51known to be well-formed.
52
53=item C<is_WHATEVER_utf8(s)>
54
55=item C<is_WHATEVER_utf8_safe(s,e)>
56
57Do a lookup assuming the string is encoded in (normalized) UTF8.
58
59The version without the C<_safe> suffix should be used only when the input is
60known to be well-formed.
61
62=item C<is_WHATEVER_latin1(s)>
63
64=item C<is_WHATEVER_latin1_safe(s,e)>
65
66Do a lookup assuming the string is encoded in latin-1 (aka plan octets).
67
68The version without the C<_safe> suffix should be used only when it is known
69that C<s> contains at least one character.
70
71=item C<is_WHATEVER_cp(cp)>
72
73Check to see if the string matches a given codepoint (hypothetically a
74U32). The condition is constructed as to "break out" as early as
75possible if the codepoint is out of range of the condition.
76
77IOW:
78
79  (cp==X || (cp>X && (cp==Y || (cp>Y && ...))))
80
81Thus if the character is X+1 only two comparisons will be done. Making
82matching lookups slower, but non-matching faster.
83
84=item C<what_len_WHATEVER_FOO(arg1, ..., len)>
85
86A variant form of each of the macro types described above can be generated, in
87which the code point is returned by the macro, and an extra parameter (in the
88final position) is added, which is a pointer for the macro to set the byte
89length of the returned code point.
90
91These forms all have a C<what_len> prefix instead of the C<is_>, for example
92C<what_len_WHATEVER_safe(s,e,is_utf8,len)> and
93C<what_len_WHATEVER_utf8(s,len)>.
94
95These forms should not be used I<except> on small sets of mostly widely
96separated code points; otherwise the code generated is inefficient.  For these
97cases, it is best to use the C<is_> forms, and then find the code point with
98C<utf8_to_uvchr_buf>().  This program can fail with a "deep recursion"
99message on the worst of the inappropriate sets.  Examine the generated macro
100to see if it is acceptable.
101
102=item C<what_WHATEVER_FOO(arg1, ...)>
103
104A variant form of each of the C<is_> macro types described above can be generated, in
105which the code point and not the length is returned by the macro.  These have
106the same caveat as L</what_len_WHATEVER_FOO(arg1, ..., len)>, plus they should
107not be used where the set contains a NULL, as 0 is returned for two different
108cases: a) the set doesn't include the input code point; b) the set does
109include it, and it is a NULL.
110
111=back
112
113=head2 CODE FORMAT
114
115perltidy  -st -bt=1 -bbt=0 -pt=0 -sbt=1 -ce -nwls== "%f"
116
117
118=head1 AUTHOR
119
120Author: Yves Orton (demerphq) 2007.  Maintained by Perl5 Porters.
121
122=head1 BUGS
123
124No tests directly here (although the regex engine will fail tests
125if this code is broken). Insufficient documentation and no Getopts
126handler for using the module as a script.
127
128=head1 LICENSE
129
130You may distribute under the terms of either the GNU General Public
131License or the Artistic License, as specified in the README file.
132
133=cut
134
135# Sub naming convention:
136# __func : private subroutine, can not be called as a method
137# _func  : private method, not meant for external use
138# func   : public method.
139
140# private subs
141#-------------------------------------------------------------------------------
142#
143# ($cp,$n,$l,$u)=__uni_latin($str);
144#
145# Return a list of arrays, each of which when interpreted correctly
146# represent the string in some given encoding with specific conditions.
147#
148# $cp - list of codepoints that make up the string.
149# $n  - list of octets that make up the string if all codepoints are invariant
150#       regardless of if the string is in UTF-8 or not.
151# $l  - list of octets that make up the string in latin1 encoding if all
152#       codepoints < 256, and at least one codepoint is UTF-8 variant.
153# $u  - list of octets that make up the string in utf8 if any codepoint is
154#       UTF-8 variant
155#
156#   High CP | Defined
157#-----------+----------
158#   0 - 127 : $n            (127/128 are the values for ASCII platforms)
159# 128 - 255 : $l, $u
160# 256 - ... : $u
161#
162
163sub __uni_latin1 {
164    my $str= shift;
165    my $max= 0;
166    my @cp;
167    my @cp_high;
168    my $only_has_invariants = 1;
169    for my $ch ( split //, $str ) {
170        my $cp= ord $ch;
171        push @cp, $cp;
172        push @cp_high, $cp if $cp > 255;
173        $max= $cp if $max < $cp;
174        if (! ASCII_PLATFORM && $only_has_invariants) {
175            if ($cp > 255) {
176                $only_has_invariants = 0;
177            }
178            else {
179                my $temp = chr($cp);
180                utf8::upgrade($temp);
181                my @utf8 = unpack "U0C*", $temp;
182                $only_has_invariants = (@utf8 == 1 && $utf8[0] == $cp);
183            }
184        }
185    }
186    my ( $n, $l, $u );
187    $only_has_invariants = $max < 128 if ASCII_PLATFORM;
188    if ($only_has_invariants) {
189        $n= [@cp];
190    } else {
191        $l= [@cp] if $max && $max < 256;
192
193        $u= $str;
194        utf8::upgrade($u);
195        $u= [ unpack "U0C*", $u ] if defined $u;
196    }
197    return ( \@cp, \@cp_high, $n, $l, $u );
198}
199
200#
201# $clean= __clean($expr);
202#
203# Cleanup a ternary expression, removing unnecessary parens and apply some
204# simplifications using regexes.
205#
206
207sub __clean {
208    my ( $expr )= @_;
209
210    #return $expr;
211
212    our $parens;
213    $parens= qr/ (?> \( (?> (?: (?> [^()]+ ) | (??{ $parens }) )* ) \) ) /x;
214
215    ## remove redundant parens
216    1 while $expr =~ s/ \( \s* ( $parens ) \s* \) /$1/gx;
217
218
219    # repeatedly simplify conditions like
220    #       ( (cond1) ? ( (cond2) ? X : Y ) : Y )
221    # into
222    #       ( ( (cond1) && (cond2) ) ? X : Y )
223    # Also similarly handles expressions like:
224    #       : (cond1) ? ( (cond2) ? X : Y ) : Y )
225    # Note the inclusion of the close paren in ([:()]) and the open paren in ([()]) is
226    # purely to ensure we have a balanced set of parens in the expression which makes
227    # it easier to understand the pattern in an editor that understands paren's, we do
228    # not expect either of these cases to actually fire. - Yves
229    1 while $expr =~ s/
230        ([:()])  \s*
231            ($parens) \s*
232            \? \s*
233                \( \s* ($parens) \s*
234                    \? \s* ($parens|[^()?:\s]+?) \s*
235                    :  \s* ($parens|[^()?:\s]+?) \s*
236                \) \s*
237            : \s* \5 \s*
238        ([()])
239    /$1 ( $2 && $3 ) ? $4 : $5 $6/gx;
240    #$expr=~s/\(\(U8\*\)s\)\[(\d+)\]/S$1/g if length $expr > 8000;
241    #$expr=~s/\s+//g if length $expr > 8000;
242
243    die "Expression too long" if length $expr > 8000;
244
245    return $expr;
246}
247
248#
249# $text= __macro(@args);
250# Join args together by newlines, and then neatly add backslashes to the end
251# of every  line as expected by the C pre-processor for #define's.
252#
253
254sub __macro {
255    my $str= join "\n", @_;
256    $str =~ s/\s*$//;
257    my @lines= map { s/\s+$//; s/\t/        /g; $_ } split /\n/, $str;
258    my $last= pop @lines;
259    $str= join "\n", ( map { sprintf "%-76s\\", $_ } @lines ), $last;
260    1 while $str =~ s/^(\t*) {8}/$1\t/gm;
261    return $str . "\n";
262}
263
264#
265# my $op=__incrdepth($op);
266#
267# take an 'op' hashref and add one to it and all its childrens depths.
268#
269
270sub __incrdepth {
271    my $op= shift;
272    return unless ref $op;
273    $op->{depth} += 1;
274    __incrdepth( $op->{yes} );
275    __incrdepth( $op->{no} );
276    return $op;
277}
278
279# join two branches of an opcode together with a condition, incrementing
280# the depth on the yes branch when we do so.
281# returns the new root opcode of the tree.
282sub __cond_join {
283    my ( $cond, $yes, $no )= @_;
284    if (ref $yes) {
285        return {
286            test  => $cond,
287            yes   => __incrdepth( $yes ),
288            no    => $no,
289            depth => 0,
290        };
291    }
292    else {
293        return {
294            test  => $cond,
295            yes   => $yes,
296            no    => __incrdepth($no),
297            depth => 0,
298        };
299    }
300}
301
302# Methods
303
304# constructor
305#
306# my $obj=CLASS->new(op=>'SOMENAME',title=>'blah',txt=>[..]);
307#
308# Create a new CharClass::Matcher object by parsing the text in
309# the txt array. Currently applies the following rules:
310#
311# Element starts with C<0x>, line is evaled the result treated as
312# a number which is passed to chr().
313#
314# Element starts with C<">, line is evaled and the result treated
315# as a string.
316#
317# Each string is then stored in the 'strs' subhash as a hash record
318# made up of the results of __uni_latin1, using the keynames
319# 'low','latin1','utf8', as well as the synthesized 'LATIN1', 'high', and
320# 'UTF8' which hold a merge of 'low' and their lowercase equivalents.
321#
322# Size data is tracked per type in the 'size' subhash.
323#
324# Return an object
325#
326sub new {
327    my $class= shift;
328    my %opt= @_;
329    for ( qw(op txt) ) {
330        die "in " . __PACKAGE__ . " constructor '$_;' is a mandatory field"
331          if !exists $opt{$_};
332    }
333
334    my $self= bless {
335        op    => $opt{op},
336        title => $opt{title} || '',
337    }, $class;
338    foreach my $txt ( @{ $opt{txt} } ) {
339        my $str= $txt;
340        if ( $str =~ /^[""]/ ) {
341            $str= eval $str;
342        } elsif ($str =~ / - /x ) { # A range:  Replace this element on the
343                                    # list with its expansion
344            my ($lower, $upper) = $str =~ / 0x (.+?) \s* - \s* 0x (.+) /x;
345            die "Format must be like '0xDEAD - 0xBEAF'; instead was '$str'" if ! defined $lower || ! defined $upper;
346            foreach my $cp (hex $lower .. hex $upper) {
347                push @{$opt{txt}}, sprintf "0x%X", $cp;
348            }
349            next;
350        } elsif ($str =~ s/ ^ N (?= 0x ) //x ) {
351            # Otherwise undocumented, a leading N means is already in the
352            # native character set; don't convert.
353            $str= chr eval $str;
354        } elsif ( $str =~ /^0x/ ) {
355            $str= eval $str;
356
357            # Convert from Unicode/ASCII to native, if necessary
358            $str = utf8::unicode_to_native($str) if ! ASCII_PLATFORM
359                                                    && $str <= 0xFF;
360            $str = chr $str;
361        } elsif ( $str =~ / \s* \\p \{ ( .*? ) \} /x) {
362            my $property = $1;
363            use Unicode::UCD qw(prop_invlist);
364
365            my @invlist = prop_invlist($property, '_perl_core_internal_ok');
366            if (! @invlist) {
367
368                # An empty return could mean an unknown property, or merely
369                # that it is empty.  Call in scalar context to differentiate
370                my $count = prop_invlist($property, '_perl_core_internal_ok');
371                die "$property not found" unless defined $count;
372            }
373
374            # Replace this element on the list with the property's expansion
375            for (my $i = 0; $i < @invlist; $i += 2) {
376                foreach my $cp ($invlist[$i] .. $invlist[$i+1] - 1) {
377
378                    # prop_invlist() returns native values; add leading 'N'
379                    # to indicate that.
380                    push @{$opt{txt}}, sprintf "N0x%X", $cp;
381                }
382            }
383            next;
384        } elsif ($str =~ / ^ do \s+ ( .* ) /x) {
385            die "do '$1' failed: $!$@" if ! do $1 or $@;
386            next;
387        } elsif ($str =~ / ^ & \s* ( .* ) /x) { # user-furnished sub() call
388            my @results = eval "$1";
389            die "eval '$1' failed: $@" if $@;
390            push @{$opt{txt}}, @results;
391            next;
392        } else {
393            die "Unparsable line: $txt\n";
394        }
395        my ( $cp, $cp_high, $low, $latin1, $utf8 )= __uni_latin1( $str );
396        my $UTF8= $low   || $utf8;
397        my $LATIN1= $low || $latin1;
398        my $high = (scalar grep { $_ < 256 } @$cp) ? 0 : $utf8;
399        #die Dumper($txt,$cp,$low,$latin1,$utf8)
400        #    if $txt=~/NEL/ or $utf8 and @$utf8>3;
401
402        @{ $self->{strs}{$str} }{qw( str txt low utf8 latin1 high cp cp_high UTF8 LATIN1 )}=
403          ( $str, $txt, $low, $utf8, $latin1, $high, $cp, $cp_high, $UTF8, $LATIN1 );
404        my $rec= $self->{strs}{$str};
405        foreach my $key ( qw(low utf8 latin1 high cp cp_high UTF8 LATIN1) ) {
406            $self->{size}{$key}{ 0 + @{ $self->{strs}{$str}{$key} } }++
407              if $self->{strs}{$str}{$key};
408        }
409        $self->{has_multi} ||= @$cp > 1;
410        $self->{has_ascii} ||= $latin1 && @$latin1;
411        $self->{has_low}   ||= $low && @$low;
412        $self->{has_high}  ||= !$low && !$latin1;
413    }
414    $self->{val_fmt}= $hex_fmt;
415    $self->{count}= 0 + keys %{ $self->{strs} };
416    return $self;
417}
418
419# my $trie = make_trie($type,$maxlen);
420#
421# using the data stored in the object build a trie of a specific type,
422# and with specific maximum depth. The trie is made up the elements of
423# the given types array for each string in the object (assuming it is
424# not too long.)
425#
426# returns the trie, or undef if there was no relevant data in the object.
427#
428
429sub make_trie {
430    my ( $self, $type, $maxlen )= @_;
431
432    my $strs= $self->{strs};
433    my %trie;
434    foreach my $rec ( values %$strs ) {
435        die "panic: unknown type '$type'"
436          if !exists $rec->{$type};
437        my $dat= $rec->{$type};
438        next unless $dat;
439        next if $maxlen && @$dat > $maxlen;
440        my $node= \%trie;
441        foreach my $elem ( @$dat ) {
442            $node->{$elem} ||= {};
443            $node= $node->{$elem};
444        }
445        $node->{''}= $rec->{str};
446    }
447    return 0 + keys( %trie ) ? \%trie : undef;
448}
449
450sub pop_count ($) {
451    my $word = shift;
452
453    # This returns a list of the positions of the bits in the input word that
454    # are 1.
455
456    my @positions;
457    my $position = 0;
458    while ($word) {
459        push @positions, $position if $word & 1;
460        $position++;
461        $word >>= 1;
462    }
463    return @positions;
464}
465
466# my $optree= _optree()
467#
468# recursively convert a trie to an optree where every node represents
469# an if else branch.
470#
471#
472
473sub _optree {
474    my ( $self, $trie, $test_type, $ret_type, $else, $depth )= @_;
475    return unless defined $trie;
476    if ( $self->{has_multi} and $ret_type =~ /cp|both/ ) {
477        die "Can't do 'cp' optree from multi-codepoint strings";
478    }
479    $ret_type ||= 'len';
480    $else= 0  unless defined $else;
481    $depth= 0 unless defined $depth;
482
483    # if we have an empty string as a key it means we are in an
484    # accepting state and unless we can match further on should
485    # return the value of the '' key.
486    if (exists $trie->{''} ) {
487        # we can now update the "else" value, anything failing to match
488        # after this point should return the value from this.
489        if ( $ret_type eq 'cp' ) {
490            $else= $self->{strs}{ $trie->{''} }{cp}[0];
491            $else= sprintf "$self->{val_fmt}", $else if $else > 9;
492        } elsif ( $ret_type eq 'len' ) {
493            $else= $depth;
494        } elsif ( $ret_type eq 'both') {
495            $else= $self->{strs}{ $trie->{''} }{cp}[0];
496            $else= sprintf "$self->{val_fmt}", $else if $else > 9;
497            $else= "len=$depth, $else";
498        }
499    }
500    # extract the meaningful keys from the trie, filter out '' as
501    # it means we are an accepting state (end of sequence).
502    my @conds= sort { $a <=> $b } grep { length $_ } keys %$trie;
503
504    # if we haven't any keys there is no further we can match and we
505    # can return the "else" value.
506    return $else if !@conds;
507
508    # Assuming Perl is being released from an ASCII platform, the below makes
509    # it work for non-UTF-8 out-of-the box when porting to non-ASCII, by
510    # adding a translation back to ASCII.  This is the wrong thing to do for
511    # UTF-EBCDIC, as that is different from UTF-8.  But the intent here is
512    # that this regen should be run on the target system, which will omit the
513    # translation, and generate the correct UTF-EBCDIC.  On ASCII systems, the
514    # translation macros expand to just their argument, so there is no harm
515    # done nor performance penalty by including them.
516    my $test;
517    if ($test_type =~ /^cp/) {
518        $test = "cp";
519        $test = "NATIVE_TO_UNI($test)" if ASCII_PLATFORM;
520    }
521    else {
522        $test = "((U8*)s)[$depth]";
523        $test = "NATIVE_TO_LATIN1($test)" if ASCII_PLATFORM;
524    }
525
526    # first we loop over the possible keys/conditions and find out what they
527    # look like; we group conditions with the same optree together.
528    my %dmp_res;
529    my @res_order;
530    local $Data::Dumper::Sortkeys=1;
531    foreach my $cond ( @conds ) {
532
533        # get the optree for this child/condition
534        my $res= $self->_optree( $trie->{$cond}, $test_type, $ret_type, $else, $depth + 1 );
535        # convert it to a string with Dumper
536        my $res_code= Dumper( $res );
537
538        push @{$dmp_res{$res_code}{vals}}, $cond;
539        if (!$dmp_res{$res_code}{optree}) {
540            $dmp_res{$res_code}{optree}= $res;
541            push @res_order, $res_code;
542        }
543    }
544
545    # now that we have deduped the optrees we construct a new optree containing the merged
546    # results.
547    my %root;
548    my $node= \%root;
549    foreach my $res_code_idx (0 .. $#res_order) {
550        my $res_code= $res_order[$res_code_idx];
551        $node->{vals}= $dmp_res{$res_code}{vals};
552        $node->{test}= $test;
553        $node->{yes}= $dmp_res{$res_code}{optree};
554        $node->{depth}= $depth;
555        if ($res_code_idx < $#res_order) {
556            $node= $node->{no}= {};
557        } else {
558            $node->{no}= $else;
559        }
560    }
561
562    # return the optree.
563    return \%root;
564}
565
566# my $optree= optree(%opts);
567#
568# Convert a trie to an optree, wrapper for _optree
569
570sub optree {
571    my $self= shift;
572    my %opt= @_;
573    my $trie= $self->make_trie( $opt{type}, $opt{max_depth} );
574    $opt{ret_type} ||= 'len';
575    my $test_type= $opt{type} =~ /^cp/ ? 'cp' : 'depth';
576    return $self->_optree( $trie, $test_type, $opt{ret_type}, $opt{else}, 0 );
577}
578
579# my $optree= generic_optree(%opts);
580#
581# build a "generic" optree out of the three 'low', 'latin1', 'utf8'
582# sets of strings, including a branch for handling the string type check.
583#
584
585sub generic_optree {
586    my $self= shift;
587    my %opt= @_;
588
589    $opt{ret_type} ||= 'len';
590    my $test_type= 'depth';
591    my $else= $opt{else} || 0;
592
593    my $latin1= $self->make_trie( 'latin1', $opt{max_depth} );
594    my $utf8= $self->make_trie( 'utf8',     $opt{max_depth} );
595
596    $_= $self->_optree( $_, $test_type, $opt{ret_type}, $else, 0 )
597      for $latin1, $utf8;
598
599    if ( $utf8 ) {
600        $else= __cond_join( "( is_utf8 )", $utf8, $latin1 || $else );
601    } elsif ( $latin1 ) {
602        $else= __cond_join( "!( is_utf8 )", $latin1, $else );
603    }
604    if ($opt{type} eq 'generic') {
605        my $low= $self->make_trie( 'low', $opt{max_depth} );
606        if ( $low ) {
607            $else= $self->_optree( $low, $test_type, $opt{ret_type}, $else, 0 );
608        }
609    }
610
611    return $else;
612}
613
614# length_optree()
615#
616# create a string length guarded optree.
617#
618
619sub length_optree {
620    my $self= shift;
621    my %opt= @_;
622    my $type= $opt{type};
623
624    die "Can't do a length_optree on type 'cp', makes no sense."
625      if $type =~ /^cp/;
626
627    my $else= ( $opt{else} ||= 0 );
628
629    my $method = $type =~ /generic/ ? 'generic_optree' : 'optree';
630    if ($method eq 'optree' && scalar keys %{$self->{size}{$type}} == 1) {
631
632        # Here is non-generic output (meaning that we are only generating one
633        # type), and all things that match have the same number ('size') of
634        # bytes.  The length guard is simply that we have that number of
635        # bytes.
636        my @size = keys %{$self->{size}{$type}};
637        my $cond= "((e) - (s)) >= $size[0]";
638        my $optree = $self->$method(%opt);
639        $else= __cond_join( $cond, $optree, $else );
640    }
641    elsif ($self->{has_multi}) {
642        my @size;
643
644        # Here, there can be a match of a multiple character string.  We use
645        # the traditional method which is to have a branch for each possible
646        # size (longest first) and test for the legal values for that size.
647        my %sizes= (
648            %{ $self->{size}{low}    || {} },
649            %{ $self->{size}{latin1} || {} },
650            %{ $self->{size}{utf8}   || {} }
651        );
652        if ($method eq 'generic_optree') {
653            @size= sort { $a <=> $b } keys %sizes;
654        } else {
655            @size= sort { $a <=> $b } keys %{ $self->{size}{$type} };
656        }
657        for my $size ( @size ) {
658            my $optree= $self->$method( %opt, type => $type, max_depth => $size );
659            my $cond= "((e)-(s) > " . ( $size - 1 ).")";
660            $else= __cond_join( $cond, $optree, $else );
661        }
662    }
663    else {
664        my $utf8;
665
666        # Here, has more than one possible size, and only matches a single
667        # character.  For non-utf8, the needed length is 1; for utf8, it is
668        # found by array lookup 'UTF8SKIP'.
669
670        # If want just the code points above 255, set up to look for those;
671        # otherwise assume will be looking for all non-UTF-8-invariant code
672        # poiints.
673        my $trie_type = ($type eq 'high') ? 'high' : 'utf8';
674
675        # If we do want more than the 0-255 range, find those, and if they
676        # exist...
677        if ($opt{type} !~ /latin1/i && ($utf8 = $self->make_trie($trie_type, 0))) {
678
679            # ... get them into an optree, and set them up as the 'else' clause
680            $utf8 = $self->_optree( $utf8, 'depth', $opt{ret_type}, 0, 0 );
681
682            # We could make this
683            #   UTF8_IS_START(*s) && ((e) - (s)) >= UTF8SKIP(s))";
684            # to avoid doing the UTF8SKIP and subsequent branches for invariants
685            # that don't match.  But the current macros that get generated
686            # have only a few things that can match past this, so I (khw)
687            # don't think it is worth it.  (Even better would be to use
688            # calculate_mask(keys %$utf8) instead of UTF8_IS_START, and use it
689            # if it saves a bunch.
690            my $cond = "(((e) - (s)) >= UTF8SKIP(s))";
691            $else = __cond_join($cond, $utf8, $else);
692
693            # For 'generic', we also will want the latin1 UTF-8 variants for
694            # the case where the input isn't UTF-8.
695            my $latin1;
696            if ($method eq 'generic_optree') {
697                $latin1 = $self->make_trie( 'latin1', 1);
698                $latin1= $self->_optree( $latin1, 'depth', $opt{ret_type}, 0, 0 );
699            }
700
701            # If we want the UTF-8 invariants, get those.
702            my $low;
703            if ($opt{type} !~ /non_low|high/
704                && ($low= $self->make_trie( 'low', 1)))
705            {
706                $low= $self->_optree( $low, 'depth', $opt{ret_type}, 0, 0 );
707
708                # Expand out the UTF-8 invariants as a string so that we
709                # can use them as the conditional
710                $low = $self->_cond_as_str( $low, 0, \%opt);
711
712                # If there are Latin1 variants, add a test for them.
713                if ($latin1) {
714                    $else = __cond_join("(! is_utf8 )", $latin1, $else);
715                }
716                elsif ($method eq 'generic_optree') {
717
718                    # Otherwise for 'generic' only we know that what
719                    # follows must be valid for just UTF-8 strings,
720                    $else->{test} = "( is_utf8 && $else->{test} )";
721                }
722
723                # If the invariants match, we are done; otherwise we have
724                # to go to the 'else' clause.
725                $else = __cond_join($low, 1, $else);
726            }
727            elsif ($latin1) {   # Here, didn't want or didn't have invariants,
728                                # but we do have latin variants
729                $else = __cond_join("(! is_utf8)", $latin1, $else);
730            }
731
732            # We need at least one byte available to start off the tests
733            $else = __cond_join("((e) > (s))", $else, 0);
734        }
735        else {  # Here, we don't want or there aren't any variants.  A single
736                # byte available is enough.
737            my $cond= "((e) > (s))";
738            my $optree = $self->$method(%opt);
739            $else= __cond_join( $cond, $optree, $else );
740        }
741    }
742
743    return $else;
744}
745
746sub calculate_mask(@) {
747    # Look at the input list of byte values.  This routine returns an array of
748    # mask/base pairs to generate that list.
749
750    my @list = @_;
751    my $list_count = @list;
752
753    # Consider a set of byte values, A, B, C ....  If we want to determine if
754    # <c> is one of them, we can write c==A || c==B || c==C ....  If the
755    # values are consecutive, we can shorten that to A<=c && c<=Z, which uses
756    # far fewer branches.  If only some of them are consecutive we can still
757    # save some branches by creating range tests for just those that are
758    # consecutive. _cond_as_str() does this work for looking for ranges.
759    #
760    # Another approach is to look at the bit patterns for A, B, C .... and see
761    # if they have some commonalities.  That's what this function does.  For
762    # example, consider a set consisting of the bytes
763    # 0xF0, 0xF1, 0xF2, and 0xF3.  We could write:
764    #   0xF0 <= c && c <= 0xF4
765    # But the following mask/compare also works, and has just one test:
766    #   (c & 0xFC) == 0xF0
767    # The reason it works is that the set consists of exactly those bytes
768    # whose first 4 bits are 1, and the next two are 0.  (The value of the
769    # other 2 bits is immaterial in determining if a byte is in the set or
770    # not.)  The mask masks out those 2 irrelevant bits, and the comparison
771    # makes sure that the result matches all bytes which match those 6
772    # material bits exactly.  In other words, the set of bytes contains
773    # exactly those whose bottom two bit positions are either 0 or 1.  The
774    # same principle applies to bit positions that are not necessarily
775    # adjacent.  And it can be applied to bytes that differ in 1 through all 8
776    # bit positions.  In order to be a candidate for this optimization, the
777    # number of bytes in the set must be a power of 2.
778    #
779    # Consider a different example, the set 0x53, 0x54, 0x73, and 0x74.  That
780    # requires 4 tests using either ranges or individual values, and even
781    # though the number in the set is a power of 2, it doesn't qualify for the
782    # mask optimization described above because the number of bits that are
783    # different is too large for that.  However, the set can be expressed as
784    # two branches with masks thusly:
785    #   (c & 0xDF) == 0x53 || (c & 0xDF) == 0x54
786    # a branch savings of 50%.  This is done by splitting the set into two
787    # subsets each of which has 2 elements, and within each set the values
788    # differ by 1 byte.
789    #
790    # This function attempts to find some way to save some branches using the
791    # mask technique.  If not, it returns an empty list; if so, it
792    # returns a list consisting of
793    #   [ [compare1, mask1], [compare2, mask2], ...
794    #     [compare_n, undef], [compare_m, undef], ...
795    #   ]
796    # The <mask> is undef in the above for those bytes that must be tested
797    # for individually.
798    #
799    # This function does not attempt to find the optimal set.  To do so would
800    # probably require testing all possible combinations, and keeping track of
801    # the current best one.
802    #
803    # There are probably much better algorithms, but this is the one I (khw)
804    # came up with.  We start with doing a bit-wise compare of every byte in
805    # the set with every other byte.  The results are sorted into arrays of
806    # all those that differ by the same bit positions.  These are stored in a
807    # hash with the each key being the bits they differ in.  Here is the hash
808    # for the 0x53, 0x54, 0x73, 0x74 set:
809    # {
810    #    4 => {
811    #            "0,1,2,5" => [
812    #                            83,
813    #                            116,
814    #                            84,
815    #                            115
816    #                        ]
817    #        },
818    #    3 => {
819    #            "0,1,2" => [
820    #                        83,
821    #                        84,
822    #                        115,
823    #                        116
824    #                        ]
825    #        }
826    #    1 => {
827    #            5 => [
828    #                    83,
829    #                    115,
830    #                    84,
831    #                    116
832    #                ]
833    #        },
834    # }
835    #
836    # The set consisting of values which differ in the 4 bit positions 0, 1,
837    # 2, and 5 from some other value in the set consists of all 4 values.
838    # Likewise all 4 values differ from some other value in the 3 bit
839    # positions 0, 1, and 2; and all 4 values differ from some other value in
840    # the single bit position 5.  The keys at the uppermost level in the above
841    # hash, 1, 3, and 4, give the number of bit positions that each sub-key
842    # below it has.  For example, the 4 key could have as its value an array
843    # consisting of "0,1,2,5", "0,1,2,6", and "3,4,6,7", if the inputs were
844    # such.  The best optimization will group the most values into a single
845    # mask.  The most values will be the ones that differ in the most
846    # positions, the ones with the largest value for the topmost key.  These
847    # keys, are thus just for convenience of sorting by that number, and do
848    # not have any bearing on the core of the algorithm.
849    #
850    # We start with an element from largest number of differing bits.  The
851    # largest in this case is 4 bits, and there is only one situation in this
852    # set which has 4 differing bits, "0,1,2,5".  We look for any subset of
853    # this set which has 16 values that differ in these 4 bits.  There aren't
854    # any, because there are only 4 values in the entire set.  We then look at
855    # the next possible thing, which is 3 bits differing in positions "0,1,2".
856    # We look for a subset that has 8 values that differ in these 3 bits.
857    # Again there are none.  So we go to look for the next possible thing,
858    # which is a subset of 2**1 values that differ only in bit position 5.  83
859    # and 115 do, so we calculate a mask and base for those and remove them
860    # from every set.  Since there is only the one set remaining, we remove
861    # them from just this one.  We then look to see if there is another set of
862    # 2 values that differ in bit position 5.  84 and 116 do, so we calculate
863    # a mask and base for those and remove them from every set (again only
864    # this set remains in this example).  The set is now empty, and there are
865    # no more sets to look at, so we are done.
866
867    if ($list_count == 256) {   # All 256 is trivially masked
868        return (0, 0);
869    }
870
871    my %hash;
872
873    # Generate bits-differing lists for each element compared against each
874    # other element
875    for my $i (0 .. $list_count - 2) {
876        for my $j ($i + 1 .. $list_count - 1) {
877            my @bits_that_differ = pop_count($list[$i] ^ $list[$j]);
878            my $differ_count = @bits_that_differ;
879            my $key = join ",", @bits_that_differ;
880            push @{$hash{$differ_count}{$key}}, $list[$i] unless grep { $_ == $list[$i] } @{$hash{$differ_count}{$key}};
881            push @{$hash{$differ_count}{$key}}, $list[$j];
882        }
883    }
884
885    print STDERR __LINE__, ": calculate_mask() called:  List of values grouped by differing bits: ", Dumper \%hash if DEBUG;
886
887    my @final_results;
888    foreach my $count (reverse sort { $a <=> $b } keys %hash) {
889        my $need = 2 ** $count;     # Need 8 values for 3 differing bits, etc
890        foreach my $bits (sort keys $hash{$count}) {
891
892            print STDERR __LINE__, ": For $count bit(s) difference ($bits), need $need; have ", scalar @{$hash{$count}{$bits}}, "\n" if DEBUG;
893
894            # Look only as long as there are at least as many elements in the
895            # subset as are needed
896            while ((my $cur_count = @{$hash{$count}{$bits}}) >= $need) {
897
898                print STDERR __LINE__, ": Looking at bit positions ($bits): ", Dumper $hash{$count}{$bits} if DEBUG;
899
900                # Start with the first element in it
901                my $try_base = $hash{$count}{$bits}[0];
902                my @subset = $try_base;
903
904                # If it succeeds, we return a mask and a base to compare
905                # against the masked value.  That base will be the AND of
906                # every element in the subset.  Initialize to the one element
907                # we have so far.
908                my $compare = $try_base;
909
910                # We are trying to find a subset of this that has <need>
911                # elements that differ in the bit positions given by the
912                # string $bits, which is comma separated.
913                my @bits = split ",", $bits;
914
915                TRY: # Look through the remainder of the list for other
916                     # elements that differ only by these bit positions.
917
918                for (my $i = 1; $i < $cur_count; $i++) {
919                    my $try_this = $hash{$count}{$bits}[$i];
920                    my @positions = pop_count($try_base ^ $try_this);
921
922                    print STDERR __LINE__, ": $try_base vs $try_this: is (", join(',', @positions), ") a subset of ($bits)?" if DEBUG;;
923
924                    foreach my $pos (@positions) {
925                        unless (grep { $pos == $_ } @bits) {
926                            print STDERR "  No\n" if DEBUG;
927                            my $remaining = $cur_count - $i - 1;
928                            if ($remaining && @subset + $remaining < $need) {
929                                print STDERR __LINE__, ": Can stop trying $try_base, because even if all the remaining $remaining values work, they wouldn't add up to the needed $need when combined with the existing ", scalar @subset, " ones\n" if DEBUG;
930                                last TRY;
931                            }
932                            next TRY;
933                        }
934                    }
935
936                    print STDERR "  Yes\n" if DEBUG;
937                    push @subset, $try_this;
938
939                    # Add this to the mask base, in case it ultimately
940                    # succeeds,
941                    $compare &= $try_this;
942                }
943
944                print STDERR __LINE__, ": subset (", join(", ", @subset), ") has ", scalar @subset, " elements; needs $need\n" if DEBUG;
945
946                if (@subset < $need) {
947                    shift @{$hash{$count}{$bits}};
948                    next;   # Try with next value
949                }
950
951                # Create the mask
952                my $mask = 0;
953                foreach my $position (@bits) {
954                    $mask |= 1 << $position;
955                }
956                $mask = ~$mask & 0xFF;
957                push @final_results, [$compare, $mask];
958
959                printf STDERR "%d: Got it: compare=%d=0x%X; mask=%X\n", __LINE__, $compare, $compare, $mask if DEBUG;
960
961                # These values are now spoken for.  Remove them from future
962                # consideration
963                foreach my $remove_count (sort keys %hash) {
964                    foreach my $bits (sort keys %{$hash{$remove_count}}) {
965                        foreach my $to_remove (@subset) {
966                            @{$hash{$remove_count}{$bits}} = grep { $_ != $to_remove } @{$hash{$remove_count}{$bits}};
967                        }
968                    }
969                }
970            }
971        }
972    }
973
974    # Any values that remain in the list are ones that have to be tested for
975    # individually.
976    my @individuals;
977    foreach my $count (reverse sort { $a <=> $b } keys %hash) {
978        foreach my $bits (sort keys $hash{$count}) {
979            foreach my $remaining (@{$hash{$count}{$bits}}) {
980
981                # If we already know about this value, just ignore it.
982                next if grep { $remaining == $_ } @individuals;
983
984                # Otherwise it needs to be returned as something to match
985                # individually
986                push @final_results, [$remaining, undef];
987                push @individuals, $remaining;
988            }
989        }
990    }
991
992    # Sort by increasing numeric value
993    @final_results = sort { $a->[0] <=> $b->[0] } @final_results;
994
995    print STDERR __LINE__, ": Final return: ", Dumper \@final_results if DEBUG;
996
997    return @final_results;
998}
999
1000# _cond_as_str
1001# turn a list of conditions into a text expression
1002# - merges ranges of conditions, and joins the result with ||
1003sub _cond_as_str {
1004    my ( $self, $op, $combine, $opts_ref )= @_;
1005    my $cond= $op->{vals};
1006    my $test= $op->{test};
1007    my $is_cp_ret = $opts_ref->{ret_type} eq "cp";
1008    return "( $test )" if !defined $cond;
1009
1010    # rangify the list.
1011    my @ranges;
1012    my $Update= sub {
1013        # We skip this if there are optimizations that
1014        # we can apply (below) to the individual ranges
1015        if ( ($is_cp_ret || $combine) && @ranges && ref $ranges[-1]) {
1016            if ( $ranges[-1][0] == $ranges[-1][1] ) {
1017                $ranges[-1]= $ranges[-1][0];
1018            } elsif ( $ranges[-1][0] + 1 == $ranges[-1][1] ) {
1019                $ranges[-1]= $ranges[-1][0];
1020                push @ranges, $ranges[-1] + 1;
1021            }
1022        }
1023    };
1024    for my $condition ( @$cond ) {
1025        if ( !@ranges || $condition != $ranges[-1][1] + 1 ) {
1026            $Update->();
1027            push @ranges, [ $condition, $condition ];
1028        } else {
1029            $ranges[-1][1]++;
1030        }
1031    }
1032    $Update->();
1033
1034    return $self->_combine( $test, @ranges )
1035      if $combine;
1036
1037    if ($is_cp_ret) {
1038        @ranges= map {
1039            ref $_
1040            ? sprintf(
1041                "( $self->{val_fmt} <= $test && $test <= $self->{val_fmt} )",
1042                @$_ )
1043            : sprintf( "$self->{val_fmt} == $test", $_ );
1044        } @ranges;
1045
1046        return "( " . join( " || ", @ranges ) . " )";
1047    }
1048
1049    # If the input set has certain characteristics, we can optimize tests
1050    # for it.  This doesn't apply if returning the code point, as we want
1051    # each element of the set individually.  The code above is for this
1052    # simpler case.
1053
1054    return 1 if @$cond == 256;  # If all bytes match, is trivially true
1055
1056    my @masks;
1057    if (@ranges > 1) {
1058
1059        # See if the entire set shares optimizable characteristics, and if so,
1060        # return the optimization.  We delay checking for this on sets with
1061        # just a single range, as there may be better optimizations available
1062        # in that case.
1063        @masks = calculate_mask(@$cond);
1064
1065        # Stringify the output of calculate_mask()
1066        if (@masks) {
1067            my @return;
1068            foreach my $mask_ref (@masks) {
1069                if (defined $mask_ref->[1]) {
1070                    push @return, sprintf "( ( $test & $self->{val_fmt} ) == $self->{val_fmt} )", $mask_ref->[1], $mask_ref->[0];
1071                }
1072                else {  # An undefined mask means to use the value as-is
1073                    push @return, sprintf "$test == $self->{val_fmt}", $mask_ref->[0];
1074                }
1075            }
1076
1077            # The best possible case below for specifying this set of values via
1078            # ranges is 1 branch per range.  If our mask method yielded better
1079            # results, there is no sense trying something that is bound to be
1080            # worse.
1081            if (@return < @ranges) {
1082                return "( " . join( " || ", @return ) . " )";
1083            }
1084
1085            @masks = @return;
1086        }
1087    }
1088
1089    # Here, there was no entire-class optimization that was clearly better
1090    # than doing things by ranges.  Look at each range.
1091    my $range_count_extra = 0;
1092    for (my $i = 0; $i < @ranges; $i++) {
1093        if (! ref $ranges[$i]) {    # Trivial case: no range
1094            $ranges[$i] = sprintf "$self->{val_fmt} == $test", $ranges[$i];
1095        }
1096        elsif ($ranges[$i]->[0] == $ranges[$i]->[1]) {
1097            $ranges[$i] =           # Trivial case: single element range
1098                    sprintf "$self->{val_fmt} == $test", $ranges[$i]->[0];
1099        }
1100        else {
1101            my $output = "";
1102
1103            # Well-formed UTF-8 continuation bytes on ascii platforms must be
1104            # in the range 0x80 .. 0xBF.  If we know that the input is
1105            # well-formed (indicated by not trying to be 'safe'), we can omit
1106            # tests that verify that the input is within either of these
1107            # bounds.  (No legal UTF-8 character can begin with anything in
1108            # this range, so we don't have to worry about this being a
1109            # continuation byte or not.)
1110            if (ASCII_PLATFORM
1111                && ! $opts_ref->{safe}
1112                && $opts_ref->{type} =~ / ^ (?: utf8 | high ) $ /xi)
1113            {
1114                my $lower_limit_is_80 = ($ranges[$i]->[0] == 0x80);
1115                my $upper_limit_is_BF = ($ranges[$i]->[1] == 0xBF);
1116
1117                # If the range is the entire legal range, it matches any legal
1118                # byte, so we can omit both tests.  (This should happen only
1119                # if the number of ranges is 1.)
1120                if ($lower_limit_is_80 && $upper_limit_is_BF) {
1121                    return 1;
1122                }
1123                elsif ($lower_limit_is_80) { # Just use the upper limit test
1124                    $output = sprintf("( $test <= $self->{val_fmt} )",
1125                                        $ranges[$i]->[1]);
1126                }
1127                elsif ($upper_limit_is_BF) { # Just use the lower limit test
1128                    $output = sprintf("( $test >= $self->{val_fmt} )",
1129                                    $ranges[$i]->[0]);
1130                }
1131            }
1132
1133            # If we didn't change to omit a test above, see if the number of
1134            # elements is a power of 2 (only a single bit in the
1135            # representation of its count will be set) and if so, it may be
1136            # that a mask/compare optimization is possible.
1137            if ($output eq ""
1138                && pop_count($ranges[$i]->[1] - $ranges[$i]->[0] + 1) == 1)
1139            {
1140                my @list;
1141                push @list, $_  for ($ranges[$i]->[0] .. $ranges[$i]->[1]);
1142                my @this_masks = calculate_mask(@list);
1143
1144                # Use the mask if there is just one for the whole range.
1145                # Otherwise there is no savings over the two branches that can
1146                # define the range.
1147                if (@this_masks == 1 && defined $this_masks[0][1]) {
1148                    $output = sprintf "( $test & $self->{val_fmt} ) == $self->{val_fmt}", $this_masks[0][1], $this_masks[0][0];
1149                }
1150            }
1151
1152            if ($output ne "") {  # Prefer any optimization
1153                $ranges[$i] = $output;
1154            }
1155            else {
1156                # No optimization happened.  We need a test that the code
1157                # point is within both bounds.  But, if the bounds are
1158                # adjacent code points, it is cleaner to say
1159                # 'first == test || second == test'
1160                # than it is to say
1161                # 'first <= test && test <= second'
1162
1163                $range_count_extra++;   # This range requires 2 branches to
1164                                        # represent
1165                if ($ranges[$i]->[0] + 1 == $ranges[$i]->[1]) {
1166                    $ranges[$i] = "( "
1167                                .  join( " || ", ( map
1168                                    { sprintf "$self->{val_fmt} == $test", $_ }
1169                                    @{$ranges[$i]} ) )
1170                                . " )";
1171                }
1172                else {  # Full bounds checking
1173                    $ranges[$i] = sprintf("( $self->{val_fmt} <= $test && $test <= $self->{val_fmt} )", $ranges[$i]->[0], $ranges[$i]->[1]);
1174                }
1175            }
1176        }
1177    }
1178
1179    # We have generated the list of bytes in two ways; one trying to use masks
1180    # to cut the number of branches down, and the other to look at individual
1181    # ranges (some of which could be cut down by using a mask for just it).
1182    # We return whichever method uses the fewest branches.
1183    return "( "
1184           . join( " || ", (@masks && @masks < @ranges + $range_count_extra)
1185                            ? @masks
1186                            : @ranges)
1187           . " )";
1188}
1189
1190# _combine
1191# recursively turn a list of conditions into a fast break-out condition
1192# used by _cond_as_str() for 'cp' type macros.
1193sub _combine {
1194    my ( $self, $test, @cond )= @_;
1195    return if !@cond;
1196    my $item= shift @cond;
1197    my ( $cstr, $gtv );
1198    if ( ref $item ) {  # @item should be a 2-element array giving range start
1199                        # and end
1200        if ($item->[0] == 0) {  # UV's are never negative, so skip "0 <= "
1201                                # test which could generate a compiler warning
1202                                # that test is always true
1203            $cstr= sprintf( "$test <= $self->{val_fmt}", $item->[1] );
1204        }
1205        else {
1206            $cstr=
1207          sprintf( "( $self->{val_fmt} <= $test && $test <= $self->{val_fmt} )",
1208                   @$item );
1209        }
1210        $gtv= sprintf "$self->{val_fmt}", $item->[1];
1211    } else {
1212        $cstr= sprintf( "$self->{val_fmt} == $test", $item );
1213        $gtv= sprintf "$self->{val_fmt}", $item;
1214    }
1215    if ( @cond ) {
1216        my $combine= $self->_combine( $test, @cond );
1217        if (@cond >1) {
1218            return "( $cstr || ( $gtv < $test &&\n"
1219                   . $combine . " ) )";
1220        } else {
1221            return "( $cstr || $combine )";
1222        }
1223    } else {
1224        return $cstr;
1225    }
1226}
1227
1228# _render()
1229# recursively convert an optree to text with reasonably neat formatting
1230sub _render {
1231    my ( $self, $op, $combine, $brace, $opts_ref, $def, $submacros )= @_;
1232    return 0 if ! defined $op;  # The set is empty
1233    if ( !ref $op ) {
1234        return $op;
1235    }
1236    my $cond= $self->_cond_as_str( $op, $combine, $opts_ref );
1237    #no warnings 'recursion';   # This would allow really really inefficient
1238                                # code to be generated.  See pod
1239    my $yes= $self->_render( $op->{yes}, $combine, 1, $opts_ref, $def, $submacros );
1240    return $yes if $cond eq '1';
1241
1242    my $no= $self->_render( $op->{no},   $combine, 0, $opts_ref, $def, $submacros );
1243    return "( $cond )" if $yes eq '1' and $no eq '0';
1244    my ( $lb, $rb )= $brace ? ( "( ", " )" ) : ( "", "" );
1245    return "$lb$cond ? $yes : $no$rb"
1246      if !ref( $op->{yes} ) && !ref( $op->{no} );
1247    my $ind1= " " x 4;
1248    my $ind= "\n" . ( $ind1 x $op->{depth} );
1249
1250    if ( ref $op->{yes} ) {
1251        $yes= $ind . $ind1 . $yes;
1252    } else {
1253        $yes= " " . $yes;
1254    }
1255
1256    my $str= "$lb$cond ?$yes$ind: $no$rb";
1257    if (length $str > 6000) {
1258        push @$submacros, sprintf "#define $def\n( %s )", "_part" . (my $yes_idx= 0+@$submacros), $yes;
1259        push @$submacros, sprintf "#define $def\n( %s )", "_part" . (my $no_idx= 0+@$submacros), $no;
1260        return sprintf "%s%s ? $def : $def%s", $lb, $cond, "_part$yes_idx", "_part$no_idx", $rb;
1261    }
1262    return $str;
1263}
1264
1265# $expr=render($op,$combine)
1266#
1267# convert an optree to text with reasonably neat formatting. If $combine
1268# is true then the condition is created using "fast breakouts" which
1269# produce uglier expressions that are more efficient for common case,
1270# longer lists such as that resulting from type 'cp' output.
1271# Currently only used for type 'cp' macros.
1272sub render {
1273    my ( $self, $op, $combine, $opts_ref, $def_fmt )= @_;
1274
1275    my @submacros;
1276    my $macro= sprintf "#define $def_fmt\n( %s )", "", $self->_render( $op, $combine, 0, $opts_ref, $def_fmt, \@submacros );
1277
1278    return join "\n\n", map { "/*** GENERATED CODE ***/\n" . __macro( __clean( $_ ) ) } @submacros, $macro;
1279}
1280
1281# make_macro
1282# make a macro of a given type.
1283# calls into make_trie and (generic_|length_)optree as needed
1284# Opts are:
1285# type     : 'cp','cp_high', 'generic','high','low','latin1','utf8','LATIN1','UTF8'
1286# ret_type : 'cp' or 'len'
1287# safe     : add length guards to macro
1288#
1289# type defaults to 'generic', and ret_type to 'len' unless type is 'cp'
1290# in which case it defaults to 'cp' as well.
1291#
1292# It is illegal to do a type 'cp' macro on a pattern with multi-codepoint
1293# sequences in it, as the generated macro will accept only a single codepoint
1294# as an argument.
1295#
1296# It is also illegal to do a non-safe macro on a pattern with multi-codepoint
1297# sequences in it, as even if it is known to be well-formed, we need to not
1298# run off the end of the buffer when say the buffer ends with the first two
1299# characters, but three are looked at by the macro.
1300#
1301# returns the macro.
1302
1303
1304sub make_macro {
1305    my $self= shift;
1306    my %opts= @_;
1307    my $type= $opts{type} || 'generic';
1308    if ($self->{has_multi}) {
1309        if ($type =~ /^cp/) {
1310            die "Can't do a 'cp' on multi-codepoint character class '$self->{op}'"
1311        }
1312        elsif (! $opts{safe}) {
1313            die "'safe' is required on multi-codepoint character class '$self->{op}'"
1314        }
1315    }
1316    my $ret_type= $opts{ret_type} || ( $opts{type} =~ /^cp/ ? 'cp' : 'len' );
1317    my $method;
1318    if ( $opts{safe} ) {
1319        $method= 'length_optree';
1320    } elsif ( $type =~ /generic/ ) {
1321        $method= 'generic_optree';
1322    } else {
1323        $method= 'optree';
1324    }
1325    my @args= $type =~ /^cp/ ? 'cp' : 's';
1326    push @args, "e" if $opts{safe};
1327    push @args, "is_utf8" if $type =~ /generic/;
1328    push @args, "len" if $ret_type eq 'both';
1329    my $pfx= $ret_type eq 'both'    ? 'what_len_' :
1330             $ret_type eq 'cp'      ? 'what_'     : 'is_';
1331    my $ext= $type     =~ /generic/ ? ''          : '_' . lc( $type );
1332    $ext .= '_non_low' if $type eq 'generic_non_low';
1333    $ext .= "_safe" if $opts{safe};
1334    my $argstr= join ",", @args;
1335    my $def_fmt="$pfx$self->{op}$ext%s($argstr)";
1336    my $optree= $self->$method( %opts, type => $type, ret_type => $ret_type );
1337    return $self->render( $optree, ($type =~ /^cp/) ? 1 : 0, \%opts, $def_fmt );
1338}
1339
1340# if we aren't being used as a module (highly likely) then process
1341# the __DATA__ below and produce macros in regcharclass.h
1342# if an argument is provided to the script then it is assumed to
1343# be the path of the file to output to, if the arg is '-' outputs
1344# to STDOUT.
1345if ( !caller ) {
1346    $|++;
1347    my $path= shift @ARGV || "regcharclass.h";
1348    my $out_fh;
1349    if ( $path eq '-' ) {
1350        $out_fh= \*STDOUT;
1351    } else {
1352	$out_fh = open_new( $path );
1353    }
1354    print $out_fh read_only_top( lang => 'C', by => $0,
1355				 file => 'regcharclass.h', style => '*',
1356				 copyright => [2007, 2011],
1357                                 final => <<EOF,
1358WARNING: These macros are for internal Perl core use only, and may be
1359changed or removed without notice.
1360EOF
1361    );
1362    print $out_fh "\n#ifndef H_REGCHARCLASS   /* Guard against nested #includes */\n#define H_REGCHARCLASS 1\n\n";
1363
1364    my ( $op, $title, @txt, @types, %mods );
1365    my $doit= sub {
1366        return unless $op;
1367
1368        # Skip if to compile on a different platform.
1369        return if delete $mods{only_ascii_platform} && ! ASCII_PLATFORM;
1370        return if delete $mods{only_ebcdic_platform} && ord 'A' != 193;
1371
1372        print $out_fh "/*\n\t$op: $title\n\n";
1373        print $out_fh join "\n", ( map { "\t$_" } @txt ), "*/", "";
1374        my $obj= __PACKAGE__->new( op => $op, title => $title, txt => \@txt );
1375
1376        #die Dumper(\@types,\%mods);
1377
1378        my @mods;
1379        push @mods, 'safe' if delete $mods{safe};
1380        unshift @mods, 'fast' if delete $mods{fast} || ! @mods; # Default to 'fast'
1381                                                                # do this one
1382                                                                # first, as
1383                                                                # traditional
1384        if (%mods) {
1385            die "Unknown modifiers: ", join ", ", map { "'$_'" } sort keys %mods;
1386        }
1387
1388        foreach my $type_spec ( @types ) {
1389            my ( $type, $ret )= split /-/, $type_spec;
1390            $ret ||= 'len';
1391            foreach my $mod ( @mods ) {
1392                delete $mods{$mod};
1393                my $macro= $obj->make_macro(
1394                    type     => $type,
1395                    ret_type => $ret,
1396                    safe     => $mod eq 'safe' && $type !~ /^cp/,
1397                );
1398                print $out_fh $macro, "\n";
1399            }
1400        }
1401    };
1402
1403    while ( <DATA> ) {
1404        s/^ \s* (?: \# .* ) ? $ //x;    # squeeze out comment and blanks
1405        next unless /\S/;
1406        chomp;
1407        if ( /^[A-Z]/ ) {
1408            $doit->();  # This starts a new definition; do the previous one
1409            ( $op, $title )= split /\s*:\s*/, $_, 2;
1410            @txt= ();
1411        } elsif ( s/^=>// ) {
1412            my ( $type, $modifier )= split /:/, $_;
1413            @types= split ' ', $type;
1414            undef %mods;
1415            map { $mods{$_} = 1 } split ' ',  $modifier;
1416        } else {
1417            push @txt, "$_";
1418        }
1419    }
1420    $doit->();
1421
1422    print $out_fh "\n#endif /* H_REGCHARCLASS */\n";
1423
1424    if($path eq '-') {
1425	print $out_fh "/* ex: set ro: */\n";
1426    } else {
1427	read_only_bottom_close_and_rename($out_fh)
1428    }
1429}
1430
1431# The form of the input is a series of definitions to make macros for.
1432# The first line gives the base name of the macro, followed by a colon, and
1433# then text to be used in comments associated with the macro that are its
1434# title or description.  In all cases the first (perhaps only) parameter to
1435# the macro is a pointer to the first byte of the code point it is to test to
1436# see if it is in the class determined by the macro.  In the case of non-UTF8,
1437# the code point consists only of a single byte.
1438#
1439# The second line must begin with a '=>' and be followed by the types of
1440# macro(s) to be generated; these are specified below.  A colon follows the
1441# types, followed by the modifiers, also specified below.  At least one
1442# modifier is required.
1443#
1444# The subsequent lines give what code points go into the class defined by the
1445# macro.  Multiple characters may be specified via a string like "\x0D\x0A",
1446# enclosed in quotes.  Otherwise the lines consist of one of:
1447#   1)  a single Unicode code point, prefaced by 0x
1448#   2)  a single range of Unicode code points separated by a minus (and
1449#       optional space)
1450#   3)  a single Unicode property specified in the standard Perl form
1451#       "\p{...}"
1452#   4)  a line like 'do path'.  This will do a 'do' on the file given by
1453#       'path'.  It is assumed that this does nothing but load subroutines
1454#       (See item 5 below).  The reason 'require path' is not used instead is
1455#       because 'do' doesn't assume that path is in @INC.
1456#   5)  a subroutine call
1457#           &pkg::foo(arg1, ...)
1458#       where pkg::foo was loaded by a 'do' line (item 4).  The subroutine
1459#       returns an array of entries of forms like items 1-3 above.  This
1460#       allows more complex inputs than achievable from the other input types.
1461#
1462# A blank line or one whose first non-blank character is '#' is a comment.
1463# The definition of the macro is terminated by a line unlike those described.
1464#
1465# Valid types:
1466#   low         generate a macro whose name is 'is_BASE_low' and defines a
1467#               class that includes only ASCII-range chars.  (BASE is the
1468#               input macro base name.)
1469#   latin1      generate a macro whose name is 'is_BASE_latin1' and defines a
1470#               class that includes only upper-Latin1-range chars.  It is not
1471#               designed to take a UTF-8 input parameter.
1472#   high        generate a macro whose name is 'is_BASE_high' and defines a
1473#               class that includes all relevant code points that are above
1474#               the Latin1 range.  This is for very specialized uses only.
1475#               It is designed to take only an input UTF-8 parameter.
1476#   utf8        generate a macro whose name is 'is_BASE_utf8' and defines a
1477#               class that includes all relevant characters that aren't ASCII.
1478#               It is designed to take only an input UTF-8 parameter.
1479#   LATIN1      generate a macro whose name is 'is_BASE_latin1' and defines a
1480#               class that includes both ASCII and upper-Latin1-range chars.
1481#               It is not designed to take a UTF-8 input parameter.
1482#   UTF8        generate a macro whose name is 'is_BASE_utf8' and defines a
1483#               class that can include any code point, adding the 'low' ones
1484#               to what 'utf8' works on.  It is designed to take only an input
1485#               UTF-8 parameter.
1486#   generic     generate a macro whose name is 'is_BASE".  It has a 2nd,
1487#               boolean, parameter which indicates if the first one points to
1488#               a UTF-8 string or not.  Thus it works in all circumstances.
1489#   generic_non_low generate a macro whose name is 'is_BASE_non_low".  It has
1490#               a 2nd, boolean, parameter which indicates if the first one
1491#               points to a UTF-8 string or not.  It excludes any ASCII-range
1492#               matches, but otherwise it works in all circumstances.
1493#   cp          generate a macro whose name is 'is_BASE_cp' and defines a
1494#               class that returns true if the UV parameter is a member of the
1495#               class; false if not.
1496#   cp_high     like cp, but it is assumed that it is known that the UV
1497#               parameter is above Latin1.  The name of the generated macro is
1498#               'is_BASE_cp_high'.  This is different from high-cp, derived
1499#               below.
1500# A macro of the given type is generated for each type listed in the input.
1501# The default return value is the number of octets read to generate the match.
1502# Append "-cp" to the type to have it instead return the matched codepoint.
1503#               The macro name is changed to 'what_BASE...'.  See pod for
1504#               caveats
1505# Appending '-both" instead adds an extra parameter to the end of the argument
1506#               list, which is a pointer as to where to store the number of
1507#               bytes matched, while also returning the code point.  The macro
1508#               name is changed to 'what_len_BASE...'.  See pod for caveats
1509#
1510# Valid modifiers:
1511#   safe        The input string is not necessarily valid UTF-8.  In
1512#               particular an extra parameter (always the 2nd) to the macro is
1513#               required, which points to one beyond the end of the string.
1514#               The macro will make sure not to read off the end of the
1515#               string.  In the case of non-UTF8, it makes sure that the
1516#               string has at least one byte in it.  The macro name has
1517#               '_safe' appended to it.
1518#   fast        The input string is valid UTF-8.  No bounds checking is done,
1519#               and the macro can make assumptions that lead to faster
1520#               execution.
1521#   only_ascii_platform   Skip this definition if this program is being run on
1522#               a non-ASCII platform.
1523#   only_ebcdic_platform  Skip this definition if this program is being run on
1524#               a non-EBCDIC platform.
1525# No modifier need be specified; fast is assumed for this case.  If both
1526# 'fast', and 'safe' are specified, two macros will be created for each
1527# 'type'.
1528#
1529# If run on a non-ASCII platform will automatically convert the Unicode input
1530# to native.  The documentation above is slightly wrong in this case.  'low'
1531# actually refers to code points whose UTF-8 representation is the same as the
1532# non-UTF-8 version (invariants); and 'latin1' refers to all the rest of the
1533# code points less than 256.
1534
15351; # in the unlikely case we are being used as a module
1536
1537__DATA__
1538# This is no longer used, but retained in case it is needed some day.
1539# TRICKYFOLD: Problematic fold case letters.  When adding to this list, also should add them to regcomp.c and fold_grind.t
1540# => generic cp generic-cp generic-both :fast safe
1541# 0x00DF	# LATIN SMALL LETTER SHARP S
1542# 0x0390	# GREEK SMALL LETTER IOTA WITH DIALYTIKA AND TONOS
1543# 0x03B0	# GREEK SMALL LETTER UPSILON WITH DIALYTIKA AND TONOS
1544# 0x1E9E  # LATIN CAPITAL LETTER SHARP S, because maps to same as 00DF
1545# 0x1FD3  # GREEK SMALL LETTER IOTA WITH DIALYTIKA AND OXIA; maps same as 0390
1546# 0x1FE3  # GREEK SMALL LETTER UPSILON WITH DIALYTIKA AND OXIA; maps same as 03B0
1547
1548LNBREAK: Line Break: \R
1549=> generic UTF8 LATIN1 : safe
1550"\x0D\x0A"      # CRLF - Network (Windows) line ending
1551\p{VertSpace}
1552
1553HORIZWS: Horizontal Whitespace: \h \H
1554=> high cp_high : fast
1555\p{HorizSpace}
1556
1557VERTWS: Vertical Whitespace: \v \V
1558=> high cp_high : fast
1559\p{VertSpace}
1560
1561XDIGIT: Hexadecimal digits
1562=> high cp_high : fast
1563\p{XDigit}
1564
1565XPERLSPACE: \p{XPerlSpace}
1566=> high cp_high : fast
1567\p{XPerlSpace}
1568
1569REPLACEMENT: Unicode REPLACEMENT CHARACTER
1570=> UTF8 :safe
15710xFFFD
1572
1573NONCHAR: Non character code points
1574=> UTF8 :fast
1575\p{Nchar}
1576
1577SURROGATE: Surrogate characters
1578=> UTF8 :fast
1579\p{Gc=Cs}
1580
1581GCB_L: Grapheme_Cluster_Break=L
1582=> UTF8 :fast
1583\p{_X_GCB_L}
1584
1585GCB_LV_LVT_V: Grapheme_Cluster_Break=(LV or LVT or V)
1586=> UTF8 :fast
1587\p{_X_LV_LVT_V}
1588
1589GCB_Prepend: Grapheme_Cluster_Break=Prepend
1590=> UTF8 :fast
1591\p{_X_GCB_Prepend}
1592
1593GCB_RI: Grapheme_Cluster_Break=RI
1594=> UTF8 :fast
1595\p{_X_RI}
1596
1597GCB_SPECIAL_BEGIN_START: Grapheme_Cluster_Break=special_begin_starts
1598=> UTF8 :fast
1599\p{_X_Special_Begin_Start}
1600
1601GCB_T: Grapheme_Cluster_Break=T
1602=> UTF8 :fast
1603\p{_X_GCB_T}
1604
1605GCB_V: Grapheme_Cluster_Break=V
1606=> UTF8 :fast
1607\p{_X_GCB_V}
1608
1609# This program was run with this enabled, and the results copied to utf8.h;
1610# then this was commented out because it takes so long to figure out these 2
1611# million code points.  The results would not change unless utf8.h decides it
1612# wants a maximum other than 4 bytes, or this program creates better
1613# optimizations
1614#UTF8_CHAR: Matches utf8 from 1 to 4 bytes
1615#=> UTF8 :safe only_ascii_platform
1616#0x0 - 0x1FFFFF
1617
1618# This hasn't been commented out, because we haven't an EBCDIC platform to run
1619# it on, and the 3 types of EBCDIC allegedly supported by Perl would have
1620# different results
1621UTF8_CHAR: Matches utf8 from 1 to 5 bytes
1622=> UTF8 :safe only_ebcdic_platform
16230x0 - 0x3FFFFF:
1624
1625QUOTEMETA: Meta-characters that \Q should quote
1626=> high :fast
1627\p{_Perl_Quotemeta}
1628
1629MULTI_CHAR_FOLD: multi-char strings that are folded to by a single character
1630=> UTF8 :safe
1631do regen/regcharclass_multi_char_folds.pl
1632
1633# 1 => All folds
1634&regcharclass_multi_char_folds::multi_char_folds(1)
1635
1636MULTI_CHAR_FOLD: multi-char strings that are folded to by a single character
1637=> LATIN1 : safe
1638
1639&regcharclass_multi_char_folds::multi_char_folds(0)
1640# 0 => Latin1-only
1641
1642FOLDS_TO_MULTI: characters that fold to multi-char strings
1643=> UTF8 :fast
1644\p{_Perl_Folds_To_Multi_Char}
1645
1646PROBLEMATIC_LOCALE_FOLD : characters whose fold is problematic under locale
1647=> UTF8 cp :fast
1648\p{_Perl_Problematic_Locale_Folds}
1649
1650PROBLEMATIC_LOCALE_FOLDEDS_START : The first folded character of folds which are problematic under locale
1651=> UTF8 cp :fast
1652\p{_Perl_Problematic_Locale_Foldeds_Start}
1653
1654PATWS: pattern white space
1655=> generic generic_non_low cp : safe
1656\p{PatWS}
1657