1package Regexp::Assemble::Compressed;
2
3use strict;
4use warnings;
5our $VERSION = '0.02';
6use base qw(Regexp::Assemble);
7
8# Note: maybe handle \U,\L more smartly
9our $char = qr/
10    (?:\\u|\\l|)                                 # \u, \l acts on one char or char group
11    (?:
12          \\Q.+?\\E                              # capture \Q..\E completely
13        | \[:[^:]+:\]                            # posix char class
14        | \\[UL].+?(?:\\E|\Z)                    # capture \U..\E and \L..\E completely
15        | \\x(?:\{[\dA-Fa-f]+\}|[\dA-Fa-f]{1,2}) # \x.. or \x{...}
16        | \\\d{1,3}                              # \000 - octal
17        | \\N\{[^{]+\}                           # unicode char
18        | \\[pP]\{[^{]+\}                        # unicode character class
19        | \\c.                                   # control char \cX
20        | \\.                                    # \t \n \s ...
21        | .                                      # any char
22     )
23/xo;
24
25sub as_string {
26    my $self = shift;
27    my $string = $self->SUPER::as_string;
28    $string =~ s{(?<!\\)\[(\^|)((?:\[:[^:]+:\]|.)+?)(?<!\\)\]}{ "[" . $1 . _compress($2) . "]" }eg;
29    return $string;
30}
31
32sub _compress {
33    my $string = shift;
34    my @characters = sort $string =~ m{ ( $char\-$char | $char ) }sgx;
35    #warn "[ ".join('|', @characters)." ]";
36    my @stack = ();
37    my @skipped = ();
38    my $last;
39    for my $char (@characters) {
40        if ( length($char) == 1 ) {
41            my $num = ord $char;
42            if (defined $last and $num - $last == 0) { next }
43            if (defined $last and @skipped and $num >= ord $skipped[0] and $num <= ord $skipped[-1]) { next }
44            if (defined $last and $num - $last == 1) {
45                push @skipped, $char;
46                $last = $num;
47                next;
48            }
49            elsif (@skipped) {
50                push @stack, @skipped < 2 ? @skipped : ('-', $skipped[-1]);
51                @skipped = ();
52            }
53            push @stack, $char;
54            $last = $num;
55        }
56        elsif (length $char == 3 and $char =~ /^([^\\])-([^\\])$/) {
57            my ($beg,$end) = ($1,$2);
58            my $num = ord $beg;
59            my $enn = ord $end;
60            if (defined $last and @skipped and $num + 1 >= ord $skipped[0] and $num <= ord $skipped[-1]) {
61                if ($enn <= ord $skipped[-1]) { next }
62                else {
63                    my $next = $skipped[-1];
64                    ++$next;
65                    push @skipped, $next..$end;
66                    $last = $enn;
67                    next;
68                }
69            }
70            if (defined $last and $num - $last == 1) {
71                push @skipped, $beg..$end;
72                $last = $enn;
73                next;
74            }
75            elsif (@skipped) {
76                push @stack, @skipped < 2 ? @skipped : ('-', $skipped[-1]);
77                @skipped = ();
78            }
79            push @stack, $beg;
80            push @skipped, ++$beg..$end;
81            $last = $enn;
82        }
83        else {
84            if (@skipped) {
85                push @stack, @skipped < 2 ? @skipped : ('-', $skipped[-1]);
86                @skipped = ();
87            }
88            push @stack, $char;
89        }
90    }
91    if (@skipped) {
92        push @stack, @skipped < 2 ? @skipped : ('-', $skipped[-1]);
93    }
94    return join '', @stack;
95}
96
971;
98__END__
99
100=head1 NAME
101
102Regexp::Assemble::Compressed - Assemble more compressed Regular Expression
103
104=head1 SYNOPSIS
105
106 use Regexp::Assemble::Compressed;
107
108 my $ra = Regexp::Assemble::Compressed->new;
109 my @cctlds = qw(ma mc md me mf mg mh mk ml mm mn mo mp
110                 mq mr ms mt mu mv mw mx my mz);
111 for my $tld ( @cctlds ) {
112     $ra->add( $tld );
113 }
114 print $ra->re; # prints m[ac-hk-z].
115                # Regexp::Assemble prints m[acdefghklmnopqrstuvwxyz]
116
117=head1 DESCRIPTION
118
119Regexp::Assemble::Compressed is a subclass of Regexp::Assemble.
120It assembles more compressed regular expressions.
121
122=head1 AUTHOR
123
124Koichi Taniguchi E<lt>taniguchi@livedoor.jpE<gt>
125
126=head1 LICENSE
127
128This library is free software; you can redistribute it and/or modify
129it under the same terms as Perl itself.
130
131=head1 SEE ALSO
132
133L<Regexp::Assemble>
134
135=cut
136