1package Regexp::Assemble::Compressed; 2 3use strict; 4use warnings; 5our $VERSION = '0.02'; 6use base qw(Regexp::Assemble); 7 8# Note: maybe handle \U,\L more smartly 9our $char = qr/ 10 (?:\\u|\\l|) # \u, \l acts on one char or char group 11 (?: 12 \\Q.+?\\E # capture \Q..\E completely 13 | \[:[^:]+:\] # posix char class 14 | \\[UL].+?(?:\\E|\Z) # capture \U..\E and \L..\E completely 15 | \\x(?:\{[\dA-Fa-f]+\}|[\dA-Fa-f]{1,2}) # \x.. or \x{...} 16 | \\\d{1,3} # \000 - octal 17 | \\N\{[^{]+\} # unicode char 18 | \\[pP]\{[^{]+\} # unicode character class 19 | \\c. # control char \cX 20 | \\. # \t \n \s ... 21 | . # any char 22 ) 23/xo; 24 25sub as_string { 26 my $self = shift; 27 my $string = $self->SUPER::as_string; 28 $string =~ s{(?<!\\)\[(\^|)((?:\[:[^:]+:\]|.)+?)(?<!\\)\]}{ "[" . $1 . _compress($2) . "]" }eg; 29 return $string; 30} 31 32sub _compress { 33 my $string = shift; 34 my @characters = sort $string =~ m{ ( $char\-$char | $char ) }sgx; 35 #warn "[ ".join('|', @characters)." ]"; 36 my @stack = (); 37 my @skipped = (); 38 my $last; 39 for my $char (@characters) { 40 if ( length($char) == 1 ) { 41 my $num = ord $char; 42 if (defined $last and $num - $last == 0) { next } 43 if (defined $last and @skipped and $num >= ord $skipped[0] and $num <= ord $skipped[-1]) { next } 44 if (defined $last and $num - $last == 1) { 45 push @skipped, $char; 46 $last = $num; 47 next; 48 } 49 elsif (@skipped) { 50 push @stack, @skipped < 2 ? @skipped : ('-', $skipped[-1]); 51 @skipped = (); 52 } 53 push @stack, $char; 54 $last = $num; 55 } 56 elsif (length $char == 3 and $char =~ /^([^\\])-([^\\])$/) { 57 my ($beg,$end) = ($1,$2); 58 my $num = ord $beg; 59 my $enn = ord $end; 60 if (defined $last and @skipped and $num + 1 >= ord $skipped[0] and $num <= ord $skipped[-1]) { 61 if ($enn <= ord $skipped[-1]) { next } 62 else { 63 my $next = $skipped[-1]; 64 ++$next; 65 push @skipped, $next..$end; 66 $last = $enn; 67 next; 68 } 69 } 70 if (defined $last and $num - $last == 1) { 71 push @skipped, $beg..$end; 72 $last = $enn; 73 next; 74 } 75 elsif (@skipped) { 76 push @stack, @skipped < 2 ? @skipped : ('-', $skipped[-1]); 77 @skipped = (); 78 } 79 push @stack, $beg; 80 push @skipped, ++$beg..$end; 81 $last = $enn; 82 } 83 else { 84 if (@skipped) { 85 push @stack, @skipped < 2 ? @skipped : ('-', $skipped[-1]); 86 @skipped = (); 87 } 88 push @stack, $char; 89 } 90 } 91 if (@skipped) { 92 push @stack, @skipped < 2 ? @skipped : ('-', $skipped[-1]); 93 } 94 return join '', @stack; 95} 96 971; 98__END__ 99 100=head1 NAME 101 102Regexp::Assemble::Compressed - Assemble more compressed Regular Expression 103 104=head1 SYNOPSIS 105 106 use Regexp::Assemble::Compressed; 107 108 my $ra = Regexp::Assemble::Compressed->new; 109 my @cctlds = qw(ma mc md me mf mg mh mk ml mm mn mo mp 110 mq mr ms mt mu mv mw mx my mz); 111 for my $tld ( @cctlds ) { 112 $ra->add( $tld ); 113 } 114 print $ra->re; # prints m[ac-hk-z]. 115 # Regexp::Assemble prints m[acdefghklmnopqrstuvwxyz] 116 117=head1 DESCRIPTION 118 119Regexp::Assemble::Compressed is a subclass of Regexp::Assemble. 120It assembles more compressed regular expressions. 121 122=head1 AUTHOR 123 124Koichi Taniguchi E<lt>taniguchi@livedoor.jpE<gt> 125 126=head1 LICENSE 127 128This library is free software; you can redistribute it and/or modify 129it under the same terms as Perl itself. 130 131=head1 SEE ALSO 132 133L<Regexp::Assemble> 134 135=cut 136