1#!/usr/local/bin/perl 2use strict; 3use warnings; 4use Data::Dumper; 5use Time::HiRes qw(time); 6use blib; 7use Text::Prefix::XS; 8use Log::Fu; 9use Getopt::Long; 10use Text::Match::FastAlternatives; 11use Dir::Self; 12use lib __DIR__ . '/t'; 13use Benchmark qw(:all); 14 15require 'txs_gendata.pm'; 16 17GetOptions( 18 'pp' => \my $UsePP, 19 'xs' => \my $UseXS, 20 'xs_multi' => \my $UseXS_multi, 21 'xs_op' => \my $UseXS_OP, 22 're' => \my $UseRE, 23 're2' => \my $UseRE2, 24 25 're2_cap' => \my $UseRE2_CAP, 26 're_cap' => \my $UseRE_CAP, 27 28 'tmfa' => \my $UseTMFA, 29 'cached' => \my $UseCached, 30 'cycles=i' => \my $Cycles, 31 'count=i' => \my $StringCount, 32 'min=i' => \my $TermMin, 33 'max=i' => \my $TermMax, 34 'terms=i' => \my $TermCount, 35 'bench' => \my $DoBench 36); 37 38$Cycles ||= 0; 39$StringCount ||= 2_000_000; 40 41my $matches = 0; 42 43txs_gendata::GenData( { 44 StringCount => $StringCount, 45 TermCount => $TermCount ||= 20, 46 MinLength => $TermMin ||= 5, 47 MaxLength => $TermMax ||= 20 48 }, 49 \my @terms, 50 \my @strings); 51 52printf("Generated INPUT=%d TERMS=%d TERM_MIN=%d TERM_MAX=%d\n", 53 $StringCount, scalar @terms, $TermMin, $TermMax); 54 55 56sub search_PP { 57 my $match_first_pass = 0; 58 my $not_filtered = 0; 59 my %index; 60 my %fullmatch; 61 my $MIN_INDEX = 100; 62 foreach my $term (@terms) { 63 if(length($term) < $MIN_INDEX) { 64 $MIN_INDEX = length($term); 65 } 66 my @chars = split(//, $term); 67 while(@chars) { 68 $index{join("", @chars)} = 1; 69 pop @chars; 70 } 71 $fullmatch{$term} = 1; 72 } 73 74 CHECK_TERM: 75 foreach my $str (@strings) { 76 my $j = 1; 77 while($j <= $MIN_INDEX) { 78 if(!exists $index{substr($str,0,$j)}){ 79 next CHECK_TERM; 80 } 81 $j++; 82 } 83 $not_filtered++; 84 #The prefix matches 85 foreach my $term (@terms) { 86 if(substr($str,0,length($term)) eq $term) { 87 $matches++; 88 next CHECK_TERM; 89 } 90 } 91 }; 92 return $matches; 93} 94 95#Try large regex version.. 96sub gen_big_re { 97 my ($is_cap,$is_re2) = @_; 98 99 my $ret; 100 $ret = join '|', map quotemeta $_, @terms; 101 if($is_cap) { 102 $ret = qr/^($ret)/; 103 } else { 104 $ret = qr/^(?:$ret)/; 105 } 106} 107 108sub search_Perl_RE { 109 110 my $re = gen_big_re(); 111 foreach my $str (@strings) { 112 if($str =~ $re) { 113 $matches++; 114 } 115 } 116 return $matches; 117} 118 119sub search_Perl_RE_cap { 120 my $re = gen_big_re(1, 0); 121 foreach my $str (@strings) { 122 my ($match) = ( $str =~ $re ); 123 if($match) { 124 $matches++; 125 } 126 } 127 return $matches; 128} 129 130 131sub search_TMFA { 132 my $tmfa = Text::Match::FastAlternatives->new(@terms); 133 foreach my $str (@strings) { 134 if($tmfa->match_at($str, 0)) { 135 $matches++; 136 } 137 } 138 return $matches; 139} 140 141sub search_XS { 142 my $xs_begin_time = time(); 143 my $xs_search = prefix_search_create(@terms); 144 my $xs_duration = time() - $xs_begin_time; 145 if($ENV{TEXT_XS_DUMP}) { 146 printf("Creating search took %0.3f sec\n", $xs_duration); 147 } 148 foreach my $str (@strings) { 149 if(my $result = prefix_search $xs_search, $str) { 150 $matches++; 151 } 152 } 153 if($ENV{TEXT_XS_DUMP}) { 154 Text::Prefix::XS::prefix_search_dump($xs_search); 155 } 156 return $matches; 157} 158 159sub search_XS_multi { 160 my $xs_search = prefix_search_create(@terms); 161 my $match_hash = prefix_search_multi($xs_search, @strings); 162 while (my ($pfix,$mch) = each %$match_hash) { 163 $matches += scalar @$mch; 164 } 165 if($ENV{TEXT_XS_DUMP}) { 166 Text::Prefix::XS::prefix_search_dump($xs_search); 167 } 168 return $matches; 169} 170 171sub search_XS_op { 172 my $xs_search = prefix_search_create(@terms); 173 foreach my $str (@strings) { 174 if(my $result = psearch($xs_search,$str)) { 175 $matches++; 176 } 177 } 178} 179 180if(!($UsePP||$UseXS||$UseRE||$UseRE2 181 ||$UseRE2_CAP||$UseRE_CAP||$UseXS_multi|| 182 $UseXS_OP)) { 183 $UsePP = 1; 184 $UseXS = 1; 185 $UseXS_OP = 1; 186 $UseXS_multi = 1; 187 $UseRE = 1; 188 $UseRE2 = 1; 189 $UseTMFA = 1; 190 $UseRE2_CAP = 1; 191 $UseRE_CAP = 1; 192} 193 194my $can_have_re2; 195eval { 196 require 're2_test.pm'; 197 $can_have_re2 = 1; 198}; 199 200my @fn_maps = ( 201 #[$UsePP, 202 # "[Y] Perl-Trie", \&search_PP], 203 204 [$UseTMFA, 205 "[N] TMFA", \&search_TMFA], 206 [$UseRE, 207 "[N] perl-re", \&search_Perl_RE], 208 [$UseRE2 && $can_have_re2, 209 "[N] RE2", sub { re2_test::search_RE2(\@terms, \@strings) }], 210 [$UseRE_CAP, 211 '[Y] perl-re', \&search_Perl_RE_cap], 212 [$UseRE2_CAP && $can_have_re2, 213 '[Y] RE2', sub { re2_test::search_RE2_CAP(\@terms, \@strings) }], 214 215 [$UseXS, "[Y] TXS", \&search_XS], 216 [$UseXS_multi, '[Y] TXS-Multi', \&search_XS_multi], 217 218 #[$UseXS_OP, '[Y] TXS-OP', \&search_XS_op], 219); 220 221printf("%-5s %-10s %3s\t%s\n", 222 'CAP', 'NAME', 'DUR', 'MATCH'); 223 224my $cycle_print = $Cycles; 225 226foreach my $cycle (0..$Cycles) { 227 if($Cycles) { 228 print "Cycle: $cycle\n"; 229 } 230 foreach (@fn_maps) { 231 my ($enabled,$title,$fn) = @$_; 232 if(!$enabled) { 233 printf("%-15s SKIP\n", $title); 234 next; 235 } 236 $matches = 0; 237 my $begin_time = time(); 238 my $matches = $fn->(); 239 my $duration = time() - $begin_time; 240 printf("%-15s\t%0.2fs\tM=%d\n", 241 $title, $duration, $matches); 242 } 243} 2441; 245