1#!/usr/local/bin/perl
2use strict;
3use warnings;
4use Data::Dumper;
5use Time::HiRes qw(time);
6use blib;
7use Text::Prefix::XS;
8use Log::Fu;
9use Getopt::Long;
10use Text::Match::FastAlternatives;
11use Dir::Self;
12use lib __DIR__ . '/t';
13use Benchmark qw(:all);
14
15require 'txs_gendata.pm';
16
17GetOptions(
18    'pp'            => \my $UsePP,
19    'xs'            => \my $UseXS,
20    'xs_multi'      => \my $UseXS_multi,
21    'xs_op'         => \my $UseXS_OP,
22    're'            => \my $UseRE,
23    're2'           => \my $UseRE2,
24
25    're2_cap'       => \my $UseRE2_CAP,
26    're_cap'        => \my $UseRE_CAP,
27
28    'tmfa'          => \my $UseTMFA,
29    'cached'        => \my $UseCached,
30    'cycles=i'      => \my $Cycles,
31    'count=i'       => \my $StringCount,
32    'min=i'         => \my $TermMin,
33    'max=i'         => \my $TermMax,
34    'terms=i'       => \my $TermCount,
35    'bench'         => \my $DoBench
36);
37
38$Cycles ||= 0;
39$StringCount ||= 2_000_000;
40
41my $matches = 0;
42
43txs_gendata::GenData( {
44        StringCount => $StringCount,
45        TermCount => $TermCount ||= 20,
46        MinLength => $TermMin ||= 5,
47        MaxLength => $TermMax ||= 20
48    },
49    \my @terms,
50    \my @strings);
51
52printf("Generated INPUT=%d TERMS=%d TERM_MIN=%d TERM_MAX=%d\n",
53       $StringCount, scalar @terms, $TermMin, $TermMax);
54
55
56sub search_PP {
57    my $match_first_pass = 0;
58    my $not_filtered = 0;
59    my %index;
60    my %fullmatch;
61    my $MIN_INDEX = 100;
62    foreach my $term (@terms) {
63        if(length($term) < $MIN_INDEX) {
64            $MIN_INDEX = length($term);
65        }
66        my @chars = split(//, $term);
67        while(@chars) {
68            $index{join("", @chars)} = 1;
69            pop @chars;
70        }
71        $fullmatch{$term} = 1;
72    }
73
74    CHECK_TERM:
75    foreach my $str (@strings) {
76        my $j = 1;
77        while($j <= $MIN_INDEX) {
78            if(!exists $index{substr($str,0,$j)}){
79                next CHECK_TERM;
80            }
81            $j++;
82        }
83        $not_filtered++;
84        #The prefix matches
85        foreach my $term (@terms) {
86            if(substr($str,0,length($term)) eq $term) {
87                $matches++;
88                next CHECK_TERM;
89            }
90        }
91    };
92    return $matches;
93}
94
95#Try large regex version..
96sub gen_big_re {
97    my ($is_cap,$is_re2) = @_;
98
99    my $ret;
100    $ret = join '|',  map quotemeta $_, @terms;
101    if($is_cap) {
102        $ret = qr/^($ret)/;
103    } else {
104        $ret = qr/^(?:$ret)/;
105    }
106}
107
108sub search_Perl_RE {
109
110    my $re = gen_big_re();
111    foreach my $str (@strings) {
112        if($str =~ $re) {
113            $matches++;
114        }
115    }
116    return $matches;
117}
118
119sub search_Perl_RE_cap {
120    my $re = gen_big_re(1, 0);
121    foreach my $str (@strings) {
122        my ($match) = ( $str =~ $re );
123        if($match) {
124            $matches++;
125        }
126    }
127    return $matches;
128}
129
130
131sub search_TMFA {
132    my $tmfa = Text::Match::FastAlternatives->new(@terms);
133    foreach my $str (@strings) {
134        if($tmfa->match_at($str, 0)) {
135            $matches++;
136        }
137    }
138    return $matches;
139}
140
141sub search_XS {
142    my $xs_begin_time = time();
143    my $xs_search = prefix_search_create(@terms);
144    my $xs_duration = time() - $xs_begin_time;
145    if($ENV{TEXT_XS_DUMP}) {
146        printf("Creating search took %0.3f sec\n", $xs_duration);
147    }
148    foreach my $str (@strings) {
149        if(my $result = prefix_search $xs_search, $str) {
150            $matches++;
151        }
152    }
153    if($ENV{TEXT_XS_DUMP}) {
154        Text::Prefix::XS::prefix_search_dump($xs_search);
155    }
156    return $matches;
157}
158
159sub search_XS_multi {
160    my $xs_search = prefix_search_create(@terms);
161    my $match_hash = prefix_search_multi($xs_search, @strings);
162    while (my ($pfix,$mch) = each %$match_hash) {
163        $matches += scalar @$mch;
164    }
165    if($ENV{TEXT_XS_DUMP}) {
166        Text::Prefix::XS::prefix_search_dump($xs_search);
167    }
168    return $matches;
169}
170
171sub search_XS_op {
172    my $xs_search = prefix_search_create(@terms);
173    foreach my $str (@strings) {
174        if(my $result = psearch($xs_search,$str)) {
175            $matches++;
176        }
177    }
178}
179
180if(!($UsePP||$UseXS||$UseRE||$UseRE2
181     ||$UseRE2_CAP||$UseRE_CAP||$UseXS_multi||
182     $UseXS_OP)) {
183    $UsePP = 1;
184    $UseXS = 1;
185    $UseXS_OP = 1;
186    $UseXS_multi = 1;
187    $UseRE = 1;
188    $UseRE2 = 1;
189    $UseTMFA = 1;
190    $UseRE2_CAP = 1;
191    $UseRE_CAP = 1;
192}
193
194my $can_have_re2;
195eval {
196    require 're2_test.pm';
197    $can_have_re2 = 1;
198};
199
200my @fn_maps = (
201    #[$UsePP,
202    # "[Y] Perl-Trie", \&search_PP],
203
204    [$UseTMFA,
205     "[N] TMFA", \&search_TMFA],
206    [$UseRE,
207     "[N] perl-re", \&search_Perl_RE],
208    [$UseRE2 && $can_have_re2,
209     "[N] RE2", sub { re2_test::search_RE2(\@terms, \@strings) }],
210    [$UseRE_CAP,
211     '[Y] perl-re', \&search_Perl_RE_cap],
212    [$UseRE2_CAP && $can_have_re2,
213     '[Y] RE2', sub { re2_test::search_RE2_CAP(\@terms, \@strings) }],
214
215    [$UseXS, "[Y] TXS", \&search_XS],
216    [$UseXS_multi, '[Y] TXS-Multi', \&search_XS_multi],
217
218    #[$UseXS_OP, '[Y] TXS-OP', \&search_XS_op],
219);
220
221printf("%-5s %-10s %3s\t%s\n",
222       'CAP', 'NAME', 'DUR', 'MATCH');
223
224my $cycle_print = $Cycles;
225
226foreach my $cycle (0..$Cycles) {
227    if($Cycles) {
228        print "Cycle: $cycle\n";
229    }
230    foreach (@fn_maps) {
231        my ($enabled,$title,$fn) = @$_;
232        if(!$enabled) {
233            printf("%-15s SKIP\n", $title);
234            next;
235        }
236        $matches = 0;
237        my $begin_time = time();
238        my $matches = $fn->();
239        my $duration = time() - $begin_time;
240        printf("%-15s\t%0.2fs\tM=%d\n",
241                  $title, $duration, $matches);
242    }
243}
2441;
245