1#!/usr/bin/perl
2
3#########################
4# Usage: ./benchmark-stemmers.pl voc.txt
5#
6# Runs speed (words/second) tests using randomly selected
7# words from a word list.
8#
9
10use strict;
11use warnings;
12
13use lib qw(../lib);
14
15use Lingua::Stem qw ();
16use Lingua::Stem::En qw ();
17use Time::HiRes qw(gettimeofday tv_interval);
18
19my $snowball = eval { require Lingua::Stem::Snowball; };
20if ($@) {
21    $snowball = 0;
22}
23
24my $loops   = 100;
25my $n_words = 3000;
26
27my @word = grep chomp, <ARGV>;
28
29#################################################
30# Preload word list so we have identical runs
31my @word_list = ();
32my $s = $n_words;
33print "Generating base word list...\n";
34for (1..$s) {
35  my $result;
36  my $w = @word[rand(scalar(@word))];
37  push (@word_list,$w);
38}
39print "Generating long word list...\n";
40my $n = $n_words * $loops;
41my @big_word_list = ();
42foreach my $count (1..$loops) {
43    push (@big_word_list, @word_list);
44}
45print "$n_words words repeated $loops times\n\n";
46# Word by word, Lingua::Stem::Snowball
47if ($snowball) {
48    my $start_time = [gettimeofday];
49    for (my $i = 0; $i < $loops; $i++) {
50        foreach my $w (@word_list) {
51            my ($result) = Lingua::Stem::Snowball::stem('en', $w);
52        }
53    }
54    my $elapsed = tv_interval($start_time);
55    printf  "Lingua::Stem::Snowball, one word at a time, no caching: %8s words/second\n", int($n/$elapsed);
56}
57
58# Processed in batches, Lingua::Stem::Snowball
59if ($snowball) {
60    eval {
61        my $start_time = [gettimeofday];
62        for (my $i = 0; $i < $loops; $i++) {
63            my @results = Lingua::Stem::Snowball::stem('en',\@word_list);
64            my $n_words_returned = @results;
65            if ($n_words_returned <= 1) {
66                use Data::Dumper; warn(Dumper(\@results));
67                die(sprintf("Lingua::Stem::Snowball, %6s word batches, no caching:         failed\n", $n_words));
68            }
69        }
70        my $elapsed = tv_interval($start_time);
71        printf  "Lingua::Stem::Snowball, %6s word batches, no caching:%8s words/second\n", $n_words, int($n/$elapsed);
72    };
73    if ($@) {
74        print "$@";
75    }
76 }
77
78# Processed in one batch, Lingua::Stem::Snowball
79if ($snowball) {
80    eval {
81        my $start_time = [gettimeofday];
82        my @results = Lingua::Stem::Snowball::stem('en',\@big_word_list);
83        my $n_words_returned = @results;
84        if ($n_words_returned <= 1) {
85            die(sprintf("Lingua::Stem::Snowball, one batch, no caching:                   failed\n", $n_words));
86        }
87        my $elapsed = tv_interval($start_time);
88        printf  "Lingua::Stem::Snowball, one batch, no caching:          %8s words/second\n", int($n/$elapsed);
89    };
90    if ($@) {
91        print "$@";
92    }
93}
94
95# Word by word, Lingua::Stem
96{
97    my $start_time = [gettimeofday];
98    for (my $i = 0; $i < $loops; $i++) {
99        foreach my $w (@word_list) {
100            my ($result) = Lingua::Stem::stem($w);
101        }
102    }
103    my $elapsed = tv_interval($start_time);
104    printf  "Lingua::Stem, one word at a time, no caching:           %8s words/second\n", int($n/$elapsed);
105}
106
107
108# Processed in batches, Lingua::Stem
109{
110    my $start_time = [gettimeofday];
111    for (my $i = 0; $i < $loops; $i++) {
112        my ($result) = Lingua::Stem::stem(@word_list);
113    }
114    my $elapsed = tv_interval($start_time);
115    printf  "Lingua::Stem, %6s word batches, no caching:          %8s words/second\n", $n_words, int($n/$elapsed);
116}
117
118# Processed in one batch, Lingua::Stem
119{
120    my $start_time = [gettimeofday];
121    my ($result) = Lingua::Stem::stem(@big_word_list);
122    my $elapsed = tv_interval($start_time);
123    printf  "Lingua::Stem, one batch, no caching:                    %8s words/second\n", int($n/$elapsed);
124}
125
126# Word by word, Lingua::Stem, with caching
127{
128    Lingua::Stem::stem_caching({ -level => 2});
129    my $start_time = [gettimeofday];
130    for (my $i = 0; $i < $loops; $i++) {
131        foreach my $w (@word_list) {
132            my ($result) = Lingua::Stem::stem($w);
133        }
134    }
135    my $elapsed = tv_interval($start_time);
136    printf  "Lingua::Stem, one word at a time, cache level 2:        %8s words/second\n", int($n/$elapsed);
137}
138
139# Processed in batches with caching, Lingua::Stem
140{
141    Lingua::Stem::stem_caching({ -level => 2});
142    my $start_time = [gettimeofday];
143    for (my $i = 0; $i < $loops; $i++) {
144        my ($result) = Lingua::Stem::stem(@word_list);
145    }
146    my $elapsed = tv_interval($start_time);
147    printf  "Lingua::Stem, %6s word batches, cache level 2:       %8s words/second\n", $n_words, int($n/$elapsed);
148}
149# Processed in one batch with caching, Lingua::Stem
150{
151    Lingua::Stem::stem_caching({ -level => 2});
152    my $start_time = [gettimeofday];
153    my ($result) = Lingua::Stem::stem(@big_word_list);
154    my $elapsed = tv_interval($start_time);
155    printf  "Lingua::Stem, one batch, cache level 2:                 %8s words/second\n", int($n/$elapsed);
156}
157
158# Word by word, Lingua::Stem::En, with caching
159{
160    Lingua::Stem::En::stem_caching({ -level => 2});
161    my $start_time = [gettimeofday];
162    for (my $i = 0; $i < $loops; $i++) {
163        foreach my $w (@word_list) {
164            my ($result) = Lingua::Stem::En::stem({ -words => [$w] });
165        }
166    }
167    my $elapsed = tv_interval($start_time);
168    printf  "Lingua::Stem::En, one word at a time, cache level 2:        %8s words/second\n", int($n/$elapsed);
169}
170
171# Processed in batches with caching, Lingua::Stem:En
172{
173    Lingua::Stem::En::stem_caching({ -level => 2});
174    my $start_time = [gettimeofday];
175    for (my $i = 0; $i < $loops; $i++) {
176        my ($result) = Lingua::Stem::En::stem({ -words => \@word_list });
177    }
178    my $elapsed = tv_interval($start_time);
179    printf  "Lingua::Stem::En, %6s word batches, cache level 2:       %8s words/second\n", $n_words, int($n/$elapsed);
180}
181# Processed in one batch with caching, Lingua::Stem::En
182{
183    Lingua::Stem::En::stem_caching({ -level => 2});
184    my $start_time = [gettimeofday];
185    my ($result) = Lingua::Stem::En::stem({ -words => \@big_word_list });
186    my $elapsed = tv_interval($start_time);
187    printf  "Lingua::Stem::En, one batch, cache level 2:                 %8s words/second\n", int($n/$elapsed);
188}
189