1#!/usr/bin/perl 2 3######################### 4# Usage: ./benchmark-stemmers.pl voc.txt 5# 6# Runs speed (words/second) tests using randomly selected 7# words from a word list. 8# 9 10use strict; 11use warnings; 12 13use lib qw(../lib); 14 15use Lingua::Stem qw (); 16use Lingua::Stem::En qw (); 17use Time::HiRes qw(gettimeofday tv_interval); 18 19my $snowball = eval { require Lingua::Stem::Snowball; }; 20if ($@) { 21 $snowball = 0; 22} 23 24my $loops = 100; 25my $n_words = 3000; 26 27my @word = grep chomp, <ARGV>; 28 29################################################# 30# Preload word list so we have identical runs 31my @word_list = (); 32my $s = $n_words; 33print "Generating base word list...\n"; 34for (1..$s) { 35 my $result; 36 my $w = @word[rand(scalar(@word))]; 37 push (@word_list,$w); 38} 39print "Generating long word list...\n"; 40my $n = $n_words * $loops; 41my @big_word_list = (); 42foreach my $count (1..$loops) { 43 push (@big_word_list, @word_list); 44} 45print "$n_words words repeated $loops times\n\n"; 46# Word by word, Lingua::Stem::Snowball 47if ($snowball) { 48 my $start_time = [gettimeofday]; 49 for (my $i = 0; $i < $loops; $i++) { 50 foreach my $w (@word_list) { 51 my ($result) = Lingua::Stem::Snowball::stem('en', $w); 52 } 53 } 54 my $elapsed = tv_interval($start_time); 55 printf "Lingua::Stem::Snowball, one word at a time, no caching: %8s words/second\n", int($n/$elapsed); 56} 57 58# Processed in batches, Lingua::Stem::Snowball 59if ($snowball) { 60 eval { 61 my $start_time = [gettimeofday]; 62 for (my $i = 0; $i < $loops; $i++) { 63 my @results = Lingua::Stem::Snowball::stem('en',\@word_list); 64 my $n_words_returned = @results; 65 if ($n_words_returned <= 1) { 66 use Data::Dumper; warn(Dumper(\@results)); 67 die(sprintf("Lingua::Stem::Snowball, %6s word batches, no caching: failed\n", $n_words)); 68 } 69 } 70 my $elapsed = tv_interval($start_time); 71 printf "Lingua::Stem::Snowball, %6s word batches, no caching:%8s words/second\n", $n_words, int($n/$elapsed); 72 }; 73 if ($@) { 74 print "$@"; 75 } 76 } 77 78# Processed in one batch, Lingua::Stem::Snowball 79if ($snowball) { 80 eval { 81 my $start_time = [gettimeofday]; 82 my @results = Lingua::Stem::Snowball::stem('en',\@big_word_list); 83 my $n_words_returned = @results; 84 if ($n_words_returned <= 1) { 85 die(sprintf("Lingua::Stem::Snowball, one batch, no caching: failed\n", $n_words)); 86 } 87 my $elapsed = tv_interval($start_time); 88 printf "Lingua::Stem::Snowball, one batch, no caching: %8s words/second\n", int($n/$elapsed); 89 }; 90 if ($@) { 91 print "$@"; 92 } 93} 94 95# Word by word, Lingua::Stem 96{ 97 my $start_time = [gettimeofday]; 98 for (my $i = 0; $i < $loops; $i++) { 99 foreach my $w (@word_list) { 100 my ($result) = Lingua::Stem::stem($w); 101 } 102 } 103 my $elapsed = tv_interval($start_time); 104 printf "Lingua::Stem, one word at a time, no caching: %8s words/second\n", int($n/$elapsed); 105} 106 107 108# Processed in batches, Lingua::Stem 109{ 110 my $start_time = [gettimeofday]; 111 for (my $i = 0; $i < $loops; $i++) { 112 my ($result) = Lingua::Stem::stem(@word_list); 113 } 114 my $elapsed = tv_interval($start_time); 115 printf "Lingua::Stem, %6s word batches, no caching: %8s words/second\n", $n_words, int($n/$elapsed); 116} 117 118# Processed in one batch, Lingua::Stem 119{ 120 my $start_time = [gettimeofday]; 121 my ($result) = Lingua::Stem::stem(@big_word_list); 122 my $elapsed = tv_interval($start_time); 123 printf "Lingua::Stem, one batch, no caching: %8s words/second\n", int($n/$elapsed); 124} 125 126# Word by word, Lingua::Stem, with caching 127{ 128 Lingua::Stem::stem_caching({ -level => 2}); 129 my $start_time = [gettimeofday]; 130 for (my $i = 0; $i < $loops; $i++) { 131 foreach my $w (@word_list) { 132 my ($result) = Lingua::Stem::stem($w); 133 } 134 } 135 my $elapsed = tv_interval($start_time); 136 printf "Lingua::Stem, one word at a time, cache level 2: %8s words/second\n", int($n/$elapsed); 137} 138 139# Processed in batches with caching, Lingua::Stem 140{ 141 Lingua::Stem::stem_caching({ -level => 2}); 142 my $start_time = [gettimeofday]; 143 for (my $i = 0; $i < $loops; $i++) { 144 my ($result) = Lingua::Stem::stem(@word_list); 145 } 146 my $elapsed = tv_interval($start_time); 147 printf "Lingua::Stem, %6s word batches, cache level 2: %8s words/second\n", $n_words, int($n/$elapsed); 148} 149# Processed in one batch with caching, Lingua::Stem 150{ 151 Lingua::Stem::stem_caching({ -level => 2}); 152 my $start_time = [gettimeofday]; 153 my ($result) = Lingua::Stem::stem(@big_word_list); 154 my $elapsed = tv_interval($start_time); 155 printf "Lingua::Stem, one batch, cache level 2: %8s words/second\n", int($n/$elapsed); 156} 157 158# Word by word, Lingua::Stem::En, with caching 159{ 160 Lingua::Stem::En::stem_caching({ -level => 2}); 161 my $start_time = [gettimeofday]; 162 for (my $i = 0; $i < $loops; $i++) { 163 foreach my $w (@word_list) { 164 my ($result) = Lingua::Stem::En::stem({ -words => [$w] }); 165 } 166 } 167 my $elapsed = tv_interval($start_time); 168 printf "Lingua::Stem::En, one word at a time, cache level 2: %8s words/second\n", int($n/$elapsed); 169} 170 171# Processed in batches with caching, Lingua::Stem:En 172{ 173 Lingua::Stem::En::stem_caching({ -level => 2}); 174 my $start_time = [gettimeofday]; 175 for (my $i = 0; $i < $loops; $i++) { 176 my ($result) = Lingua::Stem::En::stem({ -words => \@word_list }); 177 } 178 my $elapsed = tv_interval($start_time); 179 printf "Lingua::Stem::En, %6s word batches, cache level 2: %8s words/second\n", $n_words, int($n/$elapsed); 180} 181# Processed in one batch with caching, Lingua::Stem::En 182{ 183 Lingua::Stem::En::stem_caching({ -level => 2}); 184 my $start_time = [gettimeofday]; 185 my ($result) = Lingua::Stem::En::stem({ -words => \@big_word_list }); 186 my $elapsed = tv_interval($start_time); 187 printf "Lingua::Stem::En, one batch, cache level 2: %8s words/second\n", int($n/$elapsed); 188} 189