1 /** @file
2  * @brief Test stemming algorithms
3  */
4 /* Copyright 1999,2000,2001 BrightStation PLC
5  * Copyright 2002 Ananova Ltd
6  * Copyright 2002,2003,2004,2007,2008,2009,2012,2015 Olly Betts
7  *
8  * This program is free software; you can redistribute it and/or
9  * modify it under the terms of the GNU General Public License as
10  * published by the Free Software Foundation; either version 2 of the
11  * License, or (at your option) any later version.
12  *
13  * This program is distributed in the hope that it will be useful,
14  * but WITHOUT ANY WARRANTY; without even the implied warranty of
15  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
16  * GNU General Public License for more details.
17  *
18  * You should have received a copy of the GNU General Public License
19  * along with this program; if not, write to the Free Software
20  * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA  02110-1301
21  * USA
22  */
23 
24 #include <config.h>
25 
26 #include <cstdlib>
27 
28 #include <string>
29 #include <fstream>
30 #include <iostream>
31 
32 #include <xapian.h>
33 #include "testsuite.h"
34 
35 using namespace std;
36 
37 static const int JUNKSIZE = 2 * 1048576;
38 
39 static string language;
40 
41 static Xapian::Stem stemmer;
42 
43 static string srcdir;
44 
45 static int seed;
46 
47 // run stemmers on random text
48 static void
test_stemrandom()49 test_stemrandom()
50 {
51     static const char wordchars[] =
52 	"abcdefghijklmnopqrstuvwxyzabcdefghijklmnopqrstuvwxyz0123456789^\0";
53 
54     tout << "Stemming random text... (seed " << seed << ")" << endl;
55     srand(seed);
56 
57     string word;
58     int stemmed_size = 0;
59     for (int c = JUNKSIZE; c; --c) {
60 	char ch = wordchars[(rand() >> 8) % sizeof wordchars];
61 	if (ch) {
62 	    word += ch;
63 	    continue;
64 	}
65 	stemmed_size += stemmer(word).length();
66 	word.resize(0);
67     }
68     stemmed_size += stemmer(word).length();
69     tout << "Input size " << JUNKSIZE << ", stemmed size " << stemmed_size
70 	 << endl;
71 
72     if (stemmed_size > JUNKSIZE * 101 / 100) {
73 	FAIL_TEST("Stemmed data is significantly bigger than input: "
74 		  << stemmed_size << " vs. " << JUNKSIZE);
75     }
76     if (stemmed_size < JUNKSIZE / 2) {
77 	FAIL_TEST("Stemmed data is significantly smaller than input: "
78 		  << stemmed_size << " vs. " << JUNKSIZE);
79     }
80 }
81 
82 // run stemmers on random junk
83 static void
test_stemjunk()84 test_stemjunk()
85 {
86     tout << "Stemming random junk... (seed " << seed << ")" << endl;
87     srand(seed);
88 
89     string word;
90     int stemmed_size = 0;
91     for (int c = JUNKSIZE; c; --c) {
92 	char ch = char(rand() >> 8);
93 	if (ch) {
94 	    word += ch;
95 	    continue;
96 	}
97 	stemmed_size += stemmer(word).length();
98 	word.resize(0);
99     }
100     stemmed_size += stemmer(word).length();
101     tout << "Input size " << JUNKSIZE << ", stemmed size " << stemmed_size
102 	 << endl;
103 
104     if (stemmed_size > JUNKSIZE * 101 / 100) {
105 	FAIL_TEST("Stemmed data is significantly bigger than input ("
106 		  << stemmed_size << " vs. " << JUNKSIZE);
107     }
108     if (stemmed_size < JUNKSIZE / 2) {
109 	FAIL_TEST("Stemmed data is significantly smaller than input ("
110 		  << stemmed_size << " vs. " << JUNKSIZE);
111     }
112 }
113 
114 static void
test_stemdict()115 test_stemdict()
116 {
117     string dir = srcdir + "/../../xapian-data/stemming/";
118 
119     ifstream voc((dir + language + "/voc.txt").c_str());
120     if (!voc.is_open()) {
121 	SKIP_TEST(language << "/voc.txt not found");
122     }
123 
124     ifstream st((dir + language + "/output.txt").c_str());
125     if (!st.is_open()) {
126 	voc.close();
127 	FAIL_TEST(language << "/output.txt not found");
128     }
129 
130     tout << "Testing " << language << " with Snowball dictionary..." << endl;
131 
132     int pass = 1;
133     while (true) {
134 	string word, stem, expect;
135 	while (!voc.eof() && !st.eof()) {
136 	    getline(voc, word);
137 	    getline(st, expect);
138 
139 	    stem = stemmer(word);
140 
141 	    TEST_EQUAL(stem, expect);
142 	}
143 	voc.close();
144 	st.close();
145 
146 	if (pass == 2) break;
147 
148 	voc.open((dir + language + "/voc2.txt").c_str());
149 	if (!voc.is_open()) break;
150 
151 	st.open((dir + language + "/output2.txt").c_str());
152 	if (!st.is_open()) {
153 	    voc.close();
154 	    FAIL_TEST(language << "/output2.txt not found");
155 	}
156 	tout << "Testing " << language << " with supplemental dictionary..."
157 	     << endl;
158 	++pass;
159     }
160 }
161 
162 // ##################################################################
163 // # End of actual tests                                            #
164 // ##################################################################
165 
166 /// The lists of tests to perform
167 static const test_desc tests[] = {
168     {"stemrandom",		test_stemrandom},
169     {"stemjunk",		test_stemjunk},
170     {"stemdict",		test_stemdict},
171     {0, 0}
172 };
173 
main(int argc,char ** argv)174 int main(int argc, char **argv)
175 try {
176     string langs = Xapian::Stem::get_available_languages();
177     test_driver::add_command_line_option("languages", 'l', &langs);
178 
179     seed = 42;
180     string seed_str;
181     test_driver::add_command_line_option("seed", 's', &seed_str);
182 
183     test_driver::parse_command_line(argc, argv);
184     srcdir = test_driver::get_srcdir();
185     int result = 0;
186 
187     if (!seed_str.empty()) seed = atoi(seed_str.c_str());
188     cout << "The random seed is " << seed << endl;
189     cout << "Please report the seed when reporting a test failure." << endl;
190 
191     string::size_type b = 0;
192     while (b != langs.size()) {
193 	string::size_type a = b;
194 	while (b < langs.size() && langs[b] != ' ') ++b;
195 	language.assign(langs, a, b - a);
196 	while (b < langs.size() && langs[b] == ' ') ++b;
197 	cout << "Running tests with " << language << " stemmer..." << endl;
198 	stemmer = Xapian::Stem(language);
199 	result = max(result, test_driver::run(tests));
200     }
201     return result;
202 } catch (const char * e) {
203     cout << e << endl;
204     return 1;
205 }
206