1 /////////////////////////////////////////////////////////////////////////////
2 // Copyright (c) 2009-2014 Alan Wright. All rights reserved.
3 // Distributable under the terms of either the Apache License (Version 2.0)
4 // or the GNU Lesser General Public License.
5 /////////////////////////////////////////////////////////////////////////////
6
7 #include "TestInc.h"
8 #include "BaseTokenStreamFixture.h"
9 #include "PersianAnalyzer.h"
10
11 using namespace Lucene;
12
13 typedef BaseTokenStreamFixture PersianAnalyzerTest;
14
15 /// These tests show how the combination of tokenization (breaking on zero-width
16 /// non-joiner), normalization (such as treating arabic YEH and farsi YEH the
17 /// same), and stopwords creates a light-stemming effect for verbs.
18 ///
19 /// These verb forms are from http://en.wikipedia.org/wiki/Persian_grammar
20
21 /// active present indicative
TEST_F(PersianAnalyzerTest,testBehaviorVerbs1)22 TEST_F(PersianAnalyzerTest, testBehaviorVerbs1) {
23 PersianAnalyzerPtr a = newLucene<PersianAnalyzer>(LuceneVersion::LUCENE_CURRENT);
24 const uint8_t first[] = {0xd9, 0x85, 0xdb, 0x8c, 0xe2, 0x80, 0x8c, 0xd8, 0xae, 0xd9, 0x88, 0xd8, 0xb1, 0xd8, 0xaf};
25 const uint8_t second[] = {0xd8, 0xae, 0xd9, 0x88, 0xd8, 0xb1, 0xd8, 0xaf};
26 checkAnalyzesTo(a, UTF8_TO_STRING(first), newCollection<String>(UTF8_TO_STRING(second)));
27 }
28
29 /// active preterite indicative
TEST_F(PersianAnalyzerTest,testBehaviorVerbs2)30 TEST_F(PersianAnalyzerTest, testBehaviorVerbs2) {
31 PersianAnalyzerPtr a = newLucene<PersianAnalyzer>(LuceneVersion::LUCENE_CURRENT);
32 const uint8_t first[] = {0xd8, 0xae, 0xd9, 0x88, 0xd8, 0xb1, 0xd8, 0xaf};
33 const uint8_t second[] = {0xd8, 0xae, 0xd9, 0x88, 0xd8, 0xb1, 0xd8, 0xaf};
34 checkAnalyzesTo(a, UTF8_TO_STRING(first), newCollection<String>(UTF8_TO_STRING(second)));
35 }
36
37 /// active imperfective preterite indicative
TEST_F(PersianAnalyzerTest,testBehaviorVerbs3)38 TEST_F(PersianAnalyzerTest, testBehaviorVerbs3) {
39 PersianAnalyzerPtr a = newLucene<PersianAnalyzer>(LuceneVersion::LUCENE_CURRENT);
40 const uint8_t first[] = {0xd9, 0x85, 0xdb, 0x8c, 0xe2, 0x80, 0x8c, 0xd8, 0xae, 0xd9, 0x88, 0xd8, 0xb1, 0xd8, 0xaf};
41 const uint8_t second[] = {0xd8, 0xae, 0xd9, 0x88, 0xd8, 0xb1, 0xd8, 0xaf};
42 checkAnalyzesTo(a, UTF8_TO_STRING(first), newCollection<String>(UTF8_TO_STRING(second)));
43 }
44
45 /// active future indicative
TEST_F(PersianAnalyzerTest,testBehaviorVerbs4)46 TEST_F(PersianAnalyzerTest, testBehaviorVerbs4) {
47 PersianAnalyzerPtr a = newLucene<PersianAnalyzer>(LuceneVersion::LUCENE_CURRENT);
48 const uint8_t first[] = {0xd8, 0xae, 0xd9, 0x88, 0xd8, 0xa7, 0xd9, 0x87, 0xd8, 0xaf, 0x20, 0xd8, 0xae, 0xd9, 0x88,
49 0xd8, 0xb1, 0xd8, 0xaf
50 };
51 const uint8_t second[] = {0xd8, 0xae, 0xd9, 0x88, 0xd8, 0xb1, 0xd8, 0xaf};
52 checkAnalyzesTo(a, UTF8_TO_STRING(first), newCollection<String>(UTF8_TO_STRING(second)));
53 }
54
55 /// active present progressive indicative
TEST_F(PersianAnalyzerTest,testBehaviorVerbs5)56 TEST_F(PersianAnalyzerTest, testBehaviorVerbs5) {
57 PersianAnalyzerPtr a = newLucene<PersianAnalyzer>(LuceneVersion::LUCENE_CURRENT);
58 const uint8_t first[] = {0xd8, 0xaf, 0xd8, 0xa7, 0xd8, 0xb1, 0xd8, 0xaf, 0x20, 0xd9, 0x85, 0xdb, 0x8c, 0xe2, 0x80,
59 0x8c, 0xd8, 0xae, 0xd9, 0x88, 0xd8, 0xb1, 0xd8, 0xaf
60 };
61 const uint8_t second[] = {0xd8, 0xae, 0xd9, 0x88, 0xd8, 0xb1, 0xd8, 0xaf};
62 checkAnalyzesTo(a, UTF8_TO_STRING(first), newCollection<String>(UTF8_TO_STRING(second)));
63 }
64
65 /// active preterite progressive indicative
TEST_F(PersianAnalyzerTest,testBehaviorVerbs6)66 TEST_F(PersianAnalyzerTest, testBehaviorVerbs6) {
67 PersianAnalyzerPtr a = newLucene<PersianAnalyzer>(LuceneVersion::LUCENE_CURRENT);
68 const uint8_t first[] = {0xd8, 0xaf, 0xd8, 0xa7, 0xd8, 0xb4, 0xd8, 0xaa, 0x20, 0xd9, 0x85, 0xdb, 0x8c, 0xe2, 0x80,
69 0x8c, 0xd8, 0xae, 0xd9, 0x88, 0xd8, 0xb1, 0xd8, 0xaf
70 };
71 const uint8_t second[] = {0xd8, 0xae, 0xd9, 0x88, 0xd8, 0xb1, 0xd8, 0xaf};
72 checkAnalyzesTo(a, UTF8_TO_STRING(first), newCollection<String>(UTF8_TO_STRING(second)));
73 }
74
75 /// active perfect indicative
TEST_F(PersianAnalyzerTest,testBehaviorVerbs7)76 TEST_F(PersianAnalyzerTest, testBehaviorVerbs7) {
77 PersianAnalyzerPtr a = newLucene<PersianAnalyzer>(LuceneVersion::LUCENE_CURRENT);
78 const uint8_t first[] = {0xd8, 0xae, 0xd9, 0x88, 0xd8, 0xb1, 0xd8, 0xaf, 0xd9, 0x87, 0xe2, 0x80, 0x8c, 0xd8, 0xa7,
79 0xd8, 0xb3, 0xd8, 0xaa
80 };
81 const uint8_t second[] = {0xd8, 0xae, 0xd9, 0x88, 0xd8, 0xb1, 0xd8, 0xaf, 0xd9, 0x87};
82 checkAnalyzesTo(a, UTF8_TO_STRING(first), newCollection<String>(UTF8_TO_STRING(second)));
83 }
84
85 /// active imperfective perfect indicative
TEST_F(PersianAnalyzerTest,testBehaviorVerbs8)86 TEST_F(PersianAnalyzerTest, testBehaviorVerbs8) {
87 PersianAnalyzerPtr a = newLucene<PersianAnalyzer>(LuceneVersion::LUCENE_CURRENT);
88 const uint8_t first[] = {0xd9, 0x85, 0xdb, 0x8c, 0xe2, 0x80, 0x8c, 0xd8, 0xae, 0xd9, 0x88, 0xd8, 0xb1, 0xd8, 0xaf,
89 0xd9, 0x87, 0xe2, 0x80, 0x8c, 0xd8, 0xa7, 0xd8, 0xb3, 0xd8, 0xaa
90 };
91 const uint8_t second[] = {0xd8, 0xae, 0xd9, 0x88, 0xd8, 0xb1, 0xd8, 0xaf, 0xd9, 0x87};
92 checkAnalyzesTo(a, UTF8_TO_STRING(first), newCollection<String>(UTF8_TO_STRING(second)));
93 }
94
95 /// active pluperfect indicative
TEST_F(PersianAnalyzerTest,testBehaviorVerbs9)96 TEST_F(PersianAnalyzerTest, testBehaviorVerbs9) {
97 PersianAnalyzerPtr a = newLucene<PersianAnalyzer>(LuceneVersion::LUCENE_CURRENT);
98 const uint8_t first[] = {0xd8, 0xae, 0xd9, 0x88, 0xd8, 0xb1, 0xd8, 0xaf, 0xd9, 0x87, 0x20, 0xd8, 0xa8, 0xd9, 0x88,
99 0xd8, 0xaf
100 };
101 const uint8_t second[] = {0xd8, 0xae, 0xd9, 0x88, 0xd8, 0xb1, 0xd8, 0xaf, 0xd9, 0x87};
102 checkAnalyzesTo(a, UTF8_TO_STRING(first), newCollection<String>(UTF8_TO_STRING(second)));
103 }
104
105 /// active imperfective pluperfect indicative
TEST_F(PersianAnalyzerTest,testBehaviorVerbs10)106 TEST_F(PersianAnalyzerTest, testBehaviorVerbs10) {
107 PersianAnalyzerPtr a = newLucene<PersianAnalyzer>(LuceneVersion::LUCENE_CURRENT);
108 const uint8_t first[] = {0xd9, 0x85, 0xdb, 0x8c, 0xe2, 0x80, 0x8c, 0xd8, 0xae, 0xd9, 0x88, 0xd8, 0xb1, 0xd8, 0xaf,
109 0xd9, 0x87, 0x20, 0xd8, 0xa8, 0xd9, 0x88, 0xd8, 0xaf
110 };
111 const uint8_t second[] = {0xd8, 0xae, 0xd9, 0x88, 0xd8, 0xb1, 0xd8, 0xaf, 0xd9, 0x87};
112 checkAnalyzesTo(a, UTF8_TO_STRING(first), newCollection<String>(UTF8_TO_STRING(second)));
113 }
114
115 /// active preterite subjunctive
TEST_F(PersianAnalyzerTest,testBehaviorVerbs11)116 TEST_F(PersianAnalyzerTest, testBehaviorVerbs11) {
117 PersianAnalyzerPtr a = newLucene<PersianAnalyzer>(LuceneVersion::LUCENE_CURRENT);
118 const uint8_t first[] = {0xd8, 0xae, 0xd9, 0x88, 0xd8, 0xb1, 0xd8, 0xaf, 0xd9, 0x87, 0x20, 0xd8, 0xa8, 0xd8, 0xa7,
119 0xd8, 0xb4, 0xd8, 0xaf
120 };
121 const uint8_t second[] = {0xd8, 0xae, 0xd9, 0x88, 0xd8, 0xb1, 0xd8, 0xaf, 0xd9, 0x87};
122 checkAnalyzesTo(a, UTF8_TO_STRING(first), newCollection<String>(UTF8_TO_STRING(second)));
123 }
124
125 /// active imperfective preterite subjunctive
TEST_F(PersianAnalyzerTest,testBehaviorVerbs12)126 TEST_F(PersianAnalyzerTest, testBehaviorVerbs12) {
127 PersianAnalyzerPtr a = newLucene<PersianAnalyzer>(LuceneVersion::LUCENE_CURRENT);
128 const uint8_t first[] = {0xd9, 0x85, 0xdb, 0x8c, 0xe2, 0x80, 0x8c, 0xd8, 0xae, 0xd9, 0x88, 0xd8, 0xb1, 0xd8, 0xaf,
129 0xd9, 0x87, 0x20, 0xd8, 0xa8, 0xd8, 0xa7, 0xd8, 0xb4, 0xd8, 0xaf
130 };
131 const uint8_t second[] = {0xd8, 0xae, 0xd9, 0x88, 0xd8, 0xb1, 0xd8, 0xaf, 0xd9, 0x87};
132 checkAnalyzesTo(a, UTF8_TO_STRING(first), newCollection<String>(UTF8_TO_STRING(second)));
133 }
134
135 /// active pluperfect subjunctive
TEST_F(PersianAnalyzerTest,testBehaviorVerbs13)136 TEST_F(PersianAnalyzerTest, testBehaviorVerbs13) {
137 PersianAnalyzerPtr a = newLucene<PersianAnalyzer>(LuceneVersion::LUCENE_CURRENT);
138 const uint8_t first[] = {0xd8, 0xae, 0xd9, 0x88, 0xd8, 0xb1, 0xd8, 0xaf, 0xd9, 0x87, 0x20, 0xd8, 0xa8, 0xd9, 0x88,
139 0xd8, 0xaf, 0xd9, 0x87, 0x20, 0xd8, 0xa8, 0xd8, 0xa7, 0xd8, 0xb4, 0xd8, 0xaf
140 };
141 const uint8_t second[] = {0xd8, 0xae, 0xd9, 0x88, 0xd8, 0xb1, 0xd8, 0xaf, 0xd9, 0x87};
142 checkAnalyzesTo(a, UTF8_TO_STRING(first), newCollection<String>(UTF8_TO_STRING(second)));
143 }
144
145 /// active imperfective pluperfect subjunctive
TEST_F(PersianAnalyzerTest,testBehaviorVerbs14)146 TEST_F(PersianAnalyzerTest, testBehaviorVerbs14) {
147 PersianAnalyzerPtr a = newLucene<PersianAnalyzer>(LuceneVersion::LUCENE_CURRENT);
148 const uint8_t first[] = {0xd9, 0x85, 0xdb, 0x8c, 0xe2, 0x80, 0x8c, 0xd8, 0xae, 0xd9, 0x88, 0xd8, 0xb1, 0xd8, 0xaf,
149 0xd9, 0x87, 0x20, 0xd8, 0xa8, 0xd9, 0x88, 0xd8, 0xaf, 0xd9, 0x87, 0x20, 0xd8, 0xa8, 0xd8,
150 0xa7, 0xd8, 0xb4, 0xd8, 0xaf
151 };
152 const uint8_t second[] = {0xd8, 0xae, 0xd9, 0x88, 0xd8, 0xb1, 0xd8, 0xaf, 0xd9, 0x87};
153 checkAnalyzesTo(a, UTF8_TO_STRING(first), newCollection<String>(UTF8_TO_STRING(second)));
154 }
155
156 /// passive present indicative
TEST_F(PersianAnalyzerTest,testBehaviorVerbs15)157 TEST_F(PersianAnalyzerTest, testBehaviorVerbs15) {
158 PersianAnalyzerPtr a = newLucene<PersianAnalyzer>(LuceneVersion::LUCENE_CURRENT);
159 const uint8_t first[] = {0xd8, 0xae, 0xd9, 0x88, 0xd8, 0xb1, 0xd8, 0xaf, 0xd9, 0x87, 0x20, 0xd9, 0x85, 0xdb, 0x8c,
160 0xe2, 0x80, 0x8c, 0xd8, 0xb4, 0xd9, 0x88, 0xd8, 0xaf
161 };
162 const uint8_t second[] = {0xd8, 0xae, 0xd9, 0x88, 0xd8, 0xb1, 0xd8, 0xaf, 0xd9, 0x87};
163 checkAnalyzesTo(a, UTF8_TO_STRING(first), newCollection<String>(UTF8_TO_STRING(second)));
164 }
165
166 /// passive preterite indicative
TEST_F(PersianAnalyzerTest,testBehaviorVerbs16)167 TEST_F(PersianAnalyzerTest, testBehaviorVerbs16) {
168 PersianAnalyzerPtr a = newLucene<PersianAnalyzer>(LuceneVersion::LUCENE_CURRENT);
169 const uint8_t first[] = {0xd8, 0xae, 0xd9, 0x88, 0xd8, 0xb1, 0xd8, 0xaf, 0xd9, 0x87, 0x20, 0xd8, 0xb4, 0xd8, 0xaf};
170 const uint8_t second[] = {0xd8, 0xae, 0xd9, 0x88, 0xd8, 0xb1, 0xd8, 0xaf, 0xd9, 0x87};
171 checkAnalyzesTo(a, UTF8_TO_STRING(first), newCollection<String>(UTF8_TO_STRING(second)));
172 }
173
174 /// passive imperfective preterite indicative
TEST_F(PersianAnalyzerTest,testBehaviorVerbs17)175 TEST_F(PersianAnalyzerTest, testBehaviorVerbs17) {
176 PersianAnalyzerPtr a = newLucene<PersianAnalyzer>(LuceneVersion::LUCENE_CURRENT);
177 const uint8_t first[] = {0xd8, 0xae, 0xd9, 0x88, 0xd8, 0xb1, 0xd8, 0xaf, 0xd9, 0x87, 0x20, 0xd9, 0x85, 0xdb, 0x8c,
178 0xe2, 0x80, 0x8c, 0xd8, 0xb4, 0xd8, 0xaf
179 };
180 const uint8_t second[] = {0xd8, 0xae, 0xd9, 0x88, 0xd8, 0xb1, 0xd8, 0xaf, 0xd9, 0x87};
181 checkAnalyzesTo(a, UTF8_TO_STRING(first), newCollection<String>(UTF8_TO_STRING(second)));
182 }
183
184 /// passive perfect indicative
TEST_F(PersianAnalyzerTest,testBehaviorVerbs18)185 TEST_F(PersianAnalyzerTest, testBehaviorVerbs18) {
186 PersianAnalyzerPtr a = newLucene<PersianAnalyzer>(LuceneVersion::LUCENE_CURRENT);
187 const uint8_t first[] = {0xd8, 0xae, 0xd9, 0x88, 0xd8, 0xb1, 0xd8, 0xaf, 0xd9, 0x87, 0x20, 0xd8, 0xb4, 0xd8, 0xaf,
188 0xd9, 0x87, 0xe2, 0x80, 0x8c, 0xd8, 0xa7, 0xd8, 0xb3, 0xd8, 0xaa
189 };
190 const uint8_t second[] = {0xd8, 0xae, 0xd9, 0x88, 0xd8, 0xb1, 0xd8, 0xaf, 0xd9, 0x87};
191 checkAnalyzesTo(a, UTF8_TO_STRING(first), newCollection<String>(UTF8_TO_STRING(second)));
192 }
193
194 /// passive imperfective perfect indicative
TEST_F(PersianAnalyzerTest,testBehaviorVerbs19)195 TEST_F(PersianAnalyzerTest, testBehaviorVerbs19) {
196 PersianAnalyzerPtr a = newLucene<PersianAnalyzer>(LuceneVersion::LUCENE_CURRENT);
197 const uint8_t first[] = {0xd8, 0xae, 0xd9, 0x88, 0xd8, 0xb1, 0xd8, 0xaf, 0xd9, 0x87, 0x20, 0xd9, 0x85, 0xdb, 0x8c,
198 0xe2, 0x80, 0x8c, 0xd8, 0xb4, 0xd8, 0xaf, 0xd9, 0x87, 0xe2, 0x80, 0x8c, 0xd8, 0xa7, 0xd8,
199 0xb3, 0xd8, 0xaa
200 };
201 const uint8_t second[] = {0xd8, 0xae, 0xd9, 0x88, 0xd8, 0xb1, 0xd8, 0xaf, 0xd9, 0x87};
202 checkAnalyzesTo(a, UTF8_TO_STRING(first), newCollection<String>(UTF8_TO_STRING(second)));
203 }
204
205 /// passive pluperfect indicative
TEST_F(PersianAnalyzerTest,testBehaviorVerbs20)206 TEST_F(PersianAnalyzerTest, testBehaviorVerbs20) {
207 PersianAnalyzerPtr a = newLucene<PersianAnalyzer>(LuceneVersion::LUCENE_CURRENT);
208 const uint8_t first[] = {0xd8, 0xae, 0xd9, 0x88, 0xd8, 0xb1, 0xd8, 0xaf, 0xd9, 0x87, 0x20, 0xd8, 0xb4, 0xd8, 0xaf,
209 0xd9, 0x87, 0x20, 0xd8, 0xa8, 0xd9, 0x88, 0xd8, 0xaf
210 };
211 const uint8_t second[] = {0xd8, 0xae, 0xd9, 0x88, 0xd8, 0xb1, 0xd8, 0xaf, 0xd9, 0x87};
212 checkAnalyzesTo(a, UTF8_TO_STRING(first), newCollection<String>(UTF8_TO_STRING(second)));
213 }
214
215 /// passive imperfective pluperfect indicative
TEST_F(PersianAnalyzerTest,testBehaviorVerbs21)216 TEST_F(PersianAnalyzerTest, testBehaviorVerbs21) {
217 PersianAnalyzerPtr a = newLucene<PersianAnalyzer>(LuceneVersion::LUCENE_CURRENT);
218 const uint8_t first[] = {0xd8, 0xae, 0xd9, 0x88, 0xd8, 0xb1, 0xd8, 0xaf, 0xd9, 0x87, 0x20, 0xd9, 0x85, 0xdb, 0x8c,
219 0xe2, 0x80, 0x8c, 0xd8, 0xb4, 0xd8, 0xaf, 0xd9, 0x87, 0x20, 0xd8, 0xa8, 0xd9, 0x88, 0xd8,
220 0xaf
221 };
222 const uint8_t second[] = {0xd8, 0xae, 0xd9, 0x88, 0xd8, 0xb1, 0xd8, 0xaf, 0xd9, 0x87};
223 checkAnalyzesTo(a, UTF8_TO_STRING(first), newCollection<String>(UTF8_TO_STRING(second)));
224 }
225
226 /// passive future indicative
TEST_F(PersianAnalyzerTest,testBehaviorVerbs22)227 TEST_F(PersianAnalyzerTest, testBehaviorVerbs22) {
228 PersianAnalyzerPtr a = newLucene<PersianAnalyzer>(LuceneVersion::LUCENE_CURRENT);
229 const uint8_t first[] = {0xd8, 0xae, 0xd9, 0x88, 0xd8, 0xb1, 0xd8, 0xaf, 0xd9, 0x87, 0x20, 0xd8, 0xae, 0xd9, 0x88,
230 0xd8, 0xa7, 0xd9, 0x87, 0xd8, 0xaf, 0x20, 0xd8, 0xb4, 0xd8, 0xaf
231 };
232 const uint8_t second[] = {0xd8, 0xae, 0xd9, 0x88, 0xd8, 0xb1, 0xd8, 0xaf, 0xd9, 0x87};
233 checkAnalyzesTo(a, UTF8_TO_STRING(first), newCollection<String>(UTF8_TO_STRING(second)));
234 }
235
236 /// passive present progressive indicative
TEST_F(PersianAnalyzerTest,testBehaviorVerbs23)237 TEST_F(PersianAnalyzerTest, testBehaviorVerbs23) {
238 PersianAnalyzerPtr a = newLucene<PersianAnalyzer>(LuceneVersion::LUCENE_CURRENT);
239 const uint8_t first[] = {0xd8, 0xaf, 0xd8, 0xa7, 0xd8, 0xb1, 0xd8, 0xaf, 0x20, 0xd8, 0xae, 0xd9, 0x88, 0xd8, 0xb1,
240 0xd8, 0xaf, 0xd9, 0x87, 0x20, 0xd9, 0x85, 0xdb, 0x8c, 0xe2, 0x80, 0x8c, 0xd8, 0xb4, 0xd9,
241 0x88, 0xd8, 0xaf
242 };
243 const uint8_t second[] = {0xd8, 0xae, 0xd9, 0x88, 0xd8, 0xb1, 0xd8, 0xaf, 0xd9, 0x87};
244 checkAnalyzesTo(a, UTF8_TO_STRING(first), newCollection<String>(UTF8_TO_STRING(second)));
245 }
246
247 /// passive preterite progressive indicative
TEST_F(PersianAnalyzerTest,testBehaviorVerbs24)248 TEST_F(PersianAnalyzerTest, testBehaviorVerbs24) {
249 PersianAnalyzerPtr a = newLucene<PersianAnalyzer>(LuceneVersion::LUCENE_CURRENT);
250 const uint8_t first[] = {0xd8, 0xaf, 0xd8, 0xa7, 0xd8, 0xb4, 0xd8, 0xaa, 0x20, 0xd8, 0xae, 0xd9, 0x88, 0xd8, 0xb1,
251 0xd8, 0xaf, 0xd9, 0x87, 0x20, 0xd9, 0x85, 0xdb, 0x8c, 0xe2, 0x80, 0x8c, 0xd8, 0xb4, 0xd8,
252 0xaf
253 };
254 const uint8_t second[] = {0xd8, 0xae, 0xd9, 0x88, 0xd8, 0xb1, 0xd8, 0xaf, 0xd9, 0x87};
255 checkAnalyzesTo(a, UTF8_TO_STRING(first), newCollection<String>(UTF8_TO_STRING(second)));
256 }
257
258 /// passive present subjunctive
TEST_F(PersianAnalyzerTest,testBehaviorVerbs25)259 TEST_F(PersianAnalyzerTest, testBehaviorVerbs25) {
260 PersianAnalyzerPtr a = newLucene<PersianAnalyzer>(LuceneVersion::LUCENE_CURRENT);
261 const uint8_t first[] = {0xd8, 0xae, 0xd9, 0x88, 0xd8, 0xb1, 0xd8, 0xaf, 0xd9, 0x87, 0x20, 0xd8, 0xb4, 0xd9, 0x88,
262 0xd8, 0xaf
263 };
264 const uint8_t second[] = {0xd8, 0xae, 0xd9, 0x88, 0xd8, 0xb1, 0xd8, 0xaf, 0xd9, 0x87};
265 checkAnalyzesTo(a, UTF8_TO_STRING(first), newCollection<String>(UTF8_TO_STRING(second)));
266 }
267
268 /// passive preterite subjunctive
TEST_F(PersianAnalyzerTest,testBehaviorVerbs26)269 TEST_F(PersianAnalyzerTest, testBehaviorVerbs26) {
270 PersianAnalyzerPtr a = newLucene<PersianAnalyzer>(LuceneVersion::LUCENE_CURRENT);
271 const uint8_t first[] = {0xd8, 0xae, 0xd9, 0x88, 0xd8, 0xb1, 0xd8, 0xaf, 0xd9, 0x87, 0x20, 0xd8, 0xb4, 0xd8, 0xaf,
272 0xd9, 0x87, 0x20, 0xd8, 0xa8, 0xd8, 0xa7, 0xd8, 0xb4, 0xd8, 0xaf
273 };
274 const uint8_t second[] = {0xd8, 0xae, 0xd9, 0x88, 0xd8, 0xb1, 0xd8, 0xaf, 0xd9, 0x87};
275 checkAnalyzesTo(a, UTF8_TO_STRING(first), newCollection<String>(UTF8_TO_STRING(second)));
276 }
277
278 /// passive imperfective preterite subjunctive
TEST_F(PersianAnalyzerTest,testBehaviorVerbs27)279 TEST_F(PersianAnalyzerTest, testBehaviorVerbs27) {
280 PersianAnalyzerPtr a = newLucene<PersianAnalyzer>(LuceneVersion::LUCENE_CURRENT);
281 const uint8_t first[] = {0xd8, 0xae, 0xd9, 0x88, 0xd8, 0xb1, 0xd8, 0xaf, 0xd9, 0x87, 0x20, 0xd9, 0x85, 0xdb, 0x8c,
282 0xe2, 0x80, 0x8c, 0xd8, 0xb4, 0xd8, 0xaf, 0xd9, 0x87, 0x20, 0xd8, 0xa8, 0xd8, 0xa7, 0xd8,
283 0xb4, 0xd8, 0xaf
284 };
285 const uint8_t second[] = {0xd8, 0xae, 0xd9, 0x88, 0xd8, 0xb1, 0xd8, 0xaf, 0xd9, 0x87};
286 checkAnalyzesTo(a, UTF8_TO_STRING(first), newCollection<String>(UTF8_TO_STRING(second)));
287 }
288
289 /// passive pluperfect subjunctive
TEST_F(PersianAnalyzerTest,testBehaviorVerbs28)290 TEST_F(PersianAnalyzerTest, testBehaviorVerbs28) {
291 PersianAnalyzerPtr a = newLucene<PersianAnalyzer>(LuceneVersion::LUCENE_CURRENT);
292 const uint8_t first[] = {0xd8, 0xae, 0xd9, 0x88, 0xd8, 0xb1, 0xd8, 0xaf, 0xd9, 0x87, 0x20, 0xd8, 0xb4, 0xd8, 0xaf,
293 0xd9, 0x87, 0x20, 0xd8, 0xa8, 0xd9, 0x88, 0xd8, 0xaf, 0xd9, 0x87, 0x20, 0xd8, 0xa8, 0xd8,
294 0xa7, 0xd8, 0xb4, 0xd8, 0xaf
295 };
296 const uint8_t second[] = {0xd8, 0xae, 0xd9, 0x88, 0xd8, 0xb1, 0xd8, 0xaf, 0xd9, 0x87};
297 checkAnalyzesTo(a, UTF8_TO_STRING(first), newCollection<String>(UTF8_TO_STRING(second)));
298 }
299
300 /// passive imperfective pluperfect subjunctive
TEST_F(PersianAnalyzerTest,testBehaviorVerbs29)301 TEST_F(PersianAnalyzerTest, testBehaviorVerbs29) {
302 PersianAnalyzerPtr a = newLucene<PersianAnalyzer>(LuceneVersion::LUCENE_CURRENT);
303 const uint8_t first[] = {0xd8, 0xae, 0xd9, 0x88, 0xd8, 0xb1, 0xd8, 0xaf, 0xd9, 0x87, 0x20, 0xd9, 0x85, 0xdb, 0x8c,
304 0xe2, 0x80, 0x8c, 0xd8, 0xb4, 0xd8, 0xaf, 0xd9, 0x87, 0x20, 0xd8, 0xa8, 0xd9, 0x88, 0xd8,
305 0xaf, 0xd9, 0x87, 0x20, 0xd8, 0xa8, 0xd8, 0xa7, 0xd8, 0xb4, 0xd8, 0xaf
306 };
307 const uint8_t second[] = {0xd8, 0xae, 0xd9, 0x88, 0xd8, 0xb1, 0xd8, 0xaf, 0xd9, 0x87};
308 checkAnalyzesTo(a, UTF8_TO_STRING(first), newCollection<String>(UTF8_TO_STRING(second)));
309 }
310
311 /// active present subjunctive
TEST_F(PersianAnalyzerTest,testBehaviorVerbs30)312 TEST_F(PersianAnalyzerTest, testBehaviorVerbs30) {
313 PersianAnalyzerPtr a = newLucene<PersianAnalyzer>(LuceneVersion::LUCENE_CURRENT);
314 const uint8_t first[] = {0xd8, 0xa8, 0xd8, 0xae, 0xd9, 0x88, 0xd8, 0xb1, 0xd8, 0xaf};
315 const uint8_t second[] = {0xd8, 0xa8, 0xd8, 0xae, 0xd9, 0x88, 0xd8, 0xb1, 0xd8, 0xaf};
316 checkAnalyzesTo(a, UTF8_TO_STRING(first), newCollection<String>(UTF8_TO_STRING(second)));
317 }
318
319 /// This test shows how the combination of tokenization and stopwords creates a
320 /// light-stemming effect for verbs.
321 ///
322 /// In this case, these forms are presented with alternative orthography, using
323 /// arabic yeh and whitespace. This yeh phenomenon is common for legacy text
324 /// due to some previous bugs in Microsoft Windows.
325 ///
326 /// These verb forms are from http://en.wikipedia.org/wiki/Persian_grammar
327
328 /// active present subjunctive
TEST_F(PersianAnalyzerTest,testBehaviorVerbsDefective1)329 TEST_F(PersianAnalyzerTest, testBehaviorVerbsDefective1) {
330 PersianAnalyzerPtr a = newLucene<PersianAnalyzer>(LuceneVersion::LUCENE_CURRENT);
331 const uint8_t first[] = {0xd9, 0x85, 0xd9, 0x8a, 0x20, 0xd8, 0xae, 0xd9, 0x88, 0xd8, 0xb1, 0xd8, 0xaf};
332 const uint8_t second[] = {0xd8, 0xae, 0xd9, 0x88, 0xd8, 0xb1, 0xd8, 0xaf};
333 checkAnalyzesTo(a, UTF8_TO_STRING(first), newCollection<String>(UTF8_TO_STRING(second)));
334 }
335
336 /// active preterite indicative
TEST_F(PersianAnalyzerTest,testBehaviorVerbsDefective2)337 TEST_F(PersianAnalyzerTest, testBehaviorVerbsDefective2) {
338 PersianAnalyzerPtr a = newLucene<PersianAnalyzer>(LuceneVersion::LUCENE_CURRENT);
339 const uint8_t first[] = {0xd8, 0xae, 0xd9, 0x88, 0xd8, 0xb1, 0xd8, 0xaf};
340 const uint8_t second[] = {0xd8, 0xae, 0xd9, 0x88, 0xd8, 0xb1, 0xd8, 0xaf};
341 checkAnalyzesTo(a, UTF8_TO_STRING(first), newCollection<String>(UTF8_TO_STRING(second)));
342 }
343
344 /// active imperfective preterite indicative
TEST_F(PersianAnalyzerTest,testBehaviorVerbsDefective3)345 TEST_F(PersianAnalyzerTest, testBehaviorVerbsDefective3) {
346 PersianAnalyzerPtr a = newLucene<PersianAnalyzer>(LuceneVersion::LUCENE_CURRENT);
347 const uint8_t first[] = {0xd9, 0x85, 0xd9, 0x8a, 0x20, 0xd8, 0xae, 0xd9, 0x88, 0xd8, 0xb1, 0xd8, 0xaf};
348 const uint8_t second[] = {0xd8, 0xae, 0xd9, 0x88, 0xd8, 0xb1, 0xd8, 0xaf};
349 checkAnalyzesTo(a, UTF8_TO_STRING(first), newCollection<String>(UTF8_TO_STRING(second)));
350 }
351
352 /// active future indicative
TEST_F(PersianAnalyzerTest,testBehaviorVerbsDefective4)353 TEST_F(PersianAnalyzerTest, testBehaviorVerbsDefective4) {
354 PersianAnalyzerPtr a = newLucene<PersianAnalyzer>(LuceneVersion::LUCENE_CURRENT);
355 const uint8_t first[] = {0xd8, 0xae, 0xd9, 0x88, 0xd8, 0xa7, 0xd9, 0x87, 0xd8, 0xaf, 0x20, 0xd8, 0xae, 0xd9, 0x88,
356 0xd8, 0xb1, 0xd8, 0xaf
357 };
358 const uint8_t second[] = {0xd8, 0xae, 0xd9, 0x88, 0xd8, 0xb1, 0xd8, 0xaf};
359 checkAnalyzesTo(a, UTF8_TO_STRING(first), newCollection<String>(UTF8_TO_STRING(second)));
360 }
361
362 /// active present progressive indicative
TEST_F(PersianAnalyzerTest,testBehaviorVerbsDefective5)363 TEST_F(PersianAnalyzerTest, testBehaviorVerbsDefective5) {
364 PersianAnalyzerPtr a = newLucene<PersianAnalyzer>(LuceneVersion::LUCENE_CURRENT);
365 const uint8_t first[] = {0xd8, 0xaf, 0xd8, 0xa7, 0xd8, 0xb1, 0xd8, 0xaf, 0x20, 0xd9, 0x85, 0xd9, 0x8a, 0x20, 0xd8,
366 0xae, 0xd9, 0x88, 0xd8, 0xb1, 0xd8, 0xaf
367 };
368 const uint8_t second[] = {0xd8, 0xae, 0xd9, 0x88, 0xd8, 0xb1, 0xd8, 0xaf};
369 checkAnalyzesTo(a, UTF8_TO_STRING(first), newCollection<String>(UTF8_TO_STRING(second)));
370 }
371
372 /// active preterite progressive indicative
TEST_F(PersianAnalyzerTest,testBehaviorVerbsDefective6)373 TEST_F(PersianAnalyzerTest, testBehaviorVerbsDefective6) {
374 PersianAnalyzerPtr a = newLucene<PersianAnalyzer>(LuceneVersion::LUCENE_CURRENT);
375 const uint8_t first[] = {0xd8, 0xaf, 0xd8, 0xa7, 0xd8, 0xb4, 0xd8, 0xaa, 0x20, 0xd9, 0x85, 0xd9, 0x8a, 0x20, 0xd8,
376 0xae, 0xd9, 0x88, 0xd8, 0xb1, 0xd8, 0xaf
377 };
378 const uint8_t second[] = {0xd8, 0xae, 0xd9, 0x88, 0xd8, 0xb1, 0xd8, 0xaf};
379 checkAnalyzesTo(a, UTF8_TO_STRING(first), newCollection<String>(UTF8_TO_STRING(second)));
380 }
381
382 /// active perfect indicative
TEST_F(PersianAnalyzerTest,testBehaviorVerbsDefective7)383 TEST_F(PersianAnalyzerTest, testBehaviorVerbsDefective7) {
384 PersianAnalyzerPtr a = newLucene<PersianAnalyzer>(LuceneVersion::LUCENE_CURRENT);
385 const uint8_t first[] = {0xd8, 0xae, 0xd9, 0x88, 0xd8, 0xb1, 0xd8, 0xaf, 0xd9, 0x87, 0x20, 0xd8, 0xa7, 0xd8, 0xb3,
386 0xd8, 0xaa
387 };
388 const uint8_t second[] = {0xd8, 0xae, 0xd9, 0x88, 0xd8, 0xb1, 0xd8, 0xaf, 0xd9, 0x87};
389 checkAnalyzesTo(a, UTF8_TO_STRING(first), newCollection<String>(UTF8_TO_STRING(second)));
390 }
391
392 /// active imperfective perfect indicative
TEST_F(PersianAnalyzerTest,testBehaviorVerbsDefective8)393 TEST_F(PersianAnalyzerTest, testBehaviorVerbsDefective8) {
394 PersianAnalyzerPtr a = newLucene<PersianAnalyzer>(LuceneVersion::LUCENE_CURRENT);
395 const uint8_t first[] = {0xd9, 0x85, 0xd9, 0x8a, 0x20, 0xd8, 0xae, 0xd9, 0x88, 0xd8, 0xb1, 0xd8, 0xaf, 0xd9, 0x87,
396 0x20, 0xd8, 0xa7, 0xd8, 0xb3, 0xd8, 0xaa
397 };
398 const uint8_t second[] = {0xd8, 0xae, 0xd9, 0x88, 0xd8, 0xb1, 0xd8, 0xaf, 0xd9, 0x87};
399 checkAnalyzesTo(a, UTF8_TO_STRING(first), newCollection<String>(UTF8_TO_STRING(second)));
400 }
401
402 /// active pluperfect indicative
TEST_F(PersianAnalyzerTest,testBehaviorVerbsDefective9)403 TEST_F(PersianAnalyzerTest, testBehaviorVerbsDefective9) {
404 PersianAnalyzerPtr a = newLucene<PersianAnalyzer>(LuceneVersion::LUCENE_CURRENT);
405 const uint8_t first[] = {0xd8, 0xae, 0xd9, 0x88, 0xd8, 0xb1, 0xd8, 0xaf, 0xd9, 0x87, 0x20, 0xd8, 0xa8, 0xd9, 0x88,
406 0xd8, 0xaf
407 };
408 const uint8_t second[] = {0xd8, 0xae, 0xd9, 0x88, 0xd8, 0xb1, 0xd8, 0xaf, 0xd9, 0x87};
409 checkAnalyzesTo(a, UTF8_TO_STRING(first), newCollection<String>(UTF8_TO_STRING(second)));
410 }
411
412 /// active imperfective pluperfect indicative
TEST_F(PersianAnalyzerTest,testBehaviorVerbsDefective10)413 TEST_F(PersianAnalyzerTest, testBehaviorVerbsDefective10) {
414 PersianAnalyzerPtr a = newLucene<PersianAnalyzer>(LuceneVersion::LUCENE_CURRENT);
415 const uint8_t first[] = {0xd9, 0x85, 0xd9, 0x8a, 0x20, 0xd8, 0xae, 0xd9, 0x88, 0xd8, 0xb1, 0xd8, 0xaf, 0xd9, 0x87,
416 0x20, 0xd8, 0xa8, 0xd9, 0x88, 0xd8, 0xaf
417 };
418 const uint8_t second[] = {0xd8, 0xae, 0xd9, 0x88, 0xd8, 0xb1, 0xd8, 0xaf, 0xd9, 0x87};
419 checkAnalyzesTo(a, UTF8_TO_STRING(first), newCollection<String>(UTF8_TO_STRING(second)));
420 }
421
422 /// active preterite subjunctive
TEST_F(PersianAnalyzerTest,testBehaviorVerbsDefective11)423 TEST_F(PersianAnalyzerTest, testBehaviorVerbsDefective11) {
424 PersianAnalyzerPtr a = newLucene<PersianAnalyzer>(LuceneVersion::LUCENE_CURRENT);
425 const uint8_t first[] = {0xd8, 0xae, 0xd9, 0x88, 0xd8, 0xb1, 0xd8, 0xaf, 0xd9, 0x87, 0x20, 0xd8, 0xa8, 0xd8, 0xa7,
426 0xd8, 0xb4, 0xd8, 0xaf
427 };
428 const uint8_t second[] = {0xd8, 0xae, 0xd9, 0x88, 0xd8, 0xb1, 0xd8, 0xaf, 0xd9, 0x87};
429 checkAnalyzesTo(a, UTF8_TO_STRING(first), newCollection<String>(UTF8_TO_STRING(second)));
430 }
431
432 /// active imperfective preterite subjunctive
TEST_F(PersianAnalyzerTest,testBehaviorVerbsDefective12)433 TEST_F(PersianAnalyzerTest, testBehaviorVerbsDefective12) {
434 PersianAnalyzerPtr a = newLucene<PersianAnalyzer>(LuceneVersion::LUCENE_CURRENT);
435 const uint8_t first[] = {0xd9, 0x85, 0xd9, 0x8a, 0x20, 0xd8, 0xae, 0xd9, 0x88, 0xd8, 0xb1, 0xd8, 0xaf, 0xd9, 0x87,
436 0x20, 0xd8, 0xa8, 0xd8, 0xa7, 0xd8, 0xb4, 0xd8, 0xaf
437 };
438 const uint8_t second[] = {0xd8, 0xae, 0xd9, 0x88, 0xd8, 0xb1, 0xd8, 0xaf, 0xd9, 0x87};
439 checkAnalyzesTo(a, UTF8_TO_STRING(first), newCollection<String>(UTF8_TO_STRING(second)));
440 }
441
442 /// active pluperfect subjunctive
TEST_F(PersianAnalyzerTest,testBehaviorVerbsDefective13)443 TEST_F(PersianAnalyzerTest, testBehaviorVerbsDefective13) {
444 PersianAnalyzerPtr a = newLucene<PersianAnalyzer>(LuceneVersion::LUCENE_CURRENT);
445 const uint8_t first[] = {0xd8, 0xae, 0xd9, 0x88, 0xd8, 0xb1, 0xd8, 0xaf, 0xd9, 0x87, 0x20, 0xd8, 0xa8, 0xd9, 0x88,
446 0xd8, 0xaf, 0xd9, 0x87, 0x20, 0xd8, 0xa8, 0xd8, 0xa7, 0xd8, 0xb4, 0xd8, 0xaf
447 };
448 const uint8_t second[] = {0xd8, 0xae, 0xd9, 0x88, 0xd8, 0xb1, 0xd8, 0xaf, 0xd9, 0x87};
449 checkAnalyzesTo(a, UTF8_TO_STRING(first), newCollection<String>(UTF8_TO_STRING(second)));
450 }
451
452 /// active imperfective pluperfect subjunctive
TEST_F(PersianAnalyzerTest,testBehaviorVerbsDefective14)453 TEST_F(PersianAnalyzerTest, testBehaviorVerbsDefective14) {
454 PersianAnalyzerPtr a = newLucene<PersianAnalyzer>(LuceneVersion::LUCENE_CURRENT);
455 const uint8_t first[] = {0xd9, 0x85, 0xd9, 0x8a, 0x20, 0xd8, 0xae, 0xd9, 0x88, 0xd8, 0xb1, 0xd8, 0xaf, 0xd9, 0x87,
456 0x20, 0xd8, 0xa8, 0xd9, 0x88, 0xd8, 0xaf, 0xd9, 0x87, 0x20, 0xd8, 0xa8, 0xd8, 0xa7, 0xd8,
457 0xb4, 0xd8, 0xaf
458 };
459 const uint8_t second[] = {0xd8, 0xae, 0xd9, 0x88, 0xd8, 0xb1, 0xd8, 0xaf, 0xd9, 0x87};
460 checkAnalyzesTo(a, UTF8_TO_STRING(first), newCollection<String>(UTF8_TO_STRING(second)));
461 }
462
463 /// passive present indicative
TEST_F(PersianAnalyzerTest,testBehaviorVerbsDefective15)464 TEST_F(PersianAnalyzerTest, testBehaviorVerbsDefective15) {
465 PersianAnalyzerPtr a = newLucene<PersianAnalyzer>(LuceneVersion::LUCENE_CURRENT);
466 const uint8_t first[] = {0xd8, 0xae, 0xd9, 0x88, 0xd8, 0xb1, 0xd8, 0xaf, 0xd9, 0x87, 0x20, 0xd9, 0x85, 0xd9, 0x8a,
467 0x20, 0xd8, 0xb4, 0xd9, 0x88, 0xd8, 0xaf
468 };
469 const uint8_t second[] = {0xd8, 0xae, 0xd9, 0x88, 0xd8, 0xb1, 0xd8, 0xaf, 0xd9, 0x87};
470 checkAnalyzesTo(a, UTF8_TO_STRING(first), newCollection<String>(UTF8_TO_STRING(second)));
471 }
472
473 /// passive preterite indicative
TEST_F(PersianAnalyzerTest,testBehaviorVerbsDefective16)474 TEST_F(PersianAnalyzerTest, testBehaviorVerbsDefective16) {
475 PersianAnalyzerPtr a = newLucene<PersianAnalyzer>(LuceneVersion::LUCENE_CURRENT);
476 const uint8_t first[] = {0xd8, 0xae, 0xd9, 0x88, 0xd8, 0xb1, 0xd8, 0xaf, 0xd9, 0x87, 0x20, 0xd8, 0xb4, 0xd8, 0xaf};
477 const uint8_t second[] = {0xd8, 0xae, 0xd9, 0x88, 0xd8, 0xb1, 0xd8, 0xaf, 0xd9, 0x87};
478 checkAnalyzesTo(a, UTF8_TO_STRING(first), newCollection<String>(UTF8_TO_STRING(second)));
479 }
480
481 /// passive imperfective preterite indicative
TEST_F(PersianAnalyzerTest,testBehaviorVerbsDefective17)482 TEST_F(PersianAnalyzerTest, testBehaviorVerbsDefective17) {
483 PersianAnalyzerPtr a = newLucene<PersianAnalyzer>(LuceneVersion::LUCENE_CURRENT);
484 const uint8_t first[] = {0xd8, 0xae, 0xd9, 0x88, 0xd8, 0xb1, 0xd8, 0xaf, 0xd9, 0x87, 0x20, 0xd9, 0x85, 0xd9, 0x8a,
485 0x20, 0xd8, 0xb4, 0xd8, 0xaf
486 };
487 const uint8_t second[] = {0xd8, 0xae, 0xd9, 0x88, 0xd8, 0xb1, 0xd8, 0xaf, 0xd9, 0x87};
488 checkAnalyzesTo(a, UTF8_TO_STRING(first), newCollection<String>(UTF8_TO_STRING(second)));
489 }
490
491 /// passive perfect indicative
TEST_F(PersianAnalyzerTest,testBehaviorVerbsDefective18)492 TEST_F(PersianAnalyzerTest, testBehaviorVerbsDefective18) {
493 PersianAnalyzerPtr a = newLucene<PersianAnalyzer>(LuceneVersion::LUCENE_CURRENT);
494 const uint8_t first[] = {0xd8, 0xae, 0xd9, 0x88, 0xd8, 0xb1, 0xd8, 0xaf, 0xd9, 0x87, 0x20, 0xd8, 0xb4, 0xd8, 0xaf,
495 0xd9, 0x87, 0x20, 0xd8, 0xa7, 0xd8, 0xb3, 0xd8, 0xaa
496 };
497 const uint8_t second[] = {0xd8, 0xae, 0xd9, 0x88, 0xd8, 0xb1, 0xd8, 0xaf, 0xd9, 0x87};
498 checkAnalyzesTo(a, UTF8_TO_STRING(first), newCollection<String>(UTF8_TO_STRING(second)));
499 }
500
501 /// passive imperfective perfect indicative
TEST_F(PersianAnalyzerTest,testBehaviorVerbsDefective19)502 TEST_F(PersianAnalyzerTest, testBehaviorVerbsDefective19) {
503 PersianAnalyzerPtr a = newLucene<PersianAnalyzer>(LuceneVersion::LUCENE_CURRENT);
504 const uint8_t first[] = {0xd8, 0xae, 0xd9, 0x88, 0xd8, 0xb1, 0xd8, 0xaf, 0xd9, 0x87, 0x20, 0xd9, 0x85, 0xd9, 0x8a,
505 0x20, 0xd8, 0xb4, 0xd8, 0xaf, 0xd9, 0x87, 0x20, 0xd8, 0xa7, 0xd8, 0xb3, 0xd8, 0xaa
506 };
507 const uint8_t second[] = {0xd8, 0xae, 0xd9, 0x88, 0xd8, 0xb1, 0xd8, 0xaf, 0xd9, 0x87};
508 checkAnalyzesTo(a, UTF8_TO_STRING(first), newCollection<String>(UTF8_TO_STRING(second)));
509 }
510
511 /// passive pluperfect indicative
TEST_F(PersianAnalyzerTest,testBehaviorVerbsDefective20)512 TEST_F(PersianAnalyzerTest, testBehaviorVerbsDefective20) {
513 PersianAnalyzerPtr a = newLucene<PersianAnalyzer>(LuceneVersion::LUCENE_CURRENT);
514 const uint8_t first[] = {0xd8, 0xae, 0xd9, 0x88, 0xd8, 0xb1, 0xd8, 0xaf, 0xd9, 0x87, 0x20, 0xd8, 0xb4, 0xd8, 0xaf,
515 0xd9, 0x87, 0x20, 0xd8, 0xa8, 0xd9, 0x88, 0xd8, 0xaf
516 };
517 const uint8_t second[] = {0xd8, 0xae, 0xd9, 0x88, 0xd8, 0xb1, 0xd8, 0xaf, 0xd9, 0x87};
518 checkAnalyzesTo(a, UTF8_TO_STRING(first), newCollection<String>(UTF8_TO_STRING(second)));
519 }
520
521 /// passive imperfective pluperfect indicative
TEST_F(PersianAnalyzerTest,testBehaviorVerbsDefective21)522 TEST_F(PersianAnalyzerTest, testBehaviorVerbsDefective21) {
523 PersianAnalyzerPtr a = newLucene<PersianAnalyzer>(LuceneVersion::LUCENE_CURRENT);
524 const uint8_t first[] = {0xd8, 0xae, 0xd9, 0x88, 0xd8, 0xb1, 0xd8, 0xaf, 0xd9, 0x87, 0x20, 0xd9, 0x85, 0xd9, 0x8a,
525 0x20, 0xd8, 0xb4, 0xd8, 0xaf, 0xd9, 0x87, 0x20, 0xd8, 0xa8, 0xd9, 0x88, 0xd8, 0xaf
526 };
527 const uint8_t second[] = {0xd8, 0xae, 0xd9, 0x88, 0xd8, 0xb1, 0xd8, 0xaf, 0xd9, 0x87};
528 checkAnalyzesTo(a, UTF8_TO_STRING(first), newCollection<String>(UTF8_TO_STRING(second)));
529 }
530
531 /// passive future indicative
TEST_F(PersianAnalyzerTest,testBehaviorVerbsDefective22)532 TEST_F(PersianAnalyzerTest, testBehaviorVerbsDefective22) {
533 PersianAnalyzerPtr a = newLucene<PersianAnalyzer>(LuceneVersion::LUCENE_CURRENT);
534 const uint8_t first[] = {0xd8, 0xae, 0xd9, 0x88, 0xd8, 0xb1, 0xd8, 0xaf, 0xd9, 0x87, 0x20, 0xd8, 0xae, 0xd9, 0x88,
535 0xd8, 0xa7, 0xd9, 0x87, 0xd8, 0xaf, 0x20, 0xd8, 0xb4, 0xd8, 0xaf
536 };
537 const uint8_t second[] = {0xd8, 0xae, 0xd9, 0x88, 0xd8, 0xb1, 0xd8, 0xaf, 0xd9, 0x87};
538 checkAnalyzesTo(a, UTF8_TO_STRING(first), newCollection<String>(UTF8_TO_STRING(second)));
539 }
540
541 /// passive present progressive indicative
TEST_F(PersianAnalyzerTest,testBehaviorVerbsDefective23)542 TEST_F(PersianAnalyzerTest, testBehaviorVerbsDefective23) {
543 PersianAnalyzerPtr a = newLucene<PersianAnalyzer>(LuceneVersion::LUCENE_CURRENT);
544 const uint8_t first[] = {0xd8, 0xaf, 0xd8, 0xa7, 0xd8, 0xb1, 0xd8, 0xaf, 0x20, 0xd8, 0xae, 0xd9, 0x88, 0xd8, 0xb1,
545 0xd8, 0xaf, 0xd9, 0x87, 0x20, 0xd9, 0x85, 0xd9, 0x8a, 0x20, 0xd8, 0xb4, 0xd9, 0x88, 0xd8,
546 0xaf
547 };
548 const uint8_t second[] = {0xd8, 0xae, 0xd9, 0x88, 0xd8, 0xb1, 0xd8, 0xaf, 0xd9, 0x87};
549 checkAnalyzesTo(a, UTF8_TO_STRING(first), newCollection<String>(UTF8_TO_STRING(second)));
550 }
551
552 /// passive preterite progressive indicative
TEST_F(PersianAnalyzerTest,testBehaviorVerbsDefective24)553 TEST_F(PersianAnalyzerTest, testBehaviorVerbsDefective24) {
554 PersianAnalyzerPtr a = newLucene<PersianAnalyzer>(LuceneVersion::LUCENE_CURRENT);
555 const uint8_t first[] = {0xd8, 0xaf, 0xd8, 0xa7, 0xd8, 0xb4, 0xd8, 0xaa, 0x20, 0xd8, 0xae, 0xd9, 0x88, 0xd8, 0xb1,
556 0xd8, 0xaf, 0xd9, 0x87, 0x20, 0xd9, 0x85, 0xd9, 0x8a, 0x20, 0xd8, 0xb4, 0xd8, 0xaf
557 };
558 const uint8_t second[] = {0xd8, 0xae, 0xd9, 0x88, 0xd8, 0xb1, 0xd8, 0xaf, 0xd9, 0x87};
559 checkAnalyzesTo(a, UTF8_TO_STRING(first), newCollection<String>(UTF8_TO_STRING(second)));
560 }
561
562 /// passive present subjunctive
TEST_F(PersianAnalyzerTest,testBehaviorVerbsDefective25)563 TEST_F(PersianAnalyzerTest, testBehaviorVerbsDefective25) {
564 PersianAnalyzerPtr a = newLucene<PersianAnalyzer>(LuceneVersion::LUCENE_CURRENT);
565 const uint8_t first[] = {0xd8, 0xae, 0xd9, 0x88, 0xd8, 0xb1, 0xd8, 0xaf, 0xd9, 0x87, 0x20, 0xd8, 0xb4, 0xd9, 0x88,
566 0xd8, 0xaf
567 };
568 const uint8_t second[] = {0xd8, 0xae, 0xd9, 0x88, 0xd8, 0xb1, 0xd8, 0xaf, 0xd9, 0x87};
569 checkAnalyzesTo(a, UTF8_TO_STRING(first), newCollection<String>(UTF8_TO_STRING(second)));
570 }
571
572 /// passive preterite subjunctive
TEST_F(PersianAnalyzerTest,testBehaviorVerbsDefective26)573 TEST_F(PersianAnalyzerTest, testBehaviorVerbsDefective26) {
574 PersianAnalyzerPtr a = newLucene<PersianAnalyzer>(LuceneVersion::LUCENE_CURRENT);
575 const uint8_t first[] = {0xd8, 0xae, 0xd9, 0x88, 0xd8, 0xb1, 0xd8, 0xaf, 0xd9, 0x87, 0x20, 0xd8, 0xb4, 0xd8, 0xaf,
576 0xd9, 0x87, 0x20, 0xd8, 0xa8, 0xd8, 0xa7, 0xd8, 0xb4, 0xd8, 0xaf
577 };
578 const uint8_t second[] = {0xd8, 0xae, 0xd9, 0x88, 0xd8, 0xb1, 0xd8, 0xaf, 0xd9, 0x87};
579 checkAnalyzesTo(a, UTF8_TO_STRING(first), newCollection<String>(UTF8_TO_STRING(second)));
580 }
581
582 /// passive imperfective preterite subjunctive
TEST_F(PersianAnalyzerTest,testBehaviorVerbsDefective27)583 TEST_F(PersianAnalyzerTest, testBehaviorVerbsDefective27) {
584 PersianAnalyzerPtr a = newLucene<PersianAnalyzer>(LuceneVersion::LUCENE_CURRENT);
585 const uint8_t first[] = {0xd8, 0xae, 0xd9, 0x88, 0xd8, 0xb1, 0xd8, 0xaf, 0xd9, 0x87, 0x20, 0xd9, 0x85, 0xd9, 0x8a,
586 0x20, 0xd8, 0xb4, 0xd8, 0xaf, 0xd9, 0x87, 0x20, 0xd8, 0xa8, 0xd8, 0xa7, 0xd8, 0xb4, 0xd8,
587 0xaf
588 };
589 const uint8_t second[] = {0xd8, 0xae, 0xd9, 0x88, 0xd8, 0xb1, 0xd8, 0xaf, 0xd9, 0x87};
590 checkAnalyzesTo(a, UTF8_TO_STRING(first), newCollection<String>(UTF8_TO_STRING(second)));
591 }
592
593 /// passive pluperfect subjunctive
TEST_F(PersianAnalyzerTest,testBehaviorVerbsDefective28)594 TEST_F(PersianAnalyzerTest, testBehaviorVerbsDefective28) {
595 PersianAnalyzerPtr a = newLucene<PersianAnalyzer>(LuceneVersion::LUCENE_CURRENT);
596 const uint8_t first[] = {0xd8, 0xae, 0xd9, 0x88, 0xd8, 0xb1, 0xd8, 0xaf, 0xd9, 0x87, 0x20, 0xd8, 0xb4, 0xd8, 0xaf,
597 0xd9, 0x87, 0x20, 0xd8, 0xa8, 0xd9, 0x88, 0xd8, 0xaf, 0xd9, 0x87, 0x20, 0xd8, 0xa8, 0xd8,
598 0xa7, 0xd8, 0xb4, 0xd8, 0xaf
599 };
600 const uint8_t second[] = {0xd8, 0xae, 0xd9, 0x88, 0xd8, 0xb1, 0xd8, 0xaf, 0xd9, 0x87};
601 checkAnalyzesTo(a, UTF8_TO_STRING(first), newCollection<String>(UTF8_TO_STRING(second)));
602 }
603
604 /// passive imperfective pluperfect subjunctive
TEST_F(PersianAnalyzerTest,testBehaviorVerbsDefective29)605 TEST_F(PersianAnalyzerTest, testBehaviorVerbsDefective29) {
606 PersianAnalyzerPtr a = newLucene<PersianAnalyzer>(LuceneVersion::LUCENE_CURRENT);
607 const uint8_t first[] = {0xd8, 0xae, 0xd9, 0x88, 0xd8, 0xb1, 0xd8, 0xaf, 0xd9, 0x87, 0x20, 0xd9, 0x85, 0xd9, 0x8a,
608 0x20, 0xd8, 0xb4, 0xd8, 0xaf, 0xd9, 0x87, 0x20, 0xd8, 0xa8, 0xd9, 0x88, 0xd8, 0xaf, 0xd9,
609 0x87, 0x20, 0xd8, 0xa8, 0xd8, 0xa7, 0xd8, 0xb4, 0xd8, 0xaf
610 };
611 const uint8_t second[] = {0xd8, 0xae, 0xd9, 0x88, 0xd8, 0xb1, 0xd8, 0xaf, 0xd9, 0x87};
612 checkAnalyzesTo(a, UTF8_TO_STRING(first), newCollection<String>(UTF8_TO_STRING(second)));
613 }
614
615 /// active present subjunctive
TEST_F(PersianAnalyzerTest,testBehaviorVerbsDefective30)616 TEST_F(PersianAnalyzerTest, testBehaviorVerbsDefective30) {
617 PersianAnalyzerPtr a = newLucene<PersianAnalyzer>(LuceneVersion::LUCENE_CURRENT);
618 const uint8_t first[] = {0xd8, 0xa8, 0xd8, 0xae, 0xd9, 0x88, 0xd8, 0xb1, 0xd8, 0xaf};
619 const uint8_t second[] = {0xd8, 0xa8, 0xd8, 0xae, 0xd9, 0x88, 0xd8, 0xb1, 0xd8, 0xaf};
620 checkAnalyzesTo(a, UTF8_TO_STRING(first), newCollection<String>(UTF8_TO_STRING(second)));
621 }
622
623 /// These tests show how the combination of tokenization (breaking on zero-width
624 /// non-joiner or space) and stopwords creates a light-stemming effect for
625 /// nouns, removing the plural -ha.
626
TEST_F(PersianAnalyzerTest,testBehaviorNouns1)627 TEST_F(PersianAnalyzerTest, testBehaviorNouns1) {
628 PersianAnalyzerPtr a = newLucene<PersianAnalyzer>(LuceneVersion::LUCENE_CURRENT);
629 const uint8_t first[] = {0xd8, 0xa8, 0xd8, 0xb1, 0xda, 0xaf, 0x20, 0xd9, 0x87, 0xd8, 0xa7};
630 const uint8_t second[] = {0xd8, 0xa8, 0xd8, 0xb1, 0xda, 0xaf};
631 checkAnalyzesTo(a, UTF8_TO_STRING(first), newCollection<String>(UTF8_TO_STRING(second)));
632 }
633
TEST_F(PersianAnalyzerTest,testBehaviorNouns2)634 TEST_F(PersianAnalyzerTest, testBehaviorNouns2) {
635 PersianAnalyzerPtr a = newLucene<PersianAnalyzer>(LuceneVersion::LUCENE_CURRENT);
636 const uint8_t first[] = {0xd8, 0xa8, 0xd8, 0xb1, 0xda, 0xaf, 0xe2, 0x80, 0x8c, 0xd9, 0x87, 0xd8, 0xa7};
637 const uint8_t second[] = {0xd8, 0xa8, 0xd8, 0xb1, 0xda, 0xaf};
638 checkAnalyzesTo(a, UTF8_TO_STRING(first), newCollection<String>(UTF8_TO_STRING(second)));
639 }
640
641 /// Test showing that non-Persian text is treated very much like SimpleAnalyzer (lowercased, etc)
TEST_F(PersianAnalyzerTest,testBehaviorNonPersian)642 TEST_F(PersianAnalyzerTest, testBehaviorNonPersian) {
643 PersianAnalyzerPtr a = newLucene<PersianAnalyzer>(LuceneVersion::LUCENE_CURRENT);
644 checkAnalyzesTo(a, L"English test.", newCollection<String>(L"english", L"test"));
645 }
646
TEST_F(PersianAnalyzerTest,testReusableTokenStream1)647 TEST_F(PersianAnalyzerTest, testReusableTokenStream1) {
648 PersianAnalyzerPtr a = newLucene<PersianAnalyzer>(LuceneVersion::LUCENE_CURRENT);
649 const uint8_t first[] = {0xd8, 0xae, 0xd9, 0x88, 0xd8, 0xb1, 0xd8, 0xaf, 0xd9, 0x87, 0x20, 0xd9, 0x85, 0xd9, 0x8a,
650 0x20, 0xd8, 0xb4, 0xd8, 0xaf, 0xd9, 0x87, 0x20, 0xd8, 0xa8, 0xd9, 0x88, 0xd8, 0xaf, 0xd9,
651 0x87, 0x20, 0xd8, 0xa8, 0xd8, 0xa7, 0xd8, 0xb4, 0xd8, 0xaf
652 };
653 const uint8_t second[] = {0xd8, 0xae, 0xd9, 0x88, 0xd8, 0xb1, 0xd8, 0xaf, 0xd9, 0x87};
654 checkAnalyzesToReuse(a, UTF8_TO_STRING(first), newCollection<String>(UTF8_TO_STRING(second)));
655 }
656
TEST_F(PersianAnalyzerTest,testReusableTokenStream2)657 TEST_F(PersianAnalyzerTest, testReusableTokenStream2) {
658 PersianAnalyzerPtr a = newLucene<PersianAnalyzer>(LuceneVersion::LUCENE_CURRENT);
659 const uint8_t first[] = {0xd8, 0xa8, 0xd8, 0xb1, 0xda, 0xaf, 0xe2, 0x80, 0x8c, 0xd9, 0x87, 0xd8, 0xa7};
660 const uint8_t second[] = {0xd8, 0xa8, 0xd8, 0xb1, 0xda, 0xaf};
661 checkAnalyzesToReuse(a, UTF8_TO_STRING(first), newCollection<String>(UTF8_TO_STRING(second)));
662 }
663
664 /// Test that custom stopwords work, and are not case-sensitive.
TEST_F(PersianAnalyzerTest,testCustomStopwords)665 TEST_F(PersianAnalyzerTest, testCustomStopwords) {
666 Collection<String> stopWords = newCollection<String>(L"the", L"and", L"a");
667 PersianAnalyzerPtr a = newLucene<PersianAnalyzer>(LuceneVersion::LUCENE_CURRENT, HashSet<String>::newInstance(stopWords.begin(), stopWords.end()));
668 checkAnalyzesTo(a, L"The quick brown fox.", newCollection<String>(L"quick", L"brown", L"fox"));
669 }
670